torch backend bugfix and speedup ut (#793)

WeiweiZhang1 · web-flow · commit 4c597def1630 · 2025-09-05T17:33:05.000+08:00
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ AutoRound
 AutoRound is an advanced quantization library designed for Large Language Models (LLMs) and Vision-Language Models (VLMs). It delivers high accuracy at ultra-low bit widths (2–4 bits) with minimal tuning by leveraging sign-gradient descent and offering broad hardware compatibility. Check out our paper on [arxiv](https://arxiv.org/pdf/2309.05516) for more details and quantized models in several
 Hugging Face Spaces,
 e.g. [Intel](https://huggingface.co/Intel), [OPEA](https://huggingface.co/OPEA),  [Kaitchup](https://huggingface.co/kaitchup)
-and [fbaldassarri](https://huggingface.co/fbaldassarri).
+and [fbaldassarri](https://huggingface.co/fbaldassarri). Please check out [User guide](./docs/step_by_step.md) for more details
 
 <p align="center">
   <img src="docs/imgs/autoround_overview.png" alt="AutoRound Overview" width="80%">
@@ -33,13 +33,12 @@ and [fbaldassarri](https://huggingface.co/fbaldassarri).
 
 [2025/07] AutoRound now offers experimental support for **GGUF** format, and recommends using optimized RTN mode (--iters 0) for
   all bits other than 3 bits. Example
-  models: [Intel/Qwen3-235B-A22B-q2ks-mixed-ar](https://huggingface.co/Intel/Qwen3-235B-A22B-q2ks-ar)
-  and [Intel/DeepSeek-R1-0528-q2ks-mixed-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-q2ks-mixed-ar). **A more advanced algorithm** tailored for specific configurations may be available in
+  models: [Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound](https://huggingface.co/Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound)
+  and [Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound](https://huggingface.co/Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound). **A more advanced algorithm** tailored for specific configurations may be available in
   v0.6.2.
 
 [2025/05] AutoRound provides some recipes for **DeepSeek-R1-0528**, please refer
-  to [Intel/DeepSeek-R1-0528-int2-mixed-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-int2-mixed-ar), [Intel/DeepSeek-R1-0528-int4-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-int4-ar)
-  and [Intel/DeepSeek-R1-0528-int4-asym-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-int4-asym-ar) for
+  to [OPEA/DeepSeek-R1-0528-int2-mixed-AutoRound](https://huggingface.co/OPEA/DeepSeek-R1-0528-int2-mixed-AutoRound) and [OPEA/DeepSeek-R1-0528-int4-AutoRound](https://huggingface.co/OPEA/DeepSeek-R1-0528-int4-AutoRound) for
   more details.
 
 [2025/05] AutoRound has been integrated into **vLLM**. You can now run models in the AutoRound format directly with
@@ -112,7 +111,6 @@ pip install auto-round-lib
 
 ## Model Quantization (CPU/Intel GPU/Gaudi/CUDA)
 
-Please check out [User guide](./docs/step_by_step.md) for more details
 ### Command Line Usage
 Please change to `auto-round-mllm` for visual-language models (VLMs) quantization. The full list of supported arguments is provided by calling `auto-round -h` on the terminal.
 
@@ -327,3 +325,4 @@ If you find AutoRound helpful, please ⭐ star the repo and share it with your c
 
 
 
+
diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py
@@ -127,7 +127,6 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None):
 
         if isinstance(zeros, torch.Tensor):
             zeros = zeros.t().contiguous()
-            zeros -= 1
             # zeros = zeros.numpy().astype(np.uint32)
             qzeros = torch.zeros(
                 (zeros.shape[0], zeros.shape[1] // 32 * self.bits), device=self.device, dtype=torch.int32
@@ -143,7 +142,6 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None):
                 col += 1
             self.qzeros = qzeros.cpu()
         else:
-            zeros -= 1
             shape = scales_t.shape
             value = 0
             for j in range(0, (32 // self.bits)):
@@ -338,7 +336,6 @@ def forward(self, x):
             repeat_scales = self.scales.repeat_interleave(self.group_size, dim=0)
             repeat_zeros = zeros.repeat_interleave(self.group_size, dim=0)
             weights = repeat_scales * (weight - repeat_zeros)
-
         weights = weights.to(x_dtype)
         out = torch.matmul(x, weights)
         out = out.to(x_dtype)
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
@@ -218,6 +218,7 @@ def test_static_afp8_export(self, static_kv_dtype):
             iters=0,
             act_bits=8,
             nsamples=2,
+            seqlen=2,
             data_type="fp8",
             act_data_type="fp8",
             act_dynamic=False,
@@ -249,6 +250,7 @@ def test_static_afp8_export(self, static_kv_dtype):
             iters=1,
             act_bits=8,
             nsamples=2,
+            seqlen=2,
             data_type="fp8",
             act_data_type="fp8",
             act_dynamic=False,
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
@@ -54,7 +54,15 @@ def test_basic_usage(self):
     def test_q4_0(self):
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
-            self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int", nsamples=1
+            self.model,
+            self.tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=1,
+            data_type="int",
+            nsamples=1,
+            seqlen=8,
         )
         quantized_model_path = "./saved"
 
@@ -101,6 +109,7 @@ def test_func(self):
             # sym=sym,
             iters=1,
             nsamples=1,
+            seqlen=10,
             # data_type="int"
         )
         quantized_model_path = "./saved"
@@ -197,6 +206,7 @@ def test_gguf_baseline(self):
             sym=True,
             iters=0,
             nsamples=8,
+            seqlen=2,
             data_type="rtn_int_sym_dq",
             super_group_size=16,
             super_bits=6,
diff --git a/test/test_cpu/test_llmcompressor_w8a8.py b/test/test_cpu/test_llmcompressor_w8a8.py
@@ -33,6 +33,8 @@ def test_llmcompressor_w8a8(self):
             group_size=group_size,
             sym=sym,
             act_bits=act_bits,
+            seqlen=8,
+            nsamples=2,
             iters=0,
         )
         autoround.quantize()
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
@@ -2,42 +2,54 @@
 import sys
 import unittest
 
+import torch
+
 from auto_round.schemes import QuantizationScheme
 
 sys.path.insert(0, "../..")
 
 from auto_round import AutoRound
 
 
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
 class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         self.model_name = "facebook/opt-125m"
         self.save_folder = "./saved"
+        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
     def tearDownClass(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_gguf(self):
-        ar = AutoRound("Qwen/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1)
+        ar = AutoRound("Qwen/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
         ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
         self.assertEqual(ar.bits, 4)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
     def test_w4a16(self):
-        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1)
+        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
         self.assertEqual(ar.bits, 4)
         ar.quantize()
 
     def test_w2a16_rtn(self):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0)
+        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=self.llm_dataloader)
         self.assertEqual(ar.bits, 2)
         ar.quantize()
 
     def test_mxfp4(self):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1)
+        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
         self.assertEqual(ar.bits, 4)
         self.assertEqual(ar.act_bits, 4)
         self.assertEqual(ar.data_type, "mx_fp")
@@ -50,7 +62,15 @@ def test_scheme_in_layer_config(self):
             "model.decoder.layers.3.self_attn.v_proj": "W8A16",
             "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
         }
-        ar = AutoRound(self.model_name, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config)
+        ar = AutoRound(
+            self.model_name,
+            scheme="W3A16",
+            nsamples=1,
+            iters=1,
+            layer_config=layer_config,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
 
         ar.quantize()
         for n, m in ar.model.named_modules():
diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py
@@ -0,0 +1,138 @@
+import shutil
+import sys
+import unittest
+
+import pytest
+
+sys.path.insert(0, "../..")
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from auto_round import AutoRound, AutoRoundConfig
+from auto_round.eval.evaluation import simple_evaluate_user_model
+from auto_round.testing_utils import require_autogptq, require_gptqmodel
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRoundTorchBackend(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(self):
+        self.model_name = "facebook/opt-125m"
+        self.save_folder = "./saved"
+        self.llm_dataloader = LLMDataLoader()
+
+    def model_infer(self, model, tokenizer):
+        prompts = [
+            "Hello,my name is",
+            # "The president of the United States is",
+            # "The capital of France is",
+            # "The future of AI is",
+        ]
+
+        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
+
+        outputs = model.generate(
+            input_ids=inputs["input_ids"].to(model.device),
+            attention_mask=inputs["attention_mask"].to(model.device),
+            do_sample=False,  ## change this to follow official usage
+            max_new_tokens=5,
+        )
+        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
+
+        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        for i, prompt in enumerate(prompts):
+            print(f"Prompt: {prompt}")
+            print(f"Generated: {decoded_outputs[i]}")
+            print("-" * 50)
+        return decoded_outputs[0]
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+    def test_torch_4bits_asym(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        bits, group_size, sym = 4, 128, False
+        autoround = AutoRound(
+            model,
+            tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=0,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_folder
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+
+        quantization_config = AutoRoundConfig(backend="torch")
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        self.model_infer(model, tokenizer)
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
+        print(result["results"]["lambada_openai"]["acc,none"])
+        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        torch.cuda.empty_cache()
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.save_folder, torch_dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        self.model_infer(model, tokenizer)
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
+        print(result["results"]["lambada_openai"]["acc,none"])
+        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        torch.cuda.empty_cache()
+        shutil.rmtree("./saved", ignore_errors=True)
+
+    def test_torch_4bits_sym(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        bits, group_size, sym = 4, 128, True
+        autoround = AutoRound(
+            model,
+            tokenizer,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=0,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = self.save_folder
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+
+        quantization_config = AutoRoundConfig(backend="torch")
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        self.model_infer(model, tokenizer)
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
+        print(result["results"]["lambada_openai"]["acc,none"])
+        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
+        torch.cuda.empty_cache()
+        shutil.rmtree(self.save_folder, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,8 @@ def test_llmcompressor_w8a8(self):`
`33`	`33`	`group_size=group_size,`
`34`	`34`	`sym=sym,`
`35`	`35`	`act_bits=act_bits,`
	`36`	`+ seqlen=8,`
	`37`	`+ nsamples=2,`
`36`	`38`	`iters=0,`
`37`	`39`	`)`
`38`	`40`	`autoround.quantize()`