Skip to content

Commit 4c597de

Browse files
authored
torch backend bugfix and speedup ut (#793)
1 parent add749a commit 4c597de

File tree

9 files changed

+287
-18
lines changed

9 files changed

+287
-18
lines changed

README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ AutoRound
1919
AutoRound is an advanced quantization library designed for Large Language Models (LLMs) and Vision-Language Models (VLMs). It delivers high accuracy at ultra-low bit widths (2–4 bits) with minimal tuning by leveraging sign-gradient descent and offering broad hardware compatibility. Check out our paper on [arxiv](https://arxiv.org/pdf/2309.05516) for more details and quantized models in several
2020
Hugging Face Spaces,
2121
e.g. [Intel](https://huggingface.co/Intel), [OPEA](https://huggingface.co/OPEA), [Kaitchup](https://huggingface.co/kaitchup)
22-
and [fbaldassarri](https://huggingface.co/fbaldassarri).
22+
and [fbaldassarri](https://huggingface.co/fbaldassarri). Please check out [User guide](./docs/step_by_step.md) for more details
2323

2424
<p align="center">
2525
<img src="docs/imgs/autoround_overview.png" alt="AutoRound Overview" width="80%">
@@ -33,13 +33,12 @@ and [fbaldassarri](https://huggingface.co/fbaldassarri).
3333

3434
[2025/07] AutoRound now offers experimental support for **GGUF** format, and recommends using optimized RTN mode (--iters 0) for
3535
all bits other than 3 bits. Example
36-
models: [Intel/Qwen3-235B-A22B-q2ks-mixed-ar](https://huggingface.co/Intel/Qwen3-235B-A22B-q2ks-ar)
37-
and [Intel/DeepSeek-R1-0528-q2ks-mixed-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-q2ks-mixed-ar). **A more advanced algorithm** tailored for specific configurations may be available in
36+
models: [Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound](https://huggingface.co/Intel/Qwen3-235B-A22B-q2ks-mixed-AutoRound)
37+
and [Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound](https://huggingface.co/Intel/DeepSeek-R1-0528-q2ks-mixed-AutoRound). **A more advanced algorithm** tailored for specific configurations may be available in
3838
v0.6.2.
3939

4040
[2025/05] AutoRound provides some recipes for **DeepSeek-R1-0528**, please refer
41-
to [Intel/DeepSeek-R1-0528-int2-mixed-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-int2-mixed-ar), [Intel/DeepSeek-R1-0528-int4-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-int4-ar)
42-
and [Intel/DeepSeek-R1-0528-int4-asym-ar](https://huggingface.co/Intel/DeepSeek-R1-0528-int4-asym-ar) for
41+
to [OPEA/DeepSeek-R1-0528-int2-mixed-AutoRound](https://huggingface.co/OPEA/DeepSeek-R1-0528-int2-mixed-AutoRound) and [OPEA/DeepSeek-R1-0528-int4-AutoRound](https://huggingface.co/OPEA/DeepSeek-R1-0528-int4-AutoRound) for
4342
more details.
4443

4544
[2025/05] AutoRound has been integrated into **vLLM**. You can now run models in the AutoRound format directly with
@@ -112,7 +111,6 @@ pip install auto-round-lib
112111

113112
## Model Quantization (CPU/Intel GPU/Gaudi/CUDA)
114113

115-
Please check out [User guide](./docs/step_by_step.md) for more details
116114
### Command Line Usage
117115
Please change to `auto-round-mllm` for visual-language models (VLMs) quantization. The full list of supported arguments is provided by calling `auto-round -h` on the terminal.
118116

@@ -327,3 +325,4 @@ If you find AutoRound helpful, please ⭐ star the repo and share it with your c
327325

328326

329327

328+

auto_round_extension/torch/qlinear_torch.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None):
127127

128128
if isinstance(zeros, torch.Tensor):
129129
zeros = zeros.t().contiguous()
130-
zeros -= 1
131130
# zeros = zeros.numpy().astype(np.uint32)
132131
qzeros = torch.zeros(
133132
(zeros.shape[0], zeros.shape[1] // 32 * self.bits), device=self.device, dtype=torch.int32
@@ -143,7 +142,6 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None):
143142
col += 1
144143
self.qzeros = qzeros.cpu()
145144
else:
146-
zeros -= 1
147145
shape = scales_t.shape
148146
value = 0
149147
for j in range(0, (32 // self.bits)):
@@ -338,7 +336,6 @@ def forward(self, x):
338336
repeat_scales = self.scales.repeat_interleave(self.group_size, dim=0)
339337
repeat_zeros = zeros.repeat_interleave(self.group_size, dim=0)
340338
weights = repeat_scales * (weight - repeat_zeros)
341-
342339
weights = weights.to(x_dtype)
343340
out = torch.matmul(x, weights)
344341
out = out.to(x_dtype)

test/test_cpu/test_export.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ def test_static_afp8_export(self, static_kv_dtype):
218218
iters=0,
219219
act_bits=8,
220220
nsamples=2,
221+
seqlen=2,
221222
data_type="fp8",
222223
act_data_type="fp8",
223224
act_dynamic=False,
@@ -249,6 +250,7 @@ def test_static_afp8_export(self, static_kv_dtype):
249250
iters=1,
250251
act_bits=8,
251252
nsamples=2,
253+
seqlen=2,
252254
data_type="fp8",
253255
act_data_type="fp8",
254256
act_dynamic=False,

test/test_cpu/test_gguf_format.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,15 @@ def test_basic_usage(self):
5454
def test_q4_0(self):
5555
bits, group_size, sym = 4, 32, True
5656
autoround = AutoRound(
57-
self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int", nsamples=1
57+
self.model,
58+
self.tokenizer,
59+
bits=bits,
60+
group_size=group_size,
61+
sym=sym,
62+
iters=1,
63+
data_type="int",
64+
nsamples=1,
65+
seqlen=8,
5866
)
5967
quantized_model_path = "./saved"
6068

@@ -101,6 +109,7 @@ def test_func(self):
101109
# sym=sym,
102110
iters=1,
103111
nsamples=1,
112+
seqlen=10,
104113
# data_type="int"
105114
)
106115
quantized_model_path = "./saved"
@@ -197,6 +206,7 @@ def test_gguf_baseline(self):
197206
sym=True,
198207
iters=0,
199208
nsamples=8,
209+
seqlen=2,
200210
data_type="rtn_int_sym_dq",
201211
super_group_size=16,
202212
super_bits=6,

test/test_cpu/test_llmcompressor_w8a8.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def test_llmcompressor_w8a8(self):
3333
group_size=group_size,
3434
sym=sym,
3535
act_bits=act_bits,
36+
seqlen=8,
37+
nsamples=2,
3638
iters=0,
3739
)
3840
autoround.quantize()

test/test_cpu/test_scheme.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,54 @@
22
import sys
33
import unittest
44

5+
import torch
6+
57
from auto_round.schemes import QuantizationScheme
68

79
sys.path.insert(0, "../..")
810

911
from auto_round import AutoRound
1012

1113

14+
class LLMDataLoader:
15+
def __init__(self):
16+
self.batch_size = 1
17+
18+
def __iter__(self):
19+
for i in range(2):
20+
yield torch.ones([1, 10], dtype=torch.long)
21+
22+
1223
class TestAutoRound(unittest.TestCase):
1324
@classmethod
1425
def setUpClass(self):
1526
self.model_name = "facebook/opt-125m"
1627
self.save_folder = "./saved"
28+
self.llm_dataloader = LLMDataLoader()
1729

1830
@classmethod
1931
def tearDownClass(self):
2032
shutil.rmtree(self.save_folder, ignore_errors=True)
2133
shutil.rmtree("runs", ignore_errors=True)
2234

2335
def test_gguf(self):
24-
ar = AutoRound("Qwen/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1)
36+
ar = AutoRound("Qwen/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
2537
ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
2638
self.assertEqual(ar.bits, 4)
2739
shutil.rmtree(self.save_folder, ignore_errors=True)
2840

2941
def test_w4a16(self):
30-
ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1)
42+
ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
3143
self.assertEqual(ar.bits, 4)
3244
ar.quantize()
3345

3446
def test_w2a16_rtn(self):
35-
ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0)
47+
ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=self.llm_dataloader)
3648
self.assertEqual(ar.bits, 2)
3749
ar.quantize()
3850

3951
def test_mxfp4(self):
40-
ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1)
52+
ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
4153
self.assertEqual(ar.bits, 4)
4254
self.assertEqual(ar.act_bits, 4)
4355
self.assertEqual(ar.data_type, "mx_fp")
@@ -50,7 +62,15 @@ def test_scheme_in_layer_config(self):
5062
"model.decoder.layers.3.self_attn.v_proj": "W8A16",
5163
"model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
5264
}
53-
ar = AutoRound(self.model_name, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config)
65+
ar = AutoRound(
66+
self.model_name,
67+
scheme="W3A16",
68+
nsamples=1,
69+
iters=1,
70+
layer_config=layer_config,
71+
seqlen=2,
72+
dataset=self.llm_dataloader,
73+
)
5474

5575
ar.quantize()
5676
for n, m in ar.model.named_modules():
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import shutil
2+
import sys
3+
import unittest
4+
5+
import pytest
6+
7+
sys.path.insert(0, "../..")
8+
9+
import torch
10+
from transformers import AutoModelForCausalLM, AutoTokenizer
11+
12+
from auto_round import AutoRound, AutoRoundConfig
13+
from auto_round.eval.evaluation import simple_evaluate_user_model
14+
from auto_round.testing_utils import require_autogptq, require_gptqmodel
15+
16+
17+
class LLMDataLoader:
18+
def __init__(self):
19+
self.batch_size = 1
20+
21+
def __iter__(self):
22+
for i in range(2):
23+
yield torch.ones([1, 10], dtype=torch.long)
24+
25+
26+
class TestAutoRoundTorchBackend(unittest.TestCase):
27+
28+
@classmethod
29+
def setUpClass(self):
30+
self.model_name = "facebook/opt-125m"
31+
self.save_folder = "./saved"
32+
self.llm_dataloader = LLMDataLoader()
33+
34+
def model_infer(self, model, tokenizer):
35+
prompts = [
36+
"Hello,my name is",
37+
# "The president of the United States is",
38+
# "The capital of France is",
39+
# "The future of AI is",
40+
]
41+
42+
inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
43+
44+
outputs = model.generate(
45+
input_ids=inputs["input_ids"].to(model.device),
46+
attention_mask=inputs["attention_mask"].to(model.device),
47+
do_sample=False, ## change this to follow official usage
48+
max_new_tokens=5,
49+
)
50+
generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
51+
52+
decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
53+
54+
for i, prompt in enumerate(prompts):
55+
print(f"Prompt: {prompt}")
56+
print(f"Generated: {decoded_outputs[i]}")
57+
print("-" * 50)
58+
return decoded_outputs[0]
59+
60+
@classmethod
61+
def tearDownClass(self):
62+
shutil.rmtree(self.save_folder, ignore_errors=True)
63+
shutil.rmtree("runs", ignore_errors=True)
64+
65+
def test_torch_4bits_asym(self):
66+
model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
67+
tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
68+
bits, group_size, sym = 4, 128, False
69+
autoround = AutoRound(
70+
model,
71+
tokenizer,
72+
bits=bits,
73+
group_size=group_size,
74+
sym=sym,
75+
iters=0,
76+
seqlen=2,
77+
dataset=self.llm_dataloader,
78+
)
79+
quantized_model_path = self.save_folder
80+
autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
81+
82+
quantization_config = AutoRoundConfig(backend="torch")
83+
model = AutoModelForCausalLM.from_pretrained(
84+
quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
85+
)
86+
87+
tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
88+
self.model_infer(model, tokenizer)
89+
result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
90+
print(result["results"]["lambada_openai"]["acc,none"])
91+
self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
92+
torch.cuda.empty_cache()
93+
94+
model = AutoModelForCausalLM.from_pretrained(
95+
self.save_folder, torch_dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
96+
)
97+
98+
tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
99+
self.model_infer(model, tokenizer)
100+
result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
101+
print(result["results"]["lambada_openai"]["acc,none"])
102+
self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
103+
torch.cuda.empty_cache()
104+
shutil.rmtree("./saved", ignore_errors=True)
105+
106+
def test_torch_4bits_sym(self):
107+
model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
108+
tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
109+
bits, group_size, sym = 4, 128, True
110+
autoround = AutoRound(
111+
model,
112+
tokenizer,
113+
bits=bits,
114+
group_size=group_size,
115+
sym=sym,
116+
iters=0,
117+
seqlen=2,
118+
dataset=self.llm_dataloader,
119+
)
120+
quantized_model_path = self.save_folder
121+
autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model
122+
123+
quantization_config = AutoRoundConfig(backend="torch")
124+
model = AutoModelForCausalLM.from_pretrained(
125+
quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
126+
)
127+
128+
tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
129+
self.model_infer(model, tokenizer)
130+
result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
131+
print(result["results"]["lambada_openai"]["acc,none"])
132+
self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
133+
torch.cuda.empty_cache()
134+
shutil.rmtree(self.save_folder, ignore_errors=True)
135+
136+
137+
if __name__ == "__main__":
138+
unittest.main()

0 commit comments

Comments
 (0)