From 8c9af971a462a9b540be36339da05d650f485362 Mon Sep 17 00:00:00 2001 From: youn17 Date: Tue, 16 Sep 2025 17:52:20 +0900 Subject: [PATCH 1/7] Summary: Adds SMOOTHQUANT-W8A8 quantization method to the TorchAO model release pipeline. - Adjusted defaults: Increased calibration samples from 10 to 128 to ensure consistency, reduced max sequence length (SeqLen) from 2048 to 1024 - Updated HF CLI command: `huggingface-cli login` to `hf auth login` Test plan: ```bash python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant SMOOTHQUANT-W8A8 --push_to_hub --task bbh ``` --- .../scripts/torchao_model_releases/README.md | 9 ++-- .../quantize_and_upload.py | 51 ++++++++++++++++--- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md index 67866ade26..fef000a605 100644 --- a/.github/scripts/torchao_model_releases/README.md +++ b/.github/scripts/torchao_model_releases/README.md @@ -8,8 +8,7 @@ By default, we release FP8, INT4, INT8-INT4 checkpoints, with model card pre-fil Examples: ``` -# Note: first login with `huggingface-cli login`, the quantized model will be uploaded to -# the logged in user +# Note: first login with `hf auth login`, the quantized model will be uploaded to the logged in user # release with default quant options (FP8, INT4, INT8-INT4) ./release.sh --model_id Qwen/Qwen3-8B @@ -20,8 +19,10 @@ Examples: Note: for initial release, please include `--populate_model_card_template` to populate model card template. -### AWQ-INT4 -[AWQ](https://arxiv.org/abs/2306.00978) is a technique to improve accuracy for weight only quantization. It improves accuracy by preserving "salient" weight channels that has high impact on the accuracy of output, through multiplying the weight channel by a scale, and do the reverse for the correspnoding activation, since activation is not quantized, there is no additional loss from activation, while the quantization loss from weight can be reduced. +### SMOOTHQUANT-W8A8 & AWQ-INT4 +[SmoothQuant](https://arxiv.org/abs/2211.10438) smooths activation outliers by migrating quantization difficulty from activations to weights through a mathematically equivalent per-channel scaling transformation. That means SmoothQuant observes activation distribution before applying quantization. + +Similar to SmoothQuant, [AWQ](https://arxiv.org/abs/2306.00978) improves accuracy by preserving "salient" weight channels that have high impact on the accuracy of output. The notable point is that AWQ uses activation distribution to find salient weights, not weight distribution, multiplying the weight channel by a scale and doing the reverse for the corresponding activation. Since activation is not quantized, there is no additional loss from activation, while the quantization loss from weight can be reduced. After eval for INT4 checkpoint is done, we might find some task have a large accuracy drop compared to high precision baseline, in that case we can do a calibration for that task, with a few samples, tasks are selected from [lm-eval](https://github.com/EleutherAI/lm-eval\uation-harness/blob/main/lm_eval/tasks/README.md). You can follow [new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md) to add new tasks to lm-eval. diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py index 22ce6ee6df..9f28671d7a 100644 --- a/.github/scripts/torchao_model_releases/quantize_and_upload.py +++ b/.github/scripts/torchao_model_releases/quantize_and_upload.py @@ -18,6 +18,7 @@ from torchao.quantization import ( Float8DynamicActivationFloat8WeightConfig, Int4WeightOnlyConfig, + Int8DynamicActivationInt8WeightConfig, Int8DynamicActivationIntxWeightConfig, IntxWeightOnlyConfig, ModuleFqnToConfig, @@ -242,6 +243,42 @@ def _untie_weights_and_save_locally(model_id): tokenizer = AutoTokenizer.from_pretrained(model_id) """ + +_smoothquant_w8a8_quant_code = """ +from torchao.quantization import Int8DynamicActivationInt8WeightConfig, quantize_ +from torchao.prototype.smoothquant import SmoothQuantConfig + +from torchao._models._eval import TransformerEvalWrapper +model = AutoModelForCausalLM.from_pretrained( + model_to_quantize, + device_map="auto", + torch_dtype=torch.bfloat16, +) +tokenizer = AutoTokenizer.from_pretrained(model_id) + +base_config = Int8DynamicActivationInt8WeightConfig() +quant_config = SmoothQuantConfig(base_config, step="prepare") +quantize_( + model, + quant_config, +) +TransformerEvalWrapper( + model=model, + tokenizer=tokenizer, + max_seq_length=max_seq_length, +).run_eval( + tasks=tasks, + limit=calibration_limit, +) +quant_config = SmoothQuantConfig(base_config, step="convert") +quantize_(model, quant_config) + +quantized_model = model +quant_config = SmoothQuantConfig(base_config, step="prepare_for_loading") +quantized_model.config.quantization_config = TorchAoConfig(quant_config) +""" + + _awq_int4_quant_code = """ from torchao.quantization import Int4WeightOnlyConfig, quantize_ from torchao.prototype.awq import ( @@ -592,7 +629,7 @@ def _untie_weights_and_save_locally(model_id): python -m executorch.examples.models.qwen3.convert_weights $(hf download {quantized_model}) pytorch_model_converted.bin ``` -Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows. +Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows. [TODO: fix config path in note where necessary] (Note: ExecuTorch LLM export script requires config.json have certain key names. The correct config to use for the LLM export script is located at examples/models/qwen3/config/4b_config.json within the ExecuTorch repo.) @@ -651,6 +688,7 @@ def quantize_and_upload( "model.embed_tokens": _int8_int4_embedding_config, } ), + "SMOOTHQUANT-W8A8": Int8DynamicActivationInt8WeightConfig(), } quant_to_quant_code = { @@ -658,6 +696,7 @@ def quantize_and_upload( "INT4": _int4_quant_code, "INT8-INT4": _int8_int4_quant_code, "AWQ-INT4": _awq_int4_quant_code, + "SMOOTHQUANT-W8A8": _smoothquant_w8a8_quant_code, } # preparation @@ -812,7 +851,7 @@ def quantize_and_upload( parser.add_argument( "--quant", type=str, - help="Quantization method. Options are FP8, INT4, INT8-INT4, AWQ-INT4", + help="Quantization method. Options are FP8, INT4, INT8-INT4, AWQ-INT4, SMOOTHQUANT-W8A8", ) parser.add_argument( "--tasks", @@ -824,14 +863,14 @@ def quantize_and_upload( parser.add_argument( "--calibration_limit", type=int, - default=10, - help="Number of samples to use for calibration. Default is 10.", + default=128, + help="Number of samples to use for calibration. Default is 128.", ) parser.add_argument( "--max_seq_length", type=int, - default=2048, - help="Maximum sequence length of examples to calibrate and evaluate model on. Default is 2048", + default=1024, + help="Maximum sequence length of examples to calibrate and evaluate model on. Default is 1024", ) parser.add_argument( "--push_to_hub", From 0b367db936929ef07fca971bb6dbf640fe5837c7 Mon Sep 17 00:00:00 2001 From: youn17 Date: Thu, 18 Sep 2025 00:18:29 +0900 Subject: [PATCH 2/7] add SmoothQuant uploader --- .../quantize_and_upload.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py index 9f28671d7a..379a395799 100644 --- a/.github/scripts/torchao_model_releases/quantize_and_upload.py +++ b/.github/scripts/torchao_model_releases/quantize_and_upload.py @@ -27,6 +27,7 @@ PerRow, quantize_, ) +from torchao.prototype.smoothquant import SmoothQuantConfig def _get_username(): @@ -736,6 +737,35 @@ def quantize_and_upload( quantized_model = model quant_config = AWQConfig(base_config, step="prepare_for_loading") quantized_model.config.quantization_config = TorchAoConfig(quant_config) + elif quant == "SMOOTHQUANT-W8A8": + model = AutoModelForCausalLM.from_pretrained( + model_to_quantize, + device_map="auto", + torch_dtype=torch.bfloat16, + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + + base_config = Int8DynamicActivationInt8WeightConfig() + quant_config = SmoothQuantConfig(base_config, step="prepare") + quantize_( + model, + quant_config, + ) + TransformerEvalWrapper( + model=model, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + ).run_eval( + tasks=tasks, + limit=calibration_limit, + ) + quant_config = SmoothQuantConfig(base_config, step="convert") + quantize_(model, quant_config) + + quantized_model = model + + load_config = SmoothQuantConfig(base_config, step="prepare_for_loading") + quantized_model.config.quantization_config = TorchAoConfig(load_config) else: # other quantization are integrated with `from_pretrained` in huggingface transformers assert quant in quant_to_config, f"Unsupported quant option: {quant}" From ede04483b38f964c412838b18d4411c0f5cc4f5a Mon Sep 17 00:00:00 2001 From: youn17 Date: Thu, 18 Sep 2025 17:47:27 +0900 Subject: [PATCH 3/7] separate docs for AWQ & SmoothQuant --- .github/scripts/torchao_model_releases/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md index fef000a605..41f5719cd9 100644 --- a/.github/scripts/torchao_model_releases/README.md +++ b/.github/scripts/torchao_model_releases/README.md @@ -19,10 +19,11 @@ Examples: Note: for initial release, please include `--populate_model_card_template` to populate model card template. -### SMOOTHQUANT-W8A8 & AWQ-INT4 +### SMOOTHQUANT-W8A8 [SmoothQuant](https://arxiv.org/abs/2211.10438) smooths activation outliers by migrating quantization difficulty from activations to weights through a mathematically equivalent per-channel scaling transformation. That means SmoothQuant observes activation distribution before applying quantization. -Similar to SmoothQuant, [AWQ](https://arxiv.org/abs/2306.00978) improves accuracy by preserving "salient" weight channels that have high impact on the accuracy of output. The notable point is that AWQ uses activation distribution to find salient weights, not weight distribution, multiplying the weight channel by a scale and doing the reverse for the corresponding activation. Since activation is not quantized, there is no additional loss from activation, while the quantization loss from weight can be reduced. +### AWQ-INT4 +Similar to SmoothQuant, [AWQ](https://arxiv.org/abs/2306.00978) improves accuracy by preserving "salient" weight channels that has high impact on the accuracy of output. The notable point is that AWQ uses activation distribution to find salient weights, not weight distribution, multiplying the weight channel by a scale, and doing the reverse for the corresponding activation. Since activation is not quantized, there is no additional loss from activation, while the quantization loss from weight can be reduced. After eval for INT4 checkpoint is done, we might find some task have a large accuracy drop compared to high precision baseline, in that case we can do a calibration for that task, with a few samples, tasks are selected from [lm-eval](https://github.com/EleutherAI/lm-eval\uation-harness/blob/main/lm_eval/tasks/README.md). You can follow [new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md) to add new tasks to lm-eval. From d3cc18a31c52fae1c4d439c33085cec0e33ed290 Mon Sep 17 00:00:00 2001 From: youn17 Date: Tue, 23 Sep 2025 02:54:44 +0900 Subject: [PATCH 4/7] rename SMOOTHQUANT-W8A8 to SMOOTHQUANT-INT8-INT8 --- .github/scripts/torchao_model_releases/README.md | 2 +- .../torchao_model_releases/quantize_and_upload.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md index 41f5719cd9..c415678dc9 100644 --- a/.github/scripts/torchao_model_releases/README.md +++ b/.github/scripts/torchao_model_releases/README.md @@ -19,7 +19,7 @@ Examples: Note: for initial release, please include `--populate_model_card_template` to populate model card template. -### SMOOTHQUANT-W8A8 +### SMOOTHQUANT-INT8-INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438) smooths activation outliers by migrating quantization difficulty from activations to weights through a mathematically equivalent per-channel scaling transformation. That means SmoothQuant observes activation distribution before applying quantization. ### AWQ-INT4 diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py index 379a395799..a98aad9f4f 100644 --- a/.github/scripts/torchao_model_releases/quantize_and_upload.py +++ b/.github/scripts/torchao_model_releases/quantize_and_upload.py @@ -245,7 +245,7 @@ def _untie_weights_and_save_locally(model_id): """ -_smoothquant_w8a8_quant_code = """ +_smoothquant_int8_int8_quant_code = """ from torchao.quantization import Int8DynamicActivationInt8WeightConfig, quantize_ from torchao.prototype.smoothquant import SmoothQuantConfig @@ -689,7 +689,7 @@ def quantize_and_upload( "model.embed_tokens": _int8_int4_embedding_config, } ), - "SMOOTHQUANT-W8A8": Int8DynamicActivationInt8WeightConfig(), + "SMOOTHQUANT-INT8-INT8": Int8DynamicActivationInt8WeightConfig(), } quant_to_quant_code = { @@ -697,7 +697,7 @@ def quantize_and_upload( "INT4": _int4_quant_code, "INT8-INT4": _int8_int4_quant_code, "AWQ-INT4": _awq_int4_quant_code, - "SMOOTHQUANT-W8A8": _smoothquant_w8a8_quant_code, + "SMOOTHQUANT-INT8-INT8": _smoothquant_int8_int8_quant_code, } # preparation @@ -737,7 +737,7 @@ def quantize_and_upload( quantized_model = model quant_config = AWQConfig(base_config, step="prepare_for_loading") quantized_model.config.quantization_config = TorchAoConfig(quant_config) - elif quant == "SMOOTHQUANT-W8A8": + elif quant == "SMOOTHQUANT-INT8-INT8": model = AutoModelForCausalLM.from_pretrained( model_to_quantize, device_map="auto", @@ -881,7 +881,7 @@ def quantize_and_upload( parser.add_argument( "--quant", type=str, - help="Quantization method. Options are FP8, INT4, INT8-INT4, AWQ-INT4, SMOOTHQUANT-W8A8", + help="Quantization method. Options are FP8, INT4, INT8-INT4, AWQ-INT4, SMOOTHQUANT-INT8-INT8", ) parser.add_argument( "--tasks", From bdea42cedd29566a6880bf1a32f2a423fde4e3be Mon Sep 17 00:00:00 2001 From: youn17 Date: Tue, 23 Sep 2025 02:55:22 +0900 Subject: [PATCH 5/7] add SmoothQuant release example --- .github/scripts/torchao_model_releases/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md index c415678dc9..969435de3b 100644 --- a/.github/scripts/torchao_model_releases/README.md +++ b/.github/scripts/torchao_model_releases/README.md @@ -32,6 +32,9 @@ Examples: # release AWQ-INT4 model, calibrated with a specific task # with some calibration_limit (number of samples) python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant AWQ-INT4 --push_to_hub --task bbh --calibration_limit 2 + +# release SMOOTHQUANT-INT8-INT8 model, calibrated with a specific task +python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant SMOOTHQUANT-INT8-INT8 --push_to_hub --task bbh --populate_model_card_template ``` ### Update checkpoints for a different user_id (e.g. pytorch) From 6c62463b383bf32730c7ad008f9010b47ddc5358 Mon Sep 17 00:00:00 2001 From: younn17 Date: Thu, 25 Sep 2025 05:29:53 +0900 Subject: [PATCH 6/7] update example in docs --- .github/scripts/torchao_model_releases/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md index 969435de3b..84070ab79f 100644 --- a/.github/scripts/torchao_model_releases/README.md +++ b/.github/scripts/torchao_model_releases/README.md @@ -22,6 +22,12 @@ Note: for initial release, please include `--populate_model_card_template` to po ### SMOOTHQUANT-INT8-INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438) smooths activation outliers by migrating quantization difficulty from activations to weights through a mathematically equivalent per-channel scaling transformation. That means SmoothQuant observes activation distribution before applying quantization. +Examples: +``` +# release SMOOTHQUANT-INT8-INT8 model, calibrated with a specific task +python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant SMOOTHQUANT-INT8-INT8 --push_to_hub --task bbh --populate_model_card_template +``` + ### AWQ-INT4 Similar to SmoothQuant, [AWQ](https://arxiv.org/abs/2306.00978) improves accuracy by preserving "salient" weight channels that has high impact on the accuracy of output. The notable point is that AWQ uses activation distribution to find salient weights, not weight distribution, multiplying the weight channel by a scale, and doing the reverse for the corresponding activation. Since activation is not quantized, there is no additional loss from activation, while the quantization loss from weight can be reduced. @@ -32,9 +38,6 @@ Examples: # release AWQ-INT4 model, calibrated with a specific task # with some calibration_limit (number of samples) python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant AWQ-INT4 --push_to_hub --task bbh --calibration_limit 2 - -# release SMOOTHQUANT-INT8-INT8 model, calibrated with a specific task -python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant SMOOTHQUANT-INT8-INT8 --push_to_hub --task bbh --populate_model_card_template ``` ### Update checkpoints for a different user_id (e.g. pytorch) From cc58f524f577c528a51a6eb3344edfdd30d0f377 Mon Sep 17 00:00:00 2001 From: younn17 Date: Thu, 25 Sep 2025 15:49:51 +0900 Subject: [PATCH 7/7] rename SMOOTHQUANT-INT8-INT8 to SmoothQuant-INT8-INT8 --- .../scripts/torchao_model_releases/quantize_and_upload.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py index a98aad9f4f..8bceca3b8e 100644 --- a/.github/scripts/torchao_model_releases/quantize_and_upload.py +++ b/.github/scripts/torchao_model_releases/quantize_and_upload.py @@ -689,7 +689,7 @@ def quantize_and_upload( "model.embed_tokens": _int8_int4_embedding_config, } ), - "SMOOTHQUANT-INT8-INT8": Int8DynamicActivationInt8WeightConfig(), + "SmoothQuant-INT8-INT8": Int8DynamicActivationInt8WeightConfig(), } quant_to_quant_code = { @@ -697,7 +697,7 @@ def quantize_and_upload( "INT4": _int4_quant_code, "INT8-INT4": _int8_int4_quant_code, "AWQ-INT4": _awq_int4_quant_code, - "SMOOTHQUANT-INT8-INT8": _smoothquant_int8_int8_quant_code, + "SmoothQuant-INT8-INT8": _smoothquant_int8_int8_quant_code, } # preparation @@ -737,7 +737,7 @@ def quantize_and_upload( quantized_model = model quant_config = AWQConfig(base_config, step="prepare_for_loading") quantized_model.config.quantization_config = TorchAoConfig(quant_config) - elif quant == "SMOOTHQUANT-INT8-INT8": + elif quant == "SmoothQuant-INT8-INT8": model = AutoModelForCausalLM.from_pretrained( model_to_quantize, device_map="auto", @@ -881,7 +881,7 @@ def quantize_and_upload( parser.add_argument( "--quant", type=str, - help="Quantization method. Options are FP8, INT4, INT8-INT4, AWQ-INT4, SMOOTHQUANT-INT8-INT8", + help="Quantization method. Options are FP8, INT4, INT8-INT4, AWQ-INT4, SmoothQuant-INT8-INT8", ) parser.add_argument( "--tasks",