vllm-project · kylesayrs · Oct 22, 2025 · Oct 18, 2025 · Oct 18, 2025 · Oct 18, 2025
diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -0,0 +1,125 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+from llmcompressor import oneshot
+from llmcompressor.modeling import replace_modules_for_calibration
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: Requires a minimum of transformers 4.57.0
+
+MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+
+# Load model.
+model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, device_map=None, trust_remote_code=True
+)
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = replace_modules_for_calibration(model)
+
+DATASET_ID = "neuralmagic/calibration"
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 8192
+
+ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess_function(example):
+    messages = []
+    for message in example["messages"]:
+        messages.append(
+            {
+                "role": message["role"],
+                "content": [{"type": "text", "text": message["content"]}],
+            }
+        )
+
+    return processor.apply_chat_template(
+        messages,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        tokenize=True,
+        add_special_tokens=False,
+        return_dict=True,
+        add_generation_prompt=False,
+    )
+
+
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
+
+
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        )
+        for key, value in batch[0].items()
+    }
+
+
+# Configure AWQ quantization with smoothing and balancing
+# NOTE: Using W4A16 quantization with group_size=32 
+# (default W4A16 preset uses 128)
+recipe = AWQModifier(
+    ignore=[
+        "re:.*embed_tokens",
+        "re:.*input_layernorm$",
+        "re:.*mlp[.]gate$",
+        "re:.*post_attention_layernorm$",
+        "re:.*norm$",
+        "re:model[.]visual.*",
+        "re:visual.*",
+        "lm_head",
+    ],
+    duo_scaling=True,
+    config_groups={
+        "group_0": {
+            "targets": ["Linear"],
+            "weights": {
+                "num_bits": 4,
+                "type": "int",
+                "symmetric": True,
+                "group_size": 32,
+                "strategy": "group",
+                "block_structure": None,
+                "dynamic": False,
+                "actorder": None,
+                "observer": "mse",
+                "observer_kwargs": {},
+            },
+            "input_activations": None,
+            "output_activations": None,
+            "format": None,
+        }
+    },
+)
+
+# Apply AWQ quantization.
+oneshot(
+    model=model,
+    processor=processor,
+    recipe=recipe,
+    dataset=ds,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=data_collator,
+)
+
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
+print(processor.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)