diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py new file mode 100644 index 000000000..dceaad8ee --- /dev/null +++ b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py @@ -0,0 +1,120 @@ +import torch +from datasets import load_dataset +from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration + +from llmcompressor import oneshot +from llmcompressor.modeling import replace_modules_for_calibration +from llmcompressor.modifiers.awq import AWQModifier +from llmcompressor.utils import dispatch_for_generation + +# NOTE: Requires a minimum of transformers 4.57.0 + +MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct" + +# Load model. +model = Qwen3VLMoeForConditionalGeneration.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, device_map=None, trust_remote_code=True +) +processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) +model = replace_modules_for_calibration(model) + +DATASET_ID = "neuralmagic/calibration" +NUM_CALIBRATION_SAMPLES = 256 +MAX_SEQUENCE_LENGTH = 8192 + +ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess_function(example): + messages = [] + for message in example["messages"]: + messages.append( + { + "role": message["role"], + "content": [{"type": "text", "text": message["content"]}], + } + ) + + return processor.apply_chat_template( + messages, + return_tensors="pt", + padding=False, + truncation=True, + max_length=MAX_SEQUENCE_LENGTH, + tokenize=True, + add_special_tokens=False, + return_dict=True, + add_generation_prompt=False, + ) + + +ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) + + +def data_collator(batch): + assert len(batch) == 1 + return { + key: ( + torch.tensor(value) + if key != "pixel_values" + else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) + ) + for key, value in batch[0].items() + } + + +# Configure AWQ quantization with smoothing and balancing +# NOTE: This recipe uses W4A16 quantization with group_size=32 +# rather than the default preset with group_size=128 +recipe = AWQModifier( + ignore=[ + "re:.*embed_tokens", + "re:.*input_layernorm$", + "re:.*mlp[.]gate$", + "re:.*post_attention_layernorm$", + "re:.*norm$", + "re:model[.]visual.*", + "re:visual.*", + "lm_head", + ], + duo_scaling=True, + config_groups={ + "group_0": { + "targets": ["Linear"], + "weights": { + "num_bits": 4, + "type": "int", + "symmetric": True, + "group_size": 32, + "strategy": "group", + "dynamic": False, + "actorder": None, + "observer": "mse", + }, + } + }, +) + +# Apply AWQ quantization. +oneshot( + model=model, + processor=processor, + recipe=recipe, + dataset=ds, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + data_collator=data_collator, +) + +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=20) +print(processor.decode(output[0])) +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR)