revert example script

George Ohashi · George Ohashi · commit 189e9d5e7c49 · 2025-03-06T23:22:49.000-05:00
Signed-off-by: George Ohashi &lt;george@neuralmagic.com&gt;
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -6,7 +6,6 @@
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
@@ -20,7 +19,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 10
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -50,43 +49,29 @@ def process_and_tokenize(example):
 quant_stage:
     quant_modifiers:
         QuantizationModifier:
+            ignore: ["lm_head"]
             config_groups:
-                fp8_attention:
-                    output_activations:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
                         num_bits: 8
                         type: float
-                        strategy: channel
+                        strategy: tensor
                         dynamic: false
                         symmetric: true
-                    # targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
-                    targets: ['re:.*q_proj',]
-                    
+                    targets: ["Linear"]
+            kv_cache_scheme:
+                num_bits: 8
+                type: float
+                strategy: tensor
+                dynamic: false
+                symmetric: true
 """
-# recipe = """
-# quant_stage:
-#     quant_modifiers:
-#         QuantizationModifier:
-#             config_groups:
-#                 fp8_attention_q_proj:
-#                     output_activations:
-#                         num_bits: 8
-#                         type: float
-#                         strategy: channel
-#                         # group_size: 512
-#                         dynamic: false
-#                         symmetric: true
-#                     targets: ['re:.*q_proj']
-#                 # fp8_attention_kv_proj:
-#                 #     output_activations:
-#                 #         num_bits: 8
-#                 #         type: float
-#                 #         strategy: group
-#                 #         group_size: 128
-#                 #         dynamic: false
-#                 #         symmetric: true
-#                 #     targets: ['re:.*k_proj', 're:.*v_proj']
-
-# """
 
 # Apply algorithms.
 oneshot(
@@ -111,6 +96,6 @@ def process_and_tokenize(example):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-AttnQuantOnly-Group"
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)