6
6
7
7
# Select model and load it.
8
8
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
9
-
10
9
model = AutoModelForCausalLM .from_pretrained (
11
10
MODEL_ID ,
12
11
device_map = "auto" ,
20
19
21
20
# Select number of samples. 512 samples is a good place to start.
22
21
# Increasing the number of samples can improve accuracy.
23
- NUM_CALIBRATION_SAMPLES = 10
22
+ NUM_CALIBRATION_SAMPLES = 512
24
23
MAX_SEQUENCE_LENGTH = 2048
25
24
26
25
# Load dataset and preprocess.
@@ -50,43 +49,29 @@ def process_and_tokenize(example):
50
49
quant_stage:
51
50
quant_modifiers:
52
51
QuantizationModifier:
52
+ ignore: ["lm_head"]
53
53
config_groups:
54
- fp8_attention:
55
- output_activations:
54
+ group_0:
55
+ weights:
56
+ num_bits: 8
57
+ type: float
58
+ strategy: tensor
59
+ dynamic: false
60
+ symmetric: true
61
+ input_activations:
56
62
num_bits: 8
57
63
type: float
58
- strategy: channel
64
+ strategy: tensor
59
65
dynamic: false
60
66
symmetric: true
61
- # targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
62
- targets: ['re:.*q_proj',]
63
-
67
+ targets: ["Linear"]
68
+ kv_cache_scheme:
69
+ num_bits: 8
70
+ type: float
71
+ strategy: tensor
72
+ dynamic: false
73
+ symmetric: true
64
74
"""
65
- # recipe = """
66
- # quant_stage:
67
- # quant_modifiers:
68
- # QuantizationModifier:
69
- # config_groups:
70
- # fp8_attention_q_proj:
71
- # output_activations:
72
- # num_bits: 8
73
- # type: float
74
- # strategy: channel
75
- # # group_size: 512
76
- # dynamic: false
77
- # symmetric: true
78
- # targets: ['re:.*q_proj']
79
- # # fp8_attention_kv_proj:
80
- # # output_activations:
81
- # # num_bits: 8
82
- # # type: float
83
- # # strategy: group
84
- # # group_size: 128
85
- # # dynamic: false
86
- # # symmetric: true
87
- # # targets: ['re:.*k_proj', 're:.*v_proj']
88
-
89
- # """
90
75
91
76
# Apply algorithms.
92
77
oneshot (
@@ -111,6 +96,6 @@ def process_and_tokenize(example):
111
96
print ("==========================================\n \n " )
112
97
113
98
# Save to disk compressed.
114
- SAVE_DIR = MODEL_ID .split ("/" )[1 ] + "-AttnQuantOnly-Group "
99
+ SAVE_DIR = MODEL_ID .split ("/" )[1 ] + "-FP8-KV "
115
100
model .save_pretrained (SAVE_DIR , save_compressed = True )
116
101
tokenizer .save_pretrained (SAVE_DIR )
0 commit comments