Skip to content

Commit 65c52d0

Browse files
author
Maigee
committed
add cpm
1 parent 0510ba2 commit 65c52d0

21 files changed

+88888
-13
lines changed
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
seed: 0
2+
run_mode: 'train'
3+
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
4+
load_checkpoint: '/home/m30024275/cpm_model_10b.ckpt'
5+
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
6+
only_save_strategy: False
7+
resume_training: False
8+
9+
# ==== context config ====
10+
context:
11+
mode: 0 #0--Graph Mode; 1--Pynative Mode
12+
device_target: "Ascend"
13+
enable_graph_kernel: False
14+
graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
15+
max_call_depth: 10000
16+
max_device_memory: "30GB"
17+
save_graphs: False
18+
device_id: 0
19+
20+
# aicc
21+
remote_save_url: "Please input obs url on AICC platform."
22+
23+
# ==== model config ====
24+
model:
25+
model_config:
26+
type: CPMBeeConfig
27+
vocab_size: 86592
28+
dim_model: 4096
29+
dim_ff: 10240
30+
num_layers: 48
31+
num_heads: 32
32+
dim_head: 128
33+
dropout_p: 0.0
34+
position_bias_num_buckets: 256
35+
position_bias_num_segment_buckets: 256
36+
position_bias_max_distance: 2048
37+
eps: 1.e-6
38+
half: True
39+
arch:
40+
type: CPMForPreTraining
41+
42+
trainer:
43+
type: CausalLanguageModelingTrainer
44+
model_name: 'cpm_10b'
45+
# if True, do evaluate during the training process. if false, do nothing.
46+
# note that the task trainer should support _evaluate_in_training function.
47+
do_eval: False
48+
eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval.
49+
eval_epoch_interval: 1 # num of epoch intervals between each eval, 1 means eval on every epoch end.
50+
51+
metric:
52+
type: ADGENMetric
53+
tokenizer_type: "glm_6b" # use ChatGLMTokenizer
54+
55+
processor:
56+
return_tensors: ms
57+
tokenizer:
58+
type: CPMBeeTokenizer
59+
type: CPMProcessor
60+
61+
# ==== dataset config ====
62+
train_dataset: &train_dataset
63+
data_loader:
64+
type: MindDataset
65+
dataset_dir: "/home/m30024275/cpm_mindrecord"
66+
shuffle: True
67+
input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment",
68+
"segment_rel_offset", "segment_rel", "spans", "ext_table_ids", "ext_table_sub", "label" ]
69+
num_parallel_workers: 8
70+
python_multiprocessing: False
71+
drop_remainder: True
72+
batch_size: 1
73+
repeat: 1
74+
numa_enable: False
75+
prefetch_size: 1
76+
seed: 0
77+
78+
train_dataset_task:
79+
type: CausalLanguageModelDataset
80+
dataset_config: *train_dataset
81+
82+
eval_dataset: &eval_dataset
83+
data_loader:
84+
type: MindDataset
85+
dataset_dir: ""
86+
shuffle: True
87+
input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment_ids",
88+
"segment_rel_offset", "segment_rel", "spans", "ext_ids", "ext_sub", "target" ]
89+
num_parallel_workers: 8
90+
python_multiprocessing: False
91+
drop_remainder: True
92+
batch_size: 1
93+
repeat: 1
94+
numa_enable: False
95+
prefetch_size: 1
96+
seed: 0
97+
98+
eval_dataset_task:
99+
type: CausalLanguageModelDataset
100+
dataset_config: *eval_dataset
101+
102+
# ==== runner config ====
103+
runner_config:
104+
epochs: 1
105+
batch_size: 1
106+
sink_mode: False
107+
sink_size: -1
108+
109+
runner_wrapper:
110+
type: ScaleTrainOneStepCell
111+
scale_sense:
112+
type: DynamicLossScaleUpdateCell
113+
loss_scale_value: 32768
114+
scale_factor: 2
115+
scale_window: 1000
116+
use_clip_grad: True
117+
118+
# lr sechdule
119+
lr_schedule:
120+
type: noam
121+
learning_rate: 1.e-4
122+
warmup_iter: 1
123+
end_iter: 2000
124+
125+
# optimizer
126+
optimizer:
127+
type: AdamWeightDecayWithScale
128+
weight_decay: 0.01
129+
param_group: False
130+
131+
# parallel config
132+
use_parallel: False
133+
parallel:
134+
parallel_mode: 2 # 0-dataset, 1-semi, 2-auto, 3-hybrid
135+
gradients_mean: False
136+
loss_repeated_mean: True
137+
enable_alltoall: False
138+
full_batch: True
139+
search_mode: "sharding_propagation"
140+
enable_parallel_optimizer: True # optimizer shard
141+
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
142+
parallel_config:
143+
data_parallel: 8
144+
model_parallel: 1
145+
pipeline_stage: 1
146+
expert_parallel: 1
147+
optimizer_shard: True # optimizer shard
148+
micro_batch_num: 1
149+
vocab_emb_dp: True
150+
gradient_aggregation_group: 8
151+
micro_batch_interleave_num: 1
152+
153+
# moe
154+
moe_config:
155+
expert_num: 1
156+
capacity_factor: 1.05
157+
aux_loss_factor: 0.05
158+
num_experts_chosen: 1
159+
160+
# recompute
161+
recompute_config:
162+
recompute: False
163+
parallel_optimizer_comm_recompute: False
164+
mp_comm_recompute: True
165+
recompute_slice_activation: False
166+
167+
# autotune
168+
auto_tune: False
169+
filepath_prefix: './autotune'
170+
autotune_per_step: 10
171+
172+
# profile
173+
profile: False
174+
profile_start_step: 1
175+
profile_stop_step: 10
176+
init_start_profile: True
177+
profile_communication: True
178+
profile_memory: True
179+
180+
# callbacks
181+
callbacks:
182+
- type: MFLossMonitor
183+
- type: SummaryMonitor
184+
keep_default_action: True
185+
- type: CheckpointMointor
186+
prefix: "cpm-2b"
187+
save_checkpoint_steps: 500
188+
keep_checkpoint_max: 2
189+
integrated_save: False
190+
async_save: False
191+
- type: ObsMonitor
192+
keep_last: False
193+
eval_callbacks:
194+
- type: ObsMonitor
195+
keep_last: False
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
seed: 0
2+
run_mode: 'train'
3+
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
4+
load_checkpoint: '/home/m30024275/cpm_model_2b.ckpt'
5+
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
6+
only_save_strategy: False
7+
resume_training: False
8+
9+
# ==== context config ====
10+
context:
11+
mode: 0 #0--Graph Mode; 1--Pynative Mode
12+
device_target: "Ascend"
13+
enable_graph_kernel: False
14+
graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
15+
max_call_depth: 10000
16+
max_device_memory: "30GB"
17+
save_graphs: False
18+
device_id: 0
19+
20+
# aicc
21+
remote_save_url: "Please input obs url on AICC platform."
22+
23+
# ==== model config ====
24+
model:
25+
model_config:
26+
type: CPMBeeConfig
27+
vocab_size: 86592
28+
dim_model: 4096
29+
dim_ff: 5120
30+
num_layers: 48
31+
num_heads: 32
32+
dim_head: 64
33+
dropout_p: 0.0
34+
position_bias_num_buckets: 256
35+
position_bias_num_segment_buckets: 256
36+
position_bias_max_distance: 2048
37+
eps: 1.e-6
38+
half: False
39+
mask_modules: [[False, False], [True, False], [False, False], [True, False], [True, True], [True, False],
40+
[True, True], [True, True], [False, False], [False, False], [True, True], [True, False],
41+
[True, False], [True, True], [False, False], [True, True], [False, False], [False, True],
42+
[True, False], [True, True], [False, False], [False, True], [True, True], [True, True],
43+
[False, False], [True, True], [False, False], [True, True], [True, True], [False, False],
44+
[True, True], [False, False], [True, True], [False, False], [True, True], [True, False],
45+
[True, True], [True, True], [True, True], [False, False], [True, True], [False, False],
46+
[True, True], [True, True], [False, False], [True, True], [False, False], [False, False]]
47+
arch:
48+
type: CPMForPreTraining
49+
50+
trainer:
51+
type: CausalLanguageModelingTrainer
52+
model_name: 'cpm_2b'
53+
# if True, do evaluate during the training process. if false, do nothing.
54+
# note that the task trainer should support _evaluate_in_training function.
55+
do_eval: False
56+
eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval.
57+
eval_epoch_interval: 1 # num of epoch intervals between each eval, 1 means eval on every epoch end.
58+
59+
metric:
60+
type: ADGENMetric
61+
tokenizer_type: "glm_6b" # use ChatGLMTokenizer
62+
63+
processor:
64+
return_tensors: ms
65+
tokenizer:
66+
type: CPMBeeTokenizer
67+
type: CPMProcessor
68+
69+
# ==== dataset config ====
70+
train_dataset: &train_dataset
71+
data_loader:
72+
type: MindDataset
73+
dataset_dir: "/home/m30024275/cpm_mindrecord"
74+
shuffle: True
75+
input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment",
76+
"segment_rel_offset", "segment_rel", "spans", "ext_table_ids", "ext_table_sub", "label" ]
77+
num_parallel_workers: 8
78+
python_multiprocessing: False
79+
drop_remainder: True
80+
batch_size: 1
81+
repeat: 1
82+
numa_enable: False
83+
prefetch_size: 1
84+
seed: 0
85+
86+
train_dataset_task:
87+
type: CausalLanguageModelDataset
88+
dataset_config: *train_dataset
89+
90+
eval_dataset: &eval_dataset
91+
data_loader:
92+
type: MindDataset
93+
dataset_dir: ""
94+
shuffle: True
95+
input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment_ids",
96+
"segment_rel_offset", "segment_rel", "spans", "ext_ids", "ext_sub", "target" ]
97+
num_parallel_workers: 8
98+
python_multiprocessing: False
99+
drop_remainder: True
100+
batch_size: 1
101+
repeat: 1
102+
numa_enable: False
103+
prefetch_size: 1
104+
seed: 0
105+
106+
eval_dataset_task:
107+
type: CausalLanguageModelDataset
108+
dataset_config: *eval_dataset
109+
110+
# ==== runner config ====
111+
runner_config:
112+
epochs: 1
113+
batch_size: 1
114+
sink_mode: False
115+
sink_size: -1
116+
117+
runner_wrapper:
118+
type: ScaleTrainOneStepCell
119+
scale_sense:
120+
type: DynamicLossScaleUpdateCell
121+
loss_scale_value: 32768
122+
scale_factor: 2
123+
scale_window: 1000
124+
use_clip_grad: True
125+
126+
# lr sechdule
127+
lr_schedule:
128+
type: noam
129+
learning_rate: 1.e-4
130+
warmup_iter: 1
131+
end_iter: 2000
132+
133+
# optimizer
134+
optimizer:
135+
type: AdamWeightDecayWithScale
136+
weight_decay: 0.01
137+
param_group: False
138+
139+
# parallel config
140+
use_parallel: False
141+
parallel:
142+
parallel_mode: 2 # 0-dataset, 1-semi, 2-auto, 3-hybrid
143+
gradients_mean: False
144+
loss_repeated_mean: True
145+
enable_alltoall: False
146+
full_batch: True
147+
search_mode: "sharding_propagation"
148+
enable_parallel_optimizer: True # optimizer shard
149+
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
150+
parallel_config:
151+
data_parallel: 4
152+
model_parallel: 1
153+
pipeline_stage: 1
154+
expert_parallel: 1
155+
optimizer_shard: True # optimizer shard
156+
micro_batch_num: 1
157+
vocab_emb_dp: True
158+
gradient_aggregation_group: 4
159+
micro_batch_interleave_num: 1
160+
161+
# moe
162+
moe_config:
163+
expert_num: 1
164+
capacity_factor: 1.05
165+
aux_loss_factor: 0.05
166+
num_experts_chosen: 1
167+
168+
# recompute
169+
recompute_config:
170+
recompute: False
171+
parallel_optimizer_comm_recompute: False
172+
mp_comm_recompute: True
173+
recompute_slice_activation: False
174+
175+
# autotune
176+
auto_tune: False
177+
filepath_prefix: './autotune'
178+
autotune_per_step: 10
179+
180+
# profile
181+
profile: False
182+
profile_start_step: 1
183+
profile_stop_step: 10
184+
init_start_profile: True
185+
profile_communication: True
186+
profile_memory: True
187+
188+
# callbacks
189+
callbacks:
190+
- type: MFLossMonitor
191+
- type: SummaryMonitor
192+
keep_default_action: True
193+
- type: CheckpointMointor
194+
prefix: "cpm-2b"
195+
save_checkpoint_steps: 500
196+
keep_checkpoint_max: 2
197+
integrated_save: False
198+
async_save: False
199+
- type: ObsMonitor
200+
keep_last: False
201+
eval_callbacks:
202+
- type: ObsMonitor
203+
keep_last: False

0 commit comments

Comments
 (0)