|
| 1 | +seed: 0 |
| 2 | +run_mode: 'train' |
| 3 | +output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值 |
| 4 | +load_checkpoint: '/home/m30024275/cpm_model_2b.ckpt' |
| 5 | +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model |
| 6 | +only_save_strategy: False |
| 7 | +resume_training: False |
| 8 | + |
| 9 | +# ==== context config ==== |
| 10 | +context: |
| 11 | + mode: 0 #0--Graph Mode; 1--Pynative Mode |
| 12 | + device_target: "Ascend" |
| 13 | + enable_graph_kernel: False |
| 14 | + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" |
| 15 | + max_call_depth: 10000 |
| 16 | + max_device_memory: "30GB" |
| 17 | + save_graphs: False |
| 18 | + device_id: 0 |
| 19 | + |
| 20 | +# aicc |
| 21 | +remote_save_url: "Please input obs url on AICC platform." |
| 22 | + |
| 23 | +# ==== model config ==== |
| 24 | +model: |
| 25 | + model_config: |
| 26 | + type: CPMBeeConfig |
| 27 | + vocab_size: 86592 |
| 28 | + dim_model: 4096 |
| 29 | + dim_ff: 5120 |
| 30 | + num_layers: 48 |
| 31 | + num_heads: 32 |
| 32 | + dim_head: 64 |
| 33 | + dropout_p: 0.0 |
| 34 | + position_bias_num_buckets: 256 |
| 35 | + position_bias_num_segment_buckets: 256 |
| 36 | + position_bias_max_distance: 2048 |
| 37 | + eps: 1.e-6 |
| 38 | + half: False |
| 39 | + mask_modules: [[False, False], [True, False], [False, False], [True, False], [True, True], [True, False], |
| 40 | + [True, True], [True, True], [False, False], [False, False], [True, True], [True, False], |
| 41 | + [True, False], [True, True], [False, False], [True, True], [False, False], [False, True], |
| 42 | + [True, False], [True, True], [False, False], [False, True], [True, True], [True, True], |
| 43 | + [False, False], [True, True], [False, False], [True, True], [True, True], [False, False], |
| 44 | + [True, True], [False, False], [True, True], [False, False], [True, True], [True, False], |
| 45 | + [True, True], [True, True], [True, True], [False, False], [True, True], [False, False], |
| 46 | + [True, True], [True, True], [False, False], [True, True], [False, False], [False, False]] |
| 47 | + arch: |
| 48 | + type: CPMForPreTraining |
| 49 | + |
| 50 | +trainer: |
| 51 | + type: CausalLanguageModelingTrainer |
| 52 | + model_name: 'cpm_2b' |
| 53 | +# if True, do evaluate during the training process. if false, do nothing. |
| 54 | +# note that the task trainer should support _evaluate_in_training function. |
| 55 | +do_eval: False |
| 56 | +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. |
| 57 | +eval_epoch_interval: 1 # num of epoch intervals between each eval, 1 means eval on every epoch end. |
| 58 | + |
| 59 | +metric: |
| 60 | + type: ADGENMetric |
| 61 | + tokenizer_type: "glm_6b" # use ChatGLMTokenizer |
| 62 | + |
| 63 | +processor: |
| 64 | + return_tensors: ms |
| 65 | + tokenizer: |
| 66 | + type: CPMBeeTokenizer |
| 67 | + type: CPMProcessor |
| 68 | + |
| 69 | +# ==== dataset config ==== |
| 70 | +train_dataset: &train_dataset |
| 71 | + data_loader: |
| 72 | + type: MindDataset |
| 73 | + dataset_dir: "/home/m30024275/cpm_mindrecord" |
| 74 | + shuffle: True |
| 75 | + input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment", |
| 76 | + "segment_rel_offset", "segment_rel", "spans", "ext_table_ids", "ext_table_sub", "label" ] |
| 77 | + num_parallel_workers: 8 |
| 78 | + python_multiprocessing: False |
| 79 | + drop_remainder: True |
| 80 | + batch_size: 1 |
| 81 | + repeat: 1 |
| 82 | + numa_enable: False |
| 83 | + prefetch_size: 1 |
| 84 | + seed: 0 |
| 85 | + |
| 86 | +train_dataset_task: |
| 87 | + type: CausalLanguageModelDataset |
| 88 | + dataset_config: *train_dataset |
| 89 | + |
| 90 | +eval_dataset: &eval_dataset |
| 91 | + data_loader: |
| 92 | + type: MindDataset |
| 93 | + dataset_dir: "" |
| 94 | + shuffle: True |
| 95 | + input_columns: [ "inputs", "inputs_sub", "length", "context", "sample_ids", "num_segments", "segment_ids", |
| 96 | + "segment_rel_offset", "segment_rel", "spans", "ext_ids", "ext_sub", "target" ] |
| 97 | + num_parallel_workers: 8 |
| 98 | + python_multiprocessing: False |
| 99 | + drop_remainder: True |
| 100 | + batch_size: 1 |
| 101 | + repeat: 1 |
| 102 | + numa_enable: False |
| 103 | + prefetch_size: 1 |
| 104 | + seed: 0 |
| 105 | + |
| 106 | +eval_dataset_task: |
| 107 | + type: CausalLanguageModelDataset |
| 108 | + dataset_config: *eval_dataset |
| 109 | + |
| 110 | +# ==== runner config ==== |
| 111 | +runner_config: |
| 112 | + epochs: 1 |
| 113 | + batch_size: 1 |
| 114 | + sink_mode: False |
| 115 | + sink_size: -1 |
| 116 | + |
| 117 | +runner_wrapper: |
| 118 | + type: ScaleTrainOneStepCell |
| 119 | + scale_sense: |
| 120 | + type: DynamicLossScaleUpdateCell |
| 121 | + loss_scale_value: 32768 |
| 122 | + scale_factor: 2 |
| 123 | + scale_window: 1000 |
| 124 | + use_clip_grad: True |
| 125 | + |
| 126 | +# lr sechdule |
| 127 | +lr_schedule: |
| 128 | + type: noam |
| 129 | + learning_rate: 1.e-4 |
| 130 | + warmup_iter: 1 |
| 131 | + end_iter: 2000 |
| 132 | + |
| 133 | +# optimizer |
| 134 | +optimizer: |
| 135 | + type: AdamWeightDecayWithScale |
| 136 | + weight_decay: 0.01 |
| 137 | +param_group: False |
| 138 | + |
| 139 | +# parallel config |
| 140 | +use_parallel: False |
| 141 | +parallel: |
| 142 | + parallel_mode: 2 # 0-dataset, 1-semi, 2-auto, 3-hybrid |
| 143 | + gradients_mean: False |
| 144 | + loss_repeated_mean: True |
| 145 | + enable_alltoall: False |
| 146 | + full_batch: True |
| 147 | + search_mode: "sharding_propagation" |
| 148 | + enable_parallel_optimizer: True # optimizer shard |
| 149 | + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" |
| 150 | +parallel_config: |
| 151 | + data_parallel: 4 |
| 152 | + model_parallel: 1 |
| 153 | + pipeline_stage: 1 |
| 154 | + expert_parallel: 1 |
| 155 | + optimizer_shard: True # optimizer shard |
| 156 | + micro_batch_num: 1 |
| 157 | + vocab_emb_dp: True |
| 158 | + gradient_aggregation_group: 4 |
| 159 | +micro_batch_interleave_num: 1 |
| 160 | + |
| 161 | +# moe |
| 162 | +moe_config: |
| 163 | + expert_num: 1 |
| 164 | + capacity_factor: 1.05 |
| 165 | + aux_loss_factor: 0.05 |
| 166 | + num_experts_chosen: 1 |
| 167 | + |
| 168 | +# recompute |
| 169 | +recompute_config: |
| 170 | + recompute: False |
| 171 | + parallel_optimizer_comm_recompute: False |
| 172 | + mp_comm_recompute: True |
| 173 | + recompute_slice_activation: False |
| 174 | + |
| 175 | +# autotune |
| 176 | +auto_tune: False |
| 177 | +filepath_prefix: './autotune' |
| 178 | +autotune_per_step: 10 |
| 179 | + |
| 180 | +# profile |
| 181 | +profile: False |
| 182 | +profile_start_step: 1 |
| 183 | +profile_stop_step: 10 |
| 184 | +init_start_profile: True |
| 185 | +profile_communication: True |
| 186 | +profile_memory: True |
| 187 | + |
| 188 | +# callbacks |
| 189 | +callbacks: |
| 190 | + - type: MFLossMonitor |
| 191 | + - type: SummaryMonitor |
| 192 | + keep_default_action: True |
| 193 | + - type: CheckpointMointor |
| 194 | + prefix: "cpm-2b" |
| 195 | + save_checkpoint_steps: 500 |
| 196 | + keep_checkpoint_max: 2 |
| 197 | + integrated_save: False |
| 198 | + async_save: False |
| 199 | + - type: ObsMonitor |
| 200 | + keep_last: False |
| 201 | +eval_callbacks: |
| 202 | + - type: ObsMonitor |
| 203 | + keep_last: False |
0 commit comments