add warm load (PaddlePaddle#11029)

Difers · web-flow · commit d1a3d8887956 · 2025-08-29T12:07:31.000+08:00
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -189,6 +189,7 @@
     nested_numpify,
     nested_truncate,
 )
+from .utils.load_utils import load_paddle_model_from_safetensors
 from .utils.sharding_io import ShardingIO
 
 DEFAULT_CALLBACKS = [DefaultFlowCallback]
@@ -1108,6 +1109,13 @@ def _inner_training_loop(
         if self.args.ignore_data_skip:
             self.timers and self.timers("read-data").start()
 
+        if self.args.hf_ckpt_dir is not None:
+            print("Start loading the Hugging Face model with warm start")
+            weight_map_path = os.path.join(self.args.hf_ckpt_dir, "model.safetensors.index.json")
+            ckpt_pre = self.args.hf_ckpt_dir
+
+            load_paddle_model_from_safetensors(model, weight_map_path, ckpt_pre, verbose=True)
+
         for epoch in range(epochs_trained, num_train_epochs):
             if isinstance(train_dataloader, paddle.io.DataLoader) and isinstance(
                 train_dataloader.batch_sampler, DistributedBatchSampler
@@ -1343,8 +1351,28 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg):
                                 f"optimizer not run, scale_before: {scale_before_value[0]}, scale_after: {scale_after_value[0]}"
                             )
                     elif isinstance(self.optimizer, HybridParallelOptimizer):
+                        # print("hack for moe grad")
+                        # for p in parameters_list:
+                        #     if getattr(p, 'is_moe_param', False):
+                        #         if p.grad is not None:
+                        #             # print(p.name, p.grad)
+                        #             p.grad /= 8
+                        #         if p.main_grad is not None:
+                        #             # print(p.name, p.main_grad)
+                        #             p.main_grad /= 8
+
                         self.optimizer._step(parameters_list)
                     else:
+                        # print("hack for moe gradr")
+                        # for p in parameters_list:
+                        #     if getattr(p, 'is_moe_param', False):
+                        #         if p.grad is not None:
+                        #             print(p.name, p.grad)
+                        #             p.grad /= 4
+                        #         if p.main_grad is not None:
+                        #             print(p.name, p.main_grad)
+                        #             p.main_grad /= 4
+
                         self.optimizer.step()
 
                     if self.args.offload_optim:
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
@@ -1081,12 +1081,17 @@ class TrainingArguments:
     nccl_comm_group_config: Optional[str] = field(
         default=None, metadata={"help": "NCCL中通信组的细粒度控制的配置文件路径, 默认值为None, 代表不启用此项配置"}
     )
-    
+
     pre_alloc_memory: int = field(
         default=0,
         metadata={"help": "pre allocate memory size GB"},
     )
 
+    hf_ckpt_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "huggingface checkpoint dir"},
+    )
+
     def __post_init__(self):
         world_size = paddle.distributed.get_world_size()
         if in_auto_parallel_align_mode():
diff --git a/paddlenlp/trainer/utils/load_utils.py b/paddlenlp/trainer/utils/load_utils.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from collections import defaultdict
+from typing import List, Optional
+
+import paddle
+from paddle.distributed import fleet
+from safetensors import safe_open
+
+# develop: "_layers.<idx>.<rest>"
+_LAYER_RE = re.compile(r"^_layers\.(\d+)(?:\.(.*))?$")
+_EXPERT_W1_RE = re.compile(r"^mlp\.experts\.(\d+)\.w1(?:\.weight)?$")
+_EXPERT_W2_RE = re.compile(r"^mlp\.experts\.(\d+)\.w2(?:\.weight)?$")
+
+custom_name_map = {
+    "mlp.router.weight": "mlp.gate.weight",
+    "mlp.router.e_score_correction_bias": "mlp.gate.e_score_correction_bias",
+}
+
+
+def _layers_match(name: str):
+    return _LAYER_RE.match(name)
+
+
+def simple_safe_call(model, method_name, *args, **kwargs):
+    if hasattr(model, method_name):
+        return getattr(model, method_name)(*args, **kwargs)
+    if hasattr(model, "_layers") and hasattr(model._layers, method_name):
+        return getattr(model._layers, method_name)(*args, **kwargs)
+    raise AttributeError(f"{type(model).__name__} (or its wrapper) has no method {method_name}")
+
+
+def add_prefix_to_keys(d, prefix):
+    print("Input dict:", d)
+
+    mappings = {}
+    for key, value in d.items():
+        if key == "embed_tokens.weight":
+            new_key = "_layers.0.embed_tokens.weight"
+        elif key == "lm_head.weight":
+            new_key = "_layers.64.weight"
+        else:
+            new_key = f"{prefix}{key}"
+        mappings[new_key] = value
+    return mappings
+
+
+def _get_hf_prefix_develop(idx: int) -> str:
+    if idx == 0:
+        return "model"  # embedding
+    if idx == 63:
+        return "model"  # final norm
+    if idx == 64:
+        return "lm_head"  # lm_head
+    return f"model.layers.{idx - 1}"  # decoder layer
+
+
+def _handle_expert_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
+    if m := _EXPERT_W1_RE.match(rest):
+        expert_id = int(m.group(1))
+        return [
+            f"{hf_prefix}.mlp.experts.{expert_id}.gate_proj.weight",
+            f"{hf_prefix}.mlp.experts.{expert_id}.up_proj.weight",
+        ]
+    if m := _EXPERT_W2_RE.match(rest):
+        expert_id = int(m.group(1))
+        return [
+            f"{hf_prefix}.mlp.experts.{expert_id}.down_proj.weight",
+        ]
+    return None
+
+
+def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
+    if rest == "mlp.w1":
+        return [
+            f"{hf_prefix}.mlp.gate_proj.weight",
+            f"{hf_prefix}.mlp.up_proj.weight",
+        ]
+    if rest == "mlp.w2":
+        return [
+            f"{hf_prefix}.mlp.down_proj.weight",
+        ]
+    return None
+
+
+def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
+    """
+    Mapping Function for Paddle Parameter Names to Hugging Face Names
+    """
+    m = _layers_match(paddle_name)
+    if not m:
+        return []
+    idx = int(m.group(1))
+    rest = m.group(2) or ""
+
+    hf_prefix = _get_hf_prefix_develop(idx)
+
+    # 专项重命名
+    if rest in custom_name_map:
+        return [f"{hf_prefix}.{custom_name_map[rest]}"]
+
+    # 历史专家
+    if expert_names := _handle_expert_weights(hf_prefix, rest):
+        return expert_names
+
+    # 历史mlp
+    if mlp_names := _handle_mlp_weights(hf_prefix, rest):
+        return mlp_names
+
+    return [f"{hf_prefix}.{rest}"] if rest else [hf_prefix]
+
+
+def prepare_tensor(tensor, pd_param, tensor_parallel_mappings, mp_degree, dst_shape):
+    """
+    Converting weight tensors to match the target model’s shape involves
+    automatically adjusting for transposing, concatenating, and slicing by columns or lengths.
+    """
+
+    if isinstance(tensor, list):
+        tensor = paddle.concat(
+            [
+                paddle.transpose(tensor[0], perm=[1, 0]).contiguous(),
+                paddle.transpose(tensor[1], perm=[1, 0]).contiguous(),
+            ],
+            axis=-1,
+        )
+    # match for transpose
+    if len(tensor.shape) == 2:
+        if (tensor.shape[0] == dst_shape[1] or tensor.shape[1] == dst_shape[0]) and tensor.shape != dst_shape:
+            tensor = paddle.transpose(tensor, perm=[1, 0]).contiguous()
+        print(f"after transpose get hf tensor shape {tensor.shape}, paddle shape {dst_shape}")
+
+    if mp_degree > 1 and pd_param in tensor_parallel_mappings:
+        tensor = tensor_parallel_mappings[pd_param](tensor)
+    if tensor.shape == dst_shape:
+        return tensor
+    raise ValueError(f"Unexpected tensor shape: got {tensor.shape}, want {dst_shape}")
+
+
+def load_paddle_model_from_safetensors(
+    model,
+    weight_map_path: str,
+    ckpt_pre: str,
+    verbose: bool = True,
+):
+    """
+    Load safetensors into a Paddle  model using the weight mappings outlined in index.json.
+    """
+
+    tensor_parallel_mappings = {}
+    mp_degree = fleet.get_hybrid_communicate_group().get_model_parallel_world_size()
+    print("fuck mp degree!!!!!!!!!", mp_degree)
+
+    if mp_degree > 1:
+        print("load with mp_degree:", mp_degree)
+        tensor_parallel_mappings = simple_safe_call(model, "get_tensor_parallel_mappings", is_split=True)
+        tensor_parallel_mappings = add_prefix_to_keys(tensor_parallel_mappings, "_")
+
+    for k, v in tensor_parallel_mappings.items():
+        print("tensor_parallel_mappings:", k, v)
+
+    with open(weight_map_path, "r") as f:
+        weight_map = json.load(f)["weight_map"]
+
+    required_files = set()
+    file_to_pd_param_name = defaultdict(list)
+    pd_param_name_to_file = defaultdict(list)
+
+    for pd_name, _ in model.named_parameters():
+        hf_names = paddle_name_to_hf_names(pd_name)
+        if verbose:
+            print(f"paddle_name_to_hf_names: {pd_name} -> {hf_names}")
+        if not hf_names:
+            if verbose:
+                print(f"Warning: {pd_name} can not be mapped")
+            continue
+        for i, hf_name in enumerate(hf_names):
+            if hf_name in weight_map:
+                filename = weight_map[hf_name]
+                required_files.add(filename)
+                file_to_pd_param_name[filename].append(pd_name)
+                if filename not in pd_param_name_to_file[pd_name]:
+                    pd_param_name_to_file[pd_name].append(filename)
+            else:
+                if verbose:
+                    print(f"Warning: {pd_name} -> {hf_name} not found in weight map")
+
+    check_list = []
+    if verbose:
+        print("---- start load param ----")
+        for key, value in tensor_parallel_mappings.items():
+            print(key, value)
+    for filename in required_files:
+        try:
+            with safe_open(ckpt_pre + filename, framework="paddle", device="cpu") as f:
+                pd_params = file_to_pd_param_name[filename]
+                for pd_param in pd_params:
+                    if pd_param in check_list:
+                        continue
+                    if verbose:
+                        print("load for pd_param:", pd_param)
+                    hf_names = paddle_name_to_hf_names(pd_param)
+                    if not hf_names:
+                        continue
+                    if len(hf_names) == 1:
+                        tensor = f.get_tensor(hf_names[0])
+                        value = prepare_tensor(
+                            tensor, pd_param, tensor_parallel_mappings, mp_degree, model.state_dict()[pd_param].shape
+                        )
+
+                        model.state_dict()[pd_param].set_value(paddle.cast(value, model.state_dict()[pd_param].dtype))
+                    else:
+                        files = pd_param_name_to_file[pd_param]
+                        if len(files) == 1:
+                            tensor0 = f.get_tensor(hf_names[0])
+                            tensor1 = f.get_tensor(hf_names[1])
+                        else:
+                            if weight_map[hf_names[0]] == filename:
+                                tensor0 = f.get_tensor(hf_names[0])
+                                with safe_open(
+                                    ckpt_pre + weight_map[hf_names[1]], framework="paddle", device="cpu"
+                                ) as f2:
+                                    tensor1 = f2.get_tensor(hf_names[1])
+                            else:
+                                with safe_open(
+                                    ckpt_pre + weight_map[hf_names[0]], framework="paddle", device="cpu"
+                                ) as f2:
+                                    tensor0 = f2.get_tensor(hf_names[0])
+                                tensor1 = f.get_tensor(hf_names[1])
+                        value = prepare_tensor(
+                            [tensor0, tensor1],
+                            pd_param,
+                            tensor_parallel_mappings,
+                            mp_degree,
+                            model.state_dict()[pd_param].shape,
+                        )
+                        model.state_dict()[pd_param].set_value(value)
+                    check_list.append(pd_param)
+        except Exception as e:
+            print(f"Error loading {filename}: {str(e)}")
+            raise
+
+    if verbose:
+        print("All parameters loaded.")
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -170,7 +170,7 @@ def forward(self, args):
             batch_size, seq_length, _ = inputs_embeds.shape
 
             if self.sequence_parallel:
-                inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2]) # [B, S, H] --> [S, B, H]
+                inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2])  # [B, S, H] --> [S, B, H]
                 # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
                 # inputs_embeds = paddle.reshape(inputs_embeds, [-1, inputs_embeds.shape[-1]])
                 # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
@@ -185,7 +185,7 @@ def forward(self, args):
                     axis=1,
                 )
                 if self.sequence_parallel:
-                    inputs_embeds_mtp = paddle.transpose(inputs_embeds_mtp, [1, 0, 2]) # [B, S, H] --> [S, B, H]
+                    inputs_embeds_mtp = paddle.transpose(inputs_embeds_mtp, [1, 0, 2])  # [B, S, H] --> [S, B, H]
                     # inputs_embeds_mtp = inputs_embeds_mtp.reshape([-1, inputs_embeds_mtp.shape[-1]])
                     inputs_embeds_mtp = ScatterOp.apply(inputs_embeds_mtp)
                 embeds_res.append(inputs_embeds_mtp)
@@ -197,7 +197,7 @@ def forward(self, args):
             return return_args(inputs_embeds, attention_mask, attn_mask_startend_row_indices, position_ids)
         else:
             if self.sequence_parallel:
-                inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2]) # [B, S, H] --> [S, B, H]
+                inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2])  # [B, S, H] --> [S, B, H]
                 # inputs_embeds = inputs_embeds.reshape([-1, inputs_embeds.shape[-1]])
                 inputs_embeds = ScatterOp.apply(inputs_embeds)
             return return_args(inputs_embeds, attention_mask, attn_mask_startend_row_indices, position_ids)
@@ -270,7 +270,6 @@ def forward(self, args):
 class DeepseekV2MTPLayerPipe(DeepseekV2MTPLayer):
     def forward(self, args):
         hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids = parse_args(args)
-
         hidden_states_list = paddle.split(hidden_states, self.config.num_nextn_predict_layers + 1, axis=-1)
         hidden_states_main_model = hidden_states_list[0]
         inputs_embeds_cur_depth_list = hidden_states_list[1:]
@@ -525,3 +524,6 @@ def get_hcg():
 
     def get_loss_fn(self, config):
         return DeepseekV2PretrainingCriterionPipe(config)
+
+    def get_tensor_parallel_mappings(self, is_split=True):
+        return type(self)._get_tensor_parallel_mappings(self.config, is_split)