Support auto device mapping (#781)

Kaihui-intel · web-flow · commit d7d2efad2a7f · 2025-09-05T17:39:45.000+08:00
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -59,9 +59,11 @@
     convert_fp8_layer_to_linear,
     convert_fp8_model_to_16b_model,
     detect_device,
+    estimate_tuning_block_mem,
     find_matching_blocks,
     flatten_list,
     get_block_names,
+    get_device_memory,
     get_layer_config_by_gguf_format,
     get_layer_features,
     get_layer_names_in_block,
@@ -228,20 +230,19 @@ def __init__(
             logger.warning("`device` is deprecated, please use `device_map` instead")
 
         self.vlm = kwargs.pop("vlm") if "vlm" in kwargs else False
+        # Scale factor for RAM usage per parameter.
+        self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
 
-        if device_map is not None and "," in str(device_map):
-            raise ValueError(
-                "API does not support explicit set multiple devices," " please set CUDA_VISIBLE_DEVICES=0,1 yourself"
-            )
         if device_map is None:
             device_map = 0
 
         # Set device, must place after model loading
         if isinstance(device_map, (str, torch.device, int)):
             self.device = detect_device(device_map)
+
         elif isinstance(device_map, dict) and device_map:
             tmp_devices = []
             for val in device_map.values():
@@ -258,8 +259,12 @@ def __init__(
 
             self.device = tmp_devices[0]
 
-        if isinstance(device_map, dict) and device_map:
+        if (isinstance(device_map, dict) and device_map) or device_map == "auto":
             self.device_map = device_map
+        elif isinstance(device_map, str) and "," in device_map:
+            device_map = device_map.replace(" ", "")  # Remove any spaces
+            self.device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()]
+            self.device_map = "auto"
         else:
             self.device_map = None
         self._set_device_map_in_blocks(self.device_map)
@@ -543,6 +548,8 @@ def _set_device_map_in_blocks(self, device_map: Union[str, dict, None]) -> None:
             self.device_map = None
         if not device_map:
             return
+        if self.device_map == "auto" and device_map == "auto":
+            return
         if isinstance(device_map, str):
             device_map = device_map.replace(" ", "")
             infos = device_map.split(",")
@@ -583,6 +590,71 @@ def _set_device_for_matching_module(self, name: str, device: str) -> None:
         else:
             module.tuning_device = device
 
+    def _set_auto_device_map_in_block(self, block: torch.nn.Module, input_ids: list[torch.Tensor]) -> None:
+        """Automatically sets the device map for the block based on available GPUs and memory constraints."""
+        if torch.cuda.is_available():
+            num_gpus = torch.cuda.device_count()
+        elif torch.xpu.is_available():
+            logger.warning_once("XPU does not support auto device map yet, using device 0 for tuning.")
+            return
+        else:
+            raise RuntimeError("No CUDA or XPU devices found.")
+        if num_gpus <= 1:
+            self.device_map = None
+            return
+
+        if hasattr(self, "device_list") and self.device_list:
+            cuda_devices = [f"cuda:{i}" for i in self.device_list]
+            device_0 = cuda_devices[0]
+        else:
+            cuda_devices = [f"cuda:{i}" for i in range(num_gpus)]
+            device_0 = "cuda:0"
+
+        device_0_memory = get_device_memory(
+            self.device_list[0] if hasattr(self, "device_list") and self.device_list else 0
+        )
+        block_memory, input_ouput_memory = estimate_tuning_block_mem(block, input_ids)
+        if self.low_gpu_mem_usage:
+            input_ouput_memory = 0
+
+        mem_per_param_scale = 13 if self.mem_per_param_scale is None else self.mem_per_param_scale
+        if self.iters == 0:
+            mem_per_param_scale = 1  # for rtn
+
+        if (block_memory * mem_per_param_scale + input_ouput_memory) < device_0_memory:
+            return  # fit in one GPU
+
+        device_map = {}
+        device_memory = {device: get_device_memory(int(device.split(":")[1])) for device in cuda_devices}
+        device_memory[device_0] = device_0_memory - input_ouput_memory
+
+        device_idx = 0
+        # First, fill device 0 to its maximum capacity, then distribute the remaining layers evenly across other devices
+        for n, m in block.named_modules():
+            if check_to_quantized(m):
+                layer_name = block.tmp_name + "." + n
+                layer_memory = m.weight.nbytes / 1024**3
+                if device_idx == 0 and layer_memory * mem_per_param_scale < device_memory[cuda_devices[device_idx]]:
+                    device_map[layer_name] = cuda_devices[device_idx]
+                    device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale
+                elif device_idx == 0:
+                    device_idx += 1  # Move to the next device once device 0 is full
+                    device_map[layer_name] = cuda_devices[device_idx]
+                    device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale
+                else:
+                    # Calculate the target device index based on even distribution
+                    sorted_devices = sorted(cuda_devices, key=lambda d: device_memory[d], reverse=True)
+                    device_idx = sorted_devices[0]
+                    if layer_memory * mem_per_param_scale < device_memory[device_idx]:
+                        device_map[layer_name] = device_idx
+                        device_memory[device_idx] -= layer_memory * mem_per_param_scale
+                    else:
+                        logger.warning_once(
+                            f"Block {block.tmp_name} not fit in available GPU memory. "
+                            "Consider using more GPUs or reducing mem_per_param_scale if OOM occurs."
+                        )
+        self._set_device_map_in_blocks(device_map)
+
     def _dq_check(self) -> None:
         """Reset the default value of super_bits and super_group_size"""
         if self.data_type.endswith("_dq"):
@@ -1488,6 +1560,10 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                 block = block.to(self.device)
                 if _is_fp8_model(self.model):
                     convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype)
+
+                if self.device_map == "auto":
+                    self._set_auto_device_map_in_block(block, input_ids)
+
                 # Dispatch model if needed
                 if self.device_map is not None:
                     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
@@ -2551,6 +2627,9 @@ def _quantize_block(
                     new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device)
                     set_module(block, n, new_layer)
 
+        if self.device_map == "auto":
+            self._set_auto_device_map_in_block(block, input_ids)
+
         if self.device_map is not None:
             for n, m in block.named_modules():
                 if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
@@ -104,6 +104,13 @@ def __init__(self, *args, **kwargs):
             help="minmax learning rate, if None, it will beset to be the same with lr",
         )
 
+        self.add_argument(
+            "--mem_per_param_scale",
+            default=13,
+            type=float,
+            help="Scale factor for memory per parameter, used to adjust memory usage estimation for tuning",
+        )
+
         self.add_argument("--seed", default=42, type=int, help="random seed")
 
         self.add_argument("--adam", action="store_true", help="whether to use adam optimizer instead of SignSGD")
@@ -436,7 +443,7 @@ def tune(args):
         raise RuntimeError("marlin backend only supports sym quantization, please remove --asym")
 
     # Must set this before import torch
-    set_cuda_visible_devices(args.device_map)
+    # set_cuda_visible_devices(args.device_map)
     device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)
 
     import torch
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
@@ -328,7 +328,7 @@ def tune(args):
             raise ValueError(f"{format} is not supported, we only support {SUPPORTED_FORMATS}")
 
     # Must set this before import torch
-    set_cuda_visible_devices(args.device_map)
+    # set_cuda_visible_devices(args.device_map)
     device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)
 
     import torch
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -578,6 +578,10 @@ def is_valid_digit(s):
     if is_valid_digit(device):
         dev_idx = int(device)
         device = "auto"
+    if isinstance(device, str) and "," in device:  # device is "0,1,2"
+        device_list = [int(dev) for dev in device.split(",") if dev.isdigit()]
+        dev_idx = device_list[0] if device_list else None
+        device = "auto"
     if device is None or device == "auto":
         if torch.cuda.is_available():
             device = torch.device("cuda")
@@ -1426,6 +1430,8 @@ def llm_load_model(
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
 
     model_cls = AutoModel if is_glm else AutoModelForCausalLM
+    if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code:
+        logger.warning("trust_remote_code is enabled by default, please ensure its correctness.")
 
     if low_cpu_mem_tmp_dir is None:
         low_cpu_mem_tmp_dir = "low_cpu_mem_tmp"
@@ -2563,6 +2569,66 @@ def is_static_wfp8afp8(ar):
     return False
 
 
+def bytes_to_gigabytes(bytes) -> int:
+    """
+    Converts bytes to gigabytes.
+
+    Args:
+        bytes (int): The number of bytes.
+
+    Returns:
+        int: The equivalent number of gigabytes.
+    """
+    return bytes / 1024 / 1024 / 1024
+
+
+def get_device_memory(i: int = 0) -> int:
+    """
+    Gets the available memory on the specified device.
+
+    Args:
+        i (int, optional): Device index. Defaults to 0.
+
+    Returns:
+        int: Available memory in gigabytes.
+    """
+    if torch.cuda.is_available():
+        total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory)
+    elif torch.xpu.is_available():
+        raise RuntimeError("XPU does not support device_map='auto' currently.")
+    else:
+        raise RuntimeError("No supported device found (CUDA or XPU).")
+    return total_memory
+
+
+def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]:
+    """
+    Calculates the memory consumption of a specific block in the model.
+
+    Args:
+        block (torch.nn.Module): The block of the model to analyze.
+        input_ids (list[torch.Tensor]): A list of input tensors for the block.
+
+    Returns:
+        tuple: A tuple containing the following:
+            - block_memory (float): The memory consumption (in GB) of the block's linear layers.
+            - input_output_memory (float): The memory consumption (in GB) for input and output
+                tensors of the block.
+    """
+    # Calculate all block parameters memory
+    total_param_mem = 0
+    for name, module in block.named_modules():
+        if check_to_quantized(module):
+            param_size = module.weight.nbytes
+            total_param_mem += param_size
+    block_memory = total_param_mem / 1024**3  # Convert to GB
+
+    # Assuming bfloat16 or float32, input and output
+    input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
+
+    return block_memory, input_output_memory
+
+
 def get_max_vram(ratio: float = 0.9) -> dict:
     max_memory = {}
     if torch.cuda.is_available():  # NVIDIA CUDA