From 6c4d01def77743f864dbb535edff0aafb36875ba Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sat, 5 Jul 2025 17:47:06 +0800 Subject: [PATCH 1/3] add gguf kernel support Signed-off-by: Isotr0py <2037008807@qq.com> --- src/diffusers/quantizers/gguf/utils.py | 84 +++++++++++++++++++++++++- src/diffusers/utils/__init__.py | 1 + src/diffusers/utils/import_utils.py | 5 ++ 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 41d351712961..27010408541b 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -17,10 +17,11 @@ from contextlib import nullcontext import gguf +from gguf import GGMLQuantizationType as WeightType import torch import torch.nn as nn -from ...utils import is_accelerate_available +from ...utils import is_accelerate_available, is_kernels_available if is_accelerate_available(): @@ -29,6 +30,76 @@ from accelerate.hooks import add_hook_to_module, remove_hook_from_module +can_use_cuda_kernels = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 7 +if can_use_cuda_kernels and is_kernels_available(): + from kernels import get_kernel + ops = get_kernel("Isotr0py/ggml") +else: + ops = None + + +UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16} +STANDARD_QUANT_TYPES = { + WeightType.Q4_0, + WeightType.Q4_1, + WeightType.Q5_0, + WeightType.Q5_1, + WeightType.Q8_0, + WeightType.Q8_1, +} +KQUANT_TYPES = { + WeightType.Q2_K, + WeightType.Q3_K, + WeightType.Q4_K, + WeightType.Q5_K, + WeightType.Q6_K, +} +IMATRIX_QUANT_TYPES = { + WeightType.IQ1_M, + WeightType.IQ1_S, + WeightType.IQ2_XXS, + WeightType.IQ2_XS, + WeightType.IQ2_S, + WeightType.IQ3_XXS, + WeightType.IQ3_S, + WeightType.IQ4_XS, + WeightType.IQ4_NL, +} +# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization. +# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add +# MMQ kernel for I-Matrix quantization. +DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES +MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES +MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES + + +def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, + qweight_type: int) -> torch.Tensor: + # there is no need to call any kernel for fp16/bf16 + if qweight_type in UNQUANTIZED_TYPES: + return x @ qweight.T + # enable MMVQ in contiguous batching with batch_size=1 + if qweight_type in MMVQ_QUANT_TYPES: + y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) + # Use MMQ Kernel if it's available (standard + k-quants) + elif qweight_type in MMQ_QUANT_TYPES: + y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) + # If there is no available MMQ kernel, fallback to dequantize + elif qweight_type in DEQUANT_TYPES: + block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] + shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size) + weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype) + y = x @ weight.T + else: + # Raise an error if the quantization type is not supported. + # Might be useful if llama.cpp adds a new quantization type. + # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type. + qweight_type = WeightType(qweight_type) + raise NotImplementedError( + f"Unsupported GGUF quantization type: {qweight_type}") + return y + + # Copied from diffusers.quantizers.bitsandbytes.utils._create_accelerate_new_hook def _create_accelerate_new_hook(old_hook): r""" @@ -451,11 +522,22 @@ def __init__( ) -> None: super().__init__(in_features, out_features, bias, device) self.compute_dtype = compute_dtype + self.device = device def forward(self, inputs): + if ops is not None and self.weight.is_cuda and inputs.is_cuda: + return self.forward_cuda(inputs) + return self.forward_native(inputs) + + def forward_native(self, inputs): weight = dequantize_gguf_tensor(self.weight) weight = weight.to(self.compute_dtype) bias = self.bias.to(self.compute_dtype) if self.bias is not None else None output = torch.nn.functional.linear(inputs, weight, bias) return output + + def forward_cuda(self, inputs): + quant_type = self.weight.quant_type + return _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type) + diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 2df05cb8eb36..72f020ec193e 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -76,6 +76,7 @@ is_hpu_available, is_inflect_available, is_invisible_watermark_available, + is_kernels_available, is_k_diffusion_available, is_k_diffusion_version, is_librosa_available, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index f12e9de33172..6174d5b72c32 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -192,6 +192,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b _torch_npu_available, _torch_npu_version = _is_package_available("torch_npu") _transformers_available, _transformers_version = _is_package_available("transformers") _hf_hub_available, _hf_hub_version = _is_package_available("huggingface_hub") +_kernels_available, _kernels_version = _is_package_available("kernels") _inflect_available, _inflect_version = _is_package_available("inflect") _unidecode_available, _unidecode_version = _is_package_available("unidecode") _k_diffusion_available, _k_diffusion_version = _is_package_available("k_diffusion") @@ -274,6 +275,10 @@ def is_accelerate_available(): return _accelerate_available +def is_kernels_available(): + return _kernels_available + + def is_k_diffusion_available(): return _k_diffusion_available From 66bd237bc5fddafa813f6564cf041a539f39429d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 6 Jul 2025 01:00:01 +0800 Subject: [PATCH 2/3] fix Signed-off-by: Isotr0py <2037008807@qq.com> --- src/diffusers/quantizers/gguf/utils.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 27010408541b..03521eadb2b4 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -78,17 +78,21 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, # there is no need to call any kernel for fp16/bf16 if qweight_type in UNQUANTIZED_TYPES: return x @ qweight.T - # enable MMVQ in contiguous batching with batch_size=1 - if qweight_type in MMVQ_QUANT_TYPES: - y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) - # Use MMQ Kernel if it's available (standard + k-quants) - elif qweight_type in MMQ_QUANT_TYPES: - y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) + + # TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for + # contiguous batching and inefficient with diffusers' batching, + # so we disabled it now. + + # elif qweight_type in MMVQ_QUANT_TYPES: + # y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) + # elif qweight_type in MMQ_QUANT_TYPES: + # y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) # If there is no available MMQ kernel, fallback to dequantize + elif qweight_type in DEQUANT_TYPES: block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size) - weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype) + weight = ops.ggml_dequantize(qweight, qweight_type, *shape) y = x @ weight.T else: # Raise an error if the quantization type is not supported. @@ -539,5 +543,10 @@ def forward_native(self, inputs): def forward_cuda(self, inputs): quant_type = self.weight.quant_type - return _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type) + orig_shape = inputs.shape + inputs = inputs.view(-1, orig_shape[-1]) + output = _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type) + if self.bias is not None: + output = output + self.bias.to(self.compute_dtype) + return output.view(*orig_shape[:-1], -1) From e46571a7aad95b2a4efc10d076740ad260e129fc Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 6 Jul 2025 01:47:13 +0800 Subject: [PATCH 3/3] optimize Signed-off-by: Isotr0py <2037008807@qq.com> --- src/diffusers/quantizers/gguf/utils.py | 68 ++++++++++++-------------- src/diffusers/utils/__init__.py | 2 +- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py index 03521eadb2b4..31f6ec3e7321 100644 --- a/src/diffusers/quantizers/gguf/utils.py +++ b/src/diffusers/quantizers/gguf/utils.py @@ -17,7 +17,6 @@ from contextlib import nullcontext import gguf -from gguf import GGMLQuantizationType as WeightType import torch import torch.nn as nn @@ -33,37 +32,37 @@ can_use_cuda_kernels = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 7 if can_use_cuda_kernels and is_kernels_available(): from kernels import get_kernel + ops = get_kernel("Isotr0py/ggml") else: ops = None - -UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16} +UNQUANTIZED_TYPES = {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16} STANDARD_QUANT_TYPES = { - WeightType.Q4_0, - WeightType.Q4_1, - WeightType.Q5_0, - WeightType.Q5_1, - WeightType.Q8_0, - WeightType.Q8_1, + gguf.GGMLQuantizationType.Q4_0, + gguf.GGMLQuantizationType.Q4_1, + gguf.GGMLQuantizationType.Q5_0, + gguf.GGMLQuantizationType.Q5_1, + gguf.GGMLQuantizationType.Q8_0, + gguf.GGMLQuantizationType.Q8_1, } KQUANT_TYPES = { - WeightType.Q2_K, - WeightType.Q3_K, - WeightType.Q4_K, - WeightType.Q5_K, - WeightType.Q6_K, + gguf.GGMLQuantizationType.Q2_K, + gguf.GGMLQuantizationType.Q3_K, + gguf.GGMLQuantizationType.Q4_K, + gguf.GGMLQuantizationType.Q5_K, + gguf.GGMLQuantizationType.Q6_K, } IMATRIX_QUANT_TYPES = { - WeightType.IQ1_M, - WeightType.IQ1_S, - WeightType.IQ2_XXS, - WeightType.IQ2_XS, - WeightType.IQ2_S, - WeightType.IQ3_XXS, - WeightType.IQ3_S, - WeightType.IQ4_XS, - WeightType.IQ4_NL, + gguf.GGMLQuantizationType.IQ1_M, + gguf.GGMLQuantizationType.IQ1_S, + gguf.GGMLQuantizationType.IQ2_XXS, + gguf.GGMLQuantizationType.IQ2_XS, + gguf.GGMLQuantizationType.IQ2_S, + gguf.GGMLQuantizationType.IQ3_XXS, + gguf.GGMLQuantizationType.IQ3_S, + gguf.GGMLQuantizationType.IQ4_XS, + gguf.GGMLQuantizationType.IQ4_NL, } # TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization. # Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add @@ -73,8 +72,7 @@ MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES -def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, - qweight_type: int) -> torch.Tensor: +def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor: # there is no need to call any kernel for fp16/bf16 if qweight_type in UNQUANTIZED_TYPES: return x @ qweight.T @@ -87,8 +85,8 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, # y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) # elif qweight_type in MMQ_QUANT_TYPES: # y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) - # If there is no available MMQ kernel, fallback to dequantize + # If there is no available MMQ kernel, fallback to dequantize elif qweight_type in DEQUANT_TYPES: block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size) @@ -98,9 +96,8 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, # Raise an error if the quantization type is not supported. # Might be useful if llama.cpp adds a new quantization type. # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type. - qweight_type = WeightType(qweight_type) - raise NotImplementedError( - f"Unsupported GGUF quantization type: {qweight_type}") + qweight_type = gguf.GGMLQuantizationType(qweight_type) + raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}") return y @@ -528,12 +525,12 @@ def __init__( self.compute_dtype = compute_dtype self.device = device - def forward(self, inputs): + def forward(self, inputs: torch.Tensor): if ops is not None and self.weight.is_cuda and inputs.is_cuda: return self.forward_cuda(inputs) return self.forward_native(inputs) - def forward_native(self, inputs): + def forward_native(self, inputs: torch.Tensor): weight = dequantize_gguf_tensor(self.weight) weight = weight.to(self.compute_dtype) bias = self.bias.to(self.compute_dtype) if self.bias is not None else None @@ -541,12 +538,9 @@ def forward_native(self, inputs): output = torch.nn.functional.linear(inputs, weight, bias) return output - def forward_cuda(self, inputs): + def forward_cuda(self, inputs: torch.Tensor): quant_type = self.weight.quant_type - orig_shape = inputs.shape - inputs = inputs.view(-1, orig_shape[-1]) output = _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type) if self.bias is not None: - output = output + self.bias.to(self.compute_dtype) - return output.view(*orig_shape[:-1], -1) - + output += self.bias.to(self.compute_dtype) + return output diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 72f020ec193e..72b12badf269 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -76,9 +76,9 @@ is_hpu_available, is_inflect_available, is_invisible_watermark_available, - is_kernels_available, is_k_diffusion_available, is_k_diffusion_version, + is_kernels_available, is_librosa_available, is_matplotlib_available, is_nltk_available,