From 6c4d01def77743f864dbb535edff0aafb36875ba Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sat, 5 Jul 2025 17:47:06 +0800
Subject: [PATCH 1/3] add gguf kernel support

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 src/diffusers/quantizers/gguf/utils.py | 84 +++++++++++++++++++++++++-
 src/diffusers/utils/__init__.py        |  1 +
 src/diffusers/utils/import_utils.py    |  5 ++
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 41d351712961..27010408541b 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -17,10 +17,11 @@
 from contextlib import nullcontext
 
 import gguf
+from gguf import GGMLQuantizationType as WeightType
 import torch
 import torch.nn as nn
 
-from ...utils import is_accelerate_available
+from ...utils import is_accelerate_available, is_kernels_available
 
 
 if is_accelerate_available():
@@ -29,6 +30,76 @@
     from accelerate.hooks import add_hook_to_module, remove_hook_from_module
 
 
+can_use_cuda_kernels = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 7
+if can_use_cuda_kernels and is_kernels_available():
+    from kernels import get_kernel
+    ops = get_kernel("Isotr0py/ggml")
+else:
+    ops = None
+
+
+UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
+STANDARD_QUANT_TYPES = {
+    WeightType.Q4_0,
+    WeightType.Q4_1,
+    WeightType.Q5_0,
+    WeightType.Q5_1,
+    WeightType.Q8_0,
+    WeightType.Q8_1,
+}
+KQUANT_TYPES = {
+    WeightType.Q2_K,
+    WeightType.Q3_K,
+    WeightType.Q4_K,
+    WeightType.Q5_K,
+    WeightType.Q6_K,
+}
+IMATRIX_QUANT_TYPES = {
+    WeightType.IQ1_M,
+    WeightType.IQ1_S,
+    WeightType.IQ2_XXS,
+    WeightType.IQ2_XS,
+    WeightType.IQ2_S,
+    WeightType.IQ3_XXS,
+    WeightType.IQ3_S,
+    WeightType.IQ4_XS,
+    WeightType.IQ4_NL,
+}
+# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
+# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
+# MMQ kernel for I-Matrix quantization.
+DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
+
+
+def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
+                        qweight_type: int) -> torch.Tensor:
+    # there is no need to call any kernel for fp16/bf16
+    if qweight_type in UNQUANTIZED_TYPES:
+        return x @ qweight.T
+    # enable MMVQ in contiguous batching with batch_size=1
+    if qweight_type in MMVQ_QUANT_TYPES:
+        y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
+    # Use MMQ Kernel if it's available (standard + k-quants)
+    elif qweight_type in MMQ_QUANT_TYPES:
+        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+    # If there is no available MMQ kernel, fallback to dequantize
+    elif qweight_type in DEQUANT_TYPES:
+        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
+        shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
+        y = x @ weight.T
+    else:
+        # Raise an error if the quantization type is not supported.
+        # Might be useful if llama.cpp adds a new quantization type.
+        # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(
+            f"Unsupported GGUF quantization type: {qweight_type}")
+    return y
+
+
 # Copied from diffusers.quantizers.bitsandbytes.utils._create_accelerate_new_hook
 def _create_accelerate_new_hook(old_hook):
     r"""
@@ -451,11 +522,22 @@ def __init__(
     ) -> None:
         super().__init__(in_features, out_features, bias, device)
         self.compute_dtype = compute_dtype
+        self.device = device
 
     def forward(self, inputs):
+        if ops is not None and self.weight.is_cuda and inputs.is_cuda:
+            return self.forward_cuda(inputs)
+        return self.forward_native(inputs)
+
+    def forward_native(self, inputs):
         weight = dequantize_gguf_tensor(self.weight)
         weight = weight.to(self.compute_dtype)
         bias = self.bias.to(self.compute_dtype) if self.bias is not None else None
 
         output = torch.nn.functional.linear(inputs, weight, bias)
         return output
+
+    def forward_cuda(self, inputs):
+        quant_type = self.weight.quant_type
+        return _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type)
+
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 2df05cb8eb36..72f020ec193e 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -76,6 +76,7 @@
     is_hpu_available,
     is_inflect_available,
     is_invisible_watermark_available,
+    is_kernels_available,
     is_k_diffusion_available,
     is_k_diffusion_version,
     is_librosa_available,
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index f12e9de33172..6174d5b72c32 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -192,6 +192,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b
 _torch_npu_available, _torch_npu_version = _is_package_available("torch_npu")
 _transformers_available, _transformers_version = _is_package_available("transformers")
 _hf_hub_available, _hf_hub_version = _is_package_available("huggingface_hub")
+_kernels_available, _kernels_version = _is_package_available("kernels")
 _inflect_available, _inflect_version = _is_package_available("inflect")
 _unidecode_available, _unidecode_version = _is_package_available("unidecode")
 _k_diffusion_available, _k_diffusion_version = _is_package_available("k_diffusion")
@@ -274,6 +275,10 @@ def is_accelerate_available():
     return _accelerate_available
 
 
+def is_kernels_available():
+    return _kernels_available
+
+
 def is_k_diffusion_available():
     return _k_diffusion_available
 

From 66bd237bc5fddafa813f6564cf041a539f39429d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 6 Jul 2025 01:00:01 +0800
Subject: [PATCH 2/3] fix

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 src/diffusers/quantizers/gguf/utils.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 27010408541b..03521eadb2b4 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -78,17 +78,21 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
         return x @ qweight.T
-    # enable MMVQ in contiguous batching with batch_size=1
-    if qweight_type in MMVQ_QUANT_TYPES:
-        y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
-    # Use MMQ Kernel if it's available (standard + k-quants)
-    elif qweight_type in MMQ_QUANT_TYPES:
-        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+
+    # TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for
+    # contiguous batching and inefficient with diffusers' batching,
+    # so we disabled it now.
+
+    # elif qweight_type in MMVQ_QUANT_TYPES:
+    #     y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
+    # elif qweight_type in MMQ_QUANT_TYPES:
+    #     y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
     # If there is no available MMQ kernel, fallback to dequantize
+
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
-        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
         y = x @ weight.T
     else:
         # Raise an error if the quantization type is not supported.
@@ -539,5 +543,10 @@ def forward_native(self, inputs):
 
     def forward_cuda(self, inputs):
         quant_type = self.weight.quant_type
-        return _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type)
+        orig_shape = inputs.shape
+        inputs = inputs.view(-1, orig_shape[-1])
+        output = _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type)
+        if self.bias is not None:
+            output = output + self.bias.to(self.compute_dtype)
+        return output.view(*orig_shape[:-1], -1)
 

From e46571a7aad95b2a4efc10d076740ad260e129fc Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 6 Jul 2025 01:47:13 +0800
Subject: [PATCH 3/3] optimize

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 src/diffusers/quantizers/gguf/utils.py | 68 ++++++++++++--------------
 src/diffusers/utils/__init__.py        |  2 +-
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 03521eadb2b4..31f6ec3e7321 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -17,7 +17,6 @@
 from contextlib import nullcontext
 
 import gguf
-from gguf import GGMLQuantizationType as WeightType
 import torch
 import torch.nn as nn
 
@@ -33,37 +32,37 @@
 can_use_cuda_kernels = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 7
 if can_use_cuda_kernels and is_kernels_available():
     from kernels import get_kernel
+
     ops = get_kernel("Isotr0py/ggml")
 else:
     ops = None
 
-
-UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
+UNQUANTIZED_TYPES = {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16}
 STANDARD_QUANT_TYPES = {
-    WeightType.Q4_0,
-    WeightType.Q4_1,
-    WeightType.Q5_0,
-    WeightType.Q5_1,
-    WeightType.Q8_0,
-    WeightType.Q8_1,
+    gguf.GGMLQuantizationType.Q4_0,
+    gguf.GGMLQuantizationType.Q4_1,
+    gguf.GGMLQuantizationType.Q5_0,
+    gguf.GGMLQuantizationType.Q5_1,
+    gguf.GGMLQuantizationType.Q8_0,
+    gguf.GGMLQuantizationType.Q8_1,
 }
 KQUANT_TYPES = {
-    WeightType.Q2_K,
-    WeightType.Q3_K,
-    WeightType.Q4_K,
-    WeightType.Q5_K,
-    WeightType.Q6_K,
+    gguf.GGMLQuantizationType.Q2_K,
+    gguf.GGMLQuantizationType.Q3_K,
+    gguf.GGMLQuantizationType.Q4_K,
+    gguf.GGMLQuantizationType.Q5_K,
+    gguf.GGMLQuantizationType.Q6_K,
 }
 IMATRIX_QUANT_TYPES = {
-    WeightType.IQ1_M,
-    WeightType.IQ1_S,
-    WeightType.IQ2_XXS,
-    WeightType.IQ2_XS,
-    WeightType.IQ2_S,
-    WeightType.IQ3_XXS,
-    WeightType.IQ3_S,
-    WeightType.IQ4_XS,
-    WeightType.IQ4_NL,
+    gguf.GGMLQuantizationType.IQ1_M,
+    gguf.GGMLQuantizationType.IQ1_S,
+    gguf.GGMLQuantizationType.IQ2_XXS,
+    gguf.GGMLQuantizationType.IQ2_XS,
+    gguf.GGMLQuantizationType.IQ2_S,
+    gguf.GGMLQuantizationType.IQ3_XXS,
+    gguf.GGMLQuantizationType.IQ3_S,
+    gguf.GGMLQuantizationType.IQ4_XS,
+    gguf.GGMLQuantizationType.IQ4_NL,
 }
 # TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
 # Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
@@ -73,8 +72,7 @@
 MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
 
 
-def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
-                        qweight_type: int) -> torch.Tensor:
+def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor:
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
         return x @ qweight.T
@@ -87,8 +85,8 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
     #     y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
     # elif qweight_type in MMQ_QUANT_TYPES:
     #     y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
-    # If there is no available MMQ kernel, fallback to dequantize
 
+    # If there is no available MMQ kernel, fallback to dequantize
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
@@ -98,9 +96,8 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
         # Raise an error if the quantization type is not supported.
         # Might be useful if llama.cpp adds a new quantization type.
         # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
-        qweight_type = WeightType(qweight_type)
-        raise NotImplementedError(
-            f"Unsupported GGUF quantization type: {qweight_type}")
+        qweight_type = gguf.GGMLQuantizationType(qweight_type)
+        raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
     return y
 
 
@@ -528,12 +525,12 @@ def __init__(
         self.compute_dtype = compute_dtype
         self.device = device
 
-    def forward(self, inputs):
+    def forward(self, inputs: torch.Tensor):
         if ops is not None and self.weight.is_cuda and inputs.is_cuda:
             return self.forward_cuda(inputs)
         return self.forward_native(inputs)
 
-    def forward_native(self, inputs):
+    def forward_native(self, inputs: torch.Tensor):
         weight = dequantize_gguf_tensor(self.weight)
         weight = weight.to(self.compute_dtype)
         bias = self.bias.to(self.compute_dtype) if self.bias is not None else None
@@ -541,12 +538,9 @@ def forward_native(self, inputs):
         output = torch.nn.functional.linear(inputs, weight, bias)
         return output
 
-    def forward_cuda(self, inputs):
+    def forward_cuda(self, inputs: torch.Tensor):
         quant_type = self.weight.quant_type
-        orig_shape = inputs.shape
-        inputs = inputs.view(-1, orig_shape[-1])
         output = _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type)
         if self.bias is not None:
-            output = output + self.bias.to(self.compute_dtype)
-        return output.view(*orig_shape[:-1], -1)
-
+            output += self.bias.to(self.compute_dtype)
+        return output
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 72f020ec193e..72b12badf269 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -76,9 +76,9 @@
     is_hpu_available,
     is_inflect_available,
     is_invisible_watermark_available,
-    is_kernels_available,
     is_k_diffusion_available,
     is_k_diffusion_version,
+    is_kernels_available,
     is_librosa_available,
     is_matplotlib_available,
     is_nltk_available,