CUDA: conv2d minor fixes

mnehete32 · mnehete32 · commit 74f490759413 · 2025-09-16T04:58:58.000+05:30
diff --git a/ggml/src/ggml-cuda/conv2d.cu b/ggml/src/ggml-cuda/conv2d.cu
@@ -1,8 +1,6 @@
 #include "conv2d.cuh"
 #include "convert.cuh"
 
-#include <cstdint>
-
 struct conv_params {
     const uint IW, IH;
     const uint OW, OH;
@@ -88,6 +86,9 @@ template <typename layout> __device__ class float_mma {
 #pragma unroll
         for (uint i = 0; i < num_acc; i++) {
             const uint e = lane_id + i * WARP_SIZE;
+            if (e >= WMMA_M * WMMA_N) {
+                continue;
+            }
             const uint m = e / WMMA_N;
             const uint n = e % WMMA_N;
 
@@ -109,6 +110,9 @@ template <typename layout> __device__ class float_mma {
 #pragma unroll
         for (uint i = 0; i < num_acc; i++) {
             const uint e = lane_id + i * WARP_SIZE;
+            if (e >= WMMA_M * WMMA_N) {
+                continue;
+            }
             const uint m = e / WMMA_N;
             const uint n = e % WMMA_N;
 
@@ -164,6 +168,9 @@ template <typename layout> class half_mma {
 #    pragma unroll
         for (uint l = 0; l < tile_acc::ne; ++l) {
             const uint e = tile_acc::get_i(l) * WMMA_N + tile_acc::get_j(l);
+            if (e >= WMMA_M * WMMA_N) {
+                continue;
+            }
             const uint m = e / WMMA_N;
             const uint n = e % WMMA_N;
 
@@ -313,8 +320,8 @@ __global__ void __launch_bounds__(num_warps * WARP_SIZE) conv2d_kernel(const flo
                 const int in_y = calculate_input_coord(oh, kh, P.ST_Y, P.DL_Y, P.PD_Y);
                 const int in_x = calculate_input_coord(ow, kw, P.ST_X, P.DL_X, P.PD_X);
                 if (in_y >= 0 && in_y < P.IH && in_x >= 0 && in_x < P.IW) {
-                    const int64_t in_idx = layout::input_index(n, ic, in_y, in_x, P);
-                    val                  = ggml_cuda_cast<T>(IN[in_idx]);
+                    const uint64_t in_idx = layout::input_index(n, ic, in_y, in_x, P);
+                    val                   = ggml_cuda_cast<T>(IN[in_idx]);
                 }
             }
             B_sh[brow * BS_NOHOW + bcol] = val;
@@ -359,7 +366,7 @@ __global__ void __launch_bounds__(num_warps * WARP_SIZE) conv2d_kernel(const flo
 }
 
 template <typename T, template <typename> class mma>
-static void conv2d_cuda(const float * X_D, const T * K_D, float * Y_D, const conv_params P, cudaStream_t st) {
+static void conv2d_cuda(const float * X_D, const T * K_D, float * Y_D, const conv_params & P, cudaStream_t st) {
     GGML_ASSERT(BS_OC >= WMMA_M && BS_ICKHKW >= WMMA_K && BS_NOHOW >= WMMA_N);
 
     const uint NUM_BL_OC    = (P.OC + BS_OC - 1) / BS_OC;