Support more dtypes for input, indices in gather (pytorch#151822)

isuruf · pytorchmergebot · commit f0c9b3385d99 · 2025-05-01T16:35:23.000Z
Pull Request resolved: pytorch#151822 Approved by: https://github.com/ngimel
diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h
@@ -19,8 +19,8 @@ inline void scatter_gather_dtype_check(
 ) {
   if (index.numel() != 0) {
     TORCH_CHECK(
-      index.scalar_type() == at::ScalarType::Long,
-      method_name, "(): Expected dtype int64 for index"
+      index.scalar_type() == at::ScalarType::Long || index.scalar_type() == at::ScalarType::Int,
+      method_name, "(): Expected dtype int32/int64 for index"
     );
   }
 
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -175,9 +175,10 @@ TORCH_META_FUNC(gather)
   auto is_index_empty = index.numel() == 0;
   if (!is_index_empty) {
     TORCH_CHECK(
-        index.scalar_type() == at::ScalarType::Long,
+        index.scalar_type() == ScalarType::Long ||
+            index.scalar_type() == ScalarType::Int,
         "gather",
-        "(): Expected dtype int64 for index");
+        "(): Expected dtype int32/int64 for index");
   }
   if (is_index_empty)
     return;
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -167,10 +167,11 @@ template <bool is_scatter_like = true>
 struct cpu_scatter_gather_base_kernel {
   template <typename func_t>
   void operator()(const Tensor& self, int64_t dim,
-    const Tensor& index, const Scalar& value,
+    const Tensor& _index, const Scalar& value,
     const std::string& method_name, func_t& kernel_func) {
 
     Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -263,10 +264,11 @@ struct cpu_scatter_gather_base_kernel {
 
   template <typename func_t>
   void operator()(const Tensor& self, int64_t dim,
-    const Tensor& index, const Tensor& src,
+    const Tensor& _index, const Tensor& src,
     const std::string& method_name, func_t& kernel_func) {
 
     Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -358,10 +360,11 @@ struct cpu_scatter_gather_base_kernel {
   }
 
   void operator()(const Tensor& self, int64_t dim,
-    const Tensor& index, const Tensor& src,
+    const Tensor& _index, const Tensor& src,
     const std::string& method_name, ReduceMean& kernel_func) {
 
     Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -453,9 +456,10 @@ struct cpu_scatter_gather_base_kernel {
   }
 
   void operator()(const Tensor& self, int64_t dim,
-    const Tensor& index, const Tensor& src,
+    const Tensor& _index, const Tensor& src,
     const std::string& method_name, ReduceMaximum& kernel_func) {
     Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -547,10 +551,11 @@ struct cpu_scatter_gather_base_kernel {
   }
 
   void operator()(const Tensor& self, int64_t dim,
-    const Tensor& index, const Tensor& src,
+    const Tensor& _index, const Tensor& src,
     const std::string& method_name, ReduceMinimum& kernel_func) {
 
     Tensor buffer;
+    Tensor index = _index.to(ScalarType::Long);
     bool need_acc = isReducedFloatingType(self.scalar_type());
     create_acc_buffer(buffer, self, need_acc);
 
@@ -810,7 +815,8 @@ void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index,
 }
 
 template <typename scalar_t>
-void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index, const Tensor& self) {
+void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& _index, const Tensor& self) {
+  Tensor index = _index.to(ScalarType::Long);
   const int64_t* index_data = index.const_data_ptr<int64_t>();
   scalar_t* result_data = result.data_ptr<scalar_t>();
   const scalar_t* self_data = self.const_data_ptr<scalar_t>();
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -84,7 +84,7 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
         auto inp_stride_bytes = index_stride[0];
         auto out_stride_bytes = iter.strides(0)[1];
         if (iter.numel() == 0) return;
-        at::native::vectorized_gather_kernel_launch<alignment>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+        at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
         slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
         return;
       }
diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.cu b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
@@ -7,8 +7,8 @@
 #include <ATen/ceil_div.h>
 
 namespace at::native {
-template <int Alignment>
-__global__ void vectorized_gather_kernel(char * out, char * inp, int64_t * idx, int num_ind, int64_t slice_size, int64_t ind_dim_size, int64_t inp_stride, int64_t out_stride, bool allow_neg_indices) {
+template <int Alignment, typename index_t>
+__global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx, int num_ind, int64_t slice_size, int64_t ind_dim_size, int64_t inp_stride, int64_t out_stride, bool allow_neg_indices) {
     int64_t ind = idx[blockIdx.x];
     if (allow_neg_indices) {
         ind = (ind < 0) ? ind + ind_dim_size : ind;
@@ -22,8 +22,8 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, int64_t * idx,
 
 
 
-template <int64_t Alignment>
-void vectorized_gather_kernel_launch(char * out, char * inp, int64_t * idx, int num_ind,
+template <int64_t Alignment, typename index_t>
+void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind,
                                      int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices){
 
   constexpr int64_t max_num_threads=256;
@@ -32,13 +32,15 @@ void vectorized_gather_kernel_launch(char * out, char * inp, int64_t * idx, int
       static_cast<int64_t>(C10_WARP_SIZE));
   dim3 grid = {static_cast<uint32_t>(num_ind), static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1};
   auto block = std::min(max_num_threads, num_threads);
-  vectorized_gather_kernel<Alignment><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
+  vectorized_gather_kernel<Alignment, index_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
   ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 // explicit template instantiation
-template void vectorized_gather_kernel_launch<16>(char * out, char * inp, int64_t * idx, int num_ind, int64_t slice_size_in_bytes,
+template void vectorized_gather_kernel_launch<16, int64_t>(char * out, char * inp, int64_t * idx, int num_ind, int64_t slice_size_in_bytes,
+int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices);
+template void vectorized_gather_kernel_launch<16, int32_t>(char * out, char * inp, int32_t * idx, int num_ind, int64_t slice_size_in_bytes,
 int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices);
 
 }
diff --git a/aten/src/ATen/native/cuda/IndexKernelUtils.h b/aten/src/ATen/native/cuda/IndexKernelUtils.h
@@ -26,8 +26,8 @@ inline bool fast_gather_kernel_eligible(const TensorIterator& iter, char * const
          get_alignment(static_cast<size_t>(iter.strides(0)[1])) == alignment;
 }
 
-template <int64_t Alignment>
-void vectorized_gather_kernel_launch(char * out, char * inp, int64_t * idx, int num_ind,
+template <int64_t Alignment, typename index_t>
+void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind,
                                      int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes,
                                      bool allow_neg_indices=false);
 
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -116,7 +116,7 @@ static void _launch_scatter_gather_kernel(int64_t N, const func_t& f) {
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
-template <bool is_scatter_like, typename scalar_t>
+template <bool is_scatter_like, typename scalar_t, typename index_t>
 struct _cuda_scatter_gather_internal_kernel {
   template <typename func_t>
   void operator() (
@@ -128,7 +128,7 @@ struct _cuda_scatter_gather_internal_kernel {
   ) {
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
-        _cuda_scatter_gather_internal_kernel<is_scatter_like, scalar_t>()(
+        _cuda_scatter_gather_internal_kernel<is_scatter_like, scalar_t, index_t>()(
           sub_iter, index_size, index_stride, numel, f
         );
       }
@@ -151,15 +151,15 @@ struct _cuda_scatter_gather_internal_kernel {
         auto inp_stride_bytes = index_stride * element_size;
         auto out_stride_bytes = iter.strides(0)[1];
         if (iter.numel() == 0) return;
-        at::native::vectorized_gather_kernel_launch<alignment>(self_ptr, src_ptr, (int64_t*)index_ptr, num_ind, slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes);
+        at::native::vectorized_gather_kernel_launch<alignment, index_t>(self_ptr, src_ptr, (index_t*)index_ptr, num_ind, slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes);
         return;
       }
     }
     auto offset_calc = make_offset_calculator<3>(iter);
     auto loop = [=]C10_DEVICE(int i) {
       auto offsets = offset_calc.get(i);
 
-      int64_t idx_dim = *(int64_t*)(index_ptr + offsets[2]);
+      int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
         && "scatter gather kernel index out of bounds");
 
@@ -229,9 +229,11 @@ struct cuda_scatter_gather_base_kernel {
         using dtype = typename std::conditional<cast_to_opaque,
           OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
 
-        _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
-          iter, index_size, index_stride, self.numel(), f
-        );
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+          _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+            iter, index_size, index_stride, self.numel(), f
+          );
+        });
       }
     );
   }
@@ -279,19 +281,40 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
-      iter.dtype(),
-      "cuda_scatter_gather_base_kernel_func", [&] {
+    if (self.is_quantized()) {
+      TORCH_CHECK(
+          self.qscheme() == kPerTensorAffine,
+          "Only per_tensor quantized quantized tensors are supported by gather.")
+      AT_DISPATCH_QINT_TYPES(iter.dtype(), "gather_quant_cuda", [&] {
         using dtype = typename std::conditional<cast_to_opaque,
-          OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
-
-        _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
-          iter, index_size, index_stride, self.numel(), f
-        );
-      }
-    );
+            OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+          _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+            iter, index_size, index_stride, self.numel(), f
+          );
+        });
+      });
+    } else {
+      AT_DISPATCH_V2(
+          iter.dtype(),
+          "gather_cuda",
+          AT_WRAP([&] {
+            using dtype = typename std::conditional<cast_to_opaque,
+                OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+            AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+              _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+                iter, index_size, index_stride, self.numel(), f
+              );
+            });
+          }),
+          AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+          AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+          AT_EXPAND(AT_FLOAT8_TYPES),
+          kComplexHalf,
+          kHalf,
+          kBool,
+          kBFloat16);
+    }
   }
 
   template <typename func_t>
@@ -338,23 +361,24 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
     AT_DISPATCH_ALL_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
       iter.dtype(),
       "cuda_scatter_gather_base_kernel_func", [&] {
         using dtype = typename std::conditional<cast_to_opaque,
           OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
 
-        _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
-          iter, index_size, index_stride, self.numel(), f
-        );
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_gather_base_kernel_func", [&] () {
+          _cuda_scatter_gather_internal_kernel<is_scatter_like, dtype, index_t>()(
+            iter, index_size, index_stride, self.numel(), f
+          );
+        });
       }
     );
   }
 }; // struct cuda_scatter_gather_base_kernel
 
-template <typename scalar_t>
+template <typename scalar_t, typename index_t>
 struct _cuda_scatter_fill_internal_kernel {
   template <typename func_t>
   void operator()(
@@ -367,7 +391,7 @@ struct _cuda_scatter_fill_internal_kernel {
   ) {
     if (!iter.can_use_32bit_indexing()) {
       for (auto& sub_iter : iter.with_32bit_indexing()) {
-        _cuda_scatter_fill_internal_kernel<scalar_t>()(
+        _cuda_scatter_fill_internal_kernel<scalar_t, index_t>()(
           sub_iter, src_val, index_size, index_stride, numel, f
         );
       }
@@ -381,7 +405,7 @@ struct _cuda_scatter_fill_internal_kernel {
     auto loop = [=]C10_DEVICE(int i) {
       auto offsets = offset_calc.get(i);
 
-      int64_t idx_dim = *(int64_t*)(index_ptr + offsets[1]);
+      int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
         && "index out of bounds"
       );
@@ -437,9 +461,11 @@ struct cuda_scatter_fill_base_kernel {
         auto src_scalar_val = src.to<scalar_t>();
         auto src_val = *(dtype*)&src_scalar_val;
 
-        _cuda_scatter_fill_internal_kernel<dtype>()(
-          iter, src_val, index_size, index_stride, self.numel(), f
-        );
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_fill_base_kernel_func", [&] () {
+          _cuda_scatter_fill_internal_kernel<dtype, index_t>()(
+            iter, src_val, index_size, index_stride, self.numel(), f
+          );
+        });
       }
     );
   }
@@ -480,9 +506,11 @@ struct cuda_scatter_fill_base_kernel {
         auto src_scalar_val = src.to<scalar_t>();
         auto src_val = *(dtype*)&src_scalar_val;
 
-        _cuda_scatter_fill_internal_kernel<dtype>()(
-          iter, src_val, index_size, index_stride, self.numel(), f
-        );
+        AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "cuda_scatter_fill_base_kernel_reduce_multiply", [&] () {
+          _cuda_scatter_fill_internal_kernel<dtype, index_t>()(
+            iter, src_val, index_size, index_stride, self.numel(), f
+          );
+        });
       }
     );
   }
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -3335,7 +3335,6 @@ def gather(x, dim, index, sparse_grad=False):
         # Empty index case. Return an empty array with the same shape
         return new_empty(x, index.get_size())
 
-    assert index.get_dtype() == torch.int64
     size = x.get_size()
     offset = len(size) == 0
     dim = _validate_dim(x, dim, offset)
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
@@ -5420,8 +5420,8 @@ def meta_gather(self, dim, index, sparse_grad=False):
     is_index_empty = guard_size_oblivious(index.numel() == 0)
     if not is_index_empty:
         torch._check(
-            index.dtype == torch.long,
-            lambda: f"gather(): Expected dtype int64 for index, but got {index.dtype}",
+            index.dtype == torch.long or index.dtype == torch.int,
+            lambda: f"gather(): Expected dtype int32/int64 for index, but got {index.dtype}",
         )
         gather_shape_check(self, wrapped_dim, index)
     return self.new_empty(index.shape)
@@ -5460,8 +5460,8 @@ def scatter_gather_dtype_check(method_name, self, index, src_opt=None):
 
     if guard_size_oblivious(index.numel() != 0):
         torch._check(
-            index.dtype == torch.long,
-            lambda: f"{method_name}(): Expected dtype int64 for index",
+            index.dtype == torch.long or index.dtype == torch.int,
+            lambda: f"{method_name}(): Expected dtype int32/int64 for index",
         )
 
     if src_opt is not None:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co`
`84`	`84`	`auto inp_stride_bytes = index_stride[0];`
`85`	`85`	`auto out_stride_bytes = iter.strides(0)[1];`
`86`	`86`	`if (iter.numel() == 0) return;`
`87`		`- at::native::vectorized_gather_kernel_launch<alignment>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,`
	`87`	`+ at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,`
`88`	`88`	`slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /allow_neg_indices/true);`
`89`	`89`	`return;`
`90`	`90`	`}`
Original file line number	Diff line number	Diff line change
`@@ -26,8 +26,8 @@ inline bool fast_gather_kernel_eligible(const TensorIterator& iter, char * const`
`26`	`26`	`get_alignment(static_cast<size_t>(iter.strides(0)[1])) == alignment;`
`27`	`27`	`}`
`28`	`28`
`29`		`-template <int64_t Alignment>`
`30`		`-void vectorized_gather_kernel_launch(char * out, char * inp, int64_t * idx, int num_ind,`
	`29`	`+template <int64_t Alignment, typename index_t>`
	`30`	`+void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind,`
`31`	`31`	`int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes,`
`32`	`32`	`bool allow_neg_indices=false);`
`33`	`33`