intel
diff --git a/‎src/ATen/native/xpu/sycl/Loops.h
Lines changed: 5 additions & 3 deletions b/‎src/ATen/native/xpu/sycl/Loops.h
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/ATen/native/xpu/sycl/LossNLLKernel.cpp
Lines changed: 20 additions & 15 deletions b/‎src/ATen/native/xpu/sycl/LossNLLKernel.cpp
Lines changed: 20 additions & 15 deletions
diff --git a/‎src/xccl/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎src/xccl/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/xccl/NanCheck_XPU.cpp
Lines changed: 213 additions & 0 deletions b/‎src/xccl/NanCheck_XPU.cpp
Lines changed: 213 additions & 0 deletions
diff --git a/‎src/xccl/NanCheck_XPU.hpp
Lines changed: 14 additions & 0 deletions b/‎src/xccl/NanCheck_XPU.hpp
Lines changed: 14 additions & 0 deletions
@@ -620,7 +620,7 @@ void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) {
   gpu_kernel_impl_nocast(iter, f);
 }
 
-template <typename func_t>
+template <typename func_t, bool enable_broadcast_vec = true>
 void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
   for (int arg = 0; arg < iter.ntensors(); arg++) {
     TORCH_INTERNAL_ASSERT(
@@ -637,12 +637,14 @@ void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
 
   if (!iter.can_use_32bit_indexing()) {
     for (auto& sub_iter : iter.with_32bit_indexing()) {
-      gpu_kernel(sub_iter, f);
+      // Broadcasting vectorization is disabled for sub-iterators to prevent
+      // potential output offset calculation issues.
+      gpu_kernel<func_t, false>(sub_iter, f);
     }
     return;
   }
 
-  gpu_kernel_impl(iter, f);
+  gpu_kernel_impl<func_t, enable_broadcast_vec>(iter, f);
 }
 
 template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>
 
@@ -1,3 +1,4 @@
+#include <ATen/AccumulateType.h>
 #include <ATen/Functions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/core/Reduction.h>
@@ -126,7 +127,7 @@ struct NllLossForwardReduce1DKernelFunctor {
   int64_t reduction;
 };
 
-template <typename scalar_t, typename index_t>
+template <typename scalar_t, typename index_t, typename accscalar_t>
 struct NllLossForwardReduce2DKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
   void operator()(sycl::nd_item<1> item_id) const {
@@ -136,17 +137,18 @@ struct NllLossForwardReduce2DKernelFunctor
     auto total_weight_ptr = total_weight_data;
     auto output_ptr = output_data;
     int64_t local_id = item_id.get_local_id(0);
-    local_output_acc[local_id] = 0.0;
-    local_total_weight_acc[local_id] = 0.0;
+    local_output_acc[local_id] = accscalar_t(0);
+    local_total_weight_acc[local_id] = accscalar_t(0);
     for (int i = local_id; i < batch_size; i += local_size) {
       int cur_target = target_ptr[i];
       if (cur_target != ignore_index) {
         scalar_t cur_weight =
             has_weight ? weight_ptr[cur_target] : static_cast<scalar_t>(1.0f);
-        local_total_weight_acc[local_id] += cur_weight;
+        local_total_weight_acc[local_id] +=
+            static_cast<accscalar_t>(cur_weight);
         local_output_acc[local_id] -=
-            static_cast<scalar_t>(input_ptr[i * n_target + cur_target]) *
-            static_cast<scalar_t>(cur_weight);
+            static_cast<accscalar_t>(input_ptr[i * n_target + cur_target]) *
+            static_cast<accscalar_t>(cur_weight);
       }
     }
 
@@ -161,11 +163,13 @@ struct NllLossForwardReduce2DKernelFunctor
     }
     item_id.barrier(sycl_global_and_local_fence);
 
-    output_ptr[0] = local_output_acc[0];
-    total_weight_ptr[0] = local_total_weight_acc[0];
     if (reduction == at::Reduction::Mean) {
-      output_ptr[0] /= total_weight_ptr[0];
+      output_ptr[0] = static_cast<scalar_t>(
+          local_output_acc[0] / local_total_weight_acc[0]);
+    } else {
+      output_ptr[0] = static_cast<scalar_t>(local_output_acc[0]);
     }
+    total_weight_ptr[0] = static_cast<scalar_t>(local_total_weight_acc[0]);
   }
   NllLossForwardReduce2DKernelFunctor(
       scalar_t* input_data_,
@@ -192,8 +196,8 @@ struct NllLossForwardReduce2DKernelFunctor
         reduction(reduction_) {}
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    local_output_acc = sycl_local_acc_t<scalar_t>(local_size, cgh);
-    local_total_weight_acc = sycl_local_acc_t<scalar_t>(local_size, cgh);
+    local_output_acc = sycl_local_acc_t<accscalar_t>(local_size, cgh);
+    local_total_weight_acc = sycl_local_acc_t<accscalar_t>(local_size, cgh);
   }
 
  private:
@@ -207,8 +211,8 @@ struct NllLossForwardReduce2DKernelFunctor
   int64_t local_size;
   int64_t ignore_index;
   int n_target;
-  sycl_local_acc_t<scalar_t> local_output_acc;
-  sycl_local_acc_t<scalar_t> local_total_weight_acc;
+  sycl_local_acc_t<accscalar_t> local_output_acc;
+  sycl_local_acc_t<accscalar_t> local_total_weight_acc;
   int64_t reduction;
 };
 
@@ -309,8 +313,9 @@ void nll_loss_forward_template(
 
     sycl_kernel_submit(sycl::range<1>(local_size), queue, kfn);
   } else if (input_cont.dim() == 2) {
+    using accscalar_t = at::acc_type<scalar_t, true>;
     using NllLossForwardReduce2DKernel =
-        NllLossForwardReduce2DKernelFunctor<scalar_t, index_t>;
+        NllLossForwardReduce2DKernelFunctor<scalar_t, index_t, accscalar_t>;
 
     int64_t batch_size = input.size(0);
     int n_target = input.size(1);
@@ -322,7 +327,7 @@ void nll_loss_forward_template(
     auto target_data = _target_data;
     auto total_weight_data = _total_weight_data;
     auto output_data = _output_data;
-    NllLossForwardReduce2DKernelFunctor<scalar_t, index_t> kfn(
+    NllLossForwardReduce2DKernelFunctor<scalar_t, index_t, accscalar_t> kfn(
         input_data,
         target_data,
         weight_data,
 
@@ -2,10 +2,13 @@
 
 file(GLOB xccl_h "*.hpp")
 file(GLOB xccl_cpp "*.cpp")
+list(REMOVE_ITEM xccl_cpp "${CMAKE_CURRENT_SOURCE_DIR}/NanCheck_XPU.cpp")
 
 list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp})
+list(APPEND ATen_XPU_SYCL_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/NanCheck_XPU.cpp")
 
 set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE)
+set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
 
 # Why copy the header file to the build directory?
 # We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29.
 
@@ -0,0 +1,213 @@
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/xpu/sycl/MemoryAccessUtils.h>
+#include <ATen/xpu/XPUContext.h>
+#include <comm/SYCLContext.h>
+#include <stdint.h>
+#include <torch/torch.h>
+#include <xccl/NanCheck_XPU.hpp>
+#include <algorithm>
+
+namespace c10d {
+
+using BytePack = at::native::memory::aligned_vector<uint64_t, 2>;
+
+template <typename T, int EltPerPack>
+struct CheckBytePack {
+  static void check(BytePack* tmp) {
+    T* data = (T*)tmp;
+#pragma unroll 8
+    for (int i = 0; i < EltPerPack; i++) {
+      if (at::_isnan(data[i]))
+        assert(0);
+    }
+  }
+};
+
+template <typename T>
+struct CheckBytePack<T, /*EltPerPack*/ 2> {
+  static void check(BytePack* tmp) {
+    T* data = (T*)tmp;
+    if (at::_isnan(data[0]) || at::_isnan(data[1]))
+      assert(0);
+  }
+};
+
+template <typename T>
+struct CheckBytePack<T, /*EltPerPack*/ 4> {
+  static void check(BytePack* tmp) {
+    T* data = (T*)tmp;
+    if (at::_isnan(data[0]) || at::_isnan(data[1]) || at::_isnan(data[2]) ||
+        at::_isnan(data[3]))
+      assert(0);
+  }
+};
+
+template <typename T>
+struct CheckBytePack<T, /*EltPerPack*/ 8> {
+  static void check(BytePack* tmp) {
+    T* data = (T*)tmp;
+    if (at::_isnan(data[0]) || at::_isnan(data[1]) || at::_isnan(data[2]) ||
+        at::_isnan(data[3]) || at::_isnan(data[4]) || at::_isnan(data[5]) ||
+        at::_isnan(data[6]) || at::_isnan(data[7])) {
+      assert(0);
+    }
+  }
+};
+
+template <typename T>
+struct HasNanFP8x8 {
+  static bool check(uint64_t fp8x8) = delete;
+  /*
+  {
+    // `static_assert` in template definition requires c++23 onwards.
+    // But the error message still applies if you find yourself here.
+    static_assert(
+      false,
+      "You should never call this template definition because it is empty. You "
+      "can follow the example of Float8_e4m3fn below to implement the check for
+  " "your new datatype."
+    );
+  }
+  */
+};
+
+template <>
+struct HasNanFP8x8<c10::Float8_e4m3fn> {
+  static bool check(uint64_t fp8x8) {
+    auto t = fp8x8 & 0x7F7F7F7F7F7F7F7FULL;
+    auto incremented = t + 0x0101010101010101ULL;
+    auto overflow = incremented & 0x8080808080808080ULL;
+    return overflow != 0;
+  }
+};
+
+template <>
+struct HasNanFP8x8<c10::Float8_e5m2> {
+  static bool check(uint64_t fp8x8) {
+    auto t = fp8x8 & 0x7F7F7F7F7F7F7F7FULL;
+    auto incremented = t + 0x0303030303030303ULL;
+    auto overflow = incremented & 0x8080808080808080ULL;
+    return overflow != 0;
+  }
+};
+
+template <typename T>
+struct CheckBytePack<T, /*EltPerPack*/ 16> {
+  static void check(BytePack* tmp) {
+    if (HasNanFP8x8<T>::check(tmp->val[0]) ||
+        HasNanFP8x8<T>::check(tmp->val[1]))
+      assert(0);
+  }
+};
+
+#define UNROLL 8
+
+template <typename T>
+void checkChunk(BytePack* ptr, int nWorkers) {
+  BytePack tmp[UNROLL];
+
+#pragma unroll 8
+  for (int j = 0; j < UNROLL; j++) {
+    tmp[j] = ptr[nWorkers * j];
+  }
+// Then check each BytePack in the tmp buffer
+#pragma unroll 8
+  for (int j = 0; j < UNROLL; j++) {
+    CheckBytePack<T, sizeof(BytePack) / sizeof(T)>::check(tmp + j);
+  }
+  // Note: we separate the check from the load for efficient loading
+}
+
+// Align address of `ptr` up, to the alignment of `T`
+#define ALIGN_UP(ptr, T) \
+  (((uintptr_t)ptr + sizeof(T) - 1) / sizeof(T) * sizeof(T))
+
+template <typename T>
+struct checkForNaN {
+  void operator()(sycl::nd_item<1> item) const {
+    constexpr int EltPerPack = sizeof(BytePack) / sizeof(T);
+
+    size_t offset = item.get_global_id(0);
+
+    // Align input address up to BytePack in case it is not
+    T* ptrAlign = (T*)ALIGN_UP(data, BytePack);
+    size_t preProcElts =
+        std::min<size_t>(static_cast<size_t>(ptrAlign - data), size);
+
+    size_t size_left = size;
+
+    if (offset < preProcElts) {
+      if (at::_isnan(data[offset]))
+        assert(0);
+    }
+    size_left -= preProcElts;
+
+    BytePack* ptr = (BytePack*)ptrAlign;
+    size_t sizeInBP = size_left * sizeof(T) / sizeof(BytePack);
+    size_t loopSize = item.get_global_range(0) * UNROLL;
+
+    for (; offset + loopSize <= sizeInBP; offset += loopSize) {
+      checkChunk<T>(ptr + offset, item.get_global_range(0));
+    }
+
+    for (; offset < sizeInBP; offset += item.get_global_range(0)) {
+      BytePack tmp = ptr[offset];
+      CheckBytePack<T, EltPerPack>::check(&tmp);
+    }
+
+    if (item.get_local_id(0) < size_left % EltPerPack) {
+      T* tailPtr = (T*)(ptr + sizeInBP);
+      if (at::_isnan(tailPtr[item.get_local_id(0)]))
+        assert(0);
+    }
+  }
+  checkForNaN(T* data, size_t size) : data(data), size(size) {}
+
+ private:
+  T* data;
+  size_t size;
+};
+
+template <typename T>
+void checkfornan_impl_xpu(
+    const at::Tensor& tensor,
+    at::xpu::XPUStream& stream) {
+  // skip check for non float types
+  if (!torch::is_floating_point(tensor)) {
+    return;
+  }
+
+  int64_t maxNumThreadsPerBlock = syclMaxWorkGroupSize<checkForNaN<T>>();
+
+  const size_t numThreadsPerBlock =
+      std::min<size_t>(maxNumThreadsPerBlock, tensor.numel());
+
+  if (!(numThreadsPerBlock > 0)) {
+    return;
+  }
+
+  int64_t numBlocks =
+      (tensor.numel() + numThreadsPerBlock - 1) / numThreadsPerBlock;
+  auto global_range{numBlocks * numThreadsPerBlock};
+  auto local_range{numThreadsPerBlock};
+
+  using Kernel = checkForNaN<T>;
+  auto kfn = Kernel(tensor.data_ptr<T>(), tensor.numel());
+
+  sycl_kernel_submit(global_range, local_range, stream.queue(), kfn);
+}
+
+// CHECK if a Tensor contains NAN in any of its element
+void checkForNan(const at::Tensor& tensor, at::xpu::XPUStream& stream) {
+  AT_DISPATCH_FLOATING_TYPES_AND4(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Float8_e4m3fn,
+      at::ScalarType::Float8_e5m2,
+      tensor.scalar_type(),
+      "checkForNaN_XPU",
+      [&]() { checkfornan_impl_xpu<scalar_t>(tensor, stream); });
+}
+
+} // namespace c10d
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifdef USE_C10D_XCCL
+
+#include <ATen/ATen.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace c10d {
+
+void checkForNan(const at::Tensor& tensor, at::xpu::XPUStream& stream);
+
+} // namespace c10d
+
+#endif // USE_C10D_XCCL
Original file line number	Diff line number	Diff line change
`@@ -620,7 +620,7 @@ void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) {`
`620`	`620`	`gpu_kernel_impl_nocast(iter, f);`
`621`	`621`	`}`
`622`	`622`
`623`		`-template <typename func_t>`
	`623`	`+template <typename func_t, bool enable_broadcast_vec = true>`
`624`	`624`	`void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {`
`625`	`625`	`for (int arg = 0; arg < iter.ntensors(); arg++) {`
`626`	`626`	`TORCH_INTERNAL_ASSERT(`
`@@ -637,12 +637,14 @@ void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {`
`637`	`637`
`638`	`638`	`if (!iter.can_use_32bit_indexing()) {`
`639`	`639`	`for (auto& sub_iter : iter.with_32bit_indexing()) {`
`640`		`- gpu_kernel(sub_iter, f);`
	`640`	`+ // Broadcasting vectorization is disabled for sub-iterators to prevent`
	`641`	`+ // potential output offset calculation issues.`
	`642`	`+ gpu_kernel<func_t, false>(sub_iter, f);`
`641`	`643`	`}`
`642`	`644`	`return;`
`643`	`645`	`}`
`644`	`646`
`645`		`- gpu_kernel_impl(iter, f);`
	`647`	`+ gpu_kernel_impl<func_t, enable_broadcast_vec>(iter, f);`
`646`	`648`	`}`
`647`	`649`
`648`	`650`	`template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>`