[BE][11/16] fix typos in torch/ (torch/csrc/distributed/) (pytorch#156321)

XuehaiPan · pytorchmergebot · commit d55dc00f8479 · 2025-06-23T02:57:50.000Z
Pull Request resolved: pytorch#156321 Approved by: https://github.com/jingsh ghstack dependencies: pytorch#156313, pytorch#156314, pytorch#156315, pytorch#156316, pytorch#156317, pytorch#156319
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -1179,7 +1179,6 @@ exclude_patterns = [
     'torch/utils/**',
     'torch/csrc/jit/**',
     'torch/csrc/jit/[a-o]*/**',
-    'torch/csrc/distributed/**',
 ]
 init_command = [
     'python3',
diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt
@@ -24,5 +24,6 @@ rebuilt
 reenable
 reenabled
 requestor
+ser'de
 supercedes
 te
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.h b/torch/csrc/distributed/autograd/engine/dist_engine.h
@@ -15,7 +15,7 @@ class BackwardPassCleanupGuard;
 
 // This is a singleton class responsible for running distributed backward
 // passes. This engine relies heavily on the vanilla autograd engine and tries
-// to re-use it as much as possible. This class is mostly responsible for the
+// to reuse it as much as possible. This class is mostly responsible for the
 // distributed aspects of autograd and tries to hook into the autograd engine
 // where convenient.
 
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp
@@ -45,7 +45,7 @@ RpcWithProfilingReq::RpcWithProfilingReq(
       tensors_(std::move(tensors)),
       profilerConfig_(std::move(profilerConfig)),
       profilingKeyId_(profilingKeyId) {
-  TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null");
+  TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null");
 }
 
 rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const {
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -323,7 +323,7 @@ FileStore::~FileStore() {
   auto numFinishedWorker = addHelper(cleanupKey_, 1);
   auto refCount = addHelper(refCountKey_, -1);
   // The last worker cleans up the file. If numWorkers was not initialized to
-  // a specific postive value (i.e. meaning that there was not a fixed number
+  // a specific positive value (i.e. meaning that there was not a fixed number
   // of workers), we don't attempt to clean.
   // Clean up the file if number of references is 0.
   if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) {
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -145,7 +145,7 @@ struct FlightRecorder {
     std::optional<c10::time_t> time_discovered_started_;
 
     // timestamp when our CPU threads discovered that the kernel completed.
-    // will always be _after_ it actually complated, and can be the same time
+    // will always be _after_ it actually completed, and can be the same time
     // as the discovery of the start if the watchdog thread is stuck on CUDA
     // APIs
     std::optional<c10::time_t> time_discovered_completed_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -965,7 +965,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_sparse(
     const AllreduceOptions& opts) {
   // all reduce sparse calls into default allreduce which
   // implemented with all_gathering indices and values
-  // we do ths we do not have a native cuda implementation
+  // we do this we do not have a native cuda implementation
   return allreduce(inputs, opts);
 }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -65,7 +65,7 @@ struct WorkEntry {
 // That is, The process may be multi-threaded, and multiple threads may make
 // MPI calls, but only one at a time: MPI calls are not made concurrently from
 // two distinct threads (all MPI calls are serialized). However, with
-// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
+// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process
 // group. In other words, no more than 1 process group can be created globally.
 //
 // If you would like to use multiple ProcessGroupMPI, it requires your MPI
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1423,7 +1423,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
 bool ProcessGroupNCCL::abortComms(
     const std::optional<std::string>& abortReason) {
   // Remove record from global ncclCommMemPoolMapMutex before aboarting,
-  // so that a new cache segment would not register to already aborded
+  // so that a new cache segment would not register to already aborted
   // communicators. Note that ncclCommMemPoolMap is a global container which may
   // contain other PG's communicators, thus we need to only erase communicators
   // for the current PG.
@@ -1451,9 +1451,9 @@ void ProcessGroupNCCL::abort() {
   terminateProcessGroup_.store(true);
   watchdog_->notify();
 
-  // lauch abort asynchrounously and wait for it to complete or timeout
+  // launch abort asynchronously and wait for it to complete or timeout
   LOG(INFO) << logPrefix()
-            << "Launching ProcessGroupNCCL abort asynchrounously.";
+            << "Launching ProcessGroupNCCL abort asynchronously.";
   std::future<bool> fut =
       std::async(std::launch::async, [this]() { return this->abortComms(); });
 
@@ -1655,7 +1655,7 @@ std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg(
 
 void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime(
     std::chrono::time_point<std::chrono::steady_clock> time) {
-  // We intentially let the race condition to happen but this is ok
+  // We intentionally let the race condition to happen but this is ok
   // as long as we update the time, we know we are making progress.
   lastWorkListUpdateTime_ = time;
 }
@@ -1761,7 +1761,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
       // 1. The current rank is the first to observe a timeout in watchdog.
       // (shouldDump_ was set to true by the watchdog thread).
       // 2. Other ranks detected the timeout and signal the current rank to
-      // dump. In addtion, monitor threads will dump if watchdog threads has no
+      // dump. In addition, monitor threads will dump if watchdog threads has no
       // heartbeat or dumpPipe is not empty.
       if (shouldDump_.load()) {
         errorMsg = getNCCLWatchdogTimeoutErrorMsg("this local rank");
@@ -3030,7 +3030,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
 
   bool useScalableInit = false;
   // (nranks / nroots) == 128 was the default NCCL recommended
-  // accoring to
+  // according to
   // https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615.
   auto ranksPerRoot = getCvarInt(TORCH_NCCL_RANKS_PER_ROOT, 128);
 #if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
@@ -3327,7 +3327,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
     // - initially, moved record() into workEnqueue(), but found that makes it
     //   hard to get access to profilingTitle,
     //   inputs, and outputs for metadata recording, and we don't want to attach
-    //   these objects to the Work becuase it has implications for keeping those
+    //   these objects to the Work because it has implications for keeping those
     //   tensors alive longer and adds overhead when copying Work objects
     //   between threads
     r->trace_id_ = FlightRecorderCUDA::get()->record(
@@ -3442,7 +3442,7 @@ void ProcessGroupNCCL::startCoalescing() {
   // ops from a coalesce group into the flight recorder, we want to have the
   // same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
   // start, which has one minor downside- we burn a seq_ if someone ever does a
-  // 'start' and 'end' coalescing region without doing an operation inbetween.
+  // 'start' and 'end' coalescing region without doing an operation in between.
 
   coalescedDevice_.set_index(-1);
   coalescedComm_ = nullptr;
@@ -3462,7 +3462,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
   }
   TORCH_CHECK(
       coalescedDevice_.index() >= 0,
-      "Somthing went wrong. Did you call end_coalescing before start_coalescing?");
+      "Something went wrong. Did you call end_coalescing before start_coalescing?");
 
   // `coalescedComm_` should have same set of comms across collectives
   auto comm = coalescedComm_;
@@ -3618,7 +3618,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       device, rank_, opType, false, profilingTitle, inputs, outputs, enqueue);
   if (coalescing_state_) {
     // When coalescing, we record events per op that lack timing/state
-    // information becuase there is no 'work' associated with them, and then
+    // information because there is no 'work' associated with them, and then
     // later in endCoalescing we record a 'coalesced' Work which has
     // timing/state updates via watchdog thread, but lacks op metadata such as
     // input/output sizes and profilingTitle per-op in the group.
@@ -3781,7 +3781,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
   // collective so there is no flight record and we increment seqCollective_ and
   // op_id_ together. Compare this to startCoalescing/endCoalescing flow where
   // we increment either seqP2P_ or seqCollective_ once per group and increment
-  // op_id_ once per indvidual operation within the group
+  // op_id_ once per individual operation within the group
   op_id_++;
 
   const auto key = getKeyFromDevice(device);
@@ -4016,7 +4016,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
   c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
   if (coalescing_state_) {
     // When coalescing, we record events per op that lack timing/state
-    // information becuase there is no 'work' associated with them, and then
+    // information because there is no 'work' associated with them, and then
     // later in endCoalescing we record a 'coalesced' Work which has
     // timing/state updates via watchdog thread, but lacks op metadata such as
     // input/output sizes and profilingTitle per-op in the group.
@@ -4397,7 +4397,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
           false), // seq + 1 to match collective and assume only one collective
-                  // in coalesed range
+                  // in coalesced range
       std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
@@ -4694,7 +4694,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
           // User-facing outputTensors should be held by the user until after
           // waiting on work_, or the call makes no sense. We do a stashing here
           // in case user doesn't hold the outputTensors in downstream code,
-          // which can cause an early recyle by the CachingAllocator, which can
+          // which can cause an early recycle by the CachingAllocator, which can
           // lead to segfault or data corruption.
           if (opts.asyncOp) {
             work->stashed_for_allocator_safety_->stash(outputTensors_);
@@ -4742,7 +4742,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
           false), // seq + 1 to match collective and assume only one collective
-                  // in coalesed range
+                  // in coalesced range
       std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
       inputs, // inputTensors
       outputs, // outputTensors
@@ -4956,7 +4956,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
       std::make_tuple(
           static_cast<int64_t>(seqCollective_) + 1,
           false), // seq + 1 to match collective and assume only one collective
-                  // in coalesed range
+                  // in coalesced range
       std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
       inputs, // inputTensors
       outputs, // outputTensors
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -1291,7 +1291,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // communication, the key will be "1:2" on both processes. Note: this is for
   // the scenario where there is only 1 GPU per process. When it comes to
   // multiple GPUs per process, this part may need to redesigned.
-  // TODO: we probably need a separte map for P2P comms
+  // TODO: we probably need a separate map for P2P comms
   std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
 
   // The NCCL communicators currently in process of being initialized.
@@ -1316,7 +1316,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::atomic<bool> hasPendingHooks_{};
 
   // This is the signal from watchdog threads to indicate whether the monitor
-  // thread should dump. Making it static so that it is accessiable from all the
+  // thread should dump. Making it static so that it is accessible from all the
   // PGs. With this flag, monitor thread would dump debug info under any one of
   // the three conditions:
   //
diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -397,7 +397,7 @@ class WriterPayload : public c10::intrusive_ptr_target {
   void registeredInLoop() {
     /*
     This refcount increment must be matched by a reclaim call.
-    Call this method after sucessfully scheduling this handle with a loop.
+    Call this method after successfully scheduling this handle with a loop.
     */
     at::raw::intrusive_ptr::incref(this);
   }
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
@@ -573,9 +573,9 @@ using SizeType = uint64_t;
 // (https://stackoverflow.com/a/20295079), and thus `errno` should really only
 // be inspected if an error occurred.
 //
-// `success_cond` is an expression used to check if an error has happend. So for
-// `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output
-// is stored in variable `__output` and may be used in `success_cond`.
+// `success_cond` is an expression used to check if an error has happened. So
+// for `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function
+// output is stored in variable `__output` and may be used in `success_cond`.
 #ifdef _WIN32
 #define SYSCHECK(expr, success_cond)                                           \
   while (true) {                                                               \
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
@@ -118,7 +118,7 @@ class TORCH_API Work : public torch::CustomClassHolder {
 
   // Get a Future object that would be marked as either success or failure
   // This API can be used by the user to track the completion of the work
-  // and hanlde the exception if any.
+  // and handle the exception if any.
   virtual c10::intrusive_ptr<c10::ivalue::Future> getFutureResult();
 
   virtual float getDuration() const;
diff --git a/torch/csrc/distributed/c10d/comm.hpp b/torch/csrc/distributed/c10d/comm.hpp
@@ -67,7 +67,8 @@ class TORCH_API GradBucket {
     return parameters_;
   }
 
-  // Returns whther this bucket is the last bucket to allreduce in an iteration.
+  // Returns whether this bucket is the last bucket to allreduce in an
+  // iteration.
   bool isLast() const {
     return index_ == bucket_count_ - 1;
   }
diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@@ -5,7 +5,7 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/cuda/CUDAGuard.h>
 
-// Two warninngs in Cutlass included header files
+// Two warnings in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
 
@@ -163,7 +163,7 @@ at::Tensor async_input_mm_impl(
 
   TORCH_CHECK(
       M % num_chunks_M == 0,
-      "async_input_mm: `a.shape(0)` must be an interger multiple of `a_chunk_signals.numel()`");
+      "async_input_mm: `a.shape(0)` must be an integer multiple of `a_chunk_signals.numel()`");
   size_t chunk_size_M = M / num_chunks_M;
   size_t tile_size_M = cute::get<0>(TileShape_MNK{});
 
@@ -248,7 +248,7 @@ at::Tensor async_input_mm_out(
   });
 #else
   TORCH_CHECK(
-      false, "async_input_mm is not currenlty supported on your device");
+      false, "async_input_mm is not currently supported on your device");
 #endif
   return out;
 }
diff --git a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
@@ -3,7 +3,7 @@
  * that supports consuming asynchronous input. This tile scheduler introduces the following arguments:
  *
  * - tiles_per_chunk_m – Specifies the size of an M chunk. Chunks are the granularity at which the
- *   asynchronous input becomes ready. It must be an interger multiple of the size of an M tile.
+ *   asynchronous input becomes ready. It must be an integer multiple of the size of an M tile.
  *
  * - chunk_signals – chunk_signals[i] == 1 indicates that chunk i is ready. Before returning a work
  *   tile, get_current_work() waits for the signal to ensure that the corresponding chunk is ready.
@@ -327,7 +327,7 @@ public:
         wait_signal(scheduler_params.chunk_signals + chunk_idx);
       }
 
-      // An arbirary, non-default id
+      // An arbitrary, non-default id
       constexpr int barrier_id = 8;
       arch::NamedBarrier barrier(NumThreadsPerWarp, barrier_id);
       barrier.arrive_and_wait();
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -1006,7 +1006,7 @@ This class does not support ``__members__`` property.)");
   });
 
 #ifdef USE_NVSHMEM
-  // Intializes the device state in CUmodule so that it’s able to perform
+  // Initializes the device state in CUmodule so that it’s able to perform
   // NVSHMEM operations.
   module.def(
       "_nvshmemx_cumodule_init",
@@ -3297,7 +3297,7 @@ ProcessGroup options for the NCCL backend
             Default is False.
 
 Attributes:
-    config (NCCLConfig): configures NCCL communicators (only avaiable for
+    config (NCCLConfig): configures NCCL communicators (only available for
             builds using NCCL 2.17+). This can be used to improve
             communication-computation overlap for NCCL kernels by tuning
             available parameters in the config. See
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
@@ -10,7 +10,7 @@ namespace c10d {
 // A struct to hold the latest status of the process group.
 struct ProcessGroupStatus {
   // the sequential number of the last collective enqueued into workMetaList_
-  // This is useful for indentifying a rank that has not join a collective
+  // This is useful for identifying a rank that has not join a collective
   // initialized to be -1 to indicate no collective has been enqueued
   int64_t lastEnqueuedSeq{-1};
   // the sequential number of the last collective started as the kernel
diff --git a/torch/csrc/distributed/c10d/python_comm_hook.h b/torch/csrc/distributed/c10d/python_comm_hook.h
@@ -15,7 +15,7 @@ class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
   // The state is passed to the hook in runHook method, and it can be used to
   // maintain and update any state information during the execution of the hook.
   // The hook performs user-specified processing and returns a future indicating
-  // asychronous communication of gradients.
+  // asynchronous communication of gradients.
   PythonCommHook(py::object state, py::object hook)
       : state_(std::move(state)), hook_(std::move(hook)) {}
 
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
@@ -1245,7 +1245,7 @@ void Reducer::initialize_buckets(
       // patterns when copy_ing grad data in and out of its bucket view.
       // However, numerics remain correct, because the bucket view is the same
       // on either end of the raw allreduce.  bucket_view_in.copy(grad)
-      // tranposes
+      // transposes
       // (+ densifies) to the bucket view's layout, the data is allreduced,
       // then grad.copy_(bucket_view_out) transposes it back to grad's layout.
       //
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
@@ -564,7 +564,7 @@ class TORCH_API Reducer {
   // Retrieves parameter corresponding to the given VariableIndex.
   at::Tensor& get_param_from_index(size_t index);
   // Python reducer keeps C++ reducer initialized. To remove this flag,
-  // we need to refactor the DDP wrapper's initilization.
+  // we need to refactor the DDP wrapper's initialization.
   bool use_python_reducer_;
 
   // Cached bucket index to model parameter mapping. Populated after buckets
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
@@ -50,7 +50,7 @@ __device__ __forceinline__ void trap() {
 #if defined(USE_ROCM)
   // abort() calls trap() under the covers. However, on ROCm, the trap is
   // handled differently inside hip runtime. It collects a gpu core dump and
-  // causes linux kernerl to create a core dump of the host application.
+  // causes linux kernel to create a core dump of the host application.
   abort();
 #else
   __trap();
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -56,7 +56,7 @@
     INT_SWITCH_CASE(k_alignment, 8, __VA_ARGS__);                     \
     INT_SWITCH_CASE(k_alignment, 4, __VA_ARGS__);                     \
     default: {                                                        \
-      TORCH_CHECK(false, "Not implemented for aligment=", alignment); \
+      TORCH_CHECK(false, "Not implemented for alignment=", alignment); \
     }                                                                 \
   }
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
@@ -156,7 +156,7 @@ int IpcChannel::recv_fd() {
       .msg_control = cbuf,
       .msg_controllen = sizeof(cbuf)};
 
-  // Recieve message on socket_
+  // Receive message on socket_
   TORCH_CHECK(
       recvmsg(socket_, &msg, 0) > 0,
       "Failed to receive fd: ",
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
diff --git a/torch/csrc/distributed/rpc/agent_utils.h b/torch/csrc/distributed/rpc/agent_utils.h
diff --git a/torch/csrc/distributed/rpc/py_rref.cpp b/torch/csrc/distributed/rpc/py_rref.cpp
diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.cpp b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h

Original file line number	Diff line number	Diff line change
`@@ -1179,7 +1179,6 @@ exclude_patterns = [`
`1179`	`1179`	`'torch/utils/**',`
`1180`	`1180`	`'torch/csrc/jit/**',`
`1181`	`1181`	`'torch/csrc/jit/[a-o]/*',`
`1182`		`- 'torch/csrc/distributed/**',`
`1183`	`1182`	`]`
`1184`	`1183`	`init_command = [`
`1185`	`1184`	`'python3',`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ RpcWithProfilingReq::RpcWithProfilingReq(`
`45`	`45`	`tensors_(std::move(tensors)),`
`46`	`46`	`profilerConfig_(std::move(profilerConfig)),`
`47`	`47`	`profilingKeyId_(profilingKeyId) {`
`48`		`- TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null");`
	`48`	`+ TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null");`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const {`
Original file line number	Diff line number	Diff line change
`@@ -965,7 +965,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_sparse(`
`965`	`965`	`const AllreduceOptions& opts) {`
`966`	`966`	`// all reduce sparse calls into default allreduce which`
`967`	`967`	`// implemented with all_gathering indices and values`
`968`		`- // we do ths we do not have a native cuda implementation`
	`968`	`+ // we do this we do not have a native cuda implementation`
`969`	`969`	`return allreduce(inputs, opts);`
`970`	`970`	`}`
`971`	`971`