@@ -1423,7 +1423,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
14231423bool ProcessGroupNCCL::abortComms (
14241424 const std::optional<std::string>& abortReason) {
14251425 // Remove record from global ncclCommMemPoolMapMutex before aboarting,
1426- // so that a new cache segment would not register to already aborded
1426+ // so that a new cache segment would not register to already aborted
14271427 // communicators. Note that ncclCommMemPoolMap is a global container which may
14281428 // contain other PG's communicators, thus we need to only erase communicators
14291429 // for the current PG.
@@ -1451,9 +1451,9 @@ void ProcessGroupNCCL::abort() {
14511451 terminateProcessGroup_.store (true );
14521452 watchdog_->notify ();
14531453
1454- // lauch abort asynchrounously and wait for it to complete or timeout
1454+ // launch abort asynchronously and wait for it to complete or timeout
14551455 LOG (INFO) << logPrefix ()
1456- << " Launching ProcessGroupNCCL abort asynchrounously ." ;
1456+ << " Launching ProcessGroupNCCL abort asynchronously ." ;
14571457 std::future<bool > fut =
14581458 std::async (std::launch::async, [this ]() { return this ->abortComms (); });
14591459
@@ -1655,7 +1655,7 @@ std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg(
16551655
16561656void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime (
16571657 std::chrono::time_point<std::chrono::steady_clock> time) {
1658- // We intentially let the race condition to happen but this is ok
1658+ // We intentionally let the race condition to happen but this is ok
16591659 // as long as we update the time, we know we are making progress.
16601660 lastWorkListUpdateTime_ = time;
16611661}
@@ -1761,7 +1761,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
17611761 // 1. The current rank is the first to observe a timeout in watchdog.
17621762 // (shouldDump_ was set to true by the watchdog thread).
17631763 // 2. Other ranks detected the timeout and signal the current rank to
1764- // dump. In addtion , monitor threads will dump if watchdog threads has no
1764+ // dump. In addition , monitor threads will dump if watchdog threads has no
17651765 // heartbeat or dumpPipe is not empty.
17661766 if (shouldDump_.load ()) {
17671767 errorMsg = getNCCLWatchdogTimeoutErrorMsg (" this local rank" );
@@ -3030,7 +3030,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
30303030
30313031 bool useScalableInit = false ;
30323032 // (nranks / nroots) == 128 was the default NCCL recommended
3033- // accoring to
3033+ // according to
30343034 // https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615.
30353035 auto ranksPerRoot = getCvarInt (TORCH_NCCL_RANKS_PER_ROOT, 128 );
30363036#if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
@@ -3327,7 +3327,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
33273327 // - initially, moved record() into workEnqueue(), but found that makes it
33283328 // hard to get access to profilingTitle,
33293329 // inputs, and outputs for metadata recording, and we don't want to attach
3330- // these objects to the Work becuase it has implications for keeping those
3330+ // these objects to the Work because it has implications for keeping those
33313331 // tensors alive longer and adds overhead when copying Work objects
33323332 // between threads
33333333 r->trace_id_ = FlightRecorderCUDA::get ()->record (
@@ -3442,7 +3442,7 @@ void ProcessGroupNCCL::startCoalescing() {
34423442 // ops from a coalesce group into the flight recorder, we want to have the
34433443 // same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
34443444 // start, which has one minor downside- we burn a seq_ if someone ever does a
3445- // 'start' and 'end' coalescing region without doing an operation inbetween .
3445+ // 'start' and 'end' coalescing region without doing an operation in between .
34463446
34473447 coalescedDevice_.set_index (-1 );
34483448 coalescedComm_ = nullptr ;
@@ -3462,7 +3462,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
34623462 }
34633463 TORCH_CHECK (
34643464 coalescedDevice_.index () >= 0 ,
3465- " Somthing went wrong. Did you call end_coalescing before start_coalescing?" );
3465+ " Something went wrong. Did you call end_coalescing before start_coalescing?" );
34663466
34673467 // `coalescedComm_` should have same set of comms across collectives
34683468 auto comm = coalescedComm_;
@@ -3618,7 +3618,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
36183618 device, rank_, opType, false , profilingTitle, inputs, outputs, enqueue);
36193619 if (coalescing_state_) {
36203620 // When coalescing, we record events per op that lack timing/state
3621- // information becuase there is no 'work' associated with them, and then
3621+ // information because there is no 'work' associated with them, and then
36223622 // later in endCoalescing we record a 'coalesced' Work which has
36233623 // timing/state updates via watchdog thread, but lacks op metadata such as
36243624 // input/output sizes and profilingTitle per-op in the group.
@@ -3781,7 +3781,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
37813781 // collective so there is no flight record and we increment seqCollective_ and
37823782 // op_id_ together. Compare this to startCoalescing/endCoalescing flow where
37833783 // we increment either seqP2P_ or seqCollective_ once per group and increment
3784- // op_id_ once per indvidual operation within the group
3784+ // op_id_ once per individual operation within the group
37853785 op_id_++;
37863786
37873787 const auto key = getKeyFromDevice (device);
@@ -4016,7 +4016,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
40164016 c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
40174017 if (coalescing_state_) {
40184018 // When coalescing, we record events per op that lack timing/state
4019- // information becuase there is no 'work' associated with them, and then
4019+ // information because there is no 'work' associated with them, and then
40204020 // later in endCoalescing we record a 'coalesced' Work which has
40214021 // timing/state updates via watchdog thread, but lacks op metadata such as
40224022 // input/output sizes and profilingTitle per-op in the group.
@@ -4397,7 +4397,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
43974397 std::make_tuple (
43984398 static_cast <int64_t >(seqCollective_) + 1 ,
43994399 false ), // seq + 1 to match collective and assume only one collective
4400- // in coalesed range
4400+ // in coalesced range
44014401 std::make_tuple (pg_uid_, pg_desc_), // PG name tuple
44024402 tensors, // inputTensors
44034403 tensors, // outputTensors
@@ -4694,7 +4694,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
46944694 // User-facing outputTensors should be held by the user until after
46954695 // waiting on work_, or the call makes no sense. We do a stashing here
46964696 // in case user doesn't hold the outputTensors in downstream code,
4697- // which can cause an early recyle by the CachingAllocator, which can
4697+ // which can cause an early recycle by the CachingAllocator, which can
46984698 // lead to segfault or data corruption.
46994699 if (opts.asyncOp ) {
47004700 work->stashed_for_allocator_safety_ ->stash (outputTensors_);
@@ -4742,7 +4742,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
47424742 std::make_tuple (
47434743 static_cast <int64_t >(seqCollective_) + 1 ,
47444744 false ), // seq + 1 to match collective and assume only one collective
4745- // in coalesed range
4745+ // in coalesced range
47464746 std::make_tuple (pg_uid_, pg_desc_), // PG name tuple
47474747 inputs, // inputTensors
47484748 outputs, // outputTensors
@@ -4956,7 +4956,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
49564956 std::make_tuple (
49574957 static_cast <int64_t >(seqCollective_) + 1 ,
49584958 false ), // seq + 1 to match collective and assume only one collective
4959- // in coalesed range
4959+ // in coalesced range
49604960 std::make_tuple (pg_uid_, pg_desc_), // PG name tuple
49614961 inputs, // inputTensors
49624962 outputs, // outputTensors
0 commit comments