Skip to content

Commit d55dc00

Browse files
XuehaiPanpytorchmergebot
authored andcommitted
[BE][11/16] fix typos in torch/ (torch/csrc/distributed/) (pytorch#156321)
Pull Request resolved: pytorch#156321 Approved by: https://github.com/jingsh ghstack dependencies: pytorch#156313, pytorch#156314, pytorch#156315, pytorch#156316, pytorch#156317, pytorch#156319
1 parent 5b210bb commit d55dc00

34 files changed

+58
-57
lines changed

.lintrunner.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1179,7 +1179,6 @@ exclude_patterns = [
11791179
'torch/utils/**',
11801180
'torch/csrc/jit/**',
11811181
'torch/csrc/jit/[a-o]*/**',
1182-
'torch/csrc/distributed/**',
11831182
]
11841183
init_command = [
11851184
'python3',

tools/linter/dictionary.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,6 @@ rebuilt
2424
reenable
2525
reenabled
2626
requestor
27+
ser'de
2728
supercedes
2829
te

torch/csrc/distributed/autograd/engine/dist_engine.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class BackwardPassCleanupGuard;
1515

1616
// This is a singleton class responsible for running distributed backward
1717
// passes. This engine relies heavily on the vanilla autograd engine and tries
18-
// to re-use it as much as possible. This class is mostly responsible for the
18+
// to reuse it as much as possible. This class is mostly responsible for the
1919
// distributed aspects of autograd and tries to hook into the autograd engine
2020
// where convenient.
2121

torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ RpcWithProfilingReq::RpcWithProfilingReq(
4545
tensors_(std::move(tensors)),
4646
profilerConfig_(std::move(profilerConfig)),
4747
profilingKeyId_(profilingKeyId) {
48-
TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null");
48+
TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null");
4949
}
5050

5151
rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const {

torch/csrc/distributed/c10d/FileStore.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ FileStore::~FileStore() {
323323
auto numFinishedWorker = addHelper(cleanupKey_, 1);
324324
auto refCount = addHelper(refCountKey_, -1);
325325
// The last worker cleans up the file. If numWorkers was not initialized to
326-
// a specific postive value (i.e. meaning that there was not a fixed number
326+
// a specific positive value (i.e. meaning that there was not a fixed number
327327
// of workers), we don't attempt to clean.
328328
// Clean up the file if number of references is 0.
329329
if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) {

torch/csrc/distributed/c10d/FlightRecorder.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ struct FlightRecorder {
145145
std::optional<c10::time_t> time_discovered_started_;
146146

147147
// timestamp when our CPU threads discovered that the kernel completed.
148-
// will always be _after_ it actually complated, and can be the same time
148+
// will always be _after_ it actually completed, and can be the same time
149149
// as the discovery of the start if the watchdog thread is stuck on CUDA
150150
// APIs
151151
std::optional<c10::time_t> time_discovered_completed_;

torch/csrc/distributed/c10d/ProcessGroupGloo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_sparse(
965965
const AllreduceOptions& opts) {
966966
// all reduce sparse calls into default allreduce which
967967
// implemented with all_gathering indices and values
968-
// we do ths we do not have a native cuda implementation
968+
// we do this we do not have a native cuda implementation
969969
return allreduce(inputs, opts);
970970
}
971971

torch/csrc/distributed/c10d/ProcessGroupMPI.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ struct WorkEntry {
6565
// That is, The process may be multi-threaded, and multiple threads may make
6666
// MPI calls, but only one at a time: MPI calls are not made concurrently from
6767
// two distinct threads (all MPI calls are serialized). However, with
68-
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
68+
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process
6969
// group. In other words, no more than 1 process group can be created globally.
7070
//
7171
// If you would like to use multiple ProcessGroupMPI, it requires your MPI

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,7 +1423,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
14231423
bool ProcessGroupNCCL::abortComms(
14241424
const std::optional<std::string>& abortReason) {
14251425
// Remove record from global ncclCommMemPoolMapMutex before aboarting,
1426-
// so that a new cache segment would not register to already aborded
1426+
// so that a new cache segment would not register to already aborted
14271427
// communicators. Note that ncclCommMemPoolMap is a global container which may
14281428
// contain other PG's communicators, thus we need to only erase communicators
14291429
// for the current PG.
@@ -1451,9 +1451,9 @@ void ProcessGroupNCCL::abort() {
14511451
terminateProcessGroup_.store(true);
14521452
watchdog_->notify();
14531453

1454-
// lauch abort asynchrounously and wait for it to complete or timeout
1454+
// launch abort asynchronously and wait for it to complete or timeout
14551455
LOG(INFO) << logPrefix()
1456-
<< "Launching ProcessGroupNCCL abort asynchrounously.";
1456+
<< "Launching ProcessGroupNCCL abort asynchronously.";
14571457
std::future<bool> fut =
14581458
std::async(std::launch::async, [this]() { return this->abortComms(); });
14591459

@@ -1655,7 +1655,7 @@ std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg(
16551655

16561656
void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime(
16571657
std::chrono::time_point<std::chrono::steady_clock> time) {
1658-
// We intentially let the race condition to happen but this is ok
1658+
// We intentionally let the race condition to happen but this is ok
16591659
// as long as we update the time, we know we are making progress.
16601660
lastWorkListUpdateTime_ = time;
16611661
}
@@ -1761,7 +1761,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
17611761
// 1. The current rank is the first to observe a timeout in watchdog.
17621762
// (shouldDump_ was set to true by the watchdog thread).
17631763
// 2. Other ranks detected the timeout and signal the current rank to
1764-
// dump. In addtion, monitor threads will dump if watchdog threads has no
1764+
// dump. In addition, monitor threads will dump if watchdog threads has no
17651765
// heartbeat or dumpPipe is not empty.
17661766
if (shouldDump_.load()) {
17671767
errorMsg = getNCCLWatchdogTimeoutErrorMsg("this local rank");
@@ -3030,7 +3030,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
30303030

30313031
bool useScalableInit = false;
30323032
// (nranks / nroots) == 128 was the default NCCL recommended
3033-
// accoring to
3033+
// according to
30343034
// https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615.
30353035
auto ranksPerRoot = getCvarInt(TORCH_NCCL_RANKS_PER_ROOT, 128);
30363036
#if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
@@ -3327,7 +3327,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
33273327
// - initially, moved record() into workEnqueue(), but found that makes it
33283328
// hard to get access to profilingTitle,
33293329
// inputs, and outputs for metadata recording, and we don't want to attach
3330-
// these objects to the Work becuase it has implications for keeping those
3330+
// these objects to the Work because it has implications for keeping those
33313331
// tensors alive longer and adds overhead when copying Work objects
33323332
// between threads
33333333
r->trace_id_ = FlightRecorderCUDA::get()->record(
@@ -3442,7 +3442,7 @@ void ProcessGroupNCCL::startCoalescing() {
34423442
// ops from a coalesce group into the flight recorder, we want to have the
34433443
// same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
34443444
// start, which has one minor downside- we burn a seq_ if someone ever does a
3445-
// 'start' and 'end' coalescing region without doing an operation inbetween.
3445+
// 'start' and 'end' coalescing region without doing an operation in between.
34463446

34473447
coalescedDevice_.set_index(-1);
34483448
coalescedComm_ = nullptr;
@@ -3462,7 +3462,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
34623462
}
34633463
TORCH_CHECK(
34643464
coalescedDevice_.index() >= 0,
3465-
"Somthing went wrong. Did you call end_coalescing before start_coalescing?");
3465+
"Something went wrong. Did you call end_coalescing before start_coalescing?");
34663466

34673467
// `coalescedComm_` should have same set of comms across collectives
34683468
auto comm = coalescedComm_;
@@ -3618,7 +3618,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
36183618
device, rank_, opType, false, profilingTitle, inputs, outputs, enqueue);
36193619
if (coalescing_state_) {
36203620
// When coalescing, we record events per op that lack timing/state
3621-
// information becuase there is no 'work' associated with them, and then
3621+
// information because there is no 'work' associated with them, and then
36223622
// later in endCoalescing we record a 'coalesced' Work which has
36233623
// timing/state updates via watchdog thread, but lacks op metadata such as
36243624
// input/output sizes and profilingTitle per-op in the group.
@@ -3781,7 +3781,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
37813781
// collective so there is no flight record and we increment seqCollective_ and
37823782
// op_id_ together. Compare this to startCoalescing/endCoalescing flow where
37833783
// we increment either seqP2P_ or seqCollective_ once per group and increment
3784-
// op_id_ once per indvidual operation within the group
3784+
// op_id_ once per individual operation within the group
37853785
op_id_++;
37863786

37873787
const auto key = getKeyFromDevice(device);
@@ -4016,7 +4016,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
40164016
c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
40174017
if (coalescing_state_) {
40184018
// When coalescing, we record events per op that lack timing/state
4019-
// information becuase there is no 'work' associated with them, and then
4019+
// information because there is no 'work' associated with them, and then
40204020
// later in endCoalescing we record a 'coalesced' Work which has
40214021
// timing/state updates via watchdog thread, but lacks op metadata such as
40224022
// input/output sizes and profilingTitle per-op in the group.
@@ -4397,7 +4397,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
43974397
std::make_tuple(
43984398
static_cast<int64_t>(seqCollective_) + 1,
43994399
false), // seq + 1 to match collective and assume only one collective
4400-
// in coalesed range
4400+
// in coalesced range
44014401
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
44024402
tensors, // inputTensors
44034403
tensors, // outputTensors
@@ -4694,7 +4694,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
46944694
// User-facing outputTensors should be held by the user until after
46954695
// waiting on work_, or the call makes no sense. We do a stashing here
46964696
// in case user doesn't hold the outputTensors in downstream code,
4697-
// which can cause an early recyle by the CachingAllocator, which can
4697+
// which can cause an early recycle by the CachingAllocator, which can
46984698
// lead to segfault or data corruption.
46994699
if (opts.asyncOp) {
47004700
work->stashed_for_allocator_safety_->stash(outputTensors_);
@@ -4742,7 +4742,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
47424742
std::make_tuple(
47434743
static_cast<int64_t>(seqCollective_) + 1,
47444744
false), // seq + 1 to match collective and assume only one collective
4745-
// in coalesed range
4745+
// in coalesced range
47464746
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
47474747
inputs, // inputTensors
47484748
outputs, // outputTensors
@@ -4956,7 +4956,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
49564956
std::make_tuple(
49574957
static_cast<int64_t>(seqCollective_) + 1,
49584958
false), // seq + 1 to match collective and assume only one collective
4959-
// in coalesed range
4959+
// in coalesced range
49604960
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
49614961
inputs, // inputTensors
49624962
outputs, // outputTensors

torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,7 +1291,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
12911291
// communication, the key will be "1:2" on both processes. Note: this is for
12921292
// the scenario where there is only 1 GPU per process. When it comes to
12931293
// multiple GPUs per process, this part may need to redesigned.
1294-
// TODO: we probably need a separte map for P2P comms
1294+
// TODO: we probably need a separate map for P2P comms
12951295
std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
12961296

12971297
// The NCCL communicators currently in process of being initialized.
@@ -1316,7 +1316,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
13161316
std::atomic<bool> hasPendingHooks_{};
13171317

13181318
// This is the signal from watchdog threads to indicate whether the monitor
1319-
// thread should dump. Making it static so that it is accessiable from all the
1319+
// thread should dump. Making it static so that it is accessible from all the
13201320
// PGs. With this flag, monitor thread would dump debug info under any one of
13211321
// the three conditions:
13221322
//

0 commit comments

Comments
 (0)