We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 5707c3d commit 681c66fCopy full SHA for 681c66f
torchtitan/distributed/utils.py
@@ -207,11 +207,6 @@ def _get_distributed_backend(job_config):
207
os.makedirs(dump_dir, exist_ok=True)
208
_warn_overwrite_env(TRACE_FILE, f"{dump_dir}/rank_")
209
210
- # to mitigate the memory issue that collectives using
211
- # async_op=True hold memory longer than they should
212
- # such as those in tensor parallelism
213
- os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
214
-
215
torch.distributed.init_process_group(
216
backend=_get_distributed_backend(job_config),
217
timeout=timedelta(seconds=job_config.comm.init_timeout_seconds),
0 commit comments