Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/ci-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ jobs:
run: |
uv pip install ".[${EXTRA_PREFIX}extra,${EXTRA_PREFIX}test,${EXTRA_PREFIX}strategies]" \
--upgrade \
-r requirements/_integrations/accelerators.txt \
--find-links="${TORCH_URL}" \
--find-links="https://download.pytorch.org/whl/torch-tensorrt"
uv pip list
Expand Down
8 changes: 0 additions & 8 deletions docs/source-pytorch/common/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
Save memory with half-precision <precision>
../advanced/model_parallel
Train on single or multiple GPUs <../accelerators/gpu>
Train on single or multiple HPUs <../integrations/hpu/index>
Train on single or multiple TPUs <../accelerators/tpu>
Train on MPS <../accelerators/mps>
Use a pretrained model <../advanced/pretrained>
Expand Down Expand Up @@ -161,13 +160,6 @@ How-to Guides
:col_css: col-md-4
:height: 180

.. displayitem::
:header: Train on single or multiple HPUs
:description: Train models faster with HPU accelerators
:button_link: ../integrations/hpu/index.html
:col_css: col-md-4
:height: 180

.. displayitem::
:header: Train on single or multiple TPUs
:description: TTrain models faster with TPU accelerators
Expand Down
7 changes: 0 additions & 7 deletions docs/source-pytorch/common_usecases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,6 @@ Customize and extend Lightning for things like custom hardware or distributed st
:button_link: accelerators/gpu.html
:height: 100

.. displayitem::
:header: Train on single or multiple HPUs
:description: Train models faster with HPUs.
:col_css: col-md-12
:button_link: integrations/hpu/index.html
:height: 100

.. displayitem::
:header: Train on single or multiple TPUs
:description: Train models faster with TPUs.
Expand Down
20 changes: 0 additions & 20 deletions docs/source-pytorch/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,6 @@ def _load_py_module(name: str, location: str) -> ModuleType:
os.path.join(_PATH_HERE, _FOLDER_GENERATED, "CHANGELOG.md"),
)

# Copy Accelerator docs
assist_local.AssistantCLI.pull_docs_files(
gh_user_repo="Lightning-AI/lightning-Habana",
target_dir="docs/source-pytorch/integrations/hpu",
# checkout="refs/tags/1.6.0",
checkout="5549fa927d5501d31aac0c9b2ed479be62a02cbc",
)
# the HPU also need some images
URL_RAW_DOCS_HABANA = "https://raw.githubusercontent.com/Lightning-AI/lightning-Habana/1.5.0/docs/source"
for img in ["_images/HPUProfiler.png", "_images/IGP.png"]:
img_ = os.path.join(_PATH_HERE, "integrations", "hpu", img)
os.makedirs(os.path.dirname(img_), exist_ok=True)
urllib.request.urlretrieve(f"{URL_RAW_DOCS_HABANA}/{img}", img_)

# Copy strategies docs as single pages
assist_local.AssistantCLI.pull_docs_files(
gh_user_repo="Lightning-Universe/lightning-Hivemind",
Expand Down Expand Up @@ -360,7 +346,6 @@ def _load_py_module(name: str, location: str) -> ModuleType:
"numpy": ("https://numpy.org/doc/stable/", None),
"PIL": ("https://pillow.readthedocs.io/en/stable/", None),
"torchmetrics": ("https://lightning.ai/docs/torchmetrics/stable/", None),
"lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None),
"tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None),
# needed for referencing Fabric from lightning scope
"lightning.fabric": ("https://lightning.ai/docs/fabric/stable/", None),
Expand Down Expand Up @@ -468,10 +453,6 @@ def _load_py_module(name: str, location: str) -> ModuleType:
("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfigType"),
("py:class", "lightning.pytorch.utilities.types.OptimizerConfig"),
("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfig"),
("py:class", "lightning_habana.pytorch.plugins.precision.HPUPrecisionPlugin"),
("py:class", "lightning_habana.pytorch.strategies.HPUDDPStrategy"),
("py:class", "lightning_habana.pytorch.strategies.HPUParallelStrategy"),
("py:class", "lightning_habana.pytorch.strategies.SingleHPUStrategy"),
("py:obj", "logger.experiment"),
("py:class", "mlflow.tracking.MlflowClient"),
("py:attr", "model"),
Expand Down Expand Up @@ -648,7 +629,6 @@ def package_list_from_file(file):
r"^../common/trainer.html#trainer-flags$",
"https://medium.com/pytorch-lightning/quick-contribution-guide-86d977171b3a",
"https://deepgenerativemodels.github.io/assets/slides/cs236_lecture11.pdf",
"https://developer.habana.ai", # returns 403 error but redirects to intel.com documentation
"https://www.supermicro.com", # returns 403 error
"https://www.intel.com/content/www/us/en/products/docs/processors/what-is-a-gpu.html",
"https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/", # noqa: E501
Expand Down
12 changes: 2 additions & 10 deletions docs/source-pytorch/expertise_levels.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,23 +190,15 @@ Configure all aspects of Lightning for advanced usecases.
:tag: advanced

.. displayitem::
:header: Level 18: Explore HPUs
:description: Explore Havana Gaudi Processing Unit (HPU) for model scaling.
:col_css: col-md-6
:button_link: levels/advanced_level_19.html
:height: 150
:tag: advanced

.. displayitem::
:header: Level 19: Master TPUs
:header: Level 18: Master TPUs
:description: Master TPUs and run on cloud TPUs.
:col_css: col-md-6
:button_link: levels/advanced_level_20.html
:height: 150
:tag: advanced

.. displayitem::
:header: Level 20: Train models with billions of parameters
:header: Level 19: Train models with billions of parameters
:description: Scale GPU training to models with billions of parameters
:col_css: col-md-6
:button_link: levels/advanced_level_21.html
Expand Down
1 change: 0 additions & 1 deletion docs/source-pytorch/extensions/accelerator.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Currently there are accelerators for:
- CPU
- :doc:`GPU <../accelerators/gpu>`
- :doc:`TPU <../accelerators/tpu>`
- :doc:`HPU <../integrations/hpu/index>`
- :doc:`MPS <../accelerators/mps>`

The Accelerator is part of the Strategy which manages communication across multiple devices (distributed communication).
Expand Down
6 changes: 0 additions & 6 deletions docs/source-pytorch/extensions/strategy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,6 @@ The below table lists all relevant strategies available in Lightning with their
* - deepspeed
- :class:`~lightning.pytorch.strategies.DeepSpeedStrategy`
- Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. :doc:`Learn more. <../advanced/model_parallel/deepspeed>`
* - hpu_parallel
- ``HPUParallelStrategy``
- Strategy for distributed training on multiple HPU devices. :doc:`Learn more. <../integrations/hpu/index>`
* - hpu_single
- ``SingleHPUStrategy``
- Strategy for training on a single HPU device. :doc:`Learn more. <../integrations/hpu/index>`
* - xla
- :class:`~lightning.pytorch.strategies.XLAStrategy`
- Strategy for training on multiple TPU devices using the :func:`torch_xla.distributed.xla_multiprocessing.spawn` method. :doc:`Learn more. <../accelerators/tpu>`
Expand Down
8 changes: 0 additions & 8 deletions docs/source-pytorch/glossary/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
GPU <../accelerators/gpu>
Half precision <../common/precision>
Hooks <../common/hooks>
HPU <../integrations/hpu/index>
Inference <../deploy/production_intermediate>
Lightning CLI <../cli/lightning_cli>
LightningDataModule <../data/datamodule>
Expand Down Expand Up @@ -187,13 +186,6 @@ Glossary
:button_link: ../common/hooks.html
:height: 100

.. displayitem::
:header: HPU
:description: Habana Gaudi AI Processor Unit for faster training
:col_css: col-md-12
:button_link: ../integrations/hpu/index.html
:height: 100

.. displayitem::
:header: Inference
:description: Making predictions by applying a trained model to unlabeled examples
Expand Down
40 changes: 0 additions & 40 deletions docs/source-pytorch/integrations/hpu/index.rst

This file was deleted.

12 changes: 2 additions & 10 deletions docs/source-pytorch/levels/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,23 +46,15 @@ Configure all aspects of Lightning for advanced usecases.
:tag: advanced

.. displayitem::
:header: Level 18: Explore HPUs
:description: Explore Habana Gaudi Processing Unit (HPU) for model scaling.
:col_css: col-md-6
:button_link: advanced_level_19.html
:height: 150
:tag: advanced

.. displayitem::
:header: Level 19: Master TPUs
:header: Level 18: Master TPUs
:description: Master TPUs and run on cloud TPUs.
:col_css: col-md-6
:button_link: advanced_level_20.html
:height: 150
:tag: advanced

.. displayitem::
:header: Level 20: Train models with billions of parameters
:header: Level 19: Train models with billions of parameters
:description: Scale GPU training to models with billions of parameters
:col_css: col-md-6
:button_link: advanced_level_21.html
Expand Down
37 changes: 0 additions & 37 deletions docs/source-pytorch/levels/advanced_level_19.rst

This file was deleted.

2 changes: 1 addition & 1 deletion docs/source-pytorch/levels/advanced_level_20.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
:orphan:

#####################
Level 19: Master TPUs
Level 18: Master TPUs
#####################

Master cloud TPU training with profiling and scaling techniques.
Expand Down
2 changes: 1 addition & 1 deletion docs/source-pytorch/levels/advanced_level_21.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
:orphan:

##################################################
Level 20: Train models with billions of parameters
Level 19: Train models with billions of parameters
##################################################

Scale to billions of parameters with multiple distributed strategies.
Expand Down
73 changes: 0 additions & 73 deletions examples/pytorch/hpu/mnist_sample.py

This file was deleted.

2 changes: 0 additions & 2 deletions examples/pytorch/hpu/ops_bf16_mnist.txt

This file was deleted.

1 change: 0 additions & 1 deletion examples/pytorch/hpu/ops_fp32_mnist.txt

This file was deleted.

2 changes: 0 additions & 2 deletions requirements/_integrations/accelerators.txt

This file was deleted.

15 changes: 0 additions & 15 deletions src/lightning/fabric/utilities/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import torch
import torch.nn.functional as F
from lightning_utilities.core.imports import package_available
from torch import Tensor
from torch.utils.data import Dataset, DistributedSampler, Sampler
from typing_extensions import Self, TypeGuard, override
Expand Down Expand Up @@ -210,20 +209,6 @@ def _sync_ddp(result: Tensor, group: Optional[Any] = None, reduce_op: Optional[U
else:
op = reduce_op

# HPU doesn't support Long types, forcefully set it to float
# TODO: move this to the `lightning_habana` package
if (
package_available("habana_frameworks")
and os.environ.get("HCCL_DISTRIBUTED_BACKEND") == "1"
and result.type()
in (
"torch.LongTensor",
"torch.hpu.LongTensor",
)
):
rank_zero_info("Long tensor unsupported on HPU, casting to float")
result = result.float()

# Sync all processes before reduction
torch.distributed.barrier(group=group)
torch.distributed.all_reduce(result, op=op, group=group, async_op=False)
Expand Down
Loading
Loading