Skip to content

bump: try deepspeed >=0.14.1,<=0.15.0 #21076

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Aug 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions .actions/assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,33 +341,6 @@ def create_mirror_package(source_dir: str, package_mapping: dict[str, str]) -> N


class AssistantCLI:
@staticmethod
def requirements_prune_pkgs(packages: Sequence[str], req_files: Sequence[str] = REQUIREMENT_FILES_ALL) -> None:
"""Remove some packages from given requirement files."""
if isinstance(req_files, str):
req_files = [req_files]
for req in req_files:
AssistantCLI._prune_packages(req, packages)

@staticmethod
def _prune_packages(req_file: str, packages: Sequence[str]) -> None:
"""Remove some packages from given requirement files."""
path = Path(req_file)
assert path.exists()
text = path.read_text()
lines = text.splitlines()
final = []
for line in lines:
ln_ = line.strip()
if not ln_ or ln_.startswith("#"):
final.append(line)
continue
req = list(_parse_requirements([ln_]))[0]
if req.name not in packages:
final.append(line)
print(final)
path.write_text("\n".join(final) + "\n")

@staticmethod
def copy_replace_imports(
source_dir: str,
Expand Down
10 changes: 8 additions & 2 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,16 @@ jobs:
displayName: "Image info & NVIDIA"

- bash: |
cd requirements/fabric
set -ex
pip install "cython<3.0" wheel # for compatibility
pip install -U "lightning-utilities[cli]"
cd requirements/fabric
# replace range by pin minimal requirements
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
pip install "cython<3.0" wheel # for compatibility
# drop deepspeed since it is not supported by our minimal Torch requirements
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
# uninstall deepspeed since some older docker images have it pre-installed
pip uninstall -y deepspeed
condition: contains(variables['Agent.JobName'], 'oldest')
displayName: "setting oldest dependencies"

Expand Down
10 changes: 8 additions & 2 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,16 @@ jobs:
displayName: "Image info & NVIDIA"

- bash: |
cd requirements/pytorch
set -ex
pip install "cython<3.0" wheel # for compatibility
pip install -U "lightning-utilities[cli]"
cd requirements/pytorch
# replace range by pin minimal requirements
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
pip install "cython<3.0" wheel # for compatibility
# drop deepspeed since it is not supported by our minimal Torch requirements
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
# uninstall deepspeed since some older docker images have it pre-installed
pip uninstall -y deepspeed
condition: contains(variables['Agent.JobName'], 'oldest')
displayName: "setting oldest dependencies"

Expand Down
10 changes: 9 additions & 1 deletion dockers/release/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ FROM pytorchlightning/pytorch_lightning:base-cuda${CUDA_VERSION}-py${PYTHON_VERS
LABEL maintainer="Lightning-AI <https://github.com/Lightning-AI>"

ARG LIGHTNING_VERSION=""
ARG PYTORCH_VERSION

COPY ./ /home/pytorch-lightning/

Expand All @@ -39,7 +40,14 @@ RUN \
fi && \
# otherwise there is collision with folder name and pkg name on Pypi
cd pytorch-lightning && \
pip install setuptools==75.6.0 && \
# pip install setuptools==75.6.0 && \
pip install -U "lightning-utilities[cli]" && \
# drop deepspeed since it is not supported by our minimal Torch requirements \
echo "PYTORCH_VERSION is: '$PYTORCH_VERSION'" && \
if [[ "$PYTORCH_VERSION" =~ ^(2\.1|2\.2|2\.3|2\.4)$ ]]; then \
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/fabric/strategies.txt ; \
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files requirements/pytorch/strategies.txt ; \
fi && \
PACKAGE_NAME=lightning pip install '.[extra,loggers,strategies]' --no-cache-dir && \
PACKAGE_NAME=pytorch pip install '.[extra,loggers,strategies]' --no-cache-dir && \
cd .. && \
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ markers = [
"cloud: Run the cloud tests for example",
]
filterwarnings = [
# "error::DeprecationWarning",
"error::FutureWarning",
"ignore::FutureWarning:onnxscript", # Temporary ignore until onnxscript is updated
]
Expand Down
2 changes: 1 addition & 1 deletion requirements/fabric/strategies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@

# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR, but this was resolved, so maybe we can relax in the future

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I will continue in relaxing. I just wanted to do it in smaller steps so it's safer to land...

deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict
bitsandbytes >=0.45.2,<0.47.0; platform_system != "Darwin"
2 changes: 1 addition & 1 deletion requirements/pytorch/strategies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

# note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods`
# shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372
deepspeed >=0.9.3, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict
deepspeed >=0.14.1,<=0.15.0; platform_system != "Windows" and platform_system != "Darwin" # strict
6 changes: 1 addition & 5 deletions src/lightning/fabric/strategies/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
from torch.optim.lr_scheduler import _LRScheduler

_DEEPSPEED_AVAILABLE = RequirementCache("deepspeed")
_DEEPSPEED_GREATER_EQUAL_0_14_1 = RequirementCache("deepspeed>=0.14.1")


# TODO(fabric): Links in the docstrings to PL-specific deepspeed user docs need to be replaced.
Expand Down Expand Up @@ -503,10 +502,7 @@ def load_checkpoint(
)
engine = engines[0]

if _DEEPSPEED_GREATER_EQUAL_0_14_1:
from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
else:
from deepspeed.runtime import DeepSpeedOptimizer
from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer

optimzer_state_requested = any(isinstance(item, (Optimizer, DeepSpeedOptimizer)) for item in state.values())

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def __init__(self):


@RunIf(skip_windows=True)
@pytest.mark.flaky(reruns=3)
@pytest.mark.parametrize("strategy", ["ddp_spawn", "ddp_fork"])
def test_memory_sharing_disabled(strategy):
"""Test that the multiprocessing launcher disables memory sharing on model parameters and buffers to avoid race
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_pytorch/utilities/test_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

# https://github.com/pytorch/pytorch/issues/95708
@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
@RunIf(dynamo=True)
@RunIf(dynamo=True, deepspeed=True)
@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
trainer_kwargs = {
Expand Down
Loading