diff --git a/docs/source/tutorials/ar.ipynb b/docs/source/tutorials/ar.ipynb
index 7cbf0fbc6..32a311de7 100644
--- a/docs/source/tutorials/ar.ipynb
+++ b/docs/source/tutorials/ar.ipynb
@@ -187,10 +187,16 @@
     "    max_prediction_length=prediction_length,\n",
     ")\n",
     "\n",
-    "validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff + 1)\n",
+    "validation = TimeSeriesDataSet.from_dataset(\n",
+    "    training, data, min_prediction_idx=training_cutoff + 1\n",
+    ")\n",
     "batch_size = 128\n",
-    "train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)\n",
-    "val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)"
+    "train_dataloader = training.to_dataloader(\n",
+    "    train=True, batch_size=batch_size, num_workers=0\n",
+    ")\n",
+    "val_dataloader = validation.to_dataloader(\n",
+    "    train=False, batch_size=batch_size, num_workers=0\n",
+    ")"
    ]
   },
   {
@@ -251,7 +257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -269,12 +275,18 @@
    "source": [
     "pl.seed_everything(42)\n",
     "trainer = pl.Trainer(accelerator=\"auto\", gradient_clip_val=0.1)\n",
-    "net = NBeats.from_dataset(training, learning_rate=3e-2, weight_decay=1e-2, widths=[32, 512], backcast_loss_ratio=0.1)"
+    "net = NBeats.from_dataset(\n",
+    "    training,\n",
+    "    learning_rate=3e-2,\n",
+    "    weight_decay=1e-2,\n",
+    "    widths=[32, 512],\n",
+    "    backcast_loss_ratio=0.1,\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -321,9 +333,18 @@
    ],
    "source": [
     "# find optimal learning rate\n",
-    "from lightning.pytorch.tuner import Tuner\n",
+    "# from lightning.pytorch.tuner import Tuner\n",
+    "# todo: update when lightning.pytorch.tuner allows weights_only param\n",
+    "from pytorch_forecasting.models.temporal_fusion_transformer.tuning import (\n",
+    "    _NewTuner as Tuner,\n",
+    ")\n",
     "\n",
-    "res = Tuner(trainer).lr_find(net, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5)\n",
+    "res = Tuner(trainer).lr_find(\n",
+    "    net,\n",
+    "    train_dataloaders=train_dataloader,\n",
+    "    val_dataloaders=val_dataloader,\n",
+    "    min_lr=1e-5,\n",
+    ")\n",
     "print(f\"suggested learning rate: {res.suggestion()}\")\n",
     "fig = res.plot(show=True, suggest=True)\n",
     "fig.show()\n",
@@ -340,7 +361,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -443,7 +464,9 @@
     }
    ],
    "source": [
-    "early_stop_callback = EarlyStopping(monitor=\"val_loss\", min_delta=1e-4, patience=10, verbose=False, mode=\"min\")\n",
+    "early_stop_callback = EarlyStopping(\n",
+    "    monitor=\"val_loss\", min_delta=1e-4, patience=10, verbose=False, mode=\"min\"\n",
+    ")\n",
     "trainer = pl.Trainer(\n",
     "    max_epochs=3,\n",
     "    accelerator=\"auto\",\n",
@@ -468,6 +491,7 @@
     "    net,\n",
     "    train_dataloaders=train_dataloader,\n",
     "    val_dataloaders=val_dataloader,\n",
+    "    weights_only=False,\n",
     ")"
    ]
   },
@@ -481,12 +505,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "best_model_path = trainer.checkpoint_callback.best_model_path\n",
-    "best_model = NBeats.load_from_checkpoint(best_model_path)"
+    "best_model = NBeats.load_from_checkpoint(best_model_path, weights_only=False)"
    ]
   },
   {
@@ -645,7 +669,9 @@
    ],
    "source": [
     "for idx in range(10):  # plot 10 examples\n",
-    "    best_model.plot_prediction(raw_predictions.x, raw_predictions.output, idx=idx, add_loss_to_title=True)"
+    "    best_model.plot_prediction(\n",
+    "        raw_predictions.x, raw_predictions.output, idx=idx, add_loss_to_title=True\n",
+    "    )"
    ]
   },
   {
diff --git a/docs/source/tutorials/deepar.ipynb b/docs/source/tutorials/deepar.ipynb
index 753a06d93..a44e722cc 100644
--- a/docs/source/tutorials/deepar.ipynb
+++ b/docs/source/tutorials/deepar.ipynb
@@ -301,7 +301,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -348,7 +348,11 @@
    ],
    "source": [
     "# find optimal learning rate\n",
-    "from lightning.pytorch.tuner import Tuner\n",
+    "# from lightning.pytorch.tuner import Tuner\n",
+    "# todo: update when lightning.pytorch.tuner allows weights_only param\n",
+    "from pytorch_forecasting.models.temporal_fusion_transformer.tuning import (\n",
+    "    _NewTuner as Tuner,\n",
+    ")\n",
     "\n",
     "res = Tuner(trainer).lr_find(\n",
     "    net,\n",
@@ -883,12 +887,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "best_model_path = trainer.checkpoint_callback.best_model_path\n",
-    "best_model = DeepAR.load_from_checkpoint(best_model_path)"
+    "best_model = DeepAR.load_from_checkpoint(best_model_path, weights_only=False)"
    ]
   },
   {
@@ -1268,7 +1272,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": ".venv (3.12.3)",
    "language": "python",
    "name": "python3"
   },
@@ -1282,7 +1286,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/docs/source/tutorials/nhits.ipynb b/docs/source/tutorials/nhits.ipynb
index 5970c892d..ffa1fc886 100644
--- a/docs/source/tutorials/nhits.ipynb
+++ b/docs/source/tutorials/nhits.ipynb
@@ -306,7 +306,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -353,7 +353,11 @@
    ],
    "source": [
     "# find optimal learning rate\n",
-    "from lightning.pytorch.tuner import Tuner\n",
+    "# from lightning.pytorch.tuner import Tuner\n",
+    "# todo: update when lightning.pytorch.tuner allows weights_only param\n",
+    "from pytorch_forecasting.models.temporal_fusion_transformer.tuning import (\n",
+    "    _NewTuner as Tuner,\n",
+    ")\n",
     "\n",
     "res = Tuner(trainer).lr_find(\n",
     "    net,\n",
@@ -553,12 +557,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "best_model_path = trainer.checkpoint_callback.best_model_path\n",
-    "best_model = NHiTS.load_from_checkpoint(best_model_path)"
+    "best_model = NHiTS.load_from_checkpoint(best_model_path, weights_only=False)"
    ]
   },
   {
diff --git a/docs/source/tutorials/stallion.ipynb b/docs/source/tutorials/stallion.ipynb
index ebd8afe9c..f777fa2d6 100644
--- a/docs/source/tutorials/stallion.ipynb
+++ b/docs/source/tutorials/stallion.ipynb
@@ -1012,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "collapsed": false,
     "jupyter": {
@@ -1067,7 +1067,11 @@
    ],
    "source": [
     "# find optimal learning rate\n",
-    "from lightning.pytorch.tuner import Tuner\n",
+    "# from lightning.pytorch.tuner import Tuner\n",
+    "# todo: update when lightning.pytorch.tuner allows weights_only param\n",
+    "from pytorch_forecasting.models.temporal_fusion_transformer.tuning import (\n",
+    "    _NewTuner as Tuner,\n",
+    ")\n",
     "\n",
     "res = Tuner(trainer).lr_find(\n",
     "    tft,\n",
@@ -2051,14 +2055,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# load the best model according to the validation loss\n",
     "# (given that we use early stopping, this is not necessarily the last epoch)\n",
     "best_model_path = trainer.checkpoint_callback.best_model_path\n",
-    "best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)"
+    "best_tft = TemporalFusionTransformer.load_from_checkpoint(\n",
+    "    best_model_path, weights_only=False\n",
+    ")"
    ]
   },
   {
diff --git a/pyproject.toml b/pyproject.toml
index ca7ef1240..cc966a0ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ description = "Forecasting timeseries with PyTorch - dataloaders, normalizers, m
 dependencies = [
   "numpy<=3.0.0",
   "torch >=2.0.0,!=2.0.1,<3.0.0",
-  "lightning >=2.0.0,<2.6.0",
+  "lightning >=2.0.0,<3.0.0",
   "scipy >=1.8,<2.0",
   "pandas >=1.3.0,<3.0.0",
   "scikit-learn >=1.2,<2.0",
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py b/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py
index 6acb76203..19da91e56 100644
--- a/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py
+++ b/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py
@@ -3,6 +3,7 @@
 """
 
 import copy
+import functools
 import logging
 import os
 from typing import Any, Union
@@ -10,9 +11,12 @@
 import lightning.pytorch as pl
 from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint
 from lightning.pytorch.loggers import TensorBoardLogger
+from lightning.pytorch.trainer import Trainer
 from lightning.pytorch.tuner import Tuner
 import numpy as np
 import scipy._lib._util
+from skbase.utils.dependencies import _check_soft_dependencies
+import torch
 from torch.utils.data import DataLoader
 
 from pytorch_forecasting import TemporalFusionTransformer
@@ -23,6 +27,26 @@
 optuna_logger = logging.getLogger("optuna")
 
 
+# todo: Remove this class once lightning allows the pass of weights_only to tuner
+class _NewTuner(Tuner):
+    def lr_find(self, *args, **kwargs):
+        strategy = self._trainer.strategy
+        original_load_checkpoint = strategy.load_checkpoint
+
+        @functools.wraps(original_load_checkpoint)
+        def new_load_checkpoint(*ckpt_args, **ckpt_kwargs):
+            ckpt_kwargs["weights_only"] = False
+            return original_load_checkpoint(*ckpt_args, **ckpt_kwargs)
+
+        if not _check_soft_dependencies("lightning<2.6", severity="none"):
+            strategy.load_checkpoint = new_load_checkpoint
+
+        try:
+            return super().lr_find(*args, **kwargs)
+        finally:
+            strategy.load_checkpoint = original_load_checkpoint
+
+
 # ToDo: remove this once statsmodels release a version compatible with latest
 # scipy version
 def _lazywhere(cond, arrays, f, fillvalue=np.nan, f2=None):
@@ -209,7 +233,7 @@ def objective(trial: optuna.Trial) -> float:
                 enable_progress_bar=False,
                 enable_model_summary=False,
             )
-            tuner = Tuner(lr_trainer)
+            tuner = _NewTuner(lr_trainer)
             res = tuner.lr_find(
                 model,
                 train_dataloaders=train_dataloaders,
diff --git a/pytorch_forecasting/tests/test_all_estimators.py b/pytorch_forecasting/tests/test_all_estimators.py
index e3f9980ea..c9aee5222 100644
--- a/pytorch_forecasting/tests/test_all_estimators.py
+++ b/pytorch_forecasting/tests/test_all_estimators.py
@@ -318,7 +318,7 @@ def _integration(
         assert len(test_outputs) > 0
         # check loading
         net = estimator_cls.load_from_checkpoint(
-            trainer.checkpoint_callback.best_model_path
+            trainer.checkpoint_callback.best_model_path, weights_only=False
         )
 
         # check prediction
diff --git a/tests/test_models/test_deepar.py b/tests/test_models/test_deepar.py
index 6b3e1f0cc..118852079 100644
--- a/tests/test_models/test_deepar.py
+++ b/tests/test_models/test_deepar.py
@@ -90,7 +90,9 @@ def _integration(
         test_outputs = trainer.test(net, dataloaders=test_dataloader)
         assert len(test_outputs) > 0
         # check loading
-        net = DeepAR.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+        net = DeepAR.load_from_checkpoint(
+            trainer.checkpoint_callback.best_model_path, weights_only=False
+        )
 
         # check prediction
         net.predict(
diff --git a/tests/test_models/test_mlp.py b/tests/test_models/test_mlp.py
index 54e4a6191..55f4a2dda 100644
--- a/tests/test_models/test_mlp.py
+++ b/tests/test_models/test_mlp.py
@@ -71,7 +71,7 @@ def _integration(
             )
         # check loading
         net = DecoderMLP.load_from_checkpoint(
-            trainer.checkpoint_callback.best_model_path
+            trainer.checkpoint_callback.best_model_path, weights_only=False
         )
 
         # check prediction
diff --git a/tests/test_models/test_nbeats.py b/tests/test_models/test_nbeats.py
index c3379fbf1..b242e9a6b 100644
--- a/tests/test_models/test_nbeats.py
+++ b/tests/test_models/test_nbeats.py
@@ -50,7 +50,9 @@ def test_integration(dataloaders_fixed_window_without_covariates, tmp_path):
         test_outputs = trainer.test(net, dataloaders=test_dataloader)
         assert len(test_outputs) > 0
         # check loading
-        net = NBeats.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+        net = NBeats.load_from_checkpoint(
+            trainer.checkpoint_callback.best_model_path, weights_only=False
+        )
 
         # check prediction
         net.predict(
diff --git a/tests/test_models/test_nhits.py b/tests/test_models/test_nhits.py
index a79e7a93f..1b8bf0e5e 100644
--- a/tests/test_models/test_nhits.py
+++ b/tests/test_models/test_nhits.py
@@ -66,7 +66,9 @@ def _integration(dataloader, tmp_path, trainer_kwargs=None, **kwargs):
             test_outputs = trainer.test(net, dataloaders=test_dataloader)
             assert len(test_outputs) > 0
         # check loading
-        net = NHiTS.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+        net = NHiTS.load_from_checkpoint(
+            trainer.checkpoint_callback.best_model_path, weights_only=False
+        )
 
         # check prediction
         net.predict(
diff --git a/tests/test_models/test_rnn_model.py b/tests/test_models/test_rnn_model.py
index 69a9b558e..5f1b98215 100644
--- a/tests/test_models/test_rnn_model.py
+++ b/tests/test_models/test_rnn_model.py
@@ -76,7 +76,7 @@ def _integration(
         assert len(test_outputs) > 0
         # check loading
         net = RecurrentNetwork.load_from_checkpoint(
-            trainer.checkpoint_callback.best_model_path
+            trainer.checkpoint_callback.best_model_path, weights_only=False
         )
 
         # check prediction
diff --git a/tests/test_models/test_temporal_fusion_transformer.py b/tests/test_models/test_temporal_fusion_transformer.py
index 3e09b8b7e..491ca865c 100644
--- a/tests/test_models/test_temporal_fusion_transformer.py
+++ b/tests/test_models/test_temporal_fusion_transformer.py
@@ -185,7 +185,7 @@ def _integration(dataloader, tmp_path, loss=None, trainer_kwargs=None, **kwargs)
 
             # check loading
             net = TemporalFusionTransformer.load_from_checkpoint(
-                trainer.checkpoint_callback.best_model_path
+                trainer.checkpoint_callback.best_model_path, weights_only=False
             )
 
             # check prediction
@@ -505,7 +505,9 @@ def test_no_exogenous_variable():
         val_dataloaders=validation_data_loader,
     )
     best_model_path = trainer.checkpoint_callback.best_model_path
-    best_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
+    best_model = TemporalFusionTransformer.load_from_checkpoint(
+        best_model_path, weights_only=False
+    )
     best_model.predict(
         validation_data_loader,
         return_x=True,
diff --git a/tests/test_models/test_tide.py b/tests/test_models/test_tide.py
index 3b73ba380..48b75b1c4 100644
--- a/tests/test_models/test_tide.py
+++ b/tests/test_models/test_tide.py
@@ -84,7 +84,7 @@ def _integration(
         assert len(test_outputs) > 0
         # check loading
         net = estimator_cls.load_from_checkpoint(
-            trainer.checkpoint_callback.best_model_path
+            trainer.checkpoint_callback.best_model_path, weights_only=False
         )
 
         # check prediction
@@ -261,7 +261,7 @@ def test_no_exogenous_variable():
         val_dataloaders=validation_data_loader,
     )
     best_model_path = trainer.checkpoint_callback.best_model_path
-    best_model = TiDEModel.load_from_checkpoint(best_model_path)
+    best_model = TiDEModel.load_from_checkpoint(best_model_path, weights_only=False)
     best_model.predict(
         validation_data_loader,
         fast_dev_run=True,
diff --git a/tests/test_models/test_timexer.py b/tests/test_models/test_timexer.py
index 4ce98e6d6..510578582 100644
--- a/tests/test_models/test_timexer.py
+++ b/tests/test_models/test_timexer.py
@@ -115,7 +115,7 @@ def _integration(dataloader, tmp_path, loss=None, trainer_kwargs=None, **kwargs)
 
         # test the checkpointing feature
         net = TimeXer.load_from_checkpoint(
-            trainer.checkpoint_callback.best_model_path,
+            trainer.checkpoint_callback.best_model_path, weights_only=False
         )
         predictions = net.predict(
             val_dataloader,
diff --git a/tests/test_models/test_x_lstm.py b/tests/test_models/test_x_lstm.py
index a527957cf..5f1c7e7bf 100644
--- a/tests/test_models/test_x_lstm.py
+++ b/tests/test_models/test_x_lstm.py
@@ -57,7 +57,7 @@ def _integration(
         assert len(test_outputs) > 0
 
         net = xLSTMTime.load_from_checkpoint(
-            trainer.checkpoint_callback.best_model_path
+            trainer.checkpoint_callback.best_model_path, weights_only=False
         )
 
         net.predict(