[1022] Getting WG to work on santis (#1023)

tjhunter · clessig · web-flow · commit 87f612765170 · 2025-10-08T20:00:08.000+02:00
* working pytorch

* changes

* Fix for code to work on Alps-Santis

* changes

* cleanups

* changes

* reverting change

* having issues with the latest branch on santis

* changes

* changes

* changes

* override with cpu

* working for cpu

* flash-attn moved to gpu

* remove contstraint

* simplifying

* trying

* working on atos

* changes

* macos

* chanegs

* cleanups

* actions

* actions

* actions

* actions

* changes

---------

Co-authored-by: Christian Lessig &lt;christian.lessig@ecmwf.int&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Type checker (pyrefly, experimental)
         # Do not attempt to install the default dependencies, this is much faster.
         # Run temporarily on a sub directory before the main restyling.
-        run: ./scripts/actions.sh type-check-experimental || echo "::warning::typing issues found"
+        run: ./scripts/actions.sh type-check || echo "::warning::typing issues found"
   pr:
     name: PR checks
     runs-on: ubuntu-latest
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -28,7 +28,9 @@ ae_global_num_blocks: 8
 ae_global_num_heads: 32
 ae_global_dropout_rate: 0.1
 ae_global_with_qk_lnorm: True
-ae_global_att_dense_rate: 0.2
+# TODO: switching to < 1 triggers triton-related issues.
+# See https://github.com/ecmwf/WeatherGenerator/issues/1050
+ae_global_att_dense_rate: 1.0
 ae_global_block_factor: 64
 ae_global_mlp_hidden_factor: 2
 
diff --git a/integration_tests/streams/era5_small.yml b/integration_tests/streams/era5_small.yml
@@ -9,7 +9,7 @@
 
 ERA5 :
   type : anemoi
-  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2022-6h-v6.zarr']
+  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8.zarr']
   loss_weight : 1.
   source_exclude : ['w_', 'skt', 'sp', 'tcw', 'cp', 'tp']
   target_exclude : ['w_', 'skt', 'sp', 'tcw', 'cp', 'tp']
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,6 @@ authors = [
 requires-python = ">=3.12,<3.13"
 # TODO: split the plotting dependencies into their own dep groups, they are not required.
 dependencies = [
- 'torch==2.6.0',
  'numpy~=2.2',
  'astropy_healpix~=1.1.2',
  'zarr~=2.17',
@@ -22,7 +21,6 @@ dependencies = [
  'packaging',
  'wheel',
  'psutil',
- "flash-attn; sys_platform == 'linux'",
  "polars~=1.25.2",
  "omegaconf~=2.3.0",
  "dask~=2025.5.1",
@@ -32,6 +30,7 @@ dependencies = [
  "weathergen-evaluate",
 ]
 
+
 [project.urls]
 Homepage = "https://www.weathergenerator.eu"
 Documentation = "https://readthedocs.org"
@@ -66,6 +65,25 @@ dev = [
 ]
 
 
+# Torch listed as optional dependencies.
+# uv and python can only filter dependencies by platform, not by capability.
+# Following the recommendations from https://docs.astral.sh/uv/guides/integration/pytorch
+# We need to support:
+# x86_64: cpu (unit tests) + gpu 
+# aarch64: gpu
+[project.optional-dependencies]
+
+cpu = [
+  'torch==2.6.0',
+]
+
+gpu = [
+  'torch==2.6.0+cu126',
+  # flash-attn also has a torch dependency.
+  "flash-attn",
+]
+
+
 [tool.black]
 
 # Wide rows
@@ -125,6 +143,8 @@ ignore = [
 line-ending = "lf"
 
 
+
+
 [tool.uv]
 # Most work is done a distributed filesystem, where hardlink is not always possible.
 # Also, trying to resolve some permissions issue, see 44.
@@ -141,14 +161,26 @@ link-mode = "symlink"
 # Also, relatively recent versions are required to support workspaces.
 required-version = ">=0.7.0"
 
-# Following the recommendations from https://docs.astral.sh/uv/guides/integration/pytorch
-# The current setup is:
-# linux == GPU + flashattention
-# windows == GPU
-# macos == CPU
+# The supported environments
+# TODO: add macos and windows (CPU only, for running tests)
+environments = [
+    "sys_platform == 'linux' and platform_machine == 'aarch64'",
+    "sys_platform == 'linux' and platform_machine == 'x86_64'",
+#    "sys_platform == 'darwin'",
+]
+
+# One can only have cpu or gpu.
+conflicts = [
+  [
+    { extra = "cpu" },
+    { extra = "gpu" },
+  ],
+]
+
+
 [[tool.uv.index]]
-name = "pytorch-cu124"
-url = "https://download.pytorch.org/whl/cu124"
+name = "pytorch-cu126"
+url = "https://download.pytorch.org/whl/cu126"
 explicit = true
 
 
@@ -181,14 +213,26 @@ explicit = true
 [tool.uv.sources]
 weathergen-common = { workspace = true }
 weathergen-evaluate = { workspace = true }
-torch = [
-  { index = "pytorch-cu124", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
-  { index = "pytorch-cpu", marker = "sys_platform == 'macosx'"},
-]
-# This URL was evaluated this way:
-# uv run ~/WeatherGenerator-private/hpc/hpc2020/ecmwf/get-flash-atten.sh
+
+
 flash-attn = [
-  { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux'"  },
+# The build of Cathal O'Brien is not compatible with the libc build on santis.
+# Hardcode the reference to the swiss cluster for the time being.
+# TODO: open issue
+#  { url = "https://github.com/cathalobrien/get-flash-attn/releases/download/v0.1-alpha/flash_attn-2.7.4+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64'" },
+# This version was rebuilt locally on santis and uploaded.
+  { url = "https://object-store.os-api.cci1.ecmwf.int/weathergenerator-dev/wheels/flash_attn-2.7.3-cp312-cp312-linux_aarch64.whl", marker = "sys_platform == 'linux' and platform_machine == 'aarch64'" },
+  { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp312-cp312-linux_x86_64.whl", marker = "sys_platform == 'linux'  and platform_machine == 'x86_64'" },
+#  { index = "pytorch-cpu", marker = "sys_platform == 'darwin'"},
+]
+
+
+torch = [
+# Explicit pin for GPU
+  { url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-linux_aarch64.whl", marker = 'sys_platform == "linux" and platform_machine == "aarch64"', extra="gpu" },
+  { url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl", marker = 'sys_platform == "linux"  and platform_machine == "x86_64"', extra="gpu" },
+# Use the public repo for CPU versions.
+  { index = "pytorch-cpu", marker = "sys_platform == 'linux'", extra="cpu"},
 ]
 
 [tool.pytest.ini_options]
@@ -203,3 +247,4 @@ members = [
    "packages/evaluate",
    "packages/common"
 ]
+
diff --git a/scripts/actions.sh b/scripts/actions.sh
@@ -7,7 +7,7 @@ case "$1" in
   sync)
     (
       cd "$SCRIPT_DIR" || exit 1
-      uv sync --all-packages
+      uv sync --all-packages --extra gpu
     )
     ;;
   lint)
@@ -34,7 +34,7 @@ case "$1" in
        src/ scripts/ packages/
     )
     ;;
-  type-check-experimental)
+  type-check)
     (
       cd "$SCRIPT_DIR/packages/common" || exit 1
       uv run --all-packages pyrefly check
@@ -47,13 +47,15 @@ case "$1" in
   unit-test)
     (
       cd "$SCRIPT_DIR" || exit 1
-      uv run pytest src/
+      uv sync --extra cpu 
+      uv run --extra cpu pytest src/
     )
     ;;
   integration-test)
     (
       cd "$SCRIPT_DIR" || exit 1
-      srun uv run --offline pytest ./integration_tests/small1_test.py --verbose
+      uv sync --offline --all-packages --extra gpu
+      uv run --offline pytest ./integration_tests/small1_test.py --verbose -s
     )
     ;;
   create-links)
@@ -110,7 +112,7 @@ case "$1" in
     )
     ;;
   *)
-    echo "Usage: $0 {sync|lint|lint-check|unit-test|integration-test|create-links|create-jupyter-kernel|jupytext-sync}"
+    echo "Usage: $0 {sync|lint|lint-check|type-check|unit-test|integration-test|create-links|create-jupyter-kernel|jupytext-sync}"
     exit 1
     ;;
 esac
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -158,6 +158,7 @@ def run(self, cf, devices, run_id_contd=None, epoch_contd=None):
         self.init(cf, devices)
         cf = self.cf
 
+        # TODO: do not define new members outside of the init!!
         self.device_type = torch.accelerator.current_accelerator()
         self.device = torch.device(f"{self.device_type}:{cf.local_rank}")
 
@@ -676,6 +677,9 @@ def validate(self, epoch):
         self.dataset_val.advance()
 
     def batch_to_device(self, batch):
+        # TODO: do not define new members outside of the init!!
+        self.device_type = torch.accelerator.current_accelerator()
+        self.device = torch.device(f"{self.device_type}:{self.cf.local_rank}")
         # forecast_steps is dropped here from the batch
         return (
             [[d.to_device(self.device) for d in db] for db in batch[0]],
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -24,7 +24,7 @@
 DUMMY_STREAM_CONF = {
     "ERA5": {
         "type": "anemoi",
-        "filenames": ["aifs-ea-an-oper-0001-mars-o96-1979-2022-6h-v6.zarr"],
+        "filenames": ["aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8.zarr"],
         "source": ["u_", "v_", "10u", "10v"],
         "target": ["10u", "10v"],
         "loss_weight": 1.0,
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ case "$1" in`
`7`	`7`	`sync)`
`8`	`8`	`(`
`9`	`9`	`cd "$SCRIPT_DIR" \|\| exit 1`
`10`		`- uv sync --all-packages`
	`10`	`+ uv sync --all-packages --extra gpu`
`11`	`11`	`)`
`12`	`12`	`;;`
`13`	`13`	`lint)`
`@@ -34,7 +34,7 @@ case "$1" in`
`34`	`34`	`src/ scripts/ packages/`
`35`	`35`	`)`
`36`	`36`	`;;`
`37`		`- type-check-experimental)`
	`37`	`+ type-check)`
`38`	`38`	`(`
`39`	`39`	`cd "$SCRIPT_DIR/packages/common" \|\| exit 1`
`40`	`40`	`uv run --all-packages pyrefly check`
`@@ -47,13 +47,15 @@ case "$1" in`
`47`	`47`	`unit-test)`
`48`	`48`	`(`
`49`	`49`	`cd "$SCRIPT_DIR" \|\| exit 1`
`50`		`- uv run pytest src/`
	`50`	`+ uv sync --extra cpu`
	`51`	`+ uv run --extra cpu pytest src/`
`51`	`52`	`)`
`52`	`53`	`;;`
`53`	`54`	`integration-test)`
`54`	`55`	`(`
`55`	`56`	`cd "$SCRIPT_DIR" \|\| exit 1`
`56`		`- srun uv run --offline pytest ./integration_tests/small1_test.py --verbose`
	`57`	`+ uv sync --offline --all-packages --extra gpu`
	`58`	`+ uv run --offline pytest ./integration_tests/small1_test.py --verbose -s`
`57`	`59`	`)`
`58`	`60`	`;;`
`59`	`61`	`create-links)`
`@@ -110,7 +112,7 @@ case "$1" in`
`110`	`112`	`)`
`111`	`113`	`;;`
`112`	`114`	`*)`
`113`		`- echo "Usage: $0 {sync\|lint\|lint-check\|unit-test\|integration-test\|create-links\|create-jupyter-kernel\|jupytext-sync}"`
	`115`	`+ echo "Usage: $0 {sync\|lint\|lint-check\|type-check\|unit-test\|integration-test\|create-links\|create-jupyter-kernel\|jupytext-sync}"`
`114`	`116`	`exit 1`
`115`	`117`	`;;`
`116`	`118`	`esac`