ecmwf
diff --git a/‎config/default_config.yml‎
Lines changed: 2 additions & 2 deletions b/‎config/default_config.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎config/evaluate/eval_config.yml‎
Lines changed: 2 additions & 2 deletions b/‎config/evaluate/eval_config.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎integration_tests/small1.yaml‎
Lines changed: 2 additions & 2 deletions b/‎integration_tests/small1.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎integration_tests/small1_test.py‎
Lines changed: 5 additions & 5 deletions b/‎integration_tests/small1_test.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎packages/common/src/weathergen/common/config.py‎
Lines changed: 57 additions & 24 deletions b/‎packages/common/src/weathergen/common/config.py‎
Lines changed: 57 additions & 24 deletions
diff --git a/‎packages/evaluate/src/weathergen/evaluate/io_reader.py‎
Lines changed: 13 additions & 5 deletions b/‎packages/evaluate/src/weathergen/evaluate/io_reader.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎packages/evaluate/src/weathergen/evaluate/utils.py‎
Lines changed: 5 additions & 5 deletions b/‎packages/evaluate/src/weathergen/evaluate/utils.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/weathergen/datasets/masking.py‎
Lines changed: 1 addition & 1 deletion b/‎src/weathergen/datasets/masking.py‎
Lines changed: 1 addition & 1 deletion
@@ -113,8 +113,8 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"],
                           "same_strategy_per_batch": false
                           }
 
-num_epochs: 32
-samples_per_epoch: 4096
+num_mini_epochs: 32
+samples_per_mini_epoch: 4096
 samples_per_validation: 512
 shuffle: True
 
 
@@ -31,7 +31,7 @@ run_ids :
   ar40mckx:
     label: "pretrained model ar40mckx"
     results_base_dir : "./results/"
-    epoch: 0
+    mini_epoch: 0
     rank: 0
     streams:
       ERA5:
@@ -62,7 +62,7 @@ run_ids :
   c8g5katp:
     label: "2 steps window"
     results_base_dir : "./results/"
-    epoch: 0
+    mini_epoch: 0
     rank: 0
     streams:
       ERA5:
 
@@ -3,8 +3,8 @@ run_path: "./results"
 model_path: "./models"
 loss_fcts: [["mse", 1.0]]
 loss_fcts_val: [["mse", 1.0]]
-num_epochs: 1
-samples_per_epoch: 10
+num_mini_epochs: 1
+samples_per_mini_epoch: 10
 samples_per_validation: 5
 lr_steps: 4
 lr_steps_warmup: 2
 
@@ -69,7 +69,7 @@ def test_train(setup, test_run_id):
 def infer(run_id):
     logger.info("run inference")
     inference_from_args(
-        ["-start", "2022-10-10", "-end", "2022-10-11", "--samples", "10", "--epoch", "0"]
+        ["-start", "2022-10-10", "-end", "2022-10-11", "--samples", "10", "--mini_epoch", "0"]
         + [
             "--from_run_id",
             run_id,
@@ -84,7 +84,7 @@ def infer(run_id):
 def infer_with_missing(run_id):
     logger.info("run inference")
     inference_from_args(
-        ["-start", "2022-10-10", "-end", "2022-10-11", "--samples", "10", "--epoch", "0"]
+        ["-start", "2022-10-10", "-end", "2022-10-11", "--samples", "10", "--mini_epoch", "0"]
         + [
             "--from_run_id",
             run_id,
@@ -128,7 +128,7 @@ def evaluate_results(run_id):
                         }
                     },
                     "label": "MTM ERA5",
-                    "epoch": 0,
+                    "mini_epoch": 0,
                     "rank": 0,
                 }
             },
@@ -171,7 +171,7 @@ def assert_train_loss_below_threshold(run_id):
     assert loss_metric is not None, (
         "'stream.ERA5.loss_mse.loss_avg' metric is missing in metrics file"
     )
-    # Check that the loss does not explode in a single epoch
+    # Check that the loss does not explode in a single mini_epoch
     # This is meant to be a quick test, not a convergence test
     target = 1.5
     assert loss_metric < target, (
@@ -193,7 +193,7 @@ def assert_val_loss_below_threshold(run_id):
     assert loss_metric is not None, (
         "'stream.ERA5.loss_mse.loss_avg' metric is missing in metrics file"
     )
-    # Check that the loss does not explode in a single epoch
+    # Check that the loss does not explode in a single mini_epoch
     # This is meant to be a quick test, not a convergence test
     assert loss_metric < 1.25, (
         f"'stream.ERA5.loss_mse.loss_avg' is {loss_metric}, expected to be below 0.25"
 
@@ -54,23 +54,23 @@ def format_cf(config: Config) -> str:
     return stream.getvalue()
 
 
-def save(config: Config, epoch: int | None):
+def save(config: Config, mini_epoch: int | None):
     """Save current config into the current runs model directory."""
     path_models = Path(config.model_path)
     # save in directory with model files
     dirname = path_models / config.run_id
     dirname.mkdir(exist_ok=True, parents=True)
 
-    fname = dirname / _get_model_config_file_name(config.run_id, epoch)
+    fname = _get_model_config_file_write_name(path_models, config.run_id, mini_epoch)
 
     json_str = json.dumps(OmegaConf.to_container(config))
     with fname.open("w") as f:
         f.write(json_str)
 
 
-def load_model_config(run_id: str, epoch: int | None, model_path: str | None) -> Config:
+def load_model_config(run_id: str, mini_epoch: int | None, model_path: str | None) -> Config:
     """
-    Load a configuration file from a given run_id and epoch.
+    Load a configuration file from a given run_id and mini_epoch.
     If run_id is a full path, loads it from the full path.
     """
     if Path(run_id).exists():  # load from the full path if a full path is provided
@@ -84,13 +84,13 @@ def load_model_config(run_id: str, epoch: int | None, model_path: str | None) ->
                 config=pconf, attribute_name="model_path", fallback="models"
             )
         path = Path(model_path)
-        fname = path / run_id / _get_model_config_file_name(run_id, epoch)
+        fname = _get_model_config_file_read_name(path, run_id, mini_epoch)
         assert fname.exists(), (
             "The fallback path to the model does not exist. Please provide a `model_path`.",
             fname,
         )
 
-    _logger.info(f"Loading config from specified run_id and epoch: {fname}")
+    _logger.info(f"Loading config from specified run_id and mini_epoch: {fname}")
 
     with fname.open() as f:
         json_str = f.read()
@@ -100,24 +100,49 @@ def load_model_config(run_id: str, epoch: int | None, model_path: str | None) ->
     return _apply_fixes(config)
 
 
-def _get_model_config_file_name(run_id: str, epoch: int | None):
-    if epoch is None:
-        epoch_str = ""
-    elif epoch == -1:
-        epoch_str = "_latest"
+def _get_model_config_file_write_name(path: Path, run_id: str, mini_epoch: int | None):
+    if mini_epoch is None:
+        mini_epoch_str = ""
+    elif mini_epoch == -1:
+        mini_epoch_str = "_latest"
     else:
-        epoch_str = f"_epoch{epoch:05d}"
-    return f"model_{run_id}{epoch_str}.json"
+        mini_epoch_str = f"_chkpt{mini_epoch:05d}"
 
+    return path / run_id / f"model_{run_id}{mini_epoch_str}.json"
 
-def get_model_results(run_id: str, epoch: int, rank: int) -> Path:
+
+def _get_model_config_file_read_name(path: Path, run_id: str, mini_epoch: int | None):
+    if mini_epoch is None:
+        mini_epoch_str = ""
+    elif mini_epoch == -1:
+        mini_epoch_str = "_latest"
+    elif (path / run_id / f"model_{run_id}_epoch{mini_epoch:05d}.json").exists():
+        mini_epoch_str = f"_epoch{mini_epoch:05d}"
+    else:
+        mini_epoch_str = f"_chkpt{mini_epoch:05d}"
+
+    return path / run_id / f"model_{run_id}{mini_epoch_str}.json"
+
+
+def get_model_results(run_id: str, mini_epoch: int, rank: int) -> Path:
     """
-    Get the path to the model results zarr store from a given run_id and epoch.
+    Get the path to the model results zarr store from a given run_id and mini_epoch.
     """
     run_results = Path(_load_private_conf(None)["path_shared_working_dir"]) / f"results/{run_id}"
-    zarr_path = run_results / f"validation_epoch{epoch:05d}_rank{rank:04d}.zarr"
-    if not zarr_path.exists() or not zarr_path.is_dir():
-        raise FileNotFoundError(f"Zarr file {zarr_path} does not exist or is not a directory.")
+
+    zarr_path_new = run_results / f"validation_chkpt{mini_epoch:05d}_rank{rank:04d}.zarr"
+    zarr_path_old = run_results / f"validation_epoch{mini_epoch:05d}_rank{rank:04d}.zarr"
+
+    if zarr_path_new.exists() or zarr_path_new.is_dir():
+        zarr_path = zarr_path_new
+    elif zarr_path_old.exists() or zarr_path_old.is_dir():
+        zarr_path = zarr_path_old
+    else:
+        raise FileNotFoundError(
+            f"Zarr file with run_id {run_id}, mini_epoch {mini_epoch} and rank {rank} does not "
+            f"exist or is not a directory."
+        )
+
     return zarr_path
 
 
@@ -150,7 +175,7 @@ def _check_logging(config: Config) -> Config:
 def load_config(
     private_home: Path | None,
     from_run_id: str | None,
-    epoch: int | None,
+    mini_epoch: int | None,
     *overwrites: Path | dict | Config,
 ) -> Config:
     """
@@ -161,7 +186,7 @@ def load_config(
         private_home: Configuration file containing platform dependent information and secretes
         from_run_id: Run id of the pretrained WeatherGenerator model
         to continue training or inference
-        epoch: epoch of the checkpoint to load. -1 indicates last checkpoint available.
+        mini_epoch: mini_epoch of the checkpoint to load. -1 indicates last checkpoint available.
         *overwrites: Additional overwrites from different sources
 
     Note: The order of precendence for merging the final config is in ascending order:
@@ -191,13 +216,21 @@ def load_config(
     if from_run_id is None:
         base_config = _load_default_conf()
     else:
-        base_config = load_model_config(from_run_id, epoch, private_config.get("model_path", None))
+        base_config = load_model_config(
+            from_run_id, mini_epoch, private_config.get("model_path", None)
+        )
         from_run_id = base_config.run_id
     with open_dict(base_config):
         base_config.from_run_id = from_run_id
     # use OmegaConf.unsafe_merge if too slow
     c = OmegaConf.merge(base_config, private_config, *overwrite_configs)
     assert isinstance(c, Config)
+    
+    # Ensure the config has mini-epoch notation
+    if hasattr(c, "samples_per_epoch"):
+        c.samples_per_mini_epoch = c.samples_per_epoch
+        c.num_mini_epochs = c.num_epochs
+
     return c
 
 
@@ -456,9 +489,9 @@ def get_path_model(config: Config) -> Path:
     return Path(config.model_path) / config.run_id
 
 
-def get_path_output(config: Config, epoch: int) -> Path:
+def get_path_output(config: Config, mini_epoch: int) -> Path:
     base_path = get_path_run(config)
-    fname = f"validation_epoch{epoch:05d}_rank{config.rank:04d}.zarr"
+    fname = f"validation_chkpt{mini_epoch:05d}_rank{config.rank:04d}.zarr"
 
     return base_path / fname
 
@@ -523,7 +556,7 @@ def validate_forecast_policy_and_steps(cf: OmegaConf):
     valid_forecast_policies = (
         "Valid values for 'forecast_policy' are, e.g., 'fixed' when using constant "
         "forecast steps throughout the training, or 'sequential' when varying the forecast "
-        "steps over epochs, such as, e.g., 'forecast_steps: [2, 2, 4, 4]'. "
+        "steps over mini_epochs, such as, e.g., 'forecast_steps: [2, 2, 4, 4]'. "
     )
     valid_forecast_steps = (
         "'forecast_steps' must be a positive integer or a non-empty list of positive integers. "
 
@@ -469,7 +469,7 @@ def __init__(self, eval_cfg: dict, run_id: str, private_paths: dict | None = Non
 
         super().__init__(eval_cfg, run_id, private_paths)
 
-        self.epoch = eval_cfg.epoch
+        self.mini_epoch = eval_cfg.mini_epoch
         self.rank = eval_cfg.rank
 
         # Load model configuration and set (run-id specific) directories
@@ -498,9 +498,17 @@ def __init__(self, eval_cfg: dict, run_id: str, private_paths: dict | None = Non
             self.eval_cfg.get("metrics_dir", self.metrics_base_dir / self.run_id / "evaluation")
         )
 
-        self.fname_zarr = self.results_dir.joinpath(
-            f"validation_epoch{self.epoch:05d}_rank{self.rank:04d}.zarr"
+        fname_zarr_new = self.results_dir.joinpath(
+            f"validation_chkpt{self.mini_epoch:05d}_rank{self.rank:04d}.zarr"
         )
+        fname_zarr_old = self.results_dir.joinpath(
+            f"validation_epoch{self.mini_epoch:05d}_rank{self.rank:04d}.zarr"
+        )
+
+        if fname_zarr_new.exists() or fname_zarr_new.is_dir():
+            self.fname_zarr = fname_zarr_new
+        else:
+            self.fname_zarr = fname_zarr_old
 
         if not self.fname_zarr.exists() or not self.fname_zarr.is_dir():
             _logger.error(f"Zarr file {self.fname_zarr} does not exist.")
@@ -522,12 +530,12 @@ def get_inference_config(self):
             _logger.info(
                 f"Loading config for run {self.run_id} from private paths: {self.private_paths}"
             )
-            config = load_config(self.private_paths, self.run_id, self.epoch)
+            config = load_config(self.private_paths, self.run_id, self.mini_epoch)
         else:
             _logger.info(
                 f"Loading config for run {self.run_id} from model directory: {self.model_base_dir}"
             )
-            config = load_model_config(self.run_id, self.epoch, self.model_base_dir)
+            config = load_model_config(self.run_id, self.mini_epoch, self.model_base_dir)
 
         if type(config) not in [dict, oc.DictConfig]:
             _logger.warning("Model config not found. inference config will be empty.")
 
@@ -435,8 +435,8 @@ def metric_list_to_json(
         Output directory.
     run_id :
         Identifier of the inference run.
-    epoch :
-        Epoch number.
+    mini_epoch :
+        Mini_epoch number.
     """
     assert len(metrics_list) == len(npoints_sample_list) == len(streams), (
         "The lengths of metrics_list, npoints_sample_list, and streams must be the same."
@@ -460,16 +460,16 @@ def metric_list_to_json(
             # Match the expected filename pattern
             save_path = (
                 reader.metrics_dir
-                / f"{reader.run_id}_{stream}_{region}_{metric}_epoch{reader.epoch:05d}.json"
+                / f"{reader.run_id}_{stream}_{region}_{metric}_chkpt{reader.mini_epoch:05d}.json"
             )
 
             _logger.info(f"Saving results to {save_path}")
             with open(save_path, "w") as f:
                 json.dump(metric_dict, f, indent=4)
 
     _logger.info(
-        f"Saved all results of inference run {reader.run_id} - epoch {reader.epoch:d} successfully "
-        f"to {reader.metrics_dir}."
+        f"Saved all results of inference run {reader.run_id} - mini_epoch {reader.mini_epoch:d} "
+        f"successfully to {reader.metrics_dir}."
     )
 
 
 
@@ -97,7 +97,7 @@ def __init__(self, cf: Config):
 
     def reset_rng(self, rng) -> None:
         """
-        Reset rng after epoch to ensure proper randomization
+        Reset rng after mini_epoch to ensure proper randomization
         """
         self.rng = rng
Original file line number	Diff line number	Diff line change
`@@ -113,8 +113,8 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"],`
`113`	`113`	`"same_strategy_per_batch": false`
`114`	`114`	`}`
`115`	`115`
`116`		`-num_epochs: 32`
`117`		`-samples_per_epoch: 4096`
	`116`	`+num_mini_epochs: 32`
	`117`	`+samples_per_mini_epoch: 4096`
`118`	`118`	`samples_per_validation: 512`
`119`	`119`	`shuffle: True`
`120`	`120`