Inprovements to caching and DRY

louispt1 · louispt1 · commit 54e7cf796571 · 2025-08-18T14:10:43.000+02:00
diff --git a/examples/create_or_query_scenarios.ipynb b/examples/create_or_query_scenarios.ipynb
@@ -48,7 +48,7 @@
     "scenarios = Scenarios.from_excel(\"example_input_excel.xlsx\")\n",
     "\n",
     "# Here we're also loading a scenario directly from the API and adding it to the scenarios loaded/created via the excel\n",
-    "scenario_a = Scenario.load(1357691)\n",
+    "scenario_a = Scenario.load(2690439)\n",
     "scenarios.add(scenario_a)"
    ]
   },
diff --git a/inputs/example_input_excel.xlsx b/inputs/example_input_excel.xlsx
diff --git a/src/pyetm/models/inputs.py b/src/pyetm/models/inputs.py
@@ -162,24 +162,23 @@ def __iter__(self):
     def keys(self):
         return [input.key for input in self.inputs]
 
-    # TODO: Check the efficiency of doing this in a loop
     def is_valid_update(self, key_vals: dict) -> dict[str, WarningCollector]:
         """
         Returns a dict mapping input keys to their WarningCollectors when errors were found.
         """
-        warnings = {}
+        warnings: dict[str, WarningCollector] = {}
 
-        # Check each input that has an update
-        for input_obj in self.inputs:
-            if input_obj.key in key_vals:
-                input_warnings = input_obj.is_valid_update(key_vals[input_obj.key])
-                if len(input_warnings) > 0:
-                    warnings[input_obj.key] = input_warnings
-
-        # Check for non-existent keys
-        non_existent_keys = set(key_vals.keys()) - set(self.keys())
-        for key in non_existent_keys:
-            warnings[key] = WarningCollector.with_warning(key, "Key does not exist")
+        input_map = {inp.key: inp for inp in self.inputs}
+
+        for key, value in key_vals.items():
+            input_obj = input_map.get(key)
+            if input_obj is None:
+                warnings[key] = WarningCollector.with_warning(key, "Key does not exist")
+                continue
+
+            input_warnings = input_obj.is_valid_update(value)
+            if len(input_warnings) > 0:
+                warnings[key] = input_warnings
 
         return warnings
 
diff --git a/src/pyetm/models/output_curves.py b/src/pyetm/models/output_curves.py
@@ -3,6 +3,7 @@
 import pandas as pd
 from pathlib import Path
 from typing import Optional
+import os
 
 import yaml
 from pyetm.clients import BaseClient
@@ -15,6 +16,18 @@
 )
 
 
+# Small LRU cache for reading CSVs from disk. Uses mtime to invalidate when file changes.
+def _read_csv_cached(path: Path) -> pd.DataFrame:
+    return _read_csv_cached_impl(str(path), os.path.getmtime(path))
+
+
+# TODO determine appropriate maxsize
+@lru_cache(maxsize=64)
+def _read_csv_cached_impl(path_str: str, mtime: float) -> pd.DataFrame:
+    df = pd.read_csv(path_str, index_col=0)
+    return df.dropna(how="all")
+
+
 class OutputCurveError(Exception):
     """Base carrier curve error"""
 
@@ -34,18 +47,26 @@ class OutputCurve(Base):
     def available(self) -> bool:
         return bool(self.file_path)
 
-    def retrieve(self, client, scenario) -> Optional[pd.DataFrame]:
+    def retrieve(
+        self, client, scenario, force_refresh: bool = False
+    ) -> Optional[pd.DataFrame]:
         """Process curve from client, save to file, set file_path"""
         file_path = (
             get_settings().path_to_tmp(str(scenario.id))
             / f"{self.key.replace('/','-')}.csv"
         )
 
-        # TODO: Examine the caching situation in the future if time permits: could be particularly
-        # relevant for bulk processing
-        # if file_path.is_file():
-        #     self.file_path = file_path
-        #     return self.contents()
+        # Reuse a cached file if present unless explicitly refreshing.
+        if not force_refresh and file_path.is_file():
+            self.file_path = file_path
+            try:
+                return _read_csv_cached(self.file_path)
+            except Exception as e:
+                # Fall through to re-download on cache read failure
+                self.add_warning(
+                    "file_path",
+                    f"Failed to read cached curve file for {self.key}: {e}; refetching",
+                )
         try:
             result = DownloadOutputCurveRunner.run(client, scenario, self.key)
             if result.success:
@@ -80,8 +101,7 @@ def contents(self) -> Optional[pd.DataFrame]:
             return None
 
         try:
-            df = pd.read_csv(self.file_path, index_col=0)
-            return df.dropna(how="all")
+            return _read_csv_cached(self.file_path)
         except Exception as e:
             self.add_warning(
                 "file_path", f"Failed to read curve file for {self.key}: {e}"
@@ -147,6 +167,17 @@ def get_contents(self, scenario, curve_name: str) -> Optional[pd.DataFrame]:
             return None
 
         if not curve.available():
+            # Try to attach a cached file from disk first
+            expected_path = (
+                get_settings().path_to_tmp(str(scenario.id))
+                / f"{curve.key.replace('/', '-')}.csv"
+            )
+            if expected_path.is_file():
+                curve.file_path = expected_path
+                contents = curve.contents()
+                self._merge_submodel_warnings(curve, key_attr="key")
+                return contents
+
             result = curve.retrieve(BaseClient(), scenario)
             self._merge_submodel_warnings(curve, key_attr="key")
             return result
@@ -193,17 +224,7 @@ def get_curves_by_carrier_type(
         Returns:
             Dictionary mapping curve names to DataFrames
         """
-        carrier_mapping = {
-            "electricity": ["merit_order", "electricity_price", "residual_load"],
-            "heat": [
-                "heat_network",
-                "agriculture_heat",
-                "household_heat",
-                "buildings_heat",
-            ],
-            "hydrogen": ["hydrogen", "hydrogen_integral_cost"],
-            "methane": ["network_gas"],
-        }
+        carrier_mapping = self._load_carrier_mappings()
 
         if carrier_type not in carrier_mapping:
             valid_types = ", ".join(carrier_mapping.keys())
diff --git a/src/pyetm/models/scenario.py b/src/pyetm/models/scenario.py
@@ -288,12 +288,7 @@ def update_user_values(self, update_inputs: Dict[str, Any]) -> None:
         """
         # Update them in the Inputs object, and check validation
         validity_errors = self.inputs.is_valid_update(update_inputs)
-        if validity_errors:
-            error_summary = []
-            for key, warning_collector in validity_errors.items():
-                warnings_list = [w.message for w in warning_collector]
-                error_summary.append(f"{key}: {warnings_list}")
-            raise ScenarioError(f"Could not update user values: {error_summary}")
+        self._handle_validity_errors(validity_errors, "user values")
 
         result = UpdateInputsRunner.run(BaseClient(), self, update_inputs)
 
@@ -357,12 +352,7 @@ def update_sortables(self, update_sortables: Dict[str, List[Any]]) -> None:
         """
         # Validate the updates first
         validity_errors = self.sortables.is_valid_update(update_sortables)
-        if validity_errors:
-            error_summary = []
-            for key, warning_collector in validity_errors.items():
-                warnings_list = [w.message for w in warning_collector]
-                error_summary.append(f"{key}: {warnings_list}")
-            raise ScenarioError(f"Could not update sortables: {error_summary}")
+        self._handle_validity_errors(validity_errors, "sortables")
 
         # Make individual API calls for each sortable as there is no bulk endpoint
         for name, order in update_sortables.items():
@@ -443,16 +433,9 @@ def update_custom_curves(self, custom_curves) -> None:
         Args:
             custom_curves: CustomCurves object containing curves to upload
         """
-
         # Validate curves before uploading
         validity_errors = custom_curves.validate_for_upload()
-        # TODO: Extract all these validity_errors thingys to a single util or something, lots of repetition at the moment
-        if validity_errors:
-            error_summary = []
-            for key, warning_collector in validity_errors.items():
-                warnings_list = [w.message for w in warning_collector]
-                error_summary.append(f"{key}: {warnings_list}")
-            raise ScenarioError(f"Could not update custom curves: {error_summary}")
+        self._handle_validity_errors(validity_errors, "custom curves")
 
         # Upload curves
         result = UpdateCustomCurvesRunner.run(BaseClient(), self, custom_curves)
@@ -556,3 +539,19 @@ def show_all_warnings(self) -> None:
             if submodel is not None and len(submodel.warnings) > 0:
                 print(f"\n{name} warnings:")
                 submodel.show_warnings()
+
+    def _handle_validity_errors(
+        self, validity_errors: Dict[str, Any], context: str
+    ) -> None:
+        """
+        Helper method to format and raise ScenarioError for validity errors.
+        """
+        if not validity_errors:
+            return
+
+        error_summary = []
+        for key, warning_collector in validity_errors.items():
+            warnings_list = [w.message for w in warning_collector]
+            error_summary.append(f"{key}: {warnings_list}")
+
+        raise ScenarioError(f"Could not update {context}: {error_summary}")
diff --git a/src/pyetm/models/scenario_packer.py b/src/pyetm/models/scenario_packer.py
@@ -56,18 +56,41 @@ def extract_from_main_sheet(
                 candidate_series = main_df.iloc[:, 0]
 
             return ExportConfigResolver._parse_config_from_series(candidate_series)
-        except Exception:
+        except Exception as e:
+            logger.exception("Error extracting from main sheet: %s", e)
             return None
 
     @staticmethod
-    def _parse_config_from_series(series: pd.Series) -> ExportConfig:
+    def _parse_config_from_series(series: pd.Series) -> "ExportConfig":
         """Parse ExportConfig from a pandas Series (column from main sheet)."""
-        index_map = {str(idx).strip().lower(): idx for idx in series.index}
+
+        def _iter_rows():
+            for label, value in zip(series.index, series.values):
+                yield str(label).strip().lower(), value
+
+        def _value_after_output(name: str) -> Any:
+            target = name.strip().lower()
+            seen_output = False
+            chosen: Any = None
+            for lbl, val in _iter_rows():
+                if lbl == "output":
+                    seen_output = True
+                    continue
+                if seen_output and lbl == target:
+                    chosen = val
+            return chosen
+
+        def _value_any(name: str) -> Any:
+            target = name.strip().lower()
+            chosen: Any = None
+            for lbl, val in _iter_rows():
+                if lbl == target:
+                    chosen = val
+            return chosen
 
         def get_cell_value(name: str) -> Any:
-            key = name.strip().lower()
-            original_key = index_map.get(key)
-            return series.get(original_key) if original_key is not None else None
+            val = _value_after_output(name)
+            return val if val is not None else _value_any(name)
 
         def parse_bool(value: Any) -> Optional[bool]:
             """Parse boolean from various formats."""
@@ -88,26 +111,47 @@ def parse_bool(value: Any) -> Optional[bool]:
                     return False
             return None
 
+        def parse_bool_field(*names: str) -> Optional[bool]:
+            """Return the first non-None boolean parsed from the provided field names."""
+            for n in names:
+                val = parse_bool(get_cell_value(n))
+                if val is not None:
+                    return val
+            return None
+
         def parse_carriers(value: Any) -> Optional[List[str]]:
             """Parse comma-separated carrier list."""
             if not isinstance(value, str) or not value.strip():
                 return None
             return [carrier.strip() for carrier in value.split(",") if carrier.strip()]
 
-        carriers_raw = get_cell_value("exports") or get_cell_value("output_carriers")
+        exports_val = get_cell_value("exports")
+        carriers_val = get_cell_value("output_carriers")
 
-        return ExportConfig(
-            include_inputs=parse_bool(get_cell_value("inputs")),
-            include_sortables=parse_bool(get_cell_value("sortables")),
-            include_custom_curves=parse_bool(get_cell_value("custom_curves")),
+        exports_bool = parse_bool(exports_val)
+        if exports_bool is True:
+            output_carriers = ["electricity", "hydrogen", "heat", "methane"]
+        elif exports_bool is False:
+            output_carriers = None
+        else:
+            output_carriers = parse_carriers(carriers_val) or parse_carriers(
+                exports_val
+            )
+
+        config = ExportConfig(
+            include_inputs=parse_bool_field("include_inputs", "inputs"),
+            include_sortables=parse_bool_field("include_sortables", "sortables"),
+            include_custom_curves=parse_bool_field(
+                "include_custom_curves", "custom_curves"
+            ),
             include_gqueries=(
-                parse_bool(get_cell_value("gquery_results"))
-                or parse_bool(get_cell_value("gqueries"))
+                parse_bool_field("include_gqueries", "gquery_results", "gqueries")
             ),
             inputs_defaults=parse_bool(get_cell_value("defaults")),
             inputs_min_max=parse_bool(get_cell_value("min_max")),
-            output_carriers=parse_carriers(carriers_raw),
+            output_carriers=output_carriers,
         )
+        return config
 
 
 class ScenarioPacker(BaseModel):

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@`
`48`	`48`	`"scenarios = Scenarios.from_excel(\"example_input_excel.xlsx\")\n",`
`49`	`49`	`"\n",`
`50`	`50`	`"# Here we're also loading a scenario directly from the API and adding it to the scenarios loaded/created via the excel\n",`
`51`		`- "scenario_a = Scenario.load(1357691)\n",`
	`51`	`+ "scenario_a = Scenario.load(2690439)\n",`
`52`	`52`	`"scenarios.add(scenario_a)"`
`53`	`53`	`]`
`54`	`54`	`},`