Skip to content

Commit 71c6d54

Browse files
committed
chore: minor stability improvements
1 parent a8654a3 commit 71c6d54

File tree

4 files changed

+9
-13
lines changed

4 files changed

+9
-13
lines changed

src/modalities/utils/benchmarking/benchmarking_utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,12 @@ def _is_experiment_done(config_file_path: Path, expected_steps: int, skip_except
6060
error_types = []
6161
for error_log_path in error_log_paths:
6262
with error_log_path.open("r", encoding="utf-8") as f:
63-
error_type = json.load(f)["error"]["type"]
63+
try:
64+
error_dict = json.load(f)
65+
error_type = error_dict["error"]["type"]
66+
except (json.JSONDecodeError, KeyError) as e:
67+
logger.warning(f"Failed to parse error log {error_log_path}: {e}")
68+
error_type = "ErrorFileParsingError"
6469
error_types.append(error_type)
6570
# Check if any of the error types are in the skip list
6671
if len(set(skip_exception_types).intersection(set(error_types))) > 0:

tutorials/scaling_up/configs/sweep_8B_fsdp2.yaml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -173,15 +173,6 @@ fsdp_model:
173173
reduce_dtype: BF_16
174174
block_names: [GPT2Block]
175175

176-
compiled_model:
177-
component_key: model
178-
variant_key: compiled
179-
config:
180-
model:
181-
instance_key: model_raw
182-
pass_type: BY_REFERENCE
183-
block_names: [GPT2Block]
184-
185176
model_raw:
186177
component_key: model
187178
variant_key: gpt2

tutorials/scaling_up/scripts/create_sweep_configs.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ set -eu
44
# this ensures that relative paths work correctly
55
cd "$(dirname "$0")" || exit 1
66

7-
modalities benchmark prepare_sweep_configs --sweep_config_path ../configs/sweep_config.yaml --output_dir ../experiments --world_sizes 2,4,8
7+
modalities benchmark prepare_sweep_configs --sweep_config_path ../configs/sweep_8B_fsdp2_compile.yaml --output_dir ../experiments --world_sizes 4,8,16,32,64,128,256,512,1024

tutorials/scaling_up/scripts/hpc/leonardo/submit_sweep.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ cd "$(dirname "$0")" || exit 1
77

88

99
# --- Config ---
10-
EXPERIMENT_ROOT="/leonardo_scratch/fast/EUHPC_D21_101/max_lue/repositories/working/modalities/tutorials/scaling_up/experiments/2025-07-28__14-13-18_b7117a39"
10+
EXPERIMENT_ROOT="/leonardo_scratch/fast/EUHPC_D21_101/max_lue/repositories/working/modalities/tutorials/scaling_up/experiments/2025-07-30__21-50-11_fca0790e"
1111
EXPECTED_STEPS=20
1212
CONFIG_LIST_FILE="global_file_list.txt"
1313

1414
ACCOUNT=EUHPC_E05_119
15-
TIME_LIMIT=3:00:00
15+
TIME_LIMIT=03:00:00
1616
GPUS_PER_NODE=4
1717

1818
# Retrieve the list of configs to run

0 commit comments

Comments
 (0)