From 7479bedd67e8f442a4f3ae5e34e3c5113df1aa39 Mon Sep 17 00:00:00 2001
From: mgmorgan23 <maliam@allenai.org>
Date: Fri, 25 Jul 2025 17:03:05 -0700
Subject: [PATCH 1/7] Update script to call new oe-eval safety evals

---
 scripts/submit_eval_jobs.py | 172 +++++++++++++++++++++++-------------
 1 file changed, 112 insertions(+), 60 deletions(-)

diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index b23abeb3c..cb4df3e03 100755
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -108,6 +108,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 parser.add_argument("--hf_upload_experiments", type=str, nargs="*", default=None, help="Upload given experiment to the Hugging Face model hub.")
 parser.add_argument("--run_oe_eval_experiments", action="store_true", help="Run the OE eval tool and experiments too.")
 parser.add_argument("--run_safety_evaluations", action="store_true", help="Run the OE safety evaluations too.")
+parser.add_argument("--run_safety_evaluations_reasoning", action="store_true", help="Run the OE safety evaluations on a reasoning model too.")
 parser.add_argument("--skip_oi_evals", action="store_true", help="Don't run open instruct evals.")
 parser.add_argument("--oe_eval_max_length", type=int, default=4096, help="Max length for OE eval.")
 parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)")
@@ -668,69 +669,120 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
     print(f"Running OE eval with command: {oe_eval_cmd}")
     subprocess.Popen(oe_eval_cmd, shell=True)
 
-# create an experiment that runs the safety eval tasks
 if args.run_safety_evaluations:
-    # just take the original spec we had, modify it for safety eval.
-    experiment_name = f"oi_safety_{model_name}"
-    experiment_name = experiment_name.replace('β', '').replace(r"{", "").replace(r"}", "") # hack: remove characters beaker doesn't like
-    d["description"] = experiment_name
-    # specific image for safety eval
-    d["tasks"][0]["image"]["beaker"] = "hamishivi/open-safety"
-    if args.use_alternate_safety_image:
-        d["tasks"][0]["image"]["beaker"] = args.use_alternate_safety_image
-    d["tasks"] = [d["tasks"][0]]
-    task_spec = d["tasks"][0]
-    task_spec["name"] = experiment_name
-    task_spec["arguments"][0] = '''
-VLLM_WORKER_MULTIPROC_METHOD=spawn PYTHONPATH=. python evaluation/run_all_generation_benchmarks.py \
-    --model_name_or_path /model \
-    --model_input_template_path_or_name hf \
-    --report_output_path /output/metrics.json \
-    --save_individual_results_path /output/all.json \
-'''
-    # some copied logic
-    if model_info[0].startswith("hf-"):  # if it's a huggingface model, load it from the model hub and delete mount `/model`
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]} --hf_revision {args.hf_revision}")]
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")]
-        del task_spec['datasets'][1]
-    elif model_info[1].startswith("/"):  # if it's a local model, load it from the local directory and delete mount `/model`
-        assert weka_available, "NFS / Weka is required for path-based models."  # to be safe.
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path "+model_info[1])]
-        del task_spec['datasets'][1]
-    else:  # if it's a beaker model, mount the beaker dataset to `/model`
-        task_spec['datasets'][1]['source']['beaker'] = model_info[1]
+    # if so, run safety-fork through oe-eval. We assume oe-eval is cloned in the top-level repo directory.
+    oe_safety_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}"
+    if args.upload_to_hf:
+        oe_safety_cmd += f" --upload_to_hf {args.upload_to_hf}"
+    ## model location munging: if beaker, use beaker://. If hf, just name
+    if model_info[0].startswith("hf-"):
+        oe_safety_cmd += f" --model-location {model_info[1]}"
+    elif model_info[1].startswith("/"):
+        oe_safety_cmd += f" --model-location {model_info[1]}"
+    elif model_info[1].startswith("gs://"):
+        oe_safety_cmd += f" --model-location {model_info[1]}"
+    else:
+        oe_safety_cmd += f" --model-location beaker://{model_info[1]}"
+    if args.hf_revision:
+        oe_safety_cmd += f" --revision {args.hf_revision}"
+    if args.evaluate_on_weka:
+        oe_safety_cmd += " --evaluate_on_weka"
+    oe_safety_cmd += f" --tasks safety_eval"
+    if args.run_id:
+        oe_safety_cmd += f" --run-id {args.run_id}"
+    if args.step:
+        oe_safety_cmd += f" --step {args.step}"
+    # add string with number of gpus
+    num_gpus = task_spec['resources']['gpuCount']
+    # if num_gpus > 1, double it again for oe-eval configs
+    # open_instruct GPT adjustment wasn't quite enough
+    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
+    # tested reasonably extensively with 70B
+    if num_gpus > 1:
+        num_gpus *= 2
+    oe_safety_cmd += f" --num_gpus {num_gpus}"
+    if args.oe_eval_max_length:
+        oe_safety_cmd += f" --max-length {args.oe_eval_max_length}"
 
-    task_spec = adjust_gpus(
-        task_spec=task_spec,
-        experiment_group="safety_eval",
-        model_name=model_info[0],
-        gpu_multiplier=args.gpu_multiplier,
-    )
+    # add priority
+    oe_safety_cmd += f" --priority {args.priority}"
 
-    # add gpu information.
-    # we just assume you want to use all the gpus for one task at a time
-    if "70B" in model_info[0]:
-        task_spec['resources']['gpuCount'] = 8
-    num_gpus = task_spec['resources']['gpuCount']
-    task_spec["arguments"][0]+= f" --min_gpus_per_task {num_gpus}"
+    # Add stop sequences if provided
+    if args.oe_eval_stop_sequences:
+        oe_safety_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'"
+
+    # Add process output if provided
+    if args.process_output:
+        oe_safety_cmd += f" --process-output {args.process_output}"
+
+    # Add beaker image from existing argument
+    if args.beaker_image:
+        oe_safety_cmd += f" --beaker-image {args.beaker_image}"
+
+    # Add cluster parameter - use the existing cluster argument
+    # Join the list with commas since oe-eval.sh expects a comma-separated string
+    if args.cluster and len(args.cluster) > 0:
+        cluster_str = ",".join(args.cluster)
+        oe_safety_cmd += f" --cluster '{cluster_str}'"
+
+    print(f"Running OE safety eval with command: {oe_safety_cmd}")
+    subprocess.Popen(oe_safety_cmd, shell=True)
 
+if args.run_safety_evaluations_reasoning:
+    # if so, run safety-fork on resoning tasks through oe-eval. We assume oe-eval is cloned in the top-level repo directory.
+    oe_safety_reasoning_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}"
     if args.upload_to_hf:
-        hf_dataset = args.upload_to_hf
-        # to match the way oe-eval script works.
-        # if we prepended hf- to the model name, remove it.
-        # if model_name.startswith("hf-"):
-        #     model_name = model_name[3:]
-        # Above is no longer the case, oe-eval includes hf- again
-        task_spec['arguments'] = [task_spec['arguments'][0] + f" --upload_to_hf {hf_dataset} --hf_upload_name results/{model_name}"]
-
-    d["tasks"] = [task_spec]
-    if not os.path.exists("configs/beaker_configs/auto_created"):
-        os.makedirs("configs/beaker_configs/auto_created")
-    fn = "configs/beaker_configs/auto_created/{}.yaml".format(experiment_name)
-    os.makedirs(os.path.dirname(fn), exist_ok=True)
-    with open(fn, "w") as file:
-        yaml.dump(d, file, default_flow_style=True)
+        oe_safety_reasoning_cmd += f" --upload_to_hf {args.upload_to_hf}"
+    ## model location munging: if beaker, use beaker://. If hf, just name
+    if model_info[0].startswith("hf-"):
+        oe_safety_reasoning_cmd += f" --model-location {model_info[1]}"
+    elif model_info[1].startswith("/"):
+        oe_safety_reasoning_cmd += f" --model-location {model_info[1]}"
+    elif model_info[1].startswith("gs://"):
+        oe_safety_reasoning_cmd += f" --model-location {model_info[1]}"
+    else:
+        oe_safety_reasoning_cmd += f" --model-location beaker://{model_info[1]}"
+    if args.hf_revision:
+        oe_safety_reasoning_cmd += f" --revision {args.hf_revision}"
+    if args.evaluate_on_weka:
+        oe_safety_reasoning_cmd += " --evaluate_on_weka"
+    oe_safety_reasoning_cmd += f" --tasks safety_eval_reasoning"
+    if args.run_id:
+        oe_safety_reasoning_cmd += f" --run-id {args.run_id}"
+    if args.step:
+        oe_safety_reasoning_cmd += f" --step {args.step}"
+    # add string with number of gpus
+    num_gpus = task_spec['resources']['gpuCount']
+    # if num_gpus > 1, double it again for oe-eval configs
+    # open_instruct GPT adjustment wasn't quite enough
+    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
+    # tested reasonably extensively with 70B
+    if num_gpus > 1:
+        num_gpus *= 2
+    oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}"
+    if args.oe_eval_max_length:
+        oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}"
 
-    cmd = "beaker experiment create {} --workspace ai2/{}".format(fn, workspace)
-    subprocess.Popen(cmd, shell=True)
+    # add priority
+    oe_safety_reasoning_cmd += f" --priority {args.priority}"
+
+    # Add stop sequences if provided
+    if args.oe_eval_stop_sequences:
+        oe_safety_reasoning_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'"
+
+    # Add process output if provided
+    if args.process_output:
+        oe_safety_reasoning_cmd += f" --process-output {args.process_output}"
+
+    # Add beaker image from existing argument
+    if args.beaker_image:
+        oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}"
+
+    # Add cluster parameter - use the existing cluster argument
+    # Join the list with commas since oe-eval.sh expects a comma-separated string
+    if args.cluster and len(args.cluster) > 0:
+        cluster_str = ",".join(args.cluster)
+        oe_safety_reasoning_cmd += f" --cluster '{cluster_str}'"
+
+    print(f"Running OE safety eval with command: {oe_safety_reasoning_cmd}")
+    subprocess.Popen(oe_safety_reasoning_cmd, shell=True)
\ No newline at end of file

From 50ab0decf2ba7906352b6af34752fe30da325de9 Mon Sep 17 00:00:00 2001
From: mgmorgan23 <maliam@allenai.org>
Date: Mon, 28 Jul 2025 16:35:01 -0700
Subject: [PATCH 2/7] Add num gpu constraints

---
 scripts/submit_eval_jobs.py | 57 +++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index cb4df3e03..464d1ef2f 100755
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -692,17 +692,25 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_safety_cmd += f" --run-id {args.run_id}"
     if args.step:
         oe_safety_cmd += f" --step {args.step}"
-    # add string with number of gpus
-    num_gpus = task_spec['resources']['gpuCount']
-    # if num_gpus > 1, double it again for oe-eval configs
-    # open_instruct GPT adjustment wasn't quite enough
-    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
-    # tested reasonably extensively with 70B
-    if num_gpus > 1:
-        num_gpus *= 2
+
+    # pull num gpu logic from old safety eval code
+    task_spec = adjust_gpus(
+        task_spec=task_spec,
+        experiment_group="safety_eval",
+        model_name=model_info[0],
+        gpu_multiplier=args.gpu_multiplier,
+    )
+    # add gpu information.
+    # we just assume you want to use all the gpus for one task at a time
+    if "70B" in model_info[0]:
+        task_spec['resources']['gpuCount'] = 8
+    num_gpus = task_spec['resources']['gpuCount'] 
+
     oe_safety_cmd += f" --num_gpus {num_gpus}"
-    if args.oe_eval_max_length:
-        oe_safety_cmd += f" --max-length {args.oe_eval_max_length}"
+    
+    # controlled by config file
+    # if args.oe_eval_max_length:
+    #     oe_safety_cmd += f" --max-length {args.oe_eval_max_length}"
 
     # add priority
     oe_safety_cmd += f" --priority {args.priority}"
@@ -751,17 +759,24 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_safety_reasoning_cmd += f" --run-id {args.run_id}"
     if args.step:
         oe_safety_reasoning_cmd += f" --step {args.step}"
-    # add string with number of gpus
-    num_gpus = task_spec['resources']['gpuCount']
-    # if num_gpus > 1, double it again for oe-eval configs
-    # open_instruct GPT adjustment wasn't quite enough
-    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
-    # tested reasonably extensively with 70B
-    if num_gpus > 1:
-        num_gpus *= 2
-    oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}"
-    if args.oe_eval_max_length:
-        oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}"
+     
+     # pull num gpu logic from old safety eval code
+    task_spec = adjust_gpus(
+        task_spec=task_spec,
+        experiment_group="safety_eval",
+        model_name=model_info[0],
+        gpu_multiplier=args.gpu_multiplier,
+    )
+    # add gpu information.
+    # we just assume you want to use all the gpus for one task at a time
+    if "70B" in model_info[0]:
+        task_spec['resources']['gpuCount'] = 8
+    num_gpus = task_spec['resources']['gpuCount'] 
+
+    oe_safety_cmd += f" --num_gpus {num_gpus}"
+    # controlled by config file
+    # if args.oe_eval_max_length:
+    #     oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}"
 
     # add priority
     oe_safety_reasoning_cmd += f" --priority {args.priority}"

From 0bc92474d5f93cb08a7452b8733a044be901c7b7 Mon Sep 17 00:00:00 2001
From: mgmorgan23 <maliam@allenai.org>
Date: Thu, 31 Jul 2025 14:00:15 -0700
Subject: [PATCH 3/7] Add handling for alternative safety beaker image

---
 scripts/submit_eval_jobs.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index 464d1ef2f..347b0b951 100755
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -724,7 +724,9 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_safety_cmd += f" --process-output {args.process_output}"
 
     # Add beaker image from existing argument
-    if args.beaker_image:
+    if args.use_alternate_safety_image:
+        oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}"
+    elif args.beaker_image:
         oe_safety_cmd += f" --beaker-image {args.beaker_image}"
 
     # Add cluster parameter - use the existing cluster argument
@@ -790,6 +792,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_safety_reasoning_cmd += f" --process-output {args.process_output}"
 
     # Add beaker image from existing argument
+    if args.use_alternate_safety_image:
+        oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}"
     if args.beaker_image:
         oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}"
 

From 460a874c2c955f50a34625ff9f7120fc98e14bb8 Mon Sep 17 00:00:00 2001
From: mgmorgan23 <maliam@allenai.org>
Date: Fri, 1 Aug 2025 13:32:01 -0700
Subject: [PATCH 4/7] typos in script, add hf key to gantry args

---
 scripts/eval/oe-eval.sh     | 2 +-
 scripts/submit_eval_jobs.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
index 5d5e7cd15..73e7aabff 100755
--- a/scripts/eval/oe-eval.sh
+++ b/scripts/eval/oe-eval.sh
@@ -277,7 +277,7 @@ for TASK in "${TASKS[@]}"; do
             --task-args "{ \"generation_kwargs\": { \"max_gen_toks\": ${MAX_LENGTH}, \"truncate_context\": false${STOP_SEQUENCES_JSON} } }" \
             ${HF_UPLOAD_ARG} \
             --gpus "$GPU_COUNT" \
-            --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key", "weka": "oe-adapt-default:/weka/oe-adapt-default", "env#132":"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "env-secret#42": "AZURE_EVAL_API_KEY=azure_eval_api_key"}' \
+            --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key", "weka": "oe-adapt-default:/weka/oe-adapt-default", "env#132":"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "env-secret#42": "AZURE_EVAL_API_KEY=azure_eval_api_key", "env-secret#2":"HF_TOKEN=HF_TOKEN"}' \
             ${REVISION_ARG} \
             --cluster "$CLUSTER" \
             --beaker-retries 2 \
diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index 347b0b951..0c4ab6ac6 100755
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -775,7 +775,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         task_spec['resources']['gpuCount'] = 8
     num_gpus = task_spec['resources']['gpuCount'] 
 
-    oe_safety_cmd += f" --num_gpus {num_gpus}"
+    oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}"
     # controlled by config file
     # if args.oe_eval_max_length:
     #     oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}"
@@ -793,8 +793,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 
     # Add beaker image from existing argument
     if args.use_alternate_safety_image:
-        oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}"
-    if args.beaker_image:
+        oe_safety_reasoning_cmd += f" --beaker-image {args.use_alternate_safety_image}"
+    elif args.beaker_image:
         oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}"
 
     # Add cluster parameter - use the existing cluster argument

From 995028b137dbc77ea6f10f24839fef67b1b57980 Mon Sep 17 00:00:00 2001
From: mgmorgan23 <maliam@allenai.org>
Date: Thu, 7 Aug 2025 09:56:14 -0700
Subject: [PATCH 5/7] move safety eval call into a task suite

---
 docs/safety-eval/safety.md                    |  20 +-
 docs/safety.md                                |  21 ++-
 scripts/eval/oe-eval.sh                       |  18 +-
 scripts/submit_eval_jobs.py                   | 175 +++---------------
 ...ataset_model_upload_then_evaluate_model.py |  24 ++-
 5 files changed, 99 insertions(+), 159 deletions(-)

diff --git a/docs/safety-eval/safety.md b/docs/safety-eval/safety.md
index f2be4e5e7..b9ee953da 100644
--- a/docs/safety-eval/safety.md
+++ b/docs/safety-eval/safety.md
@@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b
 
 ## Running at Ai2
 
-This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork).
+This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). 
 
-An example command would be:
+An example command on a reasoning model would be:
 ```bash
 python scripts/submit_eval_jobs.py \
     --model_name <model name> \
@@ -17,10 +17,22 @@ python scripts/submit_eval_jobs.py \
       --beaker_image nathanl/open_instruct_auto \
       --upload_to_hf allenai/tulu-3-evals \
       --run_oe_eval_experiments \
-      --run_safety_evaluations
+      --oe_eval_task_suite "SAFETY_EVAL_REASONING"
 ```
 
-Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`.
+An example command on a non-reasoning model would be:
+```bash
+python scripts/submit_eval_jobs.py \
+    --model_name <model name> \
+      --location <beaker id> \
+      --is_tuned --workspace tulu-3-results \
+      --preemptible \
+      --use_hf_tokenizer_template \
+      --beaker_image nathanl/open_instruct_auto \
+      --upload_to_hf allenai/tulu-3-evals \
+      --run_oe_eval_experiments \
+      --oe_eval_task_suite "SAFETY_EVAL"
+```
 
 ## Running on an interactive session
 
diff --git a/docs/safety.md b/docs/safety.md
index 4f028559e..ff8a8a496 100644
--- a/docs/safety.md
+++ b/docs/safety.md
@@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b
 
 ## Running at Ai2
 
-This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork).
+This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). 
 
-An example command would be:
+An example command on a reasoning model would be:
 ```bash
 python scripts/submit_eval_jobs.py \
     --model_name <model name> \
@@ -17,10 +17,23 @@ python scripts/submit_eval_jobs.py \
       --beaker_image nathanl/open_instruct_auto \
       --upload_to_hf allenai/tulu-3-evals \
       --run_oe_eval_experiments \
-      --run_safety_evaluations
+      --oe_eval_task_suite "SAFETY_EVAL_REASONING"
+```
+
+An example command on a non-reasoning model would be:
+```bash
+python scripts/submit_eval_jobs.py \
+    --model_name <model name> \
+      --location <beaker id> \
+      --is_tuned --workspace tulu-3-results \
+      --preemptible \
+      --use_hf_tokenizer_template \
+      --beaker_image nathanl/open_instruct_auto \
+      --upload_to_hf allenai/tulu-3-evals \
+      --run_oe_eval_experiments \
+      --oe_eval_task_suite "SAFETY_EVAL"
 ```
 
-Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`.
 
 ## Running on an interactive session
 
diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
index 73e7aabff..ba0383b1c 100755
--- a/scripts/eval/oe-eval.sh
+++ b/scripts/eval/oe-eval.sh
@@ -48,7 +48,7 @@ set -ex
 # Function to print usage
 usage() {
     echo "Usage: $0 --model-name MODEL_NAME --model-location MODEL_LOCATION [--num_gpus GPUS] [--upload_to_hf] [--revision REVISION] [--max-length <max_length>] [--task-suite TASK_SUITE] [--priority priority] [--tasks TASKS] [--evaluate_on_weka] [--stop-sequences <comma_separated_stops>] [--beaker-image <beaker_image>] [--cluster <clusters>] [--process-output <process_output>]"
-    echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)"
+    echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)"
     echo "TASKS should be a comma-separated list of task specifications (e.g., 'gsm8k::tulu,bbh:cot::tulu')"
     echo "STOP_SEQUENCES should be a comma-separated list of strings to stop generation at (e.g., '</answer>,\\n\\n')"
     echo "PROCESS_OUTPUT should be a string specifying how to process the model output (e.g., 'r1_style')"
@@ -212,6 +212,14 @@ NEXT_MODEL_UNSEEN=(
     "ifbench::tulu"
 )
 
+SAFETY_EVAL=(
+    "safety_eval"
+)
+
+SAFETY_EVAL_REASONING=(
+    "safety_eval_reasoning"
+)
+
 # If custom tasks provided, convert comma-separated string to array
 if [[ -n "$CUSTOM_TASKS" ]]; then
     IFS=',' read -ra TASKS <<< "$CUSTOM_TASKS"
@@ -230,6 +238,12 @@ else
         TULU_3_UNSEEN)
             TASKS=("${TULU_3_UNSEEN[@]}")
             ;;
+        SAFETY_EVAL)
+            TASKS=("${SAFETY_EVAL[@]}")
+            ;;
+        SAFETY_EVAL_REASONING)
+            TASKS=("${SAFETY_EVAL_REASONING[@]}")
+            ;;
         *)
             echo "Error: Unknown task suite '$TASK_SUITE'"
             usage
@@ -284,7 +298,7 @@ for TASK in "${TASKS[@]}"; do
             --beaker-image "$BEAKER_IMAGE" \
             --beaker-priority "$PRIORITY" \
             --push-datalake \
-            --datalake-tags "$DATALAKE_ARGS"
+            --datalake-tags "$DATALAKE_ARGS" 
     else
         python oe-eval-internal/oe_eval/launch.py \
         --model "$MODEL_NAME" \
diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index 0c4ab6ac6..15b752f6d 100755
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -107,12 +107,9 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 parser.add_argument("--upload_to_hf", type=str, default=None, help="If given, upload the eval results to the Hugging Face model hub. Provide the HF dataset and path in form <hf dataset>//<hf path>.")
 parser.add_argument("--hf_upload_experiments", type=str, nargs="*", default=None, help="Upload given experiment to the Hugging Face model hub.")
 parser.add_argument("--run_oe_eval_experiments", action="store_true", help="Run the OE eval tool and experiments too.")
-parser.add_argument("--run_safety_evaluations", action="store_true", help="Run the OE safety evaluations too.")
-parser.add_argument("--run_safety_evaluations_reasoning", action="store_true", help="Run the OE safety evaluations on a reasoning model too.")
 parser.add_argument("--skip_oi_evals", action="store_true", help="Don't run open instruct evals.")
 parser.add_argument("--oe_eval_max_length", type=int, default=4096, help="Max length for OE eval.")
-parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)")
-parser.add_argument("--use_alternate_safety_image", type=str, default=None, help="Use a different image for safety eval.")
+parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)")
 parser.add_argument("--evaluate_on_weka", action="store_true", help="Evaluate OE eval on Beaker.")
 # NOTE: evaluate on weka is expected to be on by default. If not, the evals will run on the google augusta cluster.
 # TODO: fix this logic at a future date
@@ -631,14 +628,33 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_eval_cmd += f" --run-id {args.run_id}"
     if args.step:
         oe_eval_cmd += f" --step {args.step}"
+    
     # add string with number of gpus
-    num_gpus = task_spec['resources']['gpuCount']
-    # if num_gpus > 1, double it again for oe-eval configs
-    # open_instruct GPT adjustment wasn't quite enough
-    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
-    # tested reasonably extensively with 70B
-    if num_gpus > 1:
-        num_gpus *= 2
+    if args.oe_eval_task_suite == 'SAFETY_EVAL' or args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING':
+        # pull num gpu logic from old safety eval code
+        task_spec = adjust_gpus(
+            task_spec=task_spec,
+            experiment_group="safety_eval",
+            model_name=model_info[0],
+            gpu_multiplier=args.gpu_multiplier,
+        )
+        # add gpu information.
+        # we just assume you want to use all the gpus for one task at a time
+        if "70B" in model_info[0]:
+            task_spec['resources']['gpuCount'] = 8
+        num_gpus = task_spec['resources']['gpuCount'] 
+        # double GPUs for reasoning models
+        if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING':
+            num_gpus *= 2
+    else:
+        num_gpus = task_spec['resources']['gpuCount']
+        # if num_gpus > 1, double it again for oe-eval configs
+        # open_instruct GPT adjustment wasn't quite enough
+        # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
+        # tested reasonably extensively with 70B
+        if num_gpus > 1:
+            num_gpus *= 2
+    
     oe_eval_cmd += f" --num_gpus {num_gpus}"
     if args.oe_eval_max_length:
         oe_eval_cmd += f" --max-length {args.oe_eval_max_length}"
@@ -668,140 +684,3 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 
     print(f"Running OE eval with command: {oe_eval_cmd}")
     subprocess.Popen(oe_eval_cmd, shell=True)
-
-if args.run_safety_evaluations:
-    # if so, run safety-fork through oe-eval. We assume oe-eval is cloned in the top-level repo directory.
-    oe_safety_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}"
-    if args.upload_to_hf:
-        oe_safety_cmd += f" --upload_to_hf {args.upload_to_hf}"
-    ## model location munging: if beaker, use beaker://. If hf, just name
-    if model_info[0].startswith("hf-"):
-        oe_safety_cmd += f" --model-location {model_info[1]}"
-    elif model_info[1].startswith("/"):
-        oe_safety_cmd += f" --model-location {model_info[1]}"
-    elif model_info[1].startswith("gs://"):
-        oe_safety_cmd += f" --model-location {model_info[1]}"
-    else:
-        oe_safety_cmd += f" --model-location beaker://{model_info[1]}"
-    if args.hf_revision:
-        oe_safety_cmd += f" --revision {args.hf_revision}"
-    if args.evaluate_on_weka:
-        oe_safety_cmd += " --evaluate_on_weka"
-    oe_safety_cmd += f" --tasks safety_eval"
-    if args.run_id:
-        oe_safety_cmd += f" --run-id {args.run_id}"
-    if args.step:
-        oe_safety_cmd += f" --step {args.step}"
-
-    # pull num gpu logic from old safety eval code
-    task_spec = adjust_gpus(
-        task_spec=task_spec,
-        experiment_group="safety_eval",
-        model_name=model_info[0],
-        gpu_multiplier=args.gpu_multiplier,
-    )
-    # add gpu information.
-    # we just assume you want to use all the gpus for one task at a time
-    if "70B" in model_info[0]:
-        task_spec['resources']['gpuCount'] = 8
-    num_gpus = task_spec['resources']['gpuCount'] 
-
-    oe_safety_cmd += f" --num_gpus {num_gpus}"
-    
-    # controlled by config file
-    # if args.oe_eval_max_length:
-    #     oe_safety_cmd += f" --max-length {args.oe_eval_max_length}"
-
-    # add priority
-    oe_safety_cmd += f" --priority {args.priority}"
-
-    # Add stop sequences if provided
-    if args.oe_eval_stop_sequences:
-        oe_safety_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'"
-
-    # Add process output if provided
-    if args.process_output:
-        oe_safety_cmd += f" --process-output {args.process_output}"
-
-    # Add beaker image from existing argument
-    if args.use_alternate_safety_image:
-        oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}"
-    elif args.beaker_image:
-        oe_safety_cmd += f" --beaker-image {args.beaker_image}"
-
-    # Add cluster parameter - use the existing cluster argument
-    # Join the list with commas since oe-eval.sh expects a comma-separated string
-    if args.cluster and len(args.cluster) > 0:
-        cluster_str = ",".join(args.cluster)
-        oe_safety_cmd += f" --cluster '{cluster_str}'"
-
-    print(f"Running OE safety eval with command: {oe_safety_cmd}")
-    subprocess.Popen(oe_safety_cmd, shell=True)
-
-if args.run_safety_evaluations_reasoning:
-    # if so, run safety-fork on resoning tasks through oe-eval. We assume oe-eval is cloned in the top-level repo directory.
-    oe_safety_reasoning_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}"
-    if args.upload_to_hf:
-        oe_safety_reasoning_cmd += f" --upload_to_hf {args.upload_to_hf}"
-    ## model location munging: if beaker, use beaker://. If hf, just name
-    if model_info[0].startswith("hf-"):
-        oe_safety_reasoning_cmd += f" --model-location {model_info[1]}"
-    elif model_info[1].startswith("/"):
-        oe_safety_reasoning_cmd += f" --model-location {model_info[1]}"
-    elif model_info[1].startswith("gs://"):
-        oe_safety_reasoning_cmd += f" --model-location {model_info[1]}"
-    else:
-        oe_safety_reasoning_cmd += f" --model-location beaker://{model_info[1]}"
-    if args.hf_revision:
-        oe_safety_reasoning_cmd += f" --revision {args.hf_revision}"
-    if args.evaluate_on_weka:
-        oe_safety_reasoning_cmd += " --evaluate_on_weka"
-    oe_safety_reasoning_cmd += f" --tasks safety_eval_reasoning"
-    if args.run_id:
-        oe_safety_reasoning_cmd += f" --run-id {args.run_id}"
-    if args.step:
-        oe_safety_reasoning_cmd += f" --step {args.step}"
-     
-     # pull num gpu logic from old safety eval code
-    task_spec = adjust_gpus(
-        task_spec=task_spec,
-        experiment_group="safety_eval",
-        model_name=model_info[0],
-        gpu_multiplier=args.gpu_multiplier,
-    )
-    # add gpu information.
-    # we just assume you want to use all the gpus for one task at a time
-    if "70B" in model_info[0]:
-        task_spec['resources']['gpuCount'] = 8
-    num_gpus = task_spec['resources']['gpuCount'] 
-
-    oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}"
-    # controlled by config file
-    # if args.oe_eval_max_length:
-    #     oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}"
-
-    # add priority
-    oe_safety_reasoning_cmd += f" --priority {args.priority}"
-
-    # Add stop sequences if provided
-    if args.oe_eval_stop_sequences:
-        oe_safety_reasoning_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'"
-
-    # Add process output if provided
-    if args.process_output:
-        oe_safety_reasoning_cmd += f" --process-output {args.process_output}"
-
-    # Add beaker image from existing argument
-    if args.use_alternate_safety_image:
-        oe_safety_reasoning_cmd += f" --beaker-image {args.use_alternate_safety_image}"
-    elif args.beaker_image:
-        oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}"
-
-    # Add cluster parameter - use the existing cluster argument
-    # Join the list with commas since oe-eval.sh expects a comma-separated string
-    if args.cluster and len(args.cluster) > 0:
-        cluster_str = ",".join(args.cluster)
-        oe_safety_reasoning_cmd += f" --cluster '{cluster_str}'"
-
-    print(f"Running OE safety eval with command: {oe_safety_reasoning_cmd}")
-    subprocess.Popen(oe_safety_reasoning_cmd, shell=True)
\ No newline at end of file
diff --git a/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py b/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py
index c262dfc37..5bcdda2aa 100644
--- a/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py
+++ b/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py
@@ -47,7 +47,6 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig):
                 --use_hf_tokenizer_template \
                 --beaker_image nathanl/open_instruct_auto \
                 --skip_oi_evals \
-                --run_safety_evaluations \
                 --run_oe_eval_experiments \
                 --upload_to_hf {args.upload_to_hf}"""
             if args.run_id:
@@ -60,6 +59,29 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig):
             print(f"Beaker evaluation jobs: Stderr:\n{stderr.decode()}")
             print(f"Beaker evaluation jobs: process return code: {process.returncode}")
 
+            safety_command = f"""
+            python scripts/submit_eval_jobs.py \
+                --model_name {args.model_name} \
+                --location {beaker_dataset_ids[-1]} \
+                --is_tuned \
+                --workspace tulu-3-results \
+                --preemptible \
+                --use_hf_tokenizer_template \
+                --beaker_image nathanl/open_instruct_auto \
+                --skip_oi_evals \
+                --run_oe_eval_experiments \
+                --oe_eval_task_suite "SAFETY_EVAL" \
+                --upload_to_hf {args.upload_to_hf}"""
+            if args.run_id:
+                safety_command += f" --run_id {args.run_id}"
+
+            safety_process = subprocess.Popen(["bash", "-c", safety_command], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            safety_stdout, safety_stderr = safety_process.communicate()
+
+            print(f"Beaker safety evaluation jobs: Stdout:\n{safety_stdout.decode()}")
+            print(f"Beaker safety evaluation jobs: Stderr:\n{safety_stderr.decode()}")
+            print(f"Beaker safety evaluation jobs: process return code: {safety_process.returncode}")
+
 
             return
         time.sleep(args.check_interval_seconds)

From 8a1b5df333309d7d984110f8683d0a41ddef163a Mon Sep 17 00:00:00 2001
From: mgmorgan23 <maliam@allenai.org>
Date: Thu, 7 Aug 2025 13:41:18 -0700
Subject: [PATCH 6/7] update num_gpu calculation

---
 scripts/submit_eval_jobs.py | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index 7e8211ffc..d001d5c8f 100755
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -633,32 +633,18 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_eval_cmd += f" --step {args.step}"
     
     # add string with number of gpus
-    if args.oe_eval_task_suite == 'SAFETY_EVAL' or args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING':
-        # pull num gpu logic from old safety eval code
-        task_spec = adjust_gpus(
-            task_spec=task_spec,
-            experiment_group="safety_eval",
-            model_name=model_info[0],
-            gpu_multiplier=args.gpu_multiplier,
-        )
-        # add gpu information.
-        # we just assume you want to use all the gpus for one task at a time
-        if "70B" in model_info[0]:
-            task_spec['resources']['gpuCount'] = 8
-        num_gpus = task_spec['resources']['gpuCount'] 
-        # double GPUs for reasoning models
-        if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING':
-            num_gpus *= 2
-    else:
-        num_gpus = task_spec['resources']['gpuCount']
-        # if num_gpus > 1, double it again for oe-eval configs
-        # open_instruct GPT adjustment wasn't quite enough
-        # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
-        # tested reasonably extensively with 70B
-        if num_gpus > 1:
-            num_gpus *= 2
-    
+    num_gpus = task_spec['resources']['gpuCount']
+    # if num_gpus > 1, double it again for oe-eval configs
+    # open_instruct GPT adjustment wasn't quite enough
+    # adjusted here so the GPU configs in open-instruct eval are not impacted by the change
+    # tested reasonably extensively with 70B
+    if num_gpus > 1:
+        num_gpus *= 2
+    # double GPUs for reasoning models
+    if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING':
+        num_gpus *= 2
     oe_eval_cmd += f" --num_gpus {num_gpus}"
+
     if args.oe_eval_max_length:
         oe_eval_cmd += f" --max-length {args.oe_eval_max_length}"
     # Add task suite parameter

From 15f25dcc86636688b8c3593728af48bd379c2d1d Mon Sep 17 00:00:00 2001
From: mgmorgan23 <maliam@allenai.org>
Date: Fri, 8 Aug 2025 09:28:14 -0700
Subject: [PATCH 7/7] typo

---
 scripts/eval/oe-eval.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
index 6df2996dc..e8f434372 100755
--- a/scripts/eval/oe-eval.sh
+++ b/scripts/eval/oe-eval.sh
@@ -319,7 +319,7 @@ for TASK in "${TASKS[@]}"; do
     # NOTE: For gantry args here and below, random numbers like #42 are added to the env variables because they need to be unique names. The numbers are ignored.
     # Build gantry args
     if [ "$EVALUATE_ON_WEKA" == "true" ]; then
-        GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}, \"env-secret#2":"HF_TOKEN=HF_TOKEN\"}"
+        GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}, \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\"}"
     else
         GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"env-secret#43\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\", \"env\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\", \"mount\": \"/mnt/filestore_1:/filestore\", \"env#111\": \"HF_HOME=/filestore/.cache/huggingface\", \"env#112\": \"HF_DATASETS_CACHE=/filestore/.cache/huggingface\", \"env#113\": \"HF_HUB_CACHE=/filestore/.cache/hub\"${MAX_TOKENS_ARG}}"
     fi