allenai · mgmorgan23 · Jul 26, 2025 · Jul 28, 2025 · Jul 31, 2025 · Aug 1, 2025
diff --git a/docs/safety-eval/safety.md b/docs/safety-eval/safety.md
@@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b
 
 ## Running at Ai2
 
-This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork).
+This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). 
 
-An example command would be:
+An example command on a reasoning model would be:
 ```bash
 python scripts/submit_eval_jobs.py \
     --model_name <model name> \
@@ -17,10 +17,22 @@ python scripts/submit_eval_jobs.py \
       --beaker_image nathanl/open_instruct_auto \
       --upload_to_hf allenai/tulu-3-evals \
       --run_oe_eval_experiments \
-      --run_safety_evaluations
+      --oe_eval_task_suite "SAFETY_EVAL_REASONING"
 ```
 
-Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`.
+An example command on a non-reasoning model would be:
+```bash
+python scripts/submit_eval_jobs.py \
+    --model_name <model name> \
+      --location <beaker id> \
+      --is_tuned --workspace tulu-3-results \
+      --preemptible \
+      --use_hf_tokenizer_template \
+      --beaker_image nathanl/open_instruct_auto \
+      --upload_to_hf allenai/tulu-3-evals \
+      --run_oe_eval_experiments \
+      --oe_eval_task_suite "SAFETY_EVAL"
+```
 
 ## Running on an interactive session
 

diff --git a/docs/safety.md b/docs/safety.md
@@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b
 
 ## Running at Ai2
 
-This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork).
+This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). 
 
-An example command would be:
+An example command on a reasoning model would be:
 ```bash
 python scripts/submit_eval_jobs.py \
     --model_name <model name> \
@@ -17,10 +17,23 @@ python scripts/submit_eval_jobs.py \
       --beaker_image nathanl/open_instruct_auto \
       --upload_to_hf allenai/tulu-3-evals \
       --run_oe_eval_experiments \
-      --run_safety_evaluations
+      --oe_eval_task_suite "SAFETY_EVAL_REASONING"
+```
+
+An example command on a non-reasoning model would be:
+```bash
+python scripts/submit_eval_jobs.py \
+    --model_name <model name> \
+      --location <beaker id> \
+      --is_tuned --workspace tulu-3-results \
+      --preemptible \
+      --use_hf_tokenizer_template \
+      --beaker_image nathanl/open_instruct_auto \
+      --upload_to_hf allenai/tulu-3-evals \
+      --run_oe_eval_experiments \
+      --oe_eval_task_suite "SAFETY_EVAL"
 ```
 
-Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`.
 
 ## Running on an interactive session
 

diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh
@@ -48,7 +48,7 @@ set -ex
 # Function to print usage
 usage() {
     echo "Usage: $0 --model-name MODEL_NAME --model-location MODEL_LOCATION [--num_gpus GPUS] [--upload_to_hf] [--revision REVISION] [--max-length <max_length>] [--task-suite TASK_SUITE] [--priority priority] [--tasks TASKS] [--evaluate_on_weka] [--stop-sequences <comma_separated_stops>] [--beaker-image <beaker_image>] [--cluster <clusters>] [--process-output <process_output>]"
-    echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)"
+    echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)"
     echo "TASKS should be a comma-separated list of task specifications (e.g., 'gsm8k::tulu,bbh:cot::tulu')"
     echo "STOP_SEQUENCES should be a comma-separated list of strings to stop generation at (e.g., '</answer>,\\n\\n')"
     echo "PROCESS_OUTPUT should be a string specifying how to process the model output (e.g., 'r1_style')"
@@ -215,6 +215,14 @@ NEXT_MODEL_UNSEEN=(
     "ifbench::tulu"
 )
 
+SAFETY_EVAL=(
+    "safety_eval"
+)
+
+SAFETY_EVAL_REASONING=(
+    "safety_eval_reasoning"
+)
+
 # If custom tasks provided, convert comma-separated string to array
 if [[ -n "$CUSTOM_TASKS" ]]; then
     IFS=',' read -ra TASKS <<< "$CUSTOM_TASKS"
@@ -233,6 +241,12 @@ else
         TULU_3_UNSEEN)
             TASKS=("${TULU_3_UNSEEN[@]}")
             ;;
+        SAFETY_EVAL)
+            TASKS=("${SAFETY_EVAL[@]}")
+            ;;
+        SAFETY_EVAL_REASONING)
+            TASKS=("${SAFETY_EVAL_REASONING[@]}")
+            ;;
         *)
             echo "Error: Unknown task suite '$TASK_SUITE'"
             usage
@@ -305,7 +319,7 @@ for TASK in "${TASKS[@]}"; do
     # NOTE: For gantry args here and below, random numbers like #42 are added to the env variables because they need to be unique names. The numbers are ignored.
     # Build gantry args
     if [ "$EVALUATE_ON_WEKA" == "true" ]; then
-        GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}}"
+        GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}, \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\"}"
     else
         GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"env-secret#43\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\", \"env\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\", \"mount\": \"/mnt/filestore_1:/filestore\", \"env#111\": \"HF_HOME=/filestore/.cache/huggingface\", \"env#112\": \"HF_DATASETS_CACHE=/filestore/.cache/huggingface\", \"env#113\": \"HF_HUB_CACHE=/filestore/.cache/hub\"${MAX_TOKENS_ARG}}"
     fi
@@ -330,7 +344,7 @@ for TASK in "${TASKS[@]}"; do
             --beaker-image "$BEAKER_IMAGE" \
             --beaker-priority "$PRIORITY" \
             --push-datalake \
-            --datalake-tags "$DATALAKE_ARGS"
+            --datalake-tags "$DATALAKE_ARGS" 
     else
         python oe-eval-internal/oe_eval/launch.py \
         --model "$MODEL_NAME" \

diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
@@ -107,11 +107,9 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 parser.add_argument("--upload_to_hf", type=str, default=None, help="If given, upload the eval results to the Hugging Face model hub. Provide the HF dataset and path in form <hf dataset>//<hf path>.")
 parser.add_argument("--hf_upload_experiments", type=str, nargs="*", default=None, help="Upload given experiment to the Hugging Face model hub.")
 parser.add_argument("--run_oe_eval_experiments", action="store_true", help="Run the OE eval tool and experiments too.")
-parser.add_argument("--run_safety_evaluations", action="store_true", help="Run the OE safety evaluations too.")
 parser.add_argument("--skip_oi_evals", action="store_true", help="Don't run open instruct evals.")
 parser.add_argument("--oe_eval_max_length", type=int, default=4096, help="Max length for OE eval.")
-parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)")
-parser.add_argument("--use_alternate_safety_image", type=str, default=None, help="Use a different image for safety eval.")
+parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)")
 parser.add_argument("--evaluate_on_weka", action="store_true", help="Evaluate OE eval on Beaker.")
 # NOTE: evaluate on weka is expected to be on by default. If not, the evals will run on the google augusta cluster.
 # TODO: fix this logic at a future date
@@ -633,6 +631,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
         oe_eval_cmd += f" --run-id {args.run_id}"
     if args.step:
         oe_eval_cmd += f" --step {args.step}"
+
     # add string with number of gpus
     num_gpus = task_spec['resources']['gpuCount']
     # if num_gpus > 1, double it again for oe-eval configs
@@ -641,7 +640,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
     # tested reasonably extensively with 70B
     if num_gpus > 1:
         num_gpus *= 2
+    # double GPUs for reasoning models
+    if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING':
+        num_gpus *= 2
     oe_eval_cmd += f" --num_gpus {num_gpus}"
+
     if args.oe_eval_max_length:
         oe_eval_cmd += f" --max-length {args.oe_eval_max_length}"
     # Add task suite parameter
@@ -670,70 +673,3 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier):
 
     print(f"Running OE eval with command: {oe_eval_cmd}")
     subprocess.Popen(oe_eval_cmd, shell=True)
-
-# create an experiment that runs the safety eval tasks
-if args.run_safety_evaluations:
-    # just take the original spec we had, modify it for safety eval.
-    experiment_name = f"oi_safety_{model_name}"
-    experiment_name = experiment_name.replace('β', '').replace(r"{", "").replace(r"}", "") # hack: remove characters beaker doesn't like
-    d["description"] = experiment_name
-    # specific image for safety eval
-    d["tasks"][0]["image"]["beaker"] = "hamishivi/open-safety"
-    if args.use_alternate_safety_image:
-        d["tasks"][0]["image"]["beaker"] = args.use_alternate_safety_image
-    d["tasks"] = [d["tasks"][0]]
-    task_spec = d["tasks"][0]
-    task_spec["name"] = experiment_name
-    task_spec["arguments"][0] = '''
-VLLM_WORKER_MULTIPROC_METHOD=spawn PYTHONPATH=. python evaluation/run_all_generation_benchmarks.py \
-    --model_name_or_path /model \
-    --model_input_template_path_or_name hf \
-    --report_output_path /output/metrics.json \
-    --save_individual_results_path /output/all.json \
-'''
-    # some copied logic
-    if model_info[0].startswith("hf-"):  # if it's a huggingface model, load it from the model hub and delete mount `/model`
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]} --hf_revision {args.hf_revision}")]
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")]
-        del task_spec['datasets'][1]
-    elif model_info[1].startswith("/"):  # if it's a local model, load it from the local directory and delete mount `/model`
-        assert weka_available, "NFS / Weka is required for path-based models."  # to be safe.
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
-        task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path "+model_info[1])]
-        del task_spec['datasets'][1]
-    else:  # if it's a beaker model, mount the beaker dataset to `/model`
-        task_spec['datasets'][1]['source']['beaker'] = model_info[1]
-
-    task_spec = adjust_gpus(
-        task_spec=task_spec,
-        experiment_group="safety_eval",
-        model_name=model_info[0],
-        gpu_multiplier=args.gpu_multiplier,
-    )
-
-    # add gpu information.
-    # we just assume you want to use all the gpus for one task at a time
-    if "70B" in model_info[0]:
-        task_spec['resources']['gpuCount'] = 8
-    num_gpus = task_spec['resources']['gpuCount']
-    task_spec["arguments"][0]+= f" --min_gpus_per_task {num_gpus}"
-
-    if args.upload_to_hf:
-        hf_dataset = args.upload_to_hf
-        # to match the way oe-eval script works.
-        # if we prepended hf- to the model name, remove it.
-        # if model_name.startswith("hf-"):
-        #     model_name = model_name[3:]
-        # Above is no longer the case, oe-eval includes hf- again
-        task_spec['arguments'] = [task_spec['arguments'][0] + f" --upload_to_hf {hf_dataset} --hf_upload_name results/{model_name}"]
-
-    d["tasks"] = [task_spec]
-    if not os.path.exists("configs/beaker_configs/auto_created"):
-        os.makedirs("configs/beaker_configs/auto_created")
-    fn = "configs/beaker_configs/auto_created/{}.yaml".format(experiment_name)
-    os.makedirs(os.path.dirname(fn), exist_ok=True)
-    with open(fn, "w") as file:
-        yaml.dump(d, file, default_flow_style=True)
-
-    cmd = "beaker experiment create {} --workspace ai2/{}".format(fn, workspace)
-    subprocess.Popen(cmd, shell=True)
diff --git a/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py b/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py
@@ -47,7 +47,6 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig):
                 --use_hf_tokenizer_template \
                 --beaker_image nathanl/open_instruct_auto \
                 --skip_oi_evals \
-                --run_safety_evaluations \
                 --run_oe_eval_experiments \
                 --upload_to_hf {args.upload_to_hf}"""
             if args.run_id:
@@ -60,6 +59,29 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig):
             print(f"Beaker evaluation jobs: Stderr:\n{stderr.decode()}")
             print(f"Beaker evaluation jobs: process return code: {process.returncode}")
 
+            safety_command = f"""
+            python scripts/submit_eval_jobs.py \
+                --model_name {args.model_name} \
+                --location {beaker_dataset_ids[-1]} \
+                --is_tuned \
+                --workspace tulu-3-results \
+                --preemptible \
+                --use_hf_tokenizer_template \
+                --beaker_image nathanl/open_instruct_auto \
+                --skip_oi_evals \
+                --run_oe_eval_experiments \
+                --oe_eval_task_suite "SAFETY_EVAL" \
+                --upload_to_hf {args.upload_to_hf}"""
+            if args.run_id:
+                safety_command += f" --run_id {args.run_id}"
+
+            safety_process = subprocess.Popen(["bash", "-c", safety_command], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            safety_stdout, safety_stderr = safety_process.communicate()
+
+            print(f"Beaker safety evaluation jobs: Stdout:\n{safety_stdout.decode()}")
+            print(f"Beaker safety evaluation jobs: Stderr:\n{safety_stderr.decode()}")
+            print(f"Beaker safety evaluation jobs: process return code: {safety_process.returncode}")
+
 
             return
         time.sleep(args.check_interval_seconds)