From 7479bedd67e8f442a4f3ae5e34e3c5113df1aa39 Mon Sep 17 00:00:00 2001 From: mgmorgan23 Date: Fri, 25 Jul 2025 17:03:05 -0700 Subject: [PATCH 1/7] Update script to call new oe-eval safety evals --- scripts/submit_eval_jobs.py | 172 +++++++++++++++++++++++------------- 1 file changed, 112 insertions(+), 60 deletions(-) diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index b23abeb3c..cb4df3e03 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -108,6 +108,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): parser.add_argument("--hf_upload_experiments", type=str, nargs="*", default=None, help="Upload given experiment to the Hugging Face model hub.") parser.add_argument("--run_oe_eval_experiments", action="store_true", help="Run the OE eval tool and experiments too.") parser.add_argument("--run_safety_evaluations", action="store_true", help="Run the OE safety evaluations too.") +parser.add_argument("--run_safety_evaluations_reasoning", action="store_true", help="Run the OE safety evaluations on a reasoning model too.") parser.add_argument("--skip_oi_evals", action="store_true", help="Don't run open instruct evals.") parser.add_argument("--oe_eval_max_length", type=int, default=4096, help="Max length for OE eval.") parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)") @@ -668,69 +669,120 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): print(f"Running OE eval with command: {oe_eval_cmd}") subprocess.Popen(oe_eval_cmd, shell=True) -# create an experiment that runs the safety eval tasks if args.run_safety_evaluations: - # just take the original spec we had, modify it for safety eval. - experiment_name = f"oi_safety_{model_name}" - experiment_name = experiment_name.replace('β', '').replace(r"{", "").replace(r"}", "") # hack: remove characters beaker doesn't like - d["description"] = experiment_name - # specific image for safety eval - d["tasks"][0]["image"]["beaker"] = "hamishivi/open-safety" - if args.use_alternate_safety_image: - d["tasks"][0]["image"]["beaker"] = args.use_alternate_safety_image - d["tasks"] = [d["tasks"][0]] - task_spec = d["tasks"][0] - task_spec["name"] = experiment_name - task_spec["arguments"][0] = ''' -VLLM_WORKER_MULTIPROC_METHOD=spawn PYTHONPATH=. python evaluation/run_all_generation_benchmarks.py \ - --model_name_or_path /model \ - --model_input_template_path_or_name hf \ - --report_output_path /output/metrics.json \ - --save_individual_results_path /output/all.json \ -''' - # some copied logic - if model_info[0].startswith("hf-"): # if it's a huggingface model, load it from the model hub and delete mount `/model` - task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]} --hf_revision {args.hf_revision}")] - task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")] - del task_spec['datasets'][1] - elif model_info[1].startswith("/"): # if it's a local model, load it from the local directory and delete mount `/model` - assert weka_available, "NFS / Weka is required for path-based models." # to be safe. - task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])] - task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path "+model_info[1])] - del task_spec['datasets'][1] - else: # if it's a beaker model, mount the beaker dataset to `/model` - task_spec['datasets'][1]['source']['beaker'] = model_info[1] + # if so, run safety-fork through oe-eval. We assume oe-eval is cloned in the top-level repo directory. + oe_safety_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}" + if args.upload_to_hf: + oe_safety_cmd += f" --upload_to_hf {args.upload_to_hf}" + ## model location munging: if beaker, use beaker://. If hf, just name + if model_info[0].startswith("hf-"): + oe_safety_cmd += f" --model-location {model_info[1]}" + elif model_info[1].startswith("/"): + oe_safety_cmd += f" --model-location {model_info[1]}" + elif model_info[1].startswith("gs://"): + oe_safety_cmd += f" --model-location {model_info[1]}" + else: + oe_safety_cmd += f" --model-location beaker://{model_info[1]}" + if args.hf_revision: + oe_safety_cmd += f" --revision {args.hf_revision}" + if args.evaluate_on_weka: + oe_safety_cmd += " --evaluate_on_weka" + oe_safety_cmd += f" --tasks safety_eval" + if args.run_id: + oe_safety_cmd += f" --run-id {args.run_id}" + if args.step: + oe_safety_cmd += f" --step {args.step}" + # add string with number of gpus + num_gpus = task_spec['resources']['gpuCount'] + # if num_gpus > 1, double it again for oe-eval configs + # open_instruct GPT adjustment wasn't quite enough + # adjusted here so the GPU configs in open-instruct eval are not impacted by the change + # tested reasonably extensively with 70B + if num_gpus > 1: + num_gpus *= 2 + oe_safety_cmd += f" --num_gpus {num_gpus}" + if args.oe_eval_max_length: + oe_safety_cmd += f" --max-length {args.oe_eval_max_length}" - task_spec = adjust_gpus( - task_spec=task_spec, - experiment_group="safety_eval", - model_name=model_info[0], - gpu_multiplier=args.gpu_multiplier, - ) + # add priority + oe_safety_cmd += f" --priority {args.priority}" - # add gpu information. - # we just assume you want to use all the gpus for one task at a time - if "70B" in model_info[0]: - task_spec['resources']['gpuCount'] = 8 - num_gpus = task_spec['resources']['gpuCount'] - task_spec["arguments"][0]+= f" --min_gpus_per_task {num_gpus}" + # Add stop sequences if provided + if args.oe_eval_stop_sequences: + oe_safety_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'" + + # Add process output if provided + if args.process_output: + oe_safety_cmd += f" --process-output {args.process_output}" + + # Add beaker image from existing argument + if args.beaker_image: + oe_safety_cmd += f" --beaker-image {args.beaker_image}" + + # Add cluster parameter - use the existing cluster argument + # Join the list with commas since oe-eval.sh expects a comma-separated string + if args.cluster and len(args.cluster) > 0: + cluster_str = ",".join(args.cluster) + oe_safety_cmd += f" --cluster '{cluster_str}'" + + print(f"Running OE safety eval with command: {oe_safety_cmd}") + subprocess.Popen(oe_safety_cmd, shell=True) +if args.run_safety_evaluations_reasoning: + # if so, run safety-fork on resoning tasks through oe-eval. We assume oe-eval is cloned in the top-level repo directory. + oe_safety_reasoning_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}" if args.upload_to_hf: - hf_dataset = args.upload_to_hf - # to match the way oe-eval script works. - # if we prepended hf- to the model name, remove it. - # if model_name.startswith("hf-"): - # model_name = model_name[3:] - # Above is no longer the case, oe-eval includes hf- again - task_spec['arguments'] = [task_spec['arguments'][0] + f" --upload_to_hf {hf_dataset} --hf_upload_name results/{model_name}"] - - d["tasks"] = [task_spec] - if not os.path.exists("configs/beaker_configs/auto_created"): - os.makedirs("configs/beaker_configs/auto_created") - fn = "configs/beaker_configs/auto_created/{}.yaml".format(experiment_name) - os.makedirs(os.path.dirname(fn), exist_ok=True) - with open(fn, "w") as file: - yaml.dump(d, file, default_flow_style=True) + oe_safety_reasoning_cmd += f" --upload_to_hf {args.upload_to_hf}" + ## model location munging: if beaker, use beaker://. If hf, just name + if model_info[0].startswith("hf-"): + oe_safety_reasoning_cmd += f" --model-location {model_info[1]}" + elif model_info[1].startswith("/"): + oe_safety_reasoning_cmd += f" --model-location {model_info[1]}" + elif model_info[1].startswith("gs://"): + oe_safety_reasoning_cmd += f" --model-location {model_info[1]}" + else: + oe_safety_reasoning_cmd += f" --model-location beaker://{model_info[1]}" + if args.hf_revision: + oe_safety_reasoning_cmd += f" --revision {args.hf_revision}" + if args.evaluate_on_weka: + oe_safety_reasoning_cmd += " --evaluate_on_weka" + oe_safety_reasoning_cmd += f" --tasks safety_eval_reasoning" + if args.run_id: + oe_safety_reasoning_cmd += f" --run-id {args.run_id}" + if args.step: + oe_safety_reasoning_cmd += f" --step {args.step}" + # add string with number of gpus + num_gpus = task_spec['resources']['gpuCount'] + # if num_gpus > 1, double it again for oe-eval configs + # open_instruct GPT adjustment wasn't quite enough + # adjusted here so the GPU configs in open-instruct eval are not impacted by the change + # tested reasonably extensively with 70B + if num_gpus > 1: + num_gpus *= 2 + oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}" + if args.oe_eval_max_length: + oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}" - cmd = "beaker experiment create {} --workspace ai2/{}".format(fn, workspace) - subprocess.Popen(cmd, shell=True) + # add priority + oe_safety_reasoning_cmd += f" --priority {args.priority}" + + # Add stop sequences if provided + if args.oe_eval_stop_sequences: + oe_safety_reasoning_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'" + + # Add process output if provided + if args.process_output: + oe_safety_reasoning_cmd += f" --process-output {args.process_output}" + + # Add beaker image from existing argument + if args.beaker_image: + oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}" + + # Add cluster parameter - use the existing cluster argument + # Join the list with commas since oe-eval.sh expects a comma-separated string + if args.cluster and len(args.cluster) > 0: + cluster_str = ",".join(args.cluster) + oe_safety_reasoning_cmd += f" --cluster '{cluster_str}'" + + print(f"Running OE safety eval with command: {oe_safety_reasoning_cmd}") + subprocess.Popen(oe_safety_reasoning_cmd, shell=True) \ No newline at end of file From 50ab0decf2ba7906352b6af34752fe30da325de9 Mon Sep 17 00:00:00 2001 From: mgmorgan23 Date: Mon, 28 Jul 2025 16:35:01 -0700 Subject: [PATCH 2/7] Add num gpu constraints --- scripts/submit_eval_jobs.py | 57 +++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index cb4df3e03..464d1ef2f 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -692,17 +692,25 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): oe_safety_cmd += f" --run-id {args.run_id}" if args.step: oe_safety_cmd += f" --step {args.step}" - # add string with number of gpus - num_gpus = task_spec['resources']['gpuCount'] - # if num_gpus > 1, double it again for oe-eval configs - # open_instruct GPT adjustment wasn't quite enough - # adjusted here so the GPU configs in open-instruct eval are not impacted by the change - # tested reasonably extensively with 70B - if num_gpus > 1: - num_gpus *= 2 + + # pull num gpu logic from old safety eval code + task_spec = adjust_gpus( + task_spec=task_spec, + experiment_group="safety_eval", + model_name=model_info[0], + gpu_multiplier=args.gpu_multiplier, + ) + # add gpu information. + # we just assume you want to use all the gpus for one task at a time + if "70B" in model_info[0]: + task_spec['resources']['gpuCount'] = 8 + num_gpus = task_spec['resources']['gpuCount'] + oe_safety_cmd += f" --num_gpus {num_gpus}" - if args.oe_eval_max_length: - oe_safety_cmd += f" --max-length {args.oe_eval_max_length}" + + # controlled by config file + # if args.oe_eval_max_length: + # oe_safety_cmd += f" --max-length {args.oe_eval_max_length}" # add priority oe_safety_cmd += f" --priority {args.priority}" @@ -751,17 +759,24 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): oe_safety_reasoning_cmd += f" --run-id {args.run_id}" if args.step: oe_safety_reasoning_cmd += f" --step {args.step}" - # add string with number of gpus - num_gpus = task_spec['resources']['gpuCount'] - # if num_gpus > 1, double it again for oe-eval configs - # open_instruct GPT adjustment wasn't quite enough - # adjusted here so the GPU configs in open-instruct eval are not impacted by the change - # tested reasonably extensively with 70B - if num_gpus > 1: - num_gpus *= 2 - oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}" - if args.oe_eval_max_length: - oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}" + + # pull num gpu logic from old safety eval code + task_spec = adjust_gpus( + task_spec=task_spec, + experiment_group="safety_eval", + model_name=model_info[0], + gpu_multiplier=args.gpu_multiplier, + ) + # add gpu information. + # we just assume you want to use all the gpus for one task at a time + if "70B" in model_info[0]: + task_spec['resources']['gpuCount'] = 8 + num_gpus = task_spec['resources']['gpuCount'] + + oe_safety_cmd += f" --num_gpus {num_gpus}" + # controlled by config file + # if args.oe_eval_max_length: + # oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}" # add priority oe_safety_reasoning_cmd += f" --priority {args.priority}" From 0bc92474d5f93cb08a7452b8733a044be901c7b7 Mon Sep 17 00:00:00 2001 From: mgmorgan23 Date: Thu, 31 Jul 2025 14:00:15 -0700 Subject: [PATCH 3/7] Add handling for alternative safety beaker image --- scripts/submit_eval_jobs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 464d1ef2f..347b0b951 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -724,7 +724,9 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): oe_safety_cmd += f" --process-output {args.process_output}" # Add beaker image from existing argument - if args.beaker_image: + if args.use_alternate_safety_image: + oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}" + elif args.beaker_image: oe_safety_cmd += f" --beaker-image {args.beaker_image}" # Add cluster parameter - use the existing cluster argument @@ -790,6 +792,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): oe_safety_reasoning_cmd += f" --process-output {args.process_output}" # Add beaker image from existing argument + if args.use_alternate_safety_image: + oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}" if args.beaker_image: oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}" From 460a874c2c955f50a34625ff9f7120fc98e14bb8 Mon Sep 17 00:00:00 2001 From: mgmorgan23 Date: Fri, 1 Aug 2025 13:32:01 -0700 Subject: [PATCH 4/7] typos in script, add hf key to gantry args --- scripts/eval/oe-eval.sh | 2 +- scripts/submit_eval_jobs.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh index 5d5e7cd15..73e7aabff 100755 --- a/scripts/eval/oe-eval.sh +++ b/scripts/eval/oe-eval.sh @@ -277,7 +277,7 @@ for TASK in "${TASKS[@]}"; do --task-args "{ \"generation_kwargs\": { \"max_gen_toks\": ${MAX_LENGTH}, \"truncate_context\": false${STOP_SEQUENCES_JSON} } }" \ ${HF_UPLOAD_ARG} \ --gpus "$GPU_COUNT" \ - --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key", "weka": "oe-adapt-default:/weka/oe-adapt-default", "env#132":"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "env-secret#42": "AZURE_EVAL_API_KEY=azure_eval_api_key"}' \ + --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key", "weka": "oe-adapt-default:/weka/oe-adapt-default", "env#132":"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "env-secret#42": "AZURE_EVAL_API_KEY=azure_eval_api_key", "env-secret#2":"HF_TOKEN=HF_TOKEN"}' \ ${REVISION_ARG} \ --cluster "$CLUSTER" \ --beaker-retries 2 \ diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 347b0b951..0c4ab6ac6 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -775,7 +775,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): task_spec['resources']['gpuCount'] = 8 num_gpus = task_spec['resources']['gpuCount'] - oe_safety_cmd += f" --num_gpus {num_gpus}" + oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}" # controlled by config file # if args.oe_eval_max_length: # oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}" @@ -793,8 +793,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): # Add beaker image from existing argument if args.use_alternate_safety_image: - oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}" - if args.beaker_image: + oe_safety_reasoning_cmd += f" --beaker-image {args.use_alternate_safety_image}" + elif args.beaker_image: oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}" # Add cluster parameter - use the existing cluster argument From 995028b137dbc77ea6f10f24839fef67b1b57980 Mon Sep 17 00:00:00 2001 From: mgmorgan23 Date: Thu, 7 Aug 2025 09:56:14 -0700 Subject: [PATCH 5/7] move safety eval call into a task suite --- docs/safety-eval/safety.md | 20 +- docs/safety.md | 21 ++- scripts/eval/oe-eval.sh | 18 +- scripts/submit_eval_jobs.py | 175 +++--------------- ...ataset_model_upload_then_evaluate_model.py | 24 ++- 5 files changed, 99 insertions(+), 159 deletions(-) diff --git a/docs/safety-eval/safety.md b/docs/safety-eval/safety.md index f2be4e5e7..b9ee953da 100644 --- a/docs/safety-eval/safety.md +++ b/docs/safety-eval/safety.md @@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b ## Running at Ai2 -This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork). +This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). -An example command would be: +An example command on a reasoning model would be: ```bash python scripts/submit_eval_jobs.py \ --model_name \ @@ -17,10 +17,22 @@ python scripts/submit_eval_jobs.py \ --beaker_image nathanl/open_instruct_auto \ --upload_to_hf allenai/tulu-3-evals \ --run_oe_eval_experiments \ - --run_safety_evaluations + --oe_eval_task_suite "SAFETY_EVAL_REASONING" ``` -Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`. +An example command on a non-reasoning model would be: +```bash +python scripts/submit_eval_jobs.py \ + --model_name \ + --location \ + --is_tuned --workspace tulu-3-results \ + --preemptible \ + --use_hf_tokenizer_template \ + --beaker_image nathanl/open_instruct_auto \ + --upload_to_hf allenai/tulu-3-evals \ + --run_oe_eval_experiments \ + --oe_eval_task_suite "SAFETY_EVAL" +``` ## Running on an interactive session diff --git a/docs/safety.md b/docs/safety.md index 4f028559e..ff8a8a496 100644 --- a/docs/safety.md +++ b/docs/safety.md @@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b ## Running at Ai2 -This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork). +This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). -An example command would be: +An example command on a reasoning model would be: ```bash python scripts/submit_eval_jobs.py \ --model_name \ @@ -17,10 +17,23 @@ python scripts/submit_eval_jobs.py \ --beaker_image nathanl/open_instruct_auto \ --upload_to_hf allenai/tulu-3-evals \ --run_oe_eval_experiments \ - --run_safety_evaluations + --oe_eval_task_suite "SAFETY_EVAL_REASONING" +``` + +An example command on a non-reasoning model would be: +```bash +python scripts/submit_eval_jobs.py \ + --model_name \ + --location \ + --is_tuned --workspace tulu-3-results \ + --preemptible \ + --use_hf_tokenizer_template \ + --beaker_image nathanl/open_instruct_auto \ + --upload_to_hf allenai/tulu-3-evals \ + --run_oe_eval_experiments \ + --oe_eval_task_suite "SAFETY_EVAL" ``` -Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`. ## Running on an interactive session diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh index 73e7aabff..ba0383b1c 100755 --- a/scripts/eval/oe-eval.sh +++ b/scripts/eval/oe-eval.sh @@ -48,7 +48,7 @@ set -ex # Function to print usage usage() { echo "Usage: $0 --model-name MODEL_NAME --model-location MODEL_LOCATION [--num_gpus GPUS] [--upload_to_hf] [--revision REVISION] [--max-length ] [--task-suite TASK_SUITE] [--priority priority] [--tasks TASKS] [--evaluate_on_weka] [--stop-sequences ] [--beaker-image ] [--cluster ] [--process-output ]" - echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)" + echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)" echo "TASKS should be a comma-separated list of task specifications (e.g., 'gsm8k::tulu,bbh:cot::tulu')" echo "STOP_SEQUENCES should be a comma-separated list of strings to stop generation at (e.g., ',\\n\\n')" echo "PROCESS_OUTPUT should be a string specifying how to process the model output (e.g., 'r1_style')" @@ -212,6 +212,14 @@ NEXT_MODEL_UNSEEN=( "ifbench::tulu" ) +SAFETY_EVAL=( + "safety_eval" +) + +SAFETY_EVAL_REASONING=( + "safety_eval_reasoning" +) + # If custom tasks provided, convert comma-separated string to array if [[ -n "$CUSTOM_TASKS" ]]; then IFS=',' read -ra TASKS <<< "$CUSTOM_TASKS" @@ -230,6 +238,12 @@ else TULU_3_UNSEEN) TASKS=("${TULU_3_UNSEEN[@]}") ;; + SAFETY_EVAL) + TASKS=("${SAFETY_EVAL[@]}") + ;; + SAFETY_EVAL_REASONING) + TASKS=("${SAFETY_EVAL_REASONING[@]}") + ;; *) echo "Error: Unknown task suite '$TASK_SUITE'" usage @@ -284,7 +298,7 @@ for TASK in "${TASKS[@]}"; do --beaker-image "$BEAKER_IMAGE" \ --beaker-priority "$PRIORITY" \ --push-datalake \ - --datalake-tags "$DATALAKE_ARGS" + --datalake-tags "$DATALAKE_ARGS" else python oe-eval-internal/oe_eval/launch.py \ --model "$MODEL_NAME" \ diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 0c4ab6ac6..15b752f6d 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -107,12 +107,9 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): parser.add_argument("--upload_to_hf", type=str, default=None, help="If given, upload the eval results to the Hugging Face model hub. Provide the HF dataset and path in form //.") parser.add_argument("--hf_upload_experiments", type=str, nargs="*", default=None, help="Upload given experiment to the Hugging Face model hub.") parser.add_argument("--run_oe_eval_experiments", action="store_true", help="Run the OE eval tool and experiments too.") -parser.add_argument("--run_safety_evaluations", action="store_true", help="Run the OE safety evaluations too.") -parser.add_argument("--run_safety_evaluations_reasoning", action="store_true", help="Run the OE safety evaluations on a reasoning model too.") parser.add_argument("--skip_oi_evals", action="store_true", help="Don't run open instruct evals.") parser.add_argument("--oe_eval_max_length", type=int, default=4096, help="Max length for OE eval.") -parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)") -parser.add_argument("--use_alternate_safety_image", type=str, default=None, help="Use a different image for safety eval.") +parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)") parser.add_argument("--evaluate_on_weka", action="store_true", help="Evaluate OE eval on Beaker.") # NOTE: evaluate on weka is expected to be on by default. If not, the evals will run on the google augusta cluster. # TODO: fix this logic at a future date @@ -631,14 +628,33 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): oe_eval_cmd += f" --run-id {args.run_id}" if args.step: oe_eval_cmd += f" --step {args.step}" + # add string with number of gpus - num_gpus = task_spec['resources']['gpuCount'] - # if num_gpus > 1, double it again for oe-eval configs - # open_instruct GPT adjustment wasn't quite enough - # adjusted here so the GPU configs in open-instruct eval are not impacted by the change - # tested reasonably extensively with 70B - if num_gpus > 1: - num_gpus *= 2 + if args.oe_eval_task_suite == 'SAFETY_EVAL' or args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING': + # pull num gpu logic from old safety eval code + task_spec = adjust_gpus( + task_spec=task_spec, + experiment_group="safety_eval", + model_name=model_info[0], + gpu_multiplier=args.gpu_multiplier, + ) + # add gpu information. + # we just assume you want to use all the gpus for one task at a time + if "70B" in model_info[0]: + task_spec['resources']['gpuCount'] = 8 + num_gpus = task_spec['resources']['gpuCount'] + # double GPUs for reasoning models + if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING': + num_gpus *= 2 + else: + num_gpus = task_spec['resources']['gpuCount'] + # if num_gpus > 1, double it again for oe-eval configs + # open_instruct GPT adjustment wasn't quite enough + # adjusted here so the GPU configs in open-instruct eval are not impacted by the change + # tested reasonably extensively with 70B + if num_gpus > 1: + num_gpus *= 2 + oe_eval_cmd += f" --num_gpus {num_gpus}" if args.oe_eval_max_length: oe_eval_cmd += f" --max-length {args.oe_eval_max_length}" @@ -668,140 +684,3 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): print(f"Running OE eval with command: {oe_eval_cmd}") subprocess.Popen(oe_eval_cmd, shell=True) - -if args.run_safety_evaluations: - # if so, run safety-fork through oe-eval. We assume oe-eval is cloned in the top-level repo directory. - oe_safety_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}" - if args.upload_to_hf: - oe_safety_cmd += f" --upload_to_hf {args.upload_to_hf}" - ## model location munging: if beaker, use beaker://. If hf, just name - if model_info[0].startswith("hf-"): - oe_safety_cmd += f" --model-location {model_info[1]}" - elif model_info[1].startswith("/"): - oe_safety_cmd += f" --model-location {model_info[1]}" - elif model_info[1].startswith("gs://"): - oe_safety_cmd += f" --model-location {model_info[1]}" - else: - oe_safety_cmd += f" --model-location beaker://{model_info[1]}" - if args.hf_revision: - oe_safety_cmd += f" --revision {args.hf_revision}" - if args.evaluate_on_weka: - oe_safety_cmd += " --evaluate_on_weka" - oe_safety_cmd += f" --tasks safety_eval" - if args.run_id: - oe_safety_cmd += f" --run-id {args.run_id}" - if args.step: - oe_safety_cmd += f" --step {args.step}" - - # pull num gpu logic from old safety eval code - task_spec = adjust_gpus( - task_spec=task_spec, - experiment_group="safety_eval", - model_name=model_info[0], - gpu_multiplier=args.gpu_multiplier, - ) - # add gpu information. - # we just assume you want to use all the gpus for one task at a time - if "70B" in model_info[0]: - task_spec['resources']['gpuCount'] = 8 - num_gpus = task_spec['resources']['gpuCount'] - - oe_safety_cmd += f" --num_gpus {num_gpus}" - - # controlled by config file - # if args.oe_eval_max_length: - # oe_safety_cmd += f" --max-length {args.oe_eval_max_length}" - - # add priority - oe_safety_cmd += f" --priority {args.priority}" - - # Add stop sequences if provided - if args.oe_eval_stop_sequences: - oe_safety_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'" - - # Add process output if provided - if args.process_output: - oe_safety_cmd += f" --process-output {args.process_output}" - - # Add beaker image from existing argument - if args.use_alternate_safety_image: - oe_safety_cmd += f" --beaker-image {args.use_alternate_safety_image}" - elif args.beaker_image: - oe_safety_cmd += f" --beaker-image {args.beaker_image}" - - # Add cluster parameter - use the existing cluster argument - # Join the list with commas since oe-eval.sh expects a comma-separated string - if args.cluster and len(args.cluster) > 0: - cluster_str = ",".join(args.cluster) - oe_safety_cmd += f" --cluster '{cluster_str}'" - - print(f"Running OE safety eval with command: {oe_safety_cmd}") - subprocess.Popen(oe_safety_cmd, shell=True) - -if args.run_safety_evaluations_reasoning: - # if so, run safety-fork on resoning tasks through oe-eval. We assume oe-eval is cloned in the top-level repo directory. - oe_safety_reasoning_cmd = f"scripts/eval/oe-eval.sh --model-name {model_name}" - if args.upload_to_hf: - oe_safety_reasoning_cmd += f" --upload_to_hf {args.upload_to_hf}" - ## model location munging: if beaker, use beaker://. If hf, just name - if model_info[0].startswith("hf-"): - oe_safety_reasoning_cmd += f" --model-location {model_info[1]}" - elif model_info[1].startswith("/"): - oe_safety_reasoning_cmd += f" --model-location {model_info[1]}" - elif model_info[1].startswith("gs://"): - oe_safety_reasoning_cmd += f" --model-location {model_info[1]}" - else: - oe_safety_reasoning_cmd += f" --model-location beaker://{model_info[1]}" - if args.hf_revision: - oe_safety_reasoning_cmd += f" --revision {args.hf_revision}" - if args.evaluate_on_weka: - oe_safety_reasoning_cmd += " --evaluate_on_weka" - oe_safety_reasoning_cmd += f" --tasks safety_eval_reasoning" - if args.run_id: - oe_safety_reasoning_cmd += f" --run-id {args.run_id}" - if args.step: - oe_safety_reasoning_cmd += f" --step {args.step}" - - # pull num gpu logic from old safety eval code - task_spec = adjust_gpus( - task_spec=task_spec, - experiment_group="safety_eval", - model_name=model_info[0], - gpu_multiplier=args.gpu_multiplier, - ) - # add gpu information. - # we just assume you want to use all the gpus for one task at a time - if "70B" in model_info[0]: - task_spec['resources']['gpuCount'] = 8 - num_gpus = task_spec['resources']['gpuCount'] - - oe_safety_reasoning_cmd += f" --num_gpus {num_gpus}" - # controlled by config file - # if args.oe_eval_max_length: - # oe_safety_reasoning_cmd += f" --max-length {args.oe_eval_max_length}" - - # add priority - oe_safety_reasoning_cmd += f" --priority {args.priority}" - - # Add stop sequences if provided - if args.oe_eval_stop_sequences: - oe_safety_reasoning_cmd += f" --stop-sequences '{args.oe_eval_stop_sequences}'" - - # Add process output if provided - if args.process_output: - oe_safety_reasoning_cmd += f" --process-output {args.process_output}" - - # Add beaker image from existing argument - if args.use_alternate_safety_image: - oe_safety_reasoning_cmd += f" --beaker-image {args.use_alternate_safety_image}" - elif args.beaker_image: - oe_safety_reasoning_cmd += f" --beaker-image {args.beaker_image}" - - # Add cluster parameter - use the existing cluster argument - # Join the list with commas since oe-eval.sh expects a comma-separated string - if args.cluster and len(args.cluster) > 0: - cluster_str = ",".join(args.cluster) - oe_safety_reasoning_cmd += f" --cluster '{cluster_str}'" - - print(f"Running OE safety eval with command: {oe_safety_reasoning_cmd}") - subprocess.Popen(oe_safety_reasoning_cmd, shell=True) \ No newline at end of file diff --git a/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py b/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py index c262dfc37..5bcdda2aa 100644 --- a/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py +++ b/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py @@ -47,7 +47,6 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig): --use_hf_tokenizer_template \ --beaker_image nathanl/open_instruct_auto \ --skip_oi_evals \ - --run_safety_evaluations \ --run_oe_eval_experiments \ --upload_to_hf {args.upload_to_hf}""" if args.run_id: @@ -60,6 +59,29 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig): print(f"Beaker evaluation jobs: Stderr:\n{stderr.decode()}") print(f"Beaker evaluation jobs: process return code: {process.returncode}") + safety_command = f""" + python scripts/submit_eval_jobs.py \ + --model_name {args.model_name} \ + --location {beaker_dataset_ids[-1]} \ + --is_tuned \ + --workspace tulu-3-results \ + --preemptible \ + --use_hf_tokenizer_template \ + --beaker_image nathanl/open_instruct_auto \ + --skip_oi_evals \ + --run_oe_eval_experiments \ + --oe_eval_task_suite "SAFETY_EVAL" \ + --upload_to_hf {args.upload_to_hf}""" + if args.run_id: + safety_command += f" --run_id {args.run_id}" + + safety_process = subprocess.Popen(["bash", "-c", safety_command], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + safety_stdout, safety_stderr = safety_process.communicate() + + print(f"Beaker safety evaluation jobs: Stdout:\n{safety_stdout.decode()}") + print(f"Beaker safety evaluation jobs: Stderr:\n{safety_stderr.decode()}") + print(f"Beaker safety evaluation jobs: process return code: {safety_process.returncode}") + return time.sleep(args.check_interval_seconds) From 8a1b5df333309d7d984110f8683d0a41ddef163a Mon Sep 17 00:00:00 2001 From: mgmorgan23 Date: Thu, 7 Aug 2025 13:41:18 -0700 Subject: [PATCH 6/7] update num_gpu calculation --- scripts/submit_eval_jobs.py | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 7e8211ffc..d001d5c8f 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -633,32 +633,18 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): oe_eval_cmd += f" --step {args.step}" # add string with number of gpus - if args.oe_eval_task_suite == 'SAFETY_EVAL' or args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING': - # pull num gpu logic from old safety eval code - task_spec = adjust_gpus( - task_spec=task_spec, - experiment_group="safety_eval", - model_name=model_info[0], - gpu_multiplier=args.gpu_multiplier, - ) - # add gpu information. - # we just assume you want to use all the gpus for one task at a time - if "70B" in model_info[0]: - task_spec['resources']['gpuCount'] = 8 - num_gpus = task_spec['resources']['gpuCount'] - # double GPUs for reasoning models - if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING': - num_gpus *= 2 - else: - num_gpus = task_spec['resources']['gpuCount'] - # if num_gpus > 1, double it again for oe-eval configs - # open_instruct GPT adjustment wasn't quite enough - # adjusted here so the GPU configs in open-instruct eval are not impacted by the change - # tested reasonably extensively with 70B - if num_gpus > 1: - num_gpus *= 2 - + num_gpus = task_spec['resources']['gpuCount'] + # if num_gpus > 1, double it again for oe-eval configs + # open_instruct GPT adjustment wasn't quite enough + # adjusted here so the GPU configs in open-instruct eval are not impacted by the change + # tested reasonably extensively with 70B + if num_gpus > 1: + num_gpus *= 2 + # double GPUs for reasoning models + if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING': + num_gpus *= 2 oe_eval_cmd += f" --num_gpus {num_gpus}" + if args.oe_eval_max_length: oe_eval_cmd += f" --max-length {args.oe_eval_max_length}" # Add task suite parameter From 15f25dcc86636688b8c3593728af48bd379c2d1d Mon Sep 17 00:00:00 2001 From: mgmorgan23 Date: Fri, 8 Aug 2025 09:28:14 -0700 Subject: [PATCH 7/7] typo --- scripts/eval/oe-eval.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh index 6df2996dc..e8f434372 100755 --- a/scripts/eval/oe-eval.sh +++ b/scripts/eval/oe-eval.sh @@ -319,7 +319,7 @@ for TASK in "${TASKS[@]}"; do # NOTE: For gantry args here and below, random numbers like #42 are added to the env variables because they need to be unique names. The numbers are ignored. # Build gantry args if [ "$EVALUATE_ON_WEKA" == "true" ]; then - GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}, \"env-secret#2":"HF_TOKEN=HF_TOKEN\"}" + GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}, \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\"}" else GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"env-secret#43\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\", \"env\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\", \"mount\": \"/mnt/filestore_1:/filestore\", \"env#111\": \"HF_HOME=/filestore/.cache/huggingface\", \"env#112\": \"HF_DATASETS_CACHE=/filestore/.cache/huggingface\", \"env#113\": \"HF_HUB_CACHE=/filestore/.cache/hub\"${MAX_TOKENS_ARG}}" fi