diff --git a/docs/safety-eval/safety.md b/docs/safety-eval/safety.md index f2be4e5e7..b9ee953da 100644 --- a/docs/safety-eval/safety.md +++ b/docs/safety-eval/safety.md @@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b ## Running at Ai2 -This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork). +This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). -An example command would be: +An example command on a reasoning model would be: ```bash python scripts/submit_eval_jobs.py \ --model_name \ @@ -17,10 +17,22 @@ python scripts/submit_eval_jobs.py \ --beaker_image nathanl/open_instruct_auto \ --upload_to_hf allenai/tulu-3-evals \ --run_oe_eval_experiments \ - --run_safety_evaluations + --oe_eval_task_suite "SAFETY_EVAL_REASONING" ``` -Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`. +An example command on a non-reasoning model would be: +```bash +python scripts/submit_eval_jobs.py \ + --model_name \ + --location \ + --is_tuned --workspace tulu-3-results \ + --preemptible \ + --use_hf_tokenizer_template \ + --beaker_image nathanl/open_instruct_auto \ + --upload_to_hf allenai/tulu-3-evals \ + --run_oe_eval_experiments \ + --oe_eval_task_suite "SAFETY_EVAL" +``` ## Running on an interactive session diff --git a/docs/safety.md b/docs/safety.md index 4f028559e..ff8a8a496 100644 --- a/docs/safety.md +++ b/docs/safety.md @@ -4,9 +4,9 @@ We are using the Ai2 Safety Evaluation suite for safety evals. This contains a b ## Running at Ai2 -This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, simply add `--run_safety_evaluations` when calling `submit_eval_jobs.py`. This will auto-add a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). This uses the `hamishivi/safety-eval` image, which is build from [the eval-safety fork](https://github.com/nouhadziri/safety-eval-fork). +This should be the most relevant thing for internal Ai2 users of open-instruct. To run evals, use the task suite `SAFETY_EVAL` or `SAFETY_EVAL_REASONING` when calling `submit_eval_jobs.py`. This will create a job that uploads and runs the safety evaluations (and uploads to the leaderboard if the appropriate flag is set). -An example command would be: +An example command on a reasoning model would be: ```bash python scripts/submit_eval_jobs.py \ --model_name \ @@ -17,10 +17,23 @@ python scripts/submit_eval_jobs.py \ --beaker_image nathanl/open_instruct_auto \ --upload_to_hf allenai/tulu-3-evals \ --run_oe_eval_experiments \ - --run_safety_evaluations + --oe_eval_task_suite "SAFETY_EVAL_REASONING" +``` + +An example command on a non-reasoning model would be: +```bash +python scripts/submit_eval_jobs.py \ + --model_name \ + --location \ + --is_tuned --workspace tulu-3-results \ + --preemptible \ + --use_hf_tokenizer_template \ + --beaker_image nathanl/open_instruct_auto \ + --upload_to_hf allenai/tulu-3-evals \ + --run_oe_eval_experiments \ + --oe_eval_task_suite "SAFETY_EVAL" ``` -Use the `--use_alternate_safety_image` to change the safety image, for example: `--use_alternate_safety_image hamishivi/safety_eval_olmo`. ## Running on an interactive session diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh index c06268ca1..e8f434372 100755 --- a/scripts/eval/oe-eval.sh +++ b/scripts/eval/oe-eval.sh @@ -48,7 +48,7 @@ set -ex # Function to print usage usage() { echo "Usage: $0 --model-name MODEL_NAME --model-location MODEL_LOCATION [--num_gpus GPUS] [--upload_to_hf] [--revision REVISION] [--max-length ] [--task-suite TASK_SUITE] [--priority priority] [--tasks TASKS] [--evaluate_on_weka] [--stop-sequences ] [--beaker-image ] [--cluster ] [--process-output ]" - echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)" + echo "TASK_SUITE should be one of: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)" echo "TASKS should be a comma-separated list of task specifications (e.g., 'gsm8k::tulu,bbh:cot::tulu')" echo "STOP_SEQUENCES should be a comma-separated list of strings to stop generation at (e.g., ',\\n\\n')" echo "PROCESS_OUTPUT should be a string specifying how to process the model output (e.g., 'r1_style')" @@ -215,6 +215,14 @@ NEXT_MODEL_UNSEEN=( "ifbench::tulu" ) +SAFETY_EVAL=( + "safety_eval" +) + +SAFETY_EVAL_REASONING=( + "safety_eval_reasoning" +) + # If custom tasks provided, convert comma-separated string to array if [[ -n "$CUSTOM_TASKS" ]]; then IFS=',' read -ra TASKS <<< "$CUSTOM_TASKS" @@ -233,6 +241,12 @@ else TULU_3_UNSEEN) TASKS=("${TULU_3_UNSEEN[@]}") ;; + SAFETY_EVAL) + TASKS=("${SAFETY_EVAL[@]}") + ;; + SAFETY_EVAL_REASONING) + TASKS=("${SAFETY_EVAL_REASONING[@]}") + ;; *) echo "Error: Unknown task suite '$TASK_SUITE'" usage @@ -305,7 +319,7 @@ for TASK in "${TASKS[@]}"; do # NOTE: For gantry args here and below, random numbers like #42 are added to the env variables because they need to be unique names. The numbers are ignored. # Build gantry args if [ "$EVALUATE_ON_WEKA" == "true" ]; then - GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}}" + GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"weka\": \"oe-adapt-default:/weka/oe-adapt-default\", \"weka#44\": \"oe-training-default:/weka/oe-training-default\", \"env#132\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#42\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\"${MAX_TOKENS_ARG}, \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\"}" else GANTRY_ARGS="{\"env-secret\": \"OPENAI_API_KEY=openai_api_key\", \"env-secret#43\": \"AZURE_EVAL_API_KEY=azure_eval_api_key\", \"env\":\"VLLM_ALLOW_LONG_MAX_MODEL_LEN=1\", \"env-secret#2\":\"HF_TOKEN=HF_TOKEN\", \"mount\": \"/mnt/filestore_1:/filestore\", \"env#111\": \"HF_HOME=/filestore/.cache/huggingface\", \"env#112\": \"HF_DATASETS_CACHE=/filestore/.cache/huggingface\", \"env#113\": \"HF_HUB_CACHE=/filestore/.cache/hub\"${MAX_TOKENS_ARG}}" fi @@ -330,7 +344,7 @@ for TASK in "${TASKS[@]}"; do --beaker-image "$BEAKER_IMAGE" \ --beaker-priority "$PRIORITY" \ --push-datalake \ - --datalake-tags "$DATALAKE_ARGS" + --datalake-tags "$DATALAKE_ARGS" else python oe-eval-internal/oe_eval/launch.py \ --model "$MODEL_NAME" \ diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index d744986c0..f5a57e333 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -107,11 +107,9 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): parser.add_argument("--upload_to_hf", type=str, default=None, help="If given, upload the eval results to the Hugging Face model hub. Provide the HF dataset and path in form //.") parser.add_argument("--hf_upload_experiments", type=str, nargs="*", default=None, help="Upload given experiment to the Hugging Face model hub.") parser.add_argument("--run_oe_eval_experiments", action="store_true", help="Run the OE eval tool and experiments too.") -parser.add_argument("--run_safety_evaluations", action="store_true", help="Run the OE safety evaluations too.") parser.add_argument("--skip_oi_evals", action="store_true", help="Don't run open instruct evals.") parser.add_argument("--oe_eval_max_length", type=int, default=4096, help="Max length for OE eval.") -parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN (default: NEXT_MODEL_DEV)") -parser.add_argument("--use_alternate_safety_image", type=str, default=None, help="Use a different image for safety eval.") +parser.add_argument("--oe_eval_task_suite", type=str, default="NEXT_MODEL_DEV", help="Task suite for OE eval: NEXT_MODEL_DEV, NEXT_MODEL_UNSEEN, TULU_3_DEV, TULU_3_UNSEEN, SAFETY_EVAL, SAFETY_EVAL_REASONING (default: NEXT_MODEL_DEV)") parser.add_argument("--evaluate_on_weka", action="store_true", help="Evaluate OE eval on Beaker.") # NOTE: evaluate on weka is expected to be on by default. If not, the evals will run on the google augusta cluster. # TODO: fix this logic at a future date @@ -633,6 +631,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): oe_eval_cmd += f" --run-id {args.run_id}" if args.step: oe_eval_cmd += f" --step {args.step}" + # add string with number of gpus num_gpus = task_spec['resources']['gpuCount'] # if num_gpus > 1, double it again for oe-eval configs @@ -641,7 +640,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): # tested reasonably extensively with 70B if num_gpus > 1: num_gpus *= 2 + # double GPUs for reasoning models + if args.oe_eval_task_suite == 'SAFETY_EVAL_REASONING': + num_gpus *= 2 oe_eval_cmd += f" --num_gpus {num_gpus}" + if args.oe_eval_max_length: oe_eval_cmd += f" --max-length {args.oe_eval_max_length}" # Add task suite parameter @@ -670,70 +673,3 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): print(f"Running OE eval with command: {oe_eval_cmd}") subprocess.Popen(oe_eval_cmd, shell=True) - -# create an experiment that runs the safety eval tasks -if args.run_safety_evaluations: - # just take the original spec we had, modify it for safety eval. - experiment_name = f"oi_safety_{model_name}" - experiment_name = experiment_name.replace('β', '').replace(r"{", "").replace(r"}", "") # hack: remove characters beaker doesn't like - d["description"] = experiment_name - # specific image for safety eval - d["tasks"][0]["image"]["beaker"] = "hamishivi/open-safety" - if args.use_alternate_safety_image: - d["tasks"][0]["image"]["beaker"] = args.use_alternate_safety_image - d["tasks"] = [d["tasks"][0]] - task_spec = d["tasks"][0] - task_spec["name"] = experiment_name - task_spec["arguments"][0] = ''' -VLLM_WORKER_MULTIPROC_METHOD=spawn PYTHONPATH=. python evaluation/run_all_generation_benchmarks.py \ - --model_name_or_path /model \ - --model_input_template_path_or_name hf \ - --report_output_path /output/metrics.json \ - --save_individual_results_path /output/all.json \ -''' - # some copied logic - if model_info[0].startswith("hf-"): # if it's a huggingface model, load it from the model hub and delete mount `/model` - task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]} --hf_revision {args.hf_revision}")] - task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")] - del task_spec['datasets'][1] - elif model_info[1].startswith("/"): # if it's a local model, load it from the local directory and delete mount `/model` - assert weka_available, "NFS / Weka is required for path-based models." # to be safe. - task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])] - task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path "+model_info[1])] - del task_spec['datasets'][1] - else: # if it's a beaker model, mount the beaker dataset to `/model` - task_spec['datasets'][1]['source']['beaker'] = model_info[1] - - task_spec = adjust_gpus( - task_spec=task_spec, - experiment_group="safety_eval", - model_name=model_info[0], - gpu_multiplier=args.gpu_multiplier, - ) - - # add gpu information. - # we just assume you want to use all the gpus for one task at a time - if "70B" in model_info[0]: - task_spec['resources']['gpuCount'] = 8 - num_gpus = task_spec['resources']['gpuCount'] - task_spec["arguments"][0]+= f" --min_gpus_per_task {num_gpus}" - - if args.upload_to_hf: - hf_dataset = args.upload_to_hf - # to match the way oe-eval script works. - # if we prepended hf- to the model name, remove it. - # if model_name.startswith("hf-"): - # model_name = model_name[3:] - # Above is no longer the case, oe-eval includes hf- again - task_spec['arguments'] = [task_spec['arguments'][0] + f" --upload_to_hf {hf_dataset} --hf_upload_name results/{model_name}"] - - d["tasks"] = [task_spec] - if not os.path.exists("configs/beaker_configs/auto_created"): - os.makedirs("configs/beaker_configs/auto_created") - fn = "configs/beaker_configs/auto_created/{}.yaml".format(experiment_name) - os.makedirs(os.path.dirname(fn), exist_ok=True) - with open(fn, "w") as file: - yaml.dump(d, file, default_flow_style=True) - - cmd = "beaker experiment create {} --workspace ai2/{}".format(fn, workspace) - subprocess.Popen(cmd, shell=True) diff --git a/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py b/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py index c262dfc37..5bcdda2aa 100644 --- a/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py +++ b/scripts/wait_beaker_dataset_model_upload_then_evaluate_model.py @@ -47,7 +47,6 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig): --use_hf_tokenizer_template \ --beaker_image nathanl/open_instruct_auto \ --skip_oi_evals \ - --run_safety_evaluations \ --run_oe_eval_experiments \ --upload_to_hf {args.upload_to_hf}""" if args.run_id: @@ -60,6 +59,29 @@ def main(args: Args, beaker_runtime_config: BeakerRuntimeConfig): print(f"Beaker evaluation jobs: Stderr:\n{stderr.decode()}") print(f"Beaker evaluation jobs: process return code: {process.returncode}") + safety_command = f""" + python scripts/submit_eval_jobs.py \ + --model_name {args.model_name} \ + --location {beaker_dataset_ids[-1]} \ + --is_tuned \ + --workspace tulu-3-results \ + --preemptible \ + --use_hf_tokenizer_template \ + --beaker_image nathanl/open_instruct_auto \ + --skip_oi_evals \ + --run_oe_eval_experiments \ + --oe_eval_task_suite "SAFETY_EVAL" \ + --upload_to_hf {args.upload_to_hf}""" + if args.run_id: + safety_command += f" --run_id {args.run_id}" + + safety_process = subprocess.Popen(["bash", "-c", safety_command], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + safety_stdout, safety_stderr = safety_process.communicate() + + print(f"Beaker safety evaluation jobs: Stdout:\n{safety_stdout.decode()}") + print(f"Beaker safety evaluation jobs: Stderr:\n{safety_stderr.decode()}") + print(f"Beaker safety evaluation jobs: process return code: {safety_process.returncode}") + return time.sleep(args.check_interval_seconds)