From c0ed4ff6a4371738766887db877ce0f7d8688c33 Mon Sep 17 00:00:00 2001 From: Damoon Date: Fri, 26 Sep 2025 20:01:21 -0400 Subject: [PATCH 1/2] no need for multiple calls --- environments/simpleqa/simpleqa.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/environments/simpleqa/simpleqa.py b/environments/simpleqa/simpleqa.py index f72b085c9..7e97ddb48 100644 --- a/environments/simpleqa/simpleqa.py +++ b/environments/simpleqa/simpleqa.py @@ -123,16 +123,14 @@ def correct_answer_reward_func( def incorrect_answer_reward_func( prompt, completion, answer, state, **kwargs ) -> float: - judge_response = rubric.judge(prompt, completion, answer, state, **kwargs) - match = re.search(r"(A|B|C)", judge_response) + match = re.search(r"(A|B|C)", state["judge_response"]) result = match.group(0) if match else "C" return 1.0 if result == "B" else 0.0 def not_attempted_answer_reward_func( prompt, completion, answer, state, **kwargs ) -> float: - judge_response = rubric.judge(prompt, completion, answer, state, **kwargs) - match = re.search(r"(A|B|C)", judge_response) + match = re.search(r"(A|B|C)", state["judge_response"]) result = match.group(0) if match else "C" return 1.0 if result == "C" else 0.0 From 3c2572ab231a6b1c3879a15415ec3e3023a797a3 Mon Sep 17 00:00:00 2001 From: Damoon Date: Fri, 26 Sep 2025 20:30:02 -0400 Subject: [PATCH 2/2] last value --- environments/simpleqa/simpleqa.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/environments/simpleqa/simpleqa.py b/environments/simpleqa/simpleqa.py index 7e97ddb48..70e4b4001 100644 --- a/environments/simpleqa/simpleqa.py +++ b/environments/simpleqa/simpleqa.py @@ -123,14 +123,16 @@ def correct_answer_reward_func( def incorrect_answer_reward_func( prompt, completion, answer, state, **kwargs ) -> float: - match = re.search(r"(A|B|C)", state["judge_response"]) + resp = list(state["judge_response"].values())[-1] + match = re.search(r"(A|B|C)", resp) result = match.group(0) if match else "C" return 1.0 if result == "B" else 0.0 def not_attempted_answer_reward_func( prompt, completion, answer, state, **kwargs ) -> float: - match = re.search(r"(A|B|C)", state["judge_response"]) + resp = list(state["judge_response"].values())[-1] + match = re.search(r"(A|B|C)", resp) result = match.group(0) if match else "C" return 1.0 if result == "C" else 0.0