Skip to content

Commit e10aeea

Browse files
committed
Merging changed from main
1 parent 8c7b562 commit e10aeea

40 files changed

+450
-272
lines changed

.github/actions/test-if-changes/action.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@ runs:
3939
else
4040
pip install "debug-gym[dev]==${{ inputs.version }}"
4141
fi
42+
df -h
4243
- name: Run tests
4344
env:
4445
DEBUG_GYM_DEBUG: 1
4546
shell: bash
4647
run: |
48+
free -h
4749
pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
4850
- name: Store coverage report
4951
uses: actions/upload-artifact@v4

debug_gym/agents/base_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,5 +405,5 @@ def create_agent(
405405
if agent_args is None:
406406
raise ValueError("Either agent_args or config must be provided.")
407407

408-
agent = agent_class(args=agent_args, **agent_kwargs)
408+
agent = agent_class(agent_args=agent_args, **agent_kwargs)
409409
return agent

debug_gym/agents/solution_agent.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -39,28 +39,36 @@ def run(self, env, debug=False):
3939
return True
4040

4141
self.logger.info(f"Score: {info.score}/{info.max_score or '-'}")
42-
# Make a simple pdb call to make sure it is working.
43-
action = ToolCall(name="pdb", id="pdb", arguments={"command": "help help"})
44-
pdb_help_info = self.env.step(action, None, None)
45-
assert "h(elp)" in pdb_help_info.step_observation.observation, (
46-
"PDB command did not return expected help message.\n"
47-
f"{pdb_help_info.step_observation.observation}"
48-
)
4942

50-
# Send a pdb continue command, and check the output matches the one from env.reset.
51-
action = ToolCall(name="pdb", id="pdb", arguments={"command": "continue"})
52-
pdb_continue_info = self.env.step(action, None, None)
53-
54-
assert (
55-
"Reached the end of the program. Restarting the debugging session."
56-
in pdb_continue_info.step_observation.observation
57-
) or (
58-
info.step_observation.observation.splitlines()[-1]
59-
in pdb_continue_info.step_observation.observation
60-
), (
61-
"PDB command did not return expected continue message.\n"
62-
f"{pdb_continue_info.step_observation.observation}"
63-
)
43+
if env.has_tool("pdb"):
44+
# Make a simple pdb call to make sure it is working.
45+
action = ToolCall(
46+
name="pdb", id="pdb", arguments={"command": "help help"}
47+
)
48+
pdb_help_info = self.env.step(action, None, None)
49+
assert "h(elp)" in pdb_help_info.step_observation.observation, (
50+
"PDB command did not return expected help message.\n"
51+
f"{pdb_help_info.step_observation.observation}"
52+
)
53+
54+
# Send a pdb continue command, and check the output matches the one from env.reset.
55+
action = ToolCall(
56+
name="pdb", id="pdb", arguments={"command": "continue"}
57+
)
58+
pdb_continue_info = self.env.step(action, None, None)
59+
60+
pdb_observation = pdb_continue_info.step_observation.observation
61+
expected_messages = [
62+
"Reached the end of the program. Restarting the debugging session.",
63+
"Uncaught exception. Entering post mortem debugging",
64+
]
65+
reset_observation = info.step_observation.observation
66+
if reset_observation.splitlines():
67+
expected_messages.append(reset_observation.splitlines()[-1])
68+
69+
assert any(
70+
msg in pdb_observation for msg in expected_messages
71+
), f"PDB command did not return expected continue message.\n{pdb_observation}"
6472

6573
self.env.apply_gold_patch()
6674

debug_gym/agents/utils.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,6 @@ def load_config():
108108
with open(args.config_file) as reader:
109109
config = yaml.safe_load(reader)
110110

111-
# Parse overriden params.
112-
for param in args.params:
113-
fqn_key, value = param.split("=")
114-
entry_to_change = config
115-
keys = fqn_key.split(".")
116-
for k in keys[:-1]:
117-
entry_to_change = entry_to_change[k]
118-
entry_to_change[keys[-1]] = yaml.safe_load(value)
119-
120111
available_agents = [item for item in list(config.keys()) if item != "base"]
121112

122113
if not args.agent:
@@ -130,14 +121,25 @@ def load_config():
130121
if "base" in config:
131122
# base config is specified (shared across agents)
132123
return_config = config["base"]
133-
agent_specific_config = config[args.agent]
134-
for key in agent_specific_config:
135-
# override base config with agent specific config
136-
return_config[key] = agent_specific_config[key]
124+
# Override base config with agent specific config
125+
for key, value in config[args.agent].items():
126+
return_config[key] = value
137127
else:
138128
# base config is not specified
139129
return_config = config[args.agent]
140130

131+
# Parse overriden params.
132+
for param in args.params:
133+
fqn_key, value = param.split("=")
134+
entry_to_change = return_config
135+
keys = fqn_key.split(".")
136+
for k in keys[:-1]:
137+
if k not in entry_to_change:
138+
entry_to_change[k] = {}
139+
140+
entry_to_change = entry_to_change[k]
141+
entry_to_change[keys[-1]] = yaml.safe_load(value)
142+
141143
# assume agent type is the key if not specified by the user
142144
if not return_config.get("agent_type"):
143145
return_config["agent_type"] = args.agent

debug_gym/gym/envs/__init__.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,11 @@
66
from debug_gym.gym.envs.swe_bench import SWEBenchEnv
77
from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
88
from debug_gym.gym.envs.swe_smith import SWESmithEnv
9+
from debug_gym.logger import DebugGymLogger
910

1011

1112
def select_env(env_type: str = None) -> type[RepoEnv]:
1213
match env_type:
13-
case None:
14-
return RepoEnv
1514
case "local":
1615
return LocalEnv
1716
case "aider":
@@ -27,4 +26,20 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
2726
case "r2egym":
2827
return R2EGymEnv
2928
case _:
30-
raise ValueError(f"Unknown benchmark {env_type}")
29+
raise ValueError(f"Unknown environment {env_type}")
30+
31+
32+
def load_dataset(config: dict, logger: DebugGymLogger | None = None) -> dict:
33+
"""Load dataset based on the given config."""
34+
if config.get("type") is None:
35+
raise ValueError("Dataset config must specify 'type' field.")
36+
37+
try:
38+
env = select_env(config.get("type"))
39+
except ValueError as e:
40+
raise ValueError(
41+
f"Unknown environment type '{config.get('type')}' from dataset's config: {config}"
42+
)
43+
44+
dataset = env.load_dataset(logger=logger, **config)
45+
return dataset

debug_gym/gym/envs/aider.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import os
23
import subprocess
34
import tempfile
@@ -8,16 +9,20 @@
89
from debug_gym.constants import DEBUG_GYM_CACHE_DIR
910
from debug_gym.gym.entities import EvalOutput
1011
from debug_gym.gym.envs.env import RepoEnv
12+
from debug_gym.gym.envs.local import LocalEnv
1113
from debug_gym.gym.terminals.docker import DockerTerminal
1214
from debug_gym.gym.terminals.terminal import Terminal
15+
from debug_gym.logger import DebugGymLogger
1316

1417
DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider"
1518

1619

17-
def build_docker_image(logger):
20+
def build_docker_image(logger: logging.Logger | None = None):
1821
"""
1922
Build a Docker image for the Mini Nightmare environment.
2023
"""
24+
logger = logger or DebugGymLogger("debug-gym")
25+
2126
# Check if Docker image is built.
2227
import docker
2328

@@ -75,8 +80,13 @@ def __init__(
7580
if hasattr(terminal, "base_image") and terminal.base_image is None:
7681
terminal.base_image = DOCKER_AIDER_IMAGE_NAME
7782

78-
self.task_data = task_data
79-
super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
83+
super().__init__(
84+
task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
85+
)
86+
87+
@property
88+
def task_name(self) -> str:
89+
return self.current_task["task_name"]
8090

8191
@property
8292
def instructions(self) -> str:
@@ -95,7 +105,7 @@ def eval(self, **kwargs) -> EvalOutput:
95105
return self.last_eval
96106

97107
def setup_task(self):
98-
pass
108+
self.current_task = self.task_data
99109

100110
def setup_workspace(self):
101111
self.workspace.reset()
@@ -127,8 +137,9 @@ def setup_terminal(self):
127137
def load_dataset(
128138
cls,
129139
problems: str | list[str] | None = None,
130-
build_image: bool = False,
140+
build_image: bool = True,
131141
logger: object = None,
142+
**kwargs,
132143
) -> dict:
133144
if build_image:
134145
build_docker_image(logger)
@@ -167,11 +178,17 @@ def load_dataset(
167178
)
168179

169180
dataset[task_name] = {
181+
"task_name": task_name,
170182
"codebase": directory,
171183
"instructions": instructions,
172184
"filename": task_name + ".py",
173185
}
174186

175187
problems = utils.filter_problems(dataset, problems)
176188
dataset = {id: data for id, data in dataset.items() if id in problems}
189+
190+
# Add env_type to each task_data.
191+
for task_data in dataset.values():
192+
task_data["env_type"] = "aider"
193+
177194
return dataset

debug_gym/gym/envs/local.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,31 @@
11
from debug_gym.gym.envs.env import RepoEnv
2+
from debug_gym.gym.terminals.local import LocalTerminal
3+
from debug_gym.gym.terminals.terminal import Terminal
24

35

46
class LocalEnv(RepoEnv):
57

68
def __init__(
79
self,
810
path: str,
11+
terminal: Terminal | None = None,
912
entrypoint: str = "python -m pytest -sq .",
1013
debug_entrypoint: str | None = None,
1114
**kwargs,
1215
):
1316
task_data = {"path": path}
17+
terminal = terminal or LocalTerminal()
1418
super().__init__(
1519
task_data=task_data,
20+
terminal=terminal,
1621
entrypoint=entrypoint,
1722
debug_entrypoint=debug_entrypoint,
1823
**kwargs,
1924
)
2025

2126
@property
22-
def instruction(self) -> str:
23-
return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue."
27+
def instructions(self) -> str:
28+
return f"Investigate the current repository, run the tests to figure out any issues, then rewrite the code to fix them."
2429

2530
@property
2631
def task(self) -> str:

debug_gym/gym/envs/mini_nightmare.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import tempfile
23
from pathlib import Path
34

@@ -7,14 +8,16 @@
78
from debug_gym.gym.envs.env import RepoEnv
89
from debug_gym.gym.terminals.docker import DockerTerminal
910
from debug_gym.gym.terminals.terminal import Terminal
11+
from debug_gym.logger import DebugGymLogger
1012

1113
DOCKER_MINI_NIGHTMARE_IMAGE_NAME = "debug-gym:mini-nightmare"
1214

1315

14-
def build_docker_image(logger):
16+
def build_docker_image(logger: logging.Logger | None = None):
1517
"""
1618
Build a Docker image for the Mini Nightmare environment.
1719
"""
20+
logger = logger or DebugGymLogger("debug-gym")
1821
# Check if Docker image is built.
1922
import docker
2023

@@ -86,10 +89,9 @@ def __init__(
8689
if hasattr(terminal, "base_image") and terminal.base_image is None:
8790
terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME
8891

89-
self.task_data = task_data
90-
self.task_name = task_data["task_name"]
91-
92-
super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
92+
super().__init__(
93+
task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
94+
)
9395

9496
@property
9597
def instructions(self) -> str:
@@ -99,6 +101,10 @@ def instructions(self) -> str:
99101
" Beaware that the bug may not be in the code you initially see."
100102
)
101103

104+
@property
105+
def task_name(self) -> str:
106+
return self.current_task["task_name"]
107+
102108
def calculate_max_score(self, eval_output: EvalOutput) -> int:
103109
return utils.extract_max_score_from_pytest_output(eval_output.output)
104110

@@ -112,7 +118,7 @@ def eval(self, **kwargs) -> EvalOutput:
112118
return self.last_eval
113119

114120
def setup_task(self):
115-
pass
121+
self.current_task = self.task_data
116122

117123
def setup_workspace(self):
118124
self.workspace.reset()
@@ -144,8 +150,9 @@ def setup_terminal(self):
144150
def load_dataset(
145151
cls,
146152
problems: str | list[str] | None = None,
147-
build_image: bool = False,
153+
build_image: bool = True,
148154
logger: object = None,
155+
**kwargs,
149156
) -> dict:
150157
if build_image:
151158
build_docker_image(logger)
@@ -167,10 +174,16 @@ def load_dataset(
167174
assert (task_path / ".debugreadonly").exists()
168175

169176
dataset[task_name] = {
177+
"task_name": task_name,
170178
"codebase": task_path,
171179
"filename": task_name + "_code.py",
172180
}
173181

174182
problems = utils.filter_problems(dataset, problems)
175183
dataset = {id: data for id, data in dataset.items() if id in problems}
184+
185+
# Add env_type to each task_data.
186+
for task_data in dataset.values():
187+
task_data["env_type"] = "mini_nightmare"
188+
176189
return dataset

0 commit comments

Comments
 (0)