microsoft
diff --git a/‎.github/actions/test-if-changes/action.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/actions/test-if-changes/action.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎debug_gym/agents/base_agent.py‎
Lines changed: 1 addition & 1 deletion b/‎debug_gym/agents/base_agent.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎debug_gym/agents/solution_agent.py‎
Lines changed: 29 additions & 21 deletions b/‎debug_gym/agents/solution_agent.py‎
Lines changed: 29 additions & 21 deletions
diff --git a/‎debug_gym/agents/utils.py‎
Lines changed: 15 additions & 13 deletions b/‎debug_gym/agents/utils.py‎
Lines changed: 15 additions & 13 deletions
diff --git a/‎debug_gym/gym/envs/__init__.py‎
Lines changed: 18 additions & 3 deletions b/‎debug_gym/gym/envs/__init__.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎debug_gym/gym/envs/aider.py‎
Lines changed: 22 additions & 5 deletions b/‎debug_gym/gym/envs/aider.py‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎debug_gym/gym/envs/local.py‎
Lines changed: 7 additions & 2 deletions b/‎debug_gym/gym/envs/local.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎debug_gym/gym/envs/mini_nightmare.py‎
Lines changed: 20 additions & 7 deletions b/‎debug_gym/gym/envs/mini_nightmare.py‎
Lines changed: 20 additions & 7 deletions
@@ -39,11 +39,13 @@ runs:
         else
           pip install "debug-gym[dev]==${{ inputs.version }}"
         fi
+        df -h
     - name: Run tests
       env:
         DEBUG_GYM_DEBUG: 1
       shell: bash
       run: |
+        free -h
         pytest ${{ inputs.test-files }} -vv -n 16 --timeout=600 --cov=debug_gym --cov-report=term-missing
     - name: Store coverage report
       uses: actions/upload-artifact@v4
 
@@ -405,5 +405,5 @@ def create_agent(
     if agent_args is None:
         raise ValueError("Either agent_args or config must be provided.")
 
-    agent = agent_class(args=agent_args, **agent_kwargs)
+    agent = agent_class(agent_args=agent_args, **agent_kwargs)
     return agent
@@ -39,28 +39,36 @@ def run(self, env, debug=False):
                 return True
 
             self.logger.info(f"Score: {info.score}/{info.max_score or '-'}")
-            # Make a simple pdb call to make sure it is working.
-            action = ToolCall(name="pdb", id="pdb", arguments={"command": "help help"})
-            pdb_help_info = self.env.step(action, None, None)
-            assert "h(elp)" in pdb_help_info.step_observation.observation, (
-                "PDB command did not return expected help message.\n"
-                f"{pdb_help_info.step_observation.observation}"
-            )
 
-            # Send a pdb continue command, and check the output matches the one from env.reset.
-            action = ToolCall(name="pdb", id="pdb", arguments={"command": "continue"})
-            pdb_continue_info = self.env.step(action, None, None)
-
-            assert (
-                "Reached the end of the program. Restarting the debugging session."
-                in pdb_continue_info.step_observation.observation
-            ) or (
-                info.step_observation.observation.splitlines()[-1]
-                in pdb_continue_info.step_observation.observation
-            ), (
-                "PDB command did not return expected continue message.\n"
-                f"{pdb_continue_info.step_observation.observation}"
-            )
+            if env.has_tool("pdb"):
+                # Make a simple pdb call to make sure it is working.
+                action = ToolCall(
+                    name="pdb", id="pdb", arguments={"command": "help help"}
+                )
+                pdb_help_info = self.env.step(action, None, None)
+                assert "h(elp)" in pdb_help_info.step_observation.observation, (
+                    "PDB command did not return expected help message.\n"
+                    f"{pdb_help_info.step_observation.observation}"
+                )
+
+                # Send a pdb continue command, and check the output matches the one from env.reset.
+                action = ToolCall(
+                    name="pdb", id="pdb", arguments={"command": "continue"}
+                )
+                pdb_continue_info = self.env.step(action, None, None)
+
+                pdb_observation = pdb_continue_info.step_observation.observation
+                expected_messages = [
+                    "Reached the end of the program. Restarting the debugging session.",
+                    "Uncaught exception. Entering post mortem debugging",
+                ]
+                reset_observation = info.step_observation.observation
+                if reset_observation.splitlines():
+                    expected_messages.append(reset_observation.splitlines()[-1])
+
+                assert any(
+                    msg in pdb_observation for msg in expected_messages
+                ), f"PDB command did not return expected continue message.\n{pdb_observation}"
 
             self.env.apply_gold_patch()
 
 
@@ -108,15 +108,6 @@ def load_config():
     with open(args.config_file) as reader:
         config = yaml.safe_load(reader)
 
-    # Parse overriden params.
-    for param in args.params:
-        fqn_key, value = param.split("=")
-        entry_to_change = config
-        keys = fqn_key.split(".")
-        for k in keys[:-1]:
-            entry_to_change = entry_to_change[k]
-        entry_to_change[keys[-1]] = yaml.safe_load(value)
-
     available_agents = [item for item in list(config.keys()) if item != "base"]
 
     if not args.agent:
@@ -130,14 +121,25 @@ def load_config():
     if "base" in config:
         # base config is specified (shared across agents)
         return_config = config["base"]
-        agent_specific_config = config[args.agent]
-        for key in agent_specific_config:
-            # override base config with agent specific config
-            return_config[key] = agent_specific_config[key]
+        # Override base config with agent specific config
+        for key, value in config[args.agent].items():
+            return_config[key] = value
     else:
         # base config is not specified
         return_config = config[args.agent]
 
+    # Parse overriden params.
+    for param in args.params:
+        fqn_key, value = param.split("=")
+        entry_to_change = return_config
+        keys = fqn_key.split(".")
+        for k in keys[:-1]:
+            if k not in entry_to_change:
+                entry_to_change[k] = {}
+
+            entry_to_change = entry_to_change[k]
+        entry_to_change[keys[-1]] = yaml.safe_load(value)
+
     # assume agent type is the key if not specified by the user
     if not return_config.get("agent_type"):
         return_config["agent_type"] = args.agent
 
@@ -6,12 +6,11 @@
 from debug_gym.gym.envs.swe_bench import SWEBenchEnv
 from debug_gym.gym.envs.swe_bench_debug import SWEBenchDebugEnv
 from debug_gym.gym.envs.swe_smith import SWESmithEnv
+from debug_gym.logger import DebugGymLogger
 
 
 def select_env(env_type: str = None) -> type[RepoEnv]:
     match env_type:
-        case None:
-            return RepoEnv
         case "local":
             return LocalEnv
         case "aider":
@@ -27,4 +26,20 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
         case "r2egym":
             return R2EGymEnv
         case _:
-            raise ValueError(f"Unknown benchmark {env_type}")
+            raise ValueError(f"Unknown environment {env_type}")
+
+
+def load_dataset(config: dict, logger: DebugGymLogger | None = None) -> dict:
+    """Load dataset based on the given config."""
+    if config.get("type") is None:
+        raise ValueError("Dataset config must specify 'type' field.")
+
+    try:
+        env = select_env(config.get("type"))
+    except ValueError as e:
+        raise ValueError(
+            f"Unknown environment type '{config.get('type')}' from dataset's config: {config}"
+        )
+
+    dataset = env.load_dataset(logger=logger, **config)
+    return dataset
@@ -1,3 +1,4 @@
+import logging
 import os
 import subprocess
 import tempfile
@@ -8,16 +9,20 @@
 from debug_gym.constants import DEBUG_GYM_CACHE_DIR
 from debug_gym.gym.entities import EvalOutput
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.envs.local import LocalEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.logger import DebugGymLogger
 
 DOCKER_AIDER_IMAGE_NAME = "debug-gym:aider"
 
 
-def build_docker_image(logger):
+def build_docker_image(logger: logging.Logger | None = None):
     """
     Build a Docker image for the Mini Nightmare environment.
     """
+    logger = logger or DebugGymLogger("debug-gym")
+
     # Check if Docker image is built.
     import docker
 
@@ -75,8 +80,13 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_AIDER_IMAGE_NAME
 
-        self.task_data = task_data
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
+
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
 
     @property
     def instructions(self) -> str:
@@ -95,7 +105,7 @@ def eval(self, **kwargs) -> EvalOutput:
         return self.last_eval
 
     def setup_task(self):
-        pass
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -127,8 +137,9 @@ def setup_terminal(self):
     def load_dataset(
         cls,
         problems: str | list[str] | None = None,
-        build_image: bool = False,
+        build_image: bool = True,
         logger: object = None,
+        **kwargs,
     ) -> dict:
         if build_image:
             build_docker_image(logger)
@@ -167,11 +178,17 @@ def load_dataset(
             )
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": directory,
                 "instructions": instructions,
                 "filename": task_name + ".py",
             }
 
         problems = utils.filter_problems(dataset, problems)
         dataset = {id: data for id, data in dataset.items() if id in problems}
+
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "aider"
+
         return dataset
@@ -1,26 +1,31 @@
 from debug_gym.gym.envs.env import RepoEnv
+from debug_gym.gym.terminals.local import LocalTerminal
+from debug_gym.gym.terminals.terminal import Terminal
 
 
 class LocalEnv(RepoEnv):
 
     def __init__(
         self,
         path: str,
+        terminal: Terminal | None = None,
         entrypoint: str = "python -m pytest -sq .",
         debug_entrypoint: str | None = None,
         **kwargs,
     ):
         task_data = {"path": path}
+        terminal = terminal or LocalTerminal()
         super().__init__(
             task_data=task_data,
+            terminal=terminal,
             entrypoint=entrypoint,
             debug_entrypoint=debug_entrypoint,
             **kwargs,
         )
 
     @property
-    def instruction(self) -> str:
-        return f"Debug the local codebase at {self.path}. Investigate the repository, figure out the root cause, then rewrite the code to fix the issue."
+    def instructions(self) -> str:
+        return f"Investigate the current repository, run the tests to figure out any issues, then rewrite the code to fix them."
 
     @property
     def task(self) -> str:
 
@@ -1,3 +1,4 @@
+import logging
 import tempfile
 from pathlib import Path
 
@@ -7,14 +8,16 @@
 from debug_gym.gym.envs.env import RepoEnv
 from debug_gym.gym.terminals.docker import DockerTerminal
 from debug_gym.gym.terminals.terminal import Terminal
+from debug_gym.logger import DebugGymLogger
 
 DOCKER_MINI_NIGHTMARE_IMAGE_NAME = "debug-gym:mini-nightmare"
 
 
-def build_docker_image(logger):
+def build_docker_image(logger: logging.Logger | None = None):
     """
     Build a Docker image for the Mini Nightmare environment.
     """
+    logger = logger or DebugGymLogger("debug-gym")
     # Check if Docker image is built.
     import docker
 
@@ -86,10 +89,9 @@ def __init__(
         if hasattr(terminal, "base_image") and terminal.base_image is None:
             terminal.base_image = DOCKER_MINI_NIGHTMARE_IMAGE_NAME
 
-        self.task_data = task_data
-        self.task_name = task_data["task_name"]
-
-        super().__init__(entrypoint=entrypoint, terminal=terminal, **kwargs)
+        super().__init__(
+            task_data=task_data, entrypoint=entrypoint, terminal=terminal, **kwargs
+        )
 
     @property
     def instructions(self) -> str:
@@ -99,6 +101,10 @@ def instructions(self) -> str:
             " Beaware that the bug may not be in the code you initially see."
         )
 
+    @property
+    def task_name(self) -> str:
+        return self.current_task["task_name"]
+
     def calculate_max_score(self, eval_output: EvalOutput) -> int:
         return utils.extract_max_score_from_pytest_output(eval_output.output)
 
@@ -112,7 +118,7 @@ def eval(self, **kwargs) -> EvalOutput:
         return self.last_eval
 
     def setup_task(self):
-        pass
+        self.current_task = self.task_data
 
     def setup_workspace(self):
         self.workspace.reset()
@@ -144,8 +150,9 @@ def setup_terminal(self):
     def load_dataset(
         cls,
         problems: str | list[str] | None = None,
-        build_image: bool = False,
+        build_image: bool = True,
         logger: object = None,
+        **kwargs,
     ) -> dict:
         if build_image:
             build_docker_image(logger)
@@ -167,10 +174,16 @@ def load_dataset(
             assert (task_path / ".debugreadonly").exists()
 
             dataset[task_name] = {
+                "task_name": task_name,
                 "codebase": task_path,
                 "filename": task_name + "_code.py",
             }
 
         problems = utils.filter_problems(dataset, problems)
         dataset = {id: data for id, data in dataset.items() if id in problems}
+
+        # Add env_type to each task_data.
+        for task_data in dataset.values():
+            task_data["env_type"] = "mini_nightmare"
+
         return dataset