ai4curation · valerie-autumn-skye · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 30, 2025
diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
@@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                 ao.tool_uses = tool_uses
 
             end_time = time.time()
-            logger.info(f"🤖 Command took {end_time - start_time} seconds")
+            logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
             ao.total_cost_usd = total_cost_usd
             ao.success = not is_error
             if not ao.success:

diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
@@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time} seconds")
+        print(f"🤖 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            logger.info(f"💎 Command took {end_time - start_time} seconds")
+            logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")
 
             # Parse the output
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
             result = self.run_process(command, env)
             end_time = time.time()
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
-            logger.info(f"🦆 Command took {end_time - start_time} seconds")
+            logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
             # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
             session_file: Optional[Path] = None
             for line in result.stdout.split("\n"):
@@ -165,7 +165,7 @@ def run(self, input_text: str) -> CoderOutput:
                     session_file = Path(session_file_str)
                     break
             if session_file and session_file.exists():
-                with open(session_file, "r") as f:
+                with open(session_file, "r", encoding="utf-8") as f:
                     ao.structured_messages = [
                         json.loads(line) for line in f if line.strip()
                     ]

diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py
@@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            print(f"🤖 Command took {end_time - start_time} seconds")
+            print(f"🤖 Command took {end_time - start_time:.2f} seconds")
 
             # Create output - Qwen CLI doesn't provide structured output
             ao = CoderOutput(

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
@@ -0,0 +1,85 @@
+# metacoder/evals/judges.py
+import logging
+import os
+
+from anthropic import Anthropic
+from anthropic.types import MessageParam, TextBlockParam, TextBlock
+
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+logger = logging.getLogger(__name__)
+
+
+class ClaudeJudge(DeepEvalBaseLLM):
+    """
+    Wraps Anthropic's Claude models so they can be used as
+    the `model` parameter to DeepEval metrics like GEval.
+    """
+
+    def __init__(
+        self,
+        model_name: str = "claude-3-5-sonnet-20240620",
+        max_tokens: int = 1024,
+        temperature: float = 0.0,
+    ):
+        super().__init__()
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise Exception("ANTHROPIC_API_KEY is not set in environment")
+        self.client = Anthropic(api_key=api_key)
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    def load_model(self):
+        return self
+
+    def generate(self, prompt: str) -> str:
+        # Build typed content blocks and messages to satisfy the SDK's type hints
+        content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
+        messages: list[MessageParam] = [{"role": "user", "content": content}]
+        resp = self.client.messages.create(
+            model=self.model_name,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            messages=messages,
+        )
+        # anthropic returns a list of content blocks; collect only the text blocks.
+        parts: list[str] = []
+        for block in resp.content:
+            if isinstance(block, TextBlock):
+                parts.append(block.text)
+        return "".join(parts)
+
+    async def a_generate(self, prompt: str) -> str:
+        # for now just call the sync path
+        return self.generate(prompt)
+
+    def get_model_name(self) -> str:
+        return self.model_name
+
+    def has_available_quota(self) -> bool:
+        """
+        Try a very lightweight request to check if quota is available.
+        Returns True if quota exists, False if Anthropic responds with
+        quota-related errors.
+        """
+        try:
+            # Use a minimal "ping" request
+            content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
+            messages: list[MessageParam] = [{"role": "user", "content": content}]
+            self.client.messages.create(
+                model=self.model_name,
+                max_tokens=1,  # cheapest possible
+                temperature=0.0,
+                messages=messages,
+            )
+            return True
+        except Exception as e:
+            msg = str(e).lower()
+            # Check for insufficient quota:
+            # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
+            if "credit balance is too low" in msg or "400" in msg:
+                logger.warning(f"ClaudeJudge quota check failed: {e}")
+                return False
+            raise
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
@@ -5,27 +5,32 @@
 """
 
 import copy
+import functools
 import importlib
 import logging
+import os
 import time
+import traceback
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Type, cast
 
 from pydantic import BaseModel
 import yaml
+
 from deepeval import evaluate
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics import BaseMetric, GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 
+from openai import APIStatusError
+from openai.types.chat import ChatCompletionMessageParam
 
 from metacoder.coders.base_coder import BaseCoder, CoderOutput
 from metacoder.registry import AVAILABLE_CODERS
 from metacoder.evals.eval_model import EvalCase, EvalDataset
 from metacoder.configuration import AIModelConfig, CoderConfig
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -59,24 +64,34 @@ def is_successful(self) -> bool:
         return self.success
 
 
-def get_default_metrics() -> Dict[str, BaseMetric]:
-    """Get default metrics. Creates instances lazily to avoid network calls during import."""
+def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
+    """Creates a GEval instance with the specified model."""
+    return GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output is factually correct based on the expected output.",
+        # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        evaluation_steps=[
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+        ],
+        threshold=0.8,
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+        ],
+        model=model,  # may be None (defaults to OpenAI) or a Claude judge
+    )
+
+
+def get_default_metrics(
+    model: Optional[DeepEvalBaseLLM] = None,
+) -> Dict[str, BaseMetric]:
+    """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
     return {
-        "CorrectnessMetric": GEval(
-            name="Correctness",
-            criteria="Determine whether the actual output is factually correct based on the expected output.",
-            # NOTE: you can only provide either criteria or evaluation_steps, and not both
-            evaluation_steps=[
-                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-                "You should also heavily penalize omission of detail",
-                "Vague language, or contradicting OPINIONS, are OK",
-            ],
-            threshold=0.8,
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-            ],
+        "CorrectnessMetric": make_geval(
+            model=model  # Note: GEval defaults to OpenAI if no model is specified.
         ),
         "DummyMetric": DummyMetric(threshold=0.5),
     }
@@ -123,6 +138,8 @@ class EvalRunner:
 
     def __init__(self, verbose: bool = False):
         self.verbose = verbose
+        self.use_openai = True  # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
+
         if verbose:
             logging.basicConfig(level=logging.DEBUG)
         else:
@@ -183,6 +200,48 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
             additional_metadata=case.additional_metadata,
         )
 
+    @functools.lru_cache(maxsize=1)
+    def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
+        if not os.getenv("OPENAI_API_KEY"):
+            logger.info("OPENAI_API_KEY is not set.")
+            return False
+        """
+            Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
+            Fast probe of the /chat/completions endpoint (the one GEval uses).
+            Returns False on 429 (insufficient_quota) or any exception.
+        """
+        try:
+            from openai import OpenAI
+
+            # turn off SDK retries for the check so it returns fast
+            client = OpenAI(max_retries=0, timeout=8)  # NO retries, quick fail
+            # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
+            raw = [{"role": "user", "content": "ping"}]
+            messages = cast(List[ChatCompletionMessageParam], raw)
+            client.chat.completions.create(
+                model=model,
+                messages=messages,
+                max_tokens=1,
+                temperature=0,
+            )
+            return True
+        except APIStatusError as e:
+            # 429 insufficient quota or too many requests
+            if e.status_code == 429:
+                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
+                return False
+            # 401 authentication problem, including invalid API key
+            if e.status_code == 401:
+                logger.warning(f"OpenAI API Authentication Error: {e}")
+                return False
+            # all other errors
+            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
+            return False
+        except Exception as e:
+            # includes network issues, etc.
+            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
+            return False
+
     def run_single_eval(
         self,
         model_name: str,
@@ -235,8 +294,65 @@ def run_single_eval(
             test_case = self.create_test_case(case, actual_output)
 
             # Evaluate
-            logger.info(f"Evaluating with {metric_name}")
-            eval_results = evaluate([test_case], [metric])
+            logger.info(
+                f"Evaluating {metric_name} using model {metric.model.model_name}"
+            )
+
+            if isinstance(metric, GEval):
+                # Assume GEval will use OpenAI until is disabled.
+                if self.use_openai and not self._openai_quota_ok():
+                    logger.warning(
+                        "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                    )
+                    self.use_openai = False
+
+                # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
+                if not self.use_openai:
+                    claude_model = "claude-3-5-sonnet-20240620"
+                    logger.warning(
+                        f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
+                    )
+
+                    try:
+                        # Downgrade metric model to Claude judge.
+                        from metacoder.evals.judges import ClaudeJudge
+
+                        judge = ClaudeJudge(claude_model)
+
+                        if not judge.has_available_quota():
+                            raise Exception(
+                                "No Anthropic credits available for ClaudeJudge."
+                            )
+
+                        metric = make_geval(model=judge)
+                        logger.info(
+                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                        )
+                    except Exception as e:
+                        # Fallback: if you can't use Claude, downgrade gracefully.
+                        logging.debug(traceback.format_exc())
+                        logger.debug(e)
+                        logger.warning(
+                            f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric."
+                        )
+                        metric = DummyMetric(threshold=0.5)
+                        logger.warning(f"Downgraded {metric_name} to {metric.name}.")
+
+            eval_results = evaluate(
+                [test_case],
+                [metric],
+                async_config=AsyncConfig(run_async=False),  # disable async
+                display_config=DisplayConfig(
+                    show_indicator=False,  # hide the progress meter
+                    print_results=False,
+                    verbose_mode=self.verbose,
+                ),
+                cache_config=CacheConfig(use_cache=False, write_cache=False),
+                error_config=ErrorConfig(
+                    ignore_errors=False,  # actually fail on failure
+                    skip_on_missing_params=True,
+                ),
+            )
 
             # Extract results - the structure varies by deepeval version
             test_result = eval_results.test_results[0]