Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cd95918
Enforce UTF-8 for Goose session files.
Aug 29, 2025
a791ce5
Fixes issue #15. Prevents divide by zero errors and cleans up summari…
Aug 29, 2025
49891a3
Cleaned up output by using consistent printing methods.
Aug 29, 2025
46ad344
Fixes Issue #18 by implementing metric downgrades to Claude if OpenAP…
Aug 30, 2025
fc7ba41
Satisfied ruff's bizarre rules.
Aug 30, 2025
54dd3d3
Added extra logging and test for goose UTF-8 handling.
Aug 30, 2025
72f586c
Added metacoder configuration test cases for claude downgrade and no …
Aug 30, 2025
d7beb19
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
d88ca90
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
e7bba40
Added assertion to confirm that ClaudeJudge completed scoring the met…
Aug 30, 2025
d27277b
Added assertion to force test to fail on Exception. Increased logging…
Aug 30, 2025
3f22fc6
Fixed runtime issues related to metric downgrade from CorrectnessMetr…
Aug 30, 2025
d6e1e44
Added test coverage of new evaluation judge functionality. Added test…
Aug 30, 2025
882a3d9
Reduced logging verbosity. Added Anthropic quota check. Added automat…
Sep 2, 2025
c98c9d7
Fixed issue #23. Forced processes to be launched with UTF-8 encoding …
Sep 2, 2025
4761d19
Addressed ruff formatting issue.
Sep 2, 2025
6b64a79
Added output file check to fail if the output file already exists. Ot…
Sep 2, 2025
c436e7f
Modified save_results to append to existing output file rather than o…
Sep 2, 2025
b0b1c8b
Updated ClaudeJudge model to claude-sonnet-4-20250514.
Sep 3, 2025
a7e71e3
Revert "Modified save_results to append to existing output file rathe…
Sep 3, 2025
7e143da
Added UTF-8 encoding to prevent character mangling during YAML export…
Sep 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/metacoder/coders/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
ao.tool_uses = tool_uses

end_time = time.time()
logger.info(f"🤖 Command took {end_time - start_time} seconds")
logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/codex.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
if "result" in message:
ao.result_text = message["result"]
end_time = time.time()
print(f"🤖 Command took {end_time - start_time} seconds")
print(f"🤖 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
)

end_time = time.time()
logger.info(f"💎 Command took {end_time - start_time} seconds")
logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")

# Parse the output
ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
Expand Down
4 changes: 2 additions & 2 deletions src/metacoder/coders/goose.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
result = self.run_process(command, env)
end_time = time.time()
ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
logger.info(f"🦆 Command took {end_time - start_time} seconds")
logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
# look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
session_file: Optional[Path] = None
for line in result.stdout.split("\n"):
Expand All @@ -165,7 +165,7 @@ def run(self, input_text: str) -> CoderOutput:
session_file = Path(session_file_str)
break
if session_file and session_file.exists():
with open(session_file, "r") as f:
with open(session_file, "r", encoding="utf-8") as f:
ao.structured_messages = [
json.loads(line) for line in f if line.strip()
]
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
)

end_time = time.time()
print(f"🤖 Command took {end_time - start_time} seconds")
print(f"🤖 Command took {end_time - start_time:.2f} seconds")

# Create output - Qwen CLI doesn't provide structured output
ao = CoderOutput(
Expand Down
85 changes: 85 additions & 0 deletions src/metacoder/evals/judges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# metacoder/evals/judges.py
import logging
import os

from anthropic import Anthropic
from anthropic.types import MessageParam, TextBlockParam, TextBlock

from deepeval.models.base_model import DeepEvalBaseLLM

logger = logging.getLogger(__name__)


class ClaudeJudge(DeepEvalBaseLLM):
"""
Wraps Anthropic's Claude models so they can be used as
the `model` parameter to DeepEval metrics like GEval.
"""

def __init__(
self,
model_name: str = "claude-3-5-sonnet-20240620",
max_tokens: int = 1024,
temperature: float = 0.0,
):
super().__init__()
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise Exception("ANTHROPIC_API_KEY is not set in environment")
self.client = Anthropic(api_key=api_key)
self.model_name = model_name
self.max_tokens = max_tokens
self.temperature = temperature

def load_model(self):
return self

def generate(self, prompt: str) -> str:
# Build typed content blocks and messages to satisfy the SDK's type hints
content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
messages: list[MessageParam] = [{"role": "user", "content": content}]
resp = self.client.messages.create(
model=self.model_name,
max_tokens=self.max_tokens,
temperature=self.temperature,
messages=messages,
)
# anthropic returns a list of content blocks; collect only the text blocks.
parts: list[str] = []
for block in resp.content:
if isinstance(block, TextBlock):
parts.append(block.text)
return "".join(parts)

async def a_generate(self, prompt: str) -> str:
# for now just call the sync path
return self.generate(prompt)

def get_model_name(self) -> str:
return self.model_name

def has_available_quota(self) -> bool:
"""
Try a very lightweight request to check if quota is available.
Returns True if quota exists, False if Anthropic responds with
quota-related errors.
"""
try:
# Use a minimal "ping" request
content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
messages: list[MessageParam] = [{"role": "user", "content": content}]
self.client.messages.create(
model=self.model_name,
max_tokens=1, # cheapest possible
temperature=0.0,
messages=messages,
)
return True
except Exception as e:
msg = str(e).lower()
# Check for insufficient quota:
# 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
if "credit balance is too low" in msg or "400" in msg:
logger.warning(f"ClaudeJudge quota check failed: {e}")
return False
raise
166 changes: 141 additions & 25 deletions src/metacoder/evals/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,32 @@
"""

import copy
import functools
import importlib
import logging
import os
import time
import traceback
from pathlib import Path
from typing import Any, Dict, List, Optional, Type
from typing import Any, Dict, List, Optional, Type, cast

from pydantic import BaseModel
import yaml

from deepeval import evaluate
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics import BaseMetric, GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

from openai import APIStatusError
from openai.types.chat import ChatCompletionMessageParam

from metacoder.coders.base_coder import BaseCoder, CoderOutput
from metacoder.registry import AVAILABLE_CODERS
from metacoder.evals.eval_model import EvalCase, EvalDataset
from metacoder.configuration import AIModelConfig, CoderConfig


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -59,24 +64,34 @@ def is_successful(self) -> bool:
return self.success


def get_default_metrics() -> Dict[str, BaseMetric]:
"""Get default metrics. Creates instances lazily to avoid network calls during import."""
def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
"""Creates a GEval instance with the specified model."""
return GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
# NOTE: you can only provide either criteria or evaluation_steps, and not both
evaluation_steps=[
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK",
],
threshold=0.8,
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
model=model, # may be None (defaults to OpenAI) or a Claude judge
)


def get_default_metrics(
model: Optional[DeepEvalBaseLLM] = None,
) -> Dict[str, BaseMetric]:
"""Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
return {
"CorrectnessMetric": GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
# NOTE: you can only provide either criteria or evaluation_steps, and not both
evaluation_steps=[
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK",
],
threshold=0.8,
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
"CorrectnessMetric": make_geval(
model=model # Note: GEval defaults to OpenAI if no model is specified.
),
"DummyMetric": DummyMetric(threshold=0.5),
}
Expand Down Expand Up @@ -123,6 +138,8 @@ class EvalRunner:

def __init__(self, verbose: bool = False):
self.verbose = verbose
self.use_openai = True # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.

if verbose:
logging.basicConfig(level=logging.DEBUG)
else:
Expand Down Expand Up @@ -183,6 +200,48 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
additional_metadata=case.additional_metadata,
)

@functools.lru_cache(maxsize=1)
def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
if not os.getenv("OPENAI_API_KEY"):
logger.info("OPENAI_API_KEY is not set.")
return False
"""
Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
Fast probe of the /chat/completions endpoint (the one GEval uses).
Returns False on 429 (insufficient_quota) or any exception.
"""
try:
from openai import OpenAI

# turn off SDK retries for the check so it returns fast
client = OpenAI(max_retries=0, timeout=8) # NO retries, quick fail
# messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
raw = [{"role": "user", "content": "ping"}]
messages = cast(List[ChatCompletionMessageParam], raw)
client.chat.completions.create(
model=model,
messages=messages,
max_tokens=1,
temperature=0,
)
return True
except APIStatusError as e:
# 429 insufficient quota or too many requests
if e.status_code == 429:
logger.warning(f"OpenAI API Key has insufficient quota: {e}")
return False
# 401 authentication problem, including invalid API key
if e.status_code == 401:
logger.warning(f"OpenAI API Authentication Error: {e}")
return False
# all other errors
logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
return False
except Exception as e:
# includes network issues, etc.
logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
return False

def run_single_eval(
self,
model_name: str,
Expand Down Expand Up @@ -235,8 +294,65 @@ def run_single_eval(
test_case = self.create_test_case(case, actual_output)

# Evaluate
logger.info(f"Evaluating with {metric_name}")
eval_results = evaluate([test_case], [metric])
logger.info(
f"Evaluating {metric_name} using model {metric.model.model_name}"
)

if isinstance(metric, GEval):
# Assume GEval will use OpenAI until is disabled.
if self.use_openai and not self._openai_quota_ok():
logger.warning(
"OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
)
self.use_openai = False

# Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
if not self.use_openai:
claude_model = "claude-3-5-sonnet-20240620"
logger.warning(
f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
)

try:
# Downgrade metric model to Claude judge.
from metacoder.evals.judges import ClaudeJudge

judge = ClaudeJudge(claude_model)

if not judge.has_available_quota():
raise Exception(
"No Anthropic credits available for ClaudeJudge."
)

metric = make_geval(model=judge)
logger.info(
f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
)
except Exception as e:
# Fallback: if you can't use Claude, downgrade gracefully.
logging.debug(traceback.format_exc())
logger.debug(e)
logger.warning(
f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric."
)
metric = DummyMetric(threshold=0.5)
logger.warning(f"Downgraded {metric_name} to {metric.name}.")

eval_results = evaluate(
[test_case],
[metric],
async_config=AsyncConfig(run_async=False), # disable async
display_config=DisplayConfig(
show_indicator=False, # hide the progress meter
print_results=False,
verbose_mode=self.verbose,
),
cache_config=CacheConfig(use_cache=False, write_cache=False),
error_config=ErrorConfig(
ignore_errors=False, # actually fail on failure
skip_on_missing_params=True,
),
)

# Extract results - the structure varies by deepeval version
test_result = eval_results.test_results[0]
Expand Down
Loading