Skip to content

Commit 2d7c850

Browse files
dmontaguKRRT7
authored andcommitted
Fix pydantic-evals panel rendering with evaluators (pydantic#2274)
1 parent 622ff39 commit 2d7c850

File tree

2 files changed

+139
-4
lines changed

2 files changed

+139
-4
lines changed

pydantic_evals/pydantic_evals/dataset.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from .evaluators.context import EvaluatorContext
4444
from .otel import SpanTree
4545
from .otel._context_subtree import context_subtree
46-
from .reporting import EvaluationReport, ReportCase
46+
from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate
4747

4848
if sys.version_info < (3, 11):
4949
from exceptiongroup import ExceptionGroup # pragma: lax no cover
@@ -83,6 +83,10 @@
8383
_YAML_SCHEMA_LINE_PREFIX = '# yaml-language-server: $schema='
8484

8585

86+
_REPORT_CASES_ADAPTER = TypeAdapter(list[ReportCase])
87+
_REPORT_CASE_AGGREGATE_ADAPTER = TypeAdapter(ReportCaseAggregate)
88+
89+
8690
class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
8791
"""Internal model for a case, used for serialization/deserialization."""
8892

@@ -303,9 +307,9 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
303307
),
304308
)
305309
# TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
306-
eval_span.set_attribute('cases', report.cases)
310+
eval_span.set_attribute('cases', _REPORT_CASES_ADAPTER.dump_python(report.cases))
307311
# TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
308-
eval_span.set_attribute('averages', report.averages())
312+
eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(report.averages()))
309313
return report
310314

311315
def evaluate_sync(

tests/evals/test_dataset.py

Lines changed: 132 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from inline_snapshot import snapshot
1212
from pydantic import BaseModel
1313

14-
from ..conftest import try_import
14+
from ..conftest import IsStr, try_import
1515
from .utils import render_table
1616

1717
with try_import() as imports_successful:
@@ -1086,3 +1086,134 @@ async def my_task(my_inputs: MyInputs) -> int | str:
10861086
│ Averages │ │ 1.0s │
10871087
└──────────┴────────────────────────────────────────────────────────────────────────────────────┴──────────┘
10881088
""")
1089+
1090+
1091+
async def test_evaluate_async_logfire(
1092+
example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata],
1093+
simple_evaluator: type[Evaluator[TaskInput, TaskOutput, TaskMetadata]],
1094+
capfire: CaptureLogfire,
1095+
):
1096+
"""Test evaluating a dataset."""
1097+
example_dataset.add_evaluator(simple_evaluator())
1098+
1099+
async def mock_async_task(inputs: TaskInput) -> TaskOutput:
1100+
if inputs.query == 'What is 2+2?':
1101+
return TaskOutput(answer='4')
1102+
elif inputs.query == 'What is the capital of France?':
1103+
return TaskOutput(answer='Paris')
1104+
return TaskOutput(answer='Unknown') # pragma: no cover
1105+
1106+
await example_dataset.evaluate(mock_async_task)
1107+
1108+
spans = capfire.exporter.exported_spans_as_dict()
1109+
spans.sort(key=lambda s: s['start_time'])
1110+
assert spans == [
1111+
{
1112+
'attributes': {
1113+
'averages': '{"name":"Averages","scores":{"confidence":1.0},"labels":{},"metrics":{},"assertions":1.0,"task_duration":1.0,"total_duration":5.0}',
1114+
'cases': '[{"name":"case1","inputs":{"query":"What is '
1115+
'2+2?"},"metadata":{"difficulty":"easy","category":"general"},"expected_output":{"answer":"4","confidence":1.0},"output":{"answer":"4","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":6.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000003"},{"name":"case2","inputs":{"query":"What '
1116+
'is the capital of '
1117+
'France?"},"metadata":{"difficulty":"medium","category":"geography"},"expected_output":{"answer":"Paris","confidence":1.0},"output":{"answer":"Paris","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":4.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000007"}]',
1118+
'code.filepath': 'test_dataset.py',
1119+
'code.function': 'test_evaluate_async_logfire',
1120+
'code.lineno': 123,
1121+
'logfire.json_schema': '{"type":"object","properties":{"name":{},"cases":{"type":"array"},"averages":{"type":"object"}}}',
1122+
'logfire.msg': 'evaluate mock_async_task',
1123+
'logfire.msg_template': 'evaluate {name}',
1124+
'logfire.span_type': 'span',
1125+
'name': 'mock_async_task',
1126+
},
1127+
'context': {'is_remote': False, 'span_id': 1, 'trace_id': 1},
1128+
'end_time': 10000000000,
1129+
'name': 'evaluate {name}',
1130+
'parent': None,
1131+
'start_time': 1000000000,
1132+
},
1133+
{
1134+
'attributes': {
1135+
'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1136+
'attributes': '{}',
1137+
'case_name': 'case1',
1138+
'code.filepath': IsStr(),
1139+
'code.lineno': 123,
1140+
'expected_output': '{"answer":"4","confidence":1.0}',
1141+
'inputs': '{"query":"What is 2+2?"}',
1142+
'labels': '{}',
1143+
'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}',
1144+
'logfire.msg': 'case: case1',
1145+
'logfire.msg_template': 'case: {case_name}',
1146+
'logfire.span_type': 'span',
1147+
'metadata': '{"difficulty":"easy","category":"general"}',
1148+
'metrics': '{}',
1149+
'output': '{"answer":"4","confidence":1.0}',
1150+
'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1151+
'task_duration': 1.0,
1152+
'task_name': 'mock_async_task',
1153+
},
1154+
'context': {'is_remote': False, 'span_id': 3, 'trace_id': 1},
1155+
'end_time': 8000000000,
1156+
'name': 'case: {case_name}',
1157+
'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1},
1158+
'start_time': 2000000000,
1159+
},
1160+
{
1161+
'attributes': {
1162+
'code.filepath': IsStr(),
1163+
'code.lineno': 123,
1164+
'logfire.json_schema': '{"type":"object","properties":{"task":{}}}',
1165+
'logfire.msg': 'execute mock_async_task',
1166+
'logfire.msg_template': 'execute {task}',
1167+
'logfire.span_type': 'span',
1168+
'task': 'mock_async_task',
1169+
},
1170+
'context': {'is_remote': False, 'span_id': 5, 'trace_id': 1},
1171+
'end_time': 4000000000,
1172+
'name': 'execute {task}',
1173+
'parent': {'is_remote': False, 'span_id': 3, 'trace_id': 1},
1174+
'start_time': 3000000000,
1175+
},
1176+
{
1177+
'attributes': {
1178+
'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1179+
'attributes': '{}',
1180+
'case_name': 'case2',
1181+
'code.filepath': IsStr(),
1182+
'code.lineno': 123,
1183+
'expected_output': '{"answer":"Paris","confidence":1.0}',
1184+
'inputs': '{"query":"What is the capital of France?"}',
1185+
'labels': '{}',
1186+
'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}',
1187+
'logfire.msg': 'case: case2',
1188+
'logfire.msg_template': 'case: {case_name}',
1189+
'logfire.span_type': 'span',
1190+
'metadata': '{"difficulty":"medium","category":"geography"}',
1191+
'metrics': '{}',
1192+
'output': '{"answer":"Paris","confidence":1.0}',
1193+
'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1194+
'task_duration': 1.0,
1195+
'task_name': 'mock_async_task',
1196+
},
1197+
'context': {'is_remote': False, 'span_id': 7, 'trace_id': 1},
1198+
'end_time': 9000000000,
1199+
'name': 'case: {case_name}',
1200+
'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1},
1201+
'start_time': 5000000000,
1202+
},
1203+
{
1204+
'attributes': {
1205+
'code.filepath': IsStr(),
1206+
'code.lineno': 123,
1207+
'logfire.json_schema': '{"type":"object","properties":{"task":{}}}',
1208+
'logfire.msg': 'execute mock_async_task',
1209+
'logfire.msg_template': 'execute {task}',
1210+
'logfire.span_type': 'span',
1211+
'task': 'mock_async_task',
1212+
},
1213+
'context': {'is_remote': False, 'span_id': 9, 'trace_id': 1},
1214+
'end_time': 7000000000,
1215+
'name': 'execute {task}',
1216+
'parent': {'is_remote': False, 'span_id': 7, 'trace_id': 1},
1217+
'start_time': 6000000000,
1218+
},
1219+
]

0 commit comments

Comments
 (0)