|
11 | 11 | from inline_snapshot import snapshot
|
12 | 12 | from pydantic import BaseModel
|
13 | 13 |
|
14 |
| -from ..conftest import try_import |
| 14 | +from ..conftest import IsStr, try_import |
15 | 15 | from .utils import render_table
|
16 | 16 |
|
17 | 17 | with try_import() as imports_successful:
|
@@ -1086,3 +1086,134 @@ async def my_task(my_inputs: MyInputs) -> int | str:
|
1086 | 1086 | │ Averages │ │ 1.0s │
|
1087 | 1087 | └──────────┴────────────────────────────────────────────────────────────────────────────────────┴──────────┘
|
1088 | 1088 | """)
|
| 1089 | + |
| 1090 | + |
| 1091 | +async def test_evaluate_async_logfire( |
| 1092 | + example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata], |
| 1093 | + simple_evaluator: type[Evaluator[TaskInput, TaskOutput, TaskMetadata]], |
| 1094 | + capfire: CaptureLogfire, |
| 1095 | +): |
| 1096 | + """Test evaluating a dataset.""" |
| 1097 | + example_dataset.add_evaluator(simple_evaluator()) |
| 1098 | + |
| 1099 | + async def mock_async_task(inputs: TaskInput) -> TaskOutput: |
| 1100 | + if inputs.query == 'What is 2+2?': |
| 1101 | + return TaskOutput(answer='4') |
| 1102 | + elif inputs.query == 'What is the capital of France?': |
| 1103 | + return TaskOutput(answer='Paris') |
| 1104 | + return TaskOutput(answer='Unknown') # pragma: no cover |
| 1105 | + |
| 1106 | + await example_dataset.evaluate(mock_async_task) |
| 1107 | + |
| 1108 | + spans = capfire.exporter.exported_spans_as_dict() |
| 1109 | + spans.sort(key=lambda s: s['start_time']) |
| 1110 | + assert spans == [ |
| 1111 | + { |
| 1112 | + 'attributes': { |
| 1113 | + 'averages': '{"name":"Averages","scores":{"confidence":1.0},"labels":{},"metrics":{},"assertions":1.0,"task_duration":1.0,"total_duration":5.0}', |
| 1114 | + 'cases': '[{"name":"case1","inputs":{"query":"What is ' |
| 1115 | + '2+2?"},"metadata":{"difficulty":"easy","category":"general"},"expected_output":{"answer":"4","confidence":1.0},"output":{"answer":"4","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":6.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000003"},{"name":"case2","inputs":{"query":"What ' |
| 1116 | + 'is the capital of ' |
| 1117 | + 'France?"},"metadata":{"difficulty":"medium","category":"geography"},"expected_output":{"answer":"Paris","confidence":1.0},"output":{"answer":"Paris","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":4.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000007"}]', |
| 1118 | + 'code.filepath': 'test_dataset.py', |
| 1119 | + 'code.function': 'test_evaluate_async_logfire', |
| 1120 | + 'code.lineno': 123, |
| 1121 | + 'logfire.json_schema': '{"type":"object","properties":{"name":{},"cases":{"type":"array"},"averages":{"type":"object"}}}', |
| 1122 | + 'logfire.msg': 'evaluate mock_async_task', |
| 1123 | + 'logfire.msg_template': 'evaluate {name}', |
| 1124 | + 'logfire.span_type': 'span', |
| 1125 | + 'name': 'mock_async_task', |
| 1126 | + }, |
| 1127 | + 'context': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, |
| 1128 | + 'end_time': 10000000000, |
| 1129 | + 'name': 'evaluate {name}', |
| 1130 | + 'parent': None, |
| 1131 | + 'start_time': 1000000000, |
| 1132 | + }, |
| 1133 | + { |
| 1134 | + 'attributes': { |
| 1135 | + 'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
| 1136 | + 'attributes': '{}', |
| 1137 | + 'case_name': 'case1', |
| 1138 | + 'code.filepath': IsStr(), |
| 1139 | + 'code.lineno': 123, |
| 1140 | + 'expected_output': '{"answer":"4","confidence":1.0}', |
| 1141 | + 'inputs': '{"query":"What is 2+2?"}', |
| 1142 | + 'labels': '{}', |
| 1143 | + 'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}', |
| 1144 | + 'logfire.msg': 'case: case1', |
| 1145 | + 'logfire.msg_template': 'case: {case_name}', |
| 1146 | + 'logfire.span_type': 'span', |
| 1147 | + 'metadata': '{"difficulty":"easy","category":"general"}', |
| 1148 | + 'metrics': '{}', |
| 1149 | + 'output': '{"answer":"4","confidence":1.0}', |
| 1150 | + 'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
| 1151 | + 'task_duration': 1.0, |
| 1152 | + 'task_name': 'mock_async_task', |
| 1153 | + }, |
| 1154 | + 'context': {'is_remote': False, 'span_id': 3, 'trace_id': 1}, |
| 1155 | + 'end_time': 8000000000, |
| 1156 | + 'name': 'case: {case_name}', |
| 1157 | + 'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, |
| 1158 | + 'start_time': 2000000000, |
| 1159 | + }, |
| 1160 | + { |
| 1161 | + 'attributes': { |
| 1162 | + 'code.filepath': IsStr(), |
| 1163 | + 'code.lineno': 123, |
| 1164 | + 'logfire.json_schema': '{"type":"object","properties":{"task":{}}}', |
| 1165 | + 'logfire.msg': 'execute mock_async_task', |
| 1166 | + 'logfire.msg_template': 'execute {task}', |
| 1167 | + 'logfire.span_type': 'span', |
| 1168 | + 'task': 'mock_async_task', |
| 1169 | + }, |
| 1170 | + 'context': {'is_remote': False, 'span_id': 5, 'trace_id': 1}, |
| 1171 | + 'end_time': 4000000000, |
| 1172 | + 'name': 'execute {task}', |
| 1173 | + 'parent': {'is_remote': False, 'span_id': 3, 'trace_id': 1}, |
| 1174 | + 'start_time': 3000000000, |
| 1175 | + }, |
| 1176 | + { |
| 1177 | + 'attributes': { |
| 1178 | + 'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
| 1179 | + 'attributes': '{}', |
| 1180 | + 'case_name': 'case2', |
| 1181 | + 'code.filepath': IsStr(), |
| 1182 | + 'code.lineno': 123, |
| 1183 | + 'expected_output': '{"answer":"Paris","confidence":1.0}', |
| 1184 | + 'inputs': '{"query":"What is the capital of France?"}', |
| 1185 | + 'labels': '{}', |
| 1186 | + 'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}', |
| 1187 | + 'logfire.msg': 'case: case2', |
| 1188 | + 'logfire.msg_template': 'case: {case_name}', |
| 1189 | + 'logfire.span_type': 'span', |
| 1190 | + 'metadata': '{"difficulty":"medium","category":"geography"}', |
| 1191 | + 'metrics': '{}', |
| 1192 | + 'output': '{"answer":"Paris","confidence":1.0}', |
| 1193 | + 'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
| 1194 | + 'task_duration': 1.0, |
| 1195 | + 'task_name': 'mock_async_task', |
| 1196 | + }, |
| 1197 | + 'context': {'is_remote': False, 'span_id': 7, 'trace_id': 1}, |
| 1198 | + 'end_time': 9000000000, |
| 1199 | + 'name': 'case: {case_name}', |
| 1200 | + 'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, |
| 1201 | + 'start_time': 5000000000, |
| 1202 | + }, |
| 1203 | + { |
| 1204 | + 'attributes': { |
| 1205 | + 'code.filepath': IsStr(), |
| 1206 | + 'code.lineno': 123, |
| 1207 | + 'logfire.json_schema': '{"type":"object","properties":{"task":{}}}', |
| 1208 | + 'logfire.msg': 'execute mock_async_task', |
| 1209 | + 'logfire.msg_template': 'execute {task}', |
| 1210 | + 'logfire.span_type': 'span', |
| 1211 | + 'task': 'mock_async_task', |
| 1212 | + }, |
| 1213 | + 'context': {'is_remote': False, 'span_id': 9, 'trace_id': 1}, |
| 1214 | + 'end_time': 7000000000, |
| 1215 | + 'name': 'execute {task}', |
| 1216 | + 'parent': {'is_remote': False, 'span_id': 7, 'trace_id': 1}, |
| 1217 | + 'start_time': 6000000000, |
| 1218 | + }, |
| 1219 | + ] |
0 commit comments