From b3819b413300596c8668d813d2aa043c3f20ad91 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Wed, 9 Jul 2025 11:32:01 -0700 Subject: [PATCH 1/5] Support including reasons in the console output --- .../pydantic_evals/reporting/__init__.py | 41 ++++++++++++++----- tests/evals/test_reporting.py | 4 ++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py index 65dae81448..fffda30edd 100644 --- a/pydantic_evals/pydantic_evals/reporting/__init__.py +++ b/pydantic_evals/pydantic_evals/reporting/__init__.py @@ -4,7 +4,7 @@ from collections.abc import Mapping from dataclasses import dataclass from io import StringIO -from typing import Any, Callable, Generic, Literal, Protocol +from typing import Any, Callable, Generic, Literal, Protocol, cast from pydantic import BaseModel, TypeAdapter from rich.console import Console @@ -168,6 +168,7 @@ def print( self, width: int | None = None, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None, + *, include_input: bool = False, include_metadata: bool = False, include_expected_output: bool = False, @@ -183,6 +184,7 @@ def print( label_configs: dict[str, RenderValueConfig] | None = None, metric_configs: dict[str, RenderNumberConfig] | None = None, duration_config: RenderNumberConfig | None = None, + include_reasons: bool = False, ): # pragma: no cover """Print this report to the console, optionally comparing it to a baseline report. @@ -205,12 +207,14 @@ def print( label_configs=label_configs, metric_configs=metric_configs, duration_config=duration_config, + include_reasons=include_reasons, ) Console(width=width).print(table) def console_table( self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None, + *, include_input: bool = False, include_metadata: bool = False, include_expected_output: bool = False, @@ -226,6 +230,7 @@ def console_table( label_configs: dict[str, RenderValueConfig] | None = None, metric_configs: dict[str, RenderNumberConfig] | None = None, duration_config: RenderNumberConfig | None = None, + include_reasons: bool = False, ) -> Table: """Return a table containing the data from this report, or the diff between this report and a baseline report. @@ -247,6 +252,7 @@ def console_table( label_configs=label_configs or {}, metric_configs=metric_configs or {}, duration_config=duration_config or _DEFAULT_DURATION_CONFIG, + include_reasons=include_reasons, ) if baseline is None: return renderer.build_table(self) @@ -529,15 +535,16 @@ class ReportCaseRenderer: include_labels: bool include_metrics: bool include_assertions: bool + include_reasons: bool include_durations: bool include_total_duration: bool input_renderer: _ValueRenderer metadata_renderer: _ValueRenderer output_renderer: _ValueRenderer - score_renderers: dict[str, _NumberRenderer] - label_renderers: dict[str, _ValueRenderer] - metric_renderers: dict[str, _NumberRenderer] + score_renderers: Mapping[str, _NumberRenderer] + label_renderers: Mapping[str, _ValueRenderer] + metric_renderers: Mapping[str, _NumberRenderer] duration_renderer: _NumberRenderer def build_base_table(self, title: str) -> Table: @@ -581,10 +588,10 @@ def build_row(self, case: ReportCase) -> list[str]: row.append(self.output_renderer.render_value(None, case.output) or EMPTY_CELL_STR) if self.include_scores: - row.append(self._render_dict({k: v.value for k, v in case.scores.items()}, self.score_renderers)) + row.append(self._render_dict({k: v for k, v in case.scores.items()}, self.score_renderers)) if self.include_labels: - row.append(self._render_dict({k: v.value for k, v in case.labels.items()}, self.label_renderers)) + row.append(self._render_dict({k: v for k, v in case.labels.items()}, self.label_renderers)) if self.include_metrics: row.append(self._render_dict(case.metrics, self.metric_renderers)) @@ -781,24 +788,33 @@ def _render_dicts_diff( @staticmethod def _render_dict( - case_dict: dict[str, T], + case_dict: Mapping[str, EvaluationResult[T] | T], renderers: Mapping[str, _AbstractRenderer[T]], *, include_names: bool = True, ) -> str: diff_lines: list[str] = [] for key, val in case_dict.items(): - rendered = renderers[key].render_value(key if include_names else None, val) + value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val + rendered = renderers[key].render_value(key if include_names else None, value) + if isinstance(val, EvaluationResult) and (reason := val.reason): + rendered += f'\nReason: {reason}\n' diff_lines.append(rendered) return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR - @staticmethod def _render_assertions( + self, assertions: list[EvaluationResult[bool]], ) -> str: if not assertions: return EMPTY_CELL_STR - return ''.join(['[green]✔[/]' if a.value else '[red]✗[/]' for a in assertions]) + lines: list[str] = [] + for a in assertions: + line = '[green]✔[/]' if a.value else '[red]✗[/]' + if self.include_reasons: + line = f'{a.name}: {line}\nReason: {a.reason}\n\n' + lines.append(line) + return ''.join(lines) @staticmethod def _render_aggregate_assertions( @@ -859,6 +875,10 @@ class EvaluationRenderer: metric_configs: dict[str, RenderNumberConfig] duration_config: RenderNumberConfig + # TODO: Make this class kw-only so we can reorder the kwargs + # Data to include + include_reasons: bool # only applies to reports, not to diffs + def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None): return any(case.scores for case in self._all_cases(report, baseline)) @@ -905,6 +925,7 @@ def _get_case_renderer( include_labels=self.include_labels(report, baseline), include_metrics=self.include_metrics(report, baseline), include_assertions=self.include_assertions(report, baseline), + include_reasons=self.include_reasons, include_durations=self.include_durations, include_total_duration=self.include_total_duration, input_renderer=input_renderer, diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py index 50c93a2c54..7d716884a0 100644 --- a/tests/evals/test_reporting.py +++ b/tests/evals/test_reporting.py @@ -120,6 +120,7 @@ async def test_evaluation_renderer_basic(sample_report: EvaluationReport): label_configs={}, metric_configs={}, duration_config={}, + include_reasons=False, ) table = renderer.build_table(sample_report) @@ -191,6 +192,7 @@ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport label_configs={}, metric_configs={}, duration_config={}, + include_reasons=False, ) table = renderer.build_diff_table(sample_report, baseline_report) @@ -248,6 +250,7 @@ async def test_evaluation_renderer_with_removed_cases(sample_report: EvaluationR label_configs={}, metric_configs={}, duration_config={}, + include_reasons=False, ) table = renderer.build_diff_table(sample_report, baseline_report) @@ -311,6 +314,7 @@ async def test_evaluation_renderer_with_custom_configs(sample_report: Evaluation 'diff_increase_style': 'bold red', 'diff_decrease_style': 'bold green', }, + include_reasons=False, ) table = renderer.build_table(sample_report) From 795a9beeb29516b3ecdfc4dd0029de879bf750bd Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Wed, 9 Jul 2025 11:48:14 -0700 Subject: [PATCH 2/5] Fix reason rendering --- pydantic_evals/pydantic_evals/reporting/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py index fffda30edd..b4466af216 100644 --- a/pydantic_evals/pydantic_evals/reporting/__init__.py +++ b/pydantic_evals/pydantic_evals/reporting/__init__.py @@ -798,7 +798,7 @@ def _render_dict( value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val rendered = renderers[key].render_value(key if include_names else None, value) if isinstance(val, EvaluationResult) and (reason := val.reason): - rendered += f'\nReason: {reason}\n' + rendered += f'\n Reason: {reason}\n' diff_lines.append(rendered) return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR @@ -812,7 +812,8 @@ def _render_assertions( for a in assertions: line = '[green]✔[/]' if a.value else '[red]✗[/]' if self.include_reasons: - line = f'{a.name}: {line}\nReason: {a.reason}\n\n' + line = f'{a.name}: {line}\n' + line = f'{line} Reason: {a.reason}\n\n' if a.reason else line lines.append(line) return ''.join(lines) From fbba591e6c34102468f48f7f41b225a306af98f3 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Thu, 10 Jul 2025 11:31:34 -0700 Subject: [PATCH 3/5] Fix unconditional inclusion of reasons --- pydantic_evals/pydantic_evals/reporting/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py index b4466af216..83be382086 100644 --- a/pydantic_evals/pydantic_evals/reporting/__init__.py +++ b/pydantic_evals/pydantic_evals/reporting/__init__.py @@ -786,8 +786,8 @@ def _render_dicts_diff( diff_lines.append(rendered) return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR - @staticmethod def _render_dict( + self, case_dict: Mapping[str, EvaluationResult[T] | T], renderers: Mapping[str, _AbstractRenderer[T]], *, @@ -797,7 +797,7 @@ def _render_dict( for key, val in case_dict.items(): value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val rendered = renderers[key].render_value(key if include_names else None, value) - if isinstance(val, EvaluationResult) and (reason := val.reason): + if self.include_reasons and isinstance(val, EvaluationResult) and (reason := val.reason): rendered += f'\n Reason: {reason}\n' diff_lines.append(rendered) return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR From cc2a4b1cd6e8d4d132dd6d355db5978ec7935031 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:06:34 -0600 Subject: [PATCH 4/5] Add test --- tests/evals/test_reporting.py | 39 ++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py index 7d716884a0..c92e54407f 100644 --- a/tests/evals/test_reporting.py +++ b/tests/evals/test_reporting.py @@ -57,7 +57,7 @@ def sample_score(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata]) return EvaluationResult( name='MockEvaluator', value=2.5, - reason=None, + reason='my reason', source=mock_evaluator, ) @@ -138,6 +138,43 @@ async def test_evaluation_renderer_basic(sample_report: EvaluationReport): """) +async def test_evaluation_renderer_with_reasons(sample_report: EvaluationReport): + """Test basic functionality of EvaluationRenderer.""" + renderer = EvaluationRenderer( + include_input=True, + include_output=True, + include_metadata=True, + include_expected_output=True, + include_durations=True, + include_total_duration=True, + include_removed_cases=False, + include_averages=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=True, + ) + + table = renderer.build_table(sample_report) + assert render_table(table) == snapshot("""\ + Evaluation Summary: test_report +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ +┃ Case ID ┃ Inputs ┃ Metadata ┃ Expected Output ┃ Outputs ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Durations ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ +│ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ MockEvaluator: ✔ │ task: 0.100 │ +│ │ │ │ │ │ Reason: my reason │ │ │ │ total: 0.200 │ +│ │ │ │ │ │ │ │ │ │ │ +├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼─────────────────────┼────────────────────────┼─────────────────┼──────────────────┼──────────────┤ +│ Averages │ │ │ │ │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔ │ task: 0.100 │ +│ │ │ │ │ │ │ │ │ │ total: 0.200 │ +└───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴─────────────────────┴────────────────────────┴─────────────────┴──────────────────┴──────────────┘ +""") + + async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport): """Test EvaluationRenderer with baseline comparison.""" baseline_report = EvaluationReport( From 6a91fd1bd16615a540d737300f662509ee28a44c Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:30:13 -0600 Subject: [PATCH 5/5] Update changelog.md --- docs/changelog.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index d0aec66fc7..4aec0c9f93 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -16,6 +16,8 @@ Pydantic AI is still pre-version 1, so breaking changes will occur, however: See [#2388](https://github.com/pydantic/pydantic-ai/pull/2388) - The `source` field of an `EvaluationResult` is now of type `EvaluatorSpec` rather than the actual source `Evaluator` instance, to help with serialization/deserialization. +See [#2163](https://github.com/pydantic/pydantic-ai/pull/2163) - The `EvaluationReport.print` and `EvaluationReport.console_table` methods now require most arguments be passed by keyword. + ### v0.4.0 (2025-07-08) See [#1799](https://github.com/pydantic/pydantic-ai/pull/1799) - Pydantic Evals `EvaluationReport` and `ReportCase` are now generic dataclasses instead of Pydantic models. If you were serializing them using `model_dump()`, you will now need to use the `EvaluationReportAdapter` and `ReportCaseAdapter` type adapters instead.