From b3819b413300596c8668d813d2aa043c3f20ad91 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Wed, 9 Jul 2025 11:32:01 -0700
Subject: [PATCH 1/5] Support including reasons in the console output

---
 .../pydantic_evals/reporting/__init__.py      | 41 ++++++++++++++-----
 tests/evals/test_reporting.py                 |  4 ++
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py
index 65dae81448..fffda30edd 100644
--- a/pydantic_evals/pydantic_evals/reporting/__init__.py
+++ b/pydantic_evals/pydantic_evals/reporting/__init__.py
@@ -4,7 +4,7 @@
 from collections.abc import Mapping
 from dataclasses import dataclass
 from io import StringIO
-from typing import Any, Callable, Generic, Literal, Protocol
+from typing import Any, Callable, Generic, Literal, Protocol, cast
 
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
@@ -168,6 +168,7 @@ def print(
         self,
         width: int | None = None,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -183,6 +184,7 @@ def print(
         label_configs: dict[str, RenderValueConfig] | None = None,
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
     ):  # pragma: no cover
         """Print this report to the console, optionally comparing it to a baseline report.
 
@@ -205,12 +207,14 @@ def print(
             label_configs=label_configs,
             metric_configs=metric_configs,
             duration_config=duration_config,
+            include_reasons=include_reasons,
         )
         Console(width=width).print(table)
 
     def console_table(
         self,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -226,6 +230,7 @@ def console_table(
         label_configs: dict[str, RenderValueConfig] | None = None,
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
     ) -> Table:
         """Return a table containing the data from this report, or the diff between this report and a baseline report.
 
@@ -247,6 +252,7 @@ def console_table(
             label_configs=label_configs or {},
             metric_configs=metric_configs or {},
             duration_config=duration_config or _DEFAULT_DURATION_CONFIG,
+            include_reasons=include_reasons,
         )
         if baseline is None:
             return renderer.build_table(self)
@@ -529,15 +535,16 @@ class ReportCaseRenderer:
     include_labels: bool
     include_metrics: bool
     include_assertions: bool
+    include_reasons: bool
     include_durations: bool
     include_total_duration: bool
 
     input_renderer: _ValueRenderer
     metadata_renderer: _ValueRenderer
     output_renderer: _ValueRenderer
-    score_renderers: dict[str, _NumberRenderer]
-    label_renderers: dict[str, _ValueRenderer]
-    metric_renderers: dict[str, _NumberRenderer]
+    score_renderers: Mapping[str, _NumberRenderer]
+    label_renderers: Mapping[str, _ValueRenderer]
+    metric_renderers: Mapping[str, _NumberRenderer]
     duration_renderer: _NumberRenderer
 
     def build_base_table(self, title: str) -> Table:
@@ -581,10 +588,10 @@ def build_row(self, case: ReportCase) -> list[str]:
             row.append(self.output_renderer.render_value(None, case.output) or EMPTY_CELL_STR)
 
         if self.include_scores:
-            row.append(self._render_dict({k: v.value for k, v in case.scores.items()}, self.score_renderers))
+            row.append(self._render_dict({k: v for k, v in case.scores.items()}, self.score_renderers))
 
         if self.include_labels:
-            row.append(self._render_dict({k: v.value for k, v in case.labels.items()}, self.label_renderers))
+            row.append(self._render_dict({k: v for k, v in case.labels.items()}, self.label_renderers))
 
         if self.include_metrics:
             row.append(self._render_dict(case.metrics, self.metric_renderers))
@@ -781,24 +788,33 @@ def _render_dicts_diff(
 
     @staticmethod
     def _render_dict(
-        case_dict: dict[str, T],
+        case_dict: Mapping[str, EvaluationResult[T] | T],
         renderers: Mapping[str, _AbstractRenderer[T]],
         *,
         include_names: bool = True,
     ) -> str:
         diff_lines: list[str] = []
         for key, val in case_dict.items():
-            rendered = renderers[key].render_value(key if include_names else None, val)
+            value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val
+            rendered = renderers[key].render_value(key if include_names else None, value)
+            if isinstance(val, EvaluationResult) and (reason := val.reason):
+                rendered += f'\nReason: {reason}\n'
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
 
-    @staticmethod
     def _render_assertions(
+        self,
         assertions: list[EvaluationResult[bool]],
     ) -> str:
         if not assertions:
             return EMPTY_CELL_STR
-        return ''.join(['[green]✔[/]' if a.value else '[red]✗[/]' for a in assertions])
+        lines: list[str] = []
+        for a in assertions:
+            line = '[green]✔[/]' if a.value else '[red]✗[/]'
+            if self.include_reasons:
+                line = f'{a.name}: {line}\nReason: {a.reason}\n\n'
+            lines.append(line)
+        return ''.join(lines)
 
     @staticmethod
     def _render_aggregate_assertions(
@@ -859,6 +875,10 @@ class EvaluationRenderer:
     metric_configs: dict[str, RenderNumberConfig]
     duration_config: RenderNumberConfig
 
+    # TODO: Make this class kw-only so we can reorder the kwargs
+    # Data to include
+    include_reasons: bool  # only applies to reports, not to diffs
+
     def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
         return any(case.scores for case in self._all_cases(report, baseline))
 
@@ -905,6 +925,7 @@ def _get_case_renderer(
             include_labels=self.include_labels(report, baseline),
             include_metrics=self.include_metrics(report, baseline),
             include_assertions=self.include_assertions(report, baseline),
+            include_reasons=self.include_reasons,
             include_durations=self.include_durations,
             include_total_duration=self.include_total_duration,
             input_renderer=input_renderer,
diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 50c93a2c54..7d716884a0 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -120,6 +120,7 @@ async def test_evaluation_renderer_basic(sample_report: EvaluationReport):
         label_configs={},
         metric_configs={},
         duration_config={},
+        include_reasons=False,
     )
 
     table = renderer.build_table(sample_report)
@@ -191,6 +192,7 @@ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport
         label_configs={},
         metric_configs={},
         duration_config={},
+        include_reasons=False,
     )
 
     table = renderer.build_diff_table(sample_report, baseline_report)
@@ -248,6 +250,7 @@ async def test_evaluation_renderer_with_removed_cases(sample_report: EvaluationR
         label_configs={},
         metric_configs={},
         duration_config={},
+        include_reasons=False,
     )
 
     table = renderer.build_diff_table(sample_report, baseline_report)
@@ -311,6 +314,7 @@ async def test_evaluation_renderer_with_custom_configs(sample_report: Evaluation
             'diff_increase_style': 'bold red',
             'diff_decrease_style': 'bold green',
         },
+        include_reasons=False,
     )
 
     table = renderer.build_table(sample_report)

From 795a9beeb29516b3ecdfc4dd0029de879bf750bd Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Wed, 9 Jul 2025 11:48:14 -0700
Subject: [PATCH 2/5] Fix reason rendering

---
 pydantic_evals/pydantic_evals/reporting/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py
index fffda30edd..b4466af216 100644
--- a/pydantic_evals/pydantic_evals/reporting/__init__.py
+++ b/pydantic_evals/pydantic_evals/reporting/__init__.py
@@ -798,7 +798,7 @@ def _render_dict(
             value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val
             rendered = renderers[key].render_value(key if include_names else None, value)
             if isinstance(val, EvaluationResult) and (reason := val.reason):
-                rendered += f'\nReason: {reason}\n'
+                rendered += f'\n  Reason: {reason}\n'
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
 
@@ -812,7 +812,8 @@ def _render_assertions(
         for a in assertions:
             line = '[green]✔[/]' if a.value else '[red]✗[/]'
             if self.include_reasons:
-                line = f'{a.name}: {line}\nReason: {a.reason}\n\n'
+                line = f'{a.name}: {line}\n'
+                line = f'{line}  Reason: {a.reason}\n\n' if a.reason else line
             lines.append(line)
         return ''.join(lines)
 

From fbba591e6c34102468f48f7f41b225a306af98f3 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Thu, 10 Jul 2025 11:31:34 -0700
Subject: [PATCH 3/5] Fix unconditional inclusion of reasons

---
 pydantic_evals/pydantic_evals/reporting/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py
index b4466af216..83be382086 100644
--- a/pydantic_evals/pydantic_evals/reporting/__init__.py
+++ b/pydantic_evals/pydantic_evals/reporting/__init__.py
@@ -786,8 +786,8 @@ def _render_dicts_diff(
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
 
-    @staticmethod
     def _render_dict(
+        self,
         case_dict: Mapping[str, EvaluationResult[T] | T],
         renderers: Mapping[str, _AbstractRenderer[T]],
         *,
@@ -797,7 +797,7 @@ def _render_dict(
         for key, val in case_dict.items():
             value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val
             rendered = renderers[key].render_value(key if include_names else None, value)
-            if isinstance(val, EvaluationResult) and (reason := val.reason):
+            if self.include_reasons and isinstance(val, EvaluationResult) and (reason := val.reason):
                 rendered += f'\n  Reason: {reason}\n'
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR

From cc2a4b1cd6e8d4d132dd6d355db5978ec7935031 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Mon, 4 Aug 2025 16:06:34 -0600
Subject: [PATCH 4/5] Add test

---
 tests/evals/test_reporting.py | 39 ++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 7d716884a0..c92e54407f 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -57,7 +57,7 @@ def sample_score(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata])
     return EvaluationResult(
         name='MockEvaluator',
         value=2.5,
-        reason=None,
+        reason='my reason',
         source=mock_evaluator,
     )
 
@@ -138,6 +138,43 @@ async def test_evaluation_renderer_basic(sample_report: EvaluationReport):
 """)
 
 
+async def test_evaluation_renderer_with_reasons(sample_report: EvaluationReport):
+    """Test basic functionality of EvaluationRenderer."""
+    renderer = EvaluationRenderer(
+        include_input=True,
+        include_output=True,
+        include_metadata=True,
+        include_expected_output=True,
+        include_durations=True,
+        include_total_duration=True,
+        include_removed_cases=False,
+        include_averages=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=True,
+    )
+
+    table = renderer.build_table(sample_report)
+    assert render_table(table) == snapshot("""\
+                                                                                     Evaluation Summary: test_report
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
+┃ Case ID   ┃ Inputs                    ┃ Metadata               ┃ Expected Output ┃ Outputs         ┃ Scores              ┃ Labels                 ┃ Metrics         ┃ Assertions       ┃    Durations ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
+│ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50        │ label1: hello          │ accuracy: 0.950 │ MockEvaluator: ✔ │  task: 0.100 │
+│           │                           │                        │                 │                 │   Reason: my reason │                        │                 │                  │ total: 0.200 │
+│           │                           │                        │                 │                 │                     │                        │                 │                  │              │
+├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼─────────────────────┼────────────────────────┼─────────────────┼──────────────────┼──────────────┤
+│ Averages  │                           │                        │                 │                 │ score1: 2.50        │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔         │  task: 0.100 │
+│           │                           │                        │                 │                 │                     │                        │                 │                  │ total: 0.200 │
+└───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴─────────────────────┴────────────────────────┴─────────────────┴──────────────────┴──────────────┘
+""")
+
+
 async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport):
     """Test EvaluationRenderer with baseline comparison."""
     baseline_report = EvaluationReport(

From 6a91fd1bd16615a540d737300f662509ee28a44c Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Mon, 4 Aug 2025 16:30:13 -0600
Subject: [PATCH 5/5] Update changelog.md

---
 docs/changelog.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/changelog.md b/docs/changelog.md
index d0aec66fc7..4aec0c9f93 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -16,6 +16,8 @@ Pydantic AI is still pre-version 1, so breaking changes will occur, however:
 
 See [#2388](https://github.com/pydantic/pydantic-ai/pull/2388) - The `source` field of an `EvaluationResult` is now of type `EvaluatorSpec` rather than the actual source `Evaluator` instance, to help with serialization/deserialization.
 
+See [#2163](https://github.com/pydantic/pydantic-ai/pull/2163) - The `EvaluationReport.print` and `EvaluationReport.console_table` methods now require most arguments be passed by keyword.
+
 ### v0.4.0 (2025-07-08)
 
 See [#1799](https://github.com/pydantic/pydantic-ai/pull/1799) - Pydantic Evals `EvaluationReport` and `ReportCase` are now generic dataclasses instead of Pydantic models. If you were serializing them using `model_dump()`, you will now need to use the `EvaluationReportAdapter` and `ReportCaseAdapter` type adapters instead.