Skip to content

Commit 55c6814

Browse files
authored
BREAKING CHANGE: Remove Python evaluator for security reasons (#2808)
1 parent 5f5a9ea commit 55c6814

File tree

6 files changed

+73
-118
lines changed

6 files changed

+73
-118
lines changed

docs/changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ Once we release V2, in April 2026 at the earliest, we'll continue to provide sec
77

88
Here's a filtered list of the breaking changes for each version to help you upgrade Pydantic AI.
99

10+
### v1.0.1 (2025-09-05)
11+
12+
The following breaking change was accidentally left out of v1.0.0:
13+
14+
- See [#2808](https://github.com/pydantic/pydantic-ai/pull/2808) - Remove `Python` evaluator from `pydantic_evals` for security reasons
15+
1016
### v1.0.0 (2025-09-04)
1117

1218
- See [#2725](https://github.com/pydantic/pydantic-ai/pull/2725) - Drop support for Python 3.9

pydantic_evals/pydantic_evals/evaluators/__init__.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
LLMJudge,
88
MaxDuration,
99
OutputConfig,
10-
Python,
1110
)
1211
from .context import EvaluatorContext
1312
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorFailure, EvaluatorOutput, EvaluatorSpec
@@ -22,7 +21,6 @@
2221
'LLMJudge',
2322
'HasMatchingSpan',
2423
'OutputConfig',
25-
'Python',
2624
# context
2725
'EvaluatorContext',
2826
# evaluator
@@ -34,3 +32,11 @@
3432
'EvaluationReason',
3533
'EvaluationResult',
3634
)
35+
36+
37+
def __getattr__(name: str):
38+
if name == 'Python':
39+
raise ImportError(
40+
'The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.'
41+
)
42+
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')

pydantic_evals/pydantic_evals/evaluators/common.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
'MaxDuration',
2222
'LLMJudge',
2323
'HasMatchingSpan',
24-
'Python',
2524
'OutputConfig',
2625
)
2726

@@ -268,22 +267,6 @@ def evaluate(
268267
return ctx.span_tree.any(self.query)
269268

270269

271-
# TODO: Consider moving this to docs rather than providing it with the library, given the security implications
272-
@dataclass(repr=False)
273-
class Python(Evaluator[object, object, object]):
274-
"""The output of this evaluator is the result of evaluating the provided Python expression.
275-
276-
***WARNING***: this evaluator runs arbitrary Python code, so you should ***NEVER*** use it with untrusted inputs.
277-
"""
278-
279-
expression: str
280-
evaluation_name: str | None = field(default=None)
281-
282-
def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
283-
# Evaluate the condition, exposing access to the evaluator context as `ctx`.
284-
return eval(self.expression, {'ctx': ctx})
285-
286-
287270
DEFAULT_EVALUATORS: tuple[type[Evaluator[object, object, object]], ...] = (
288271
Equals,
289272
EqualsExpected,
@@ -292,5 +275,12 @@ def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOu
292275
MaxDuration,
293276
LLMJudge,
294277
HasMatchingSpan,
295-
# Python, # not included by default for security reasons
296278
)
279+
280+
281+
def __getattr__(name: str):
282+
if name == 'Python':
283+
raise ImportError(
284+
'The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.'
285+
)
286+
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')

tests/evals/test_dataset.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import json
44
import sys
5-
from dataclasses import dataclass
5+
from dataclasses import dataclass, field
66
from pathlib import Path
77
from typing import Any
88

@@ -27,7 +27,6 @@
2727
EvaluatorOutput,
2828
EvaluatorSpec,
2929
LLMJudge,
30-
Python,
3130
)
3231
from pydantic_evals.evaluators.context import EvaluatorContext
3332
from pydantic_evals.reporting import EvaluationReport, ReportCase, ReportCaseAdapter, ReportCaseFailure
@@ -41,6 +40,15 @@ class MockEvaluator(Evaluator[object, object, object]):
4140
def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
4241
return self.output
4342

43+
@dataclass(repr=False)
44+
class Python(Evaluator[object, object, object]):
45+
expression: str
46+
evaluation_name: str | None = field(default=None)
47+
48+
def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
49+
# Evaluate the condition, exposing access to the evaluator context as `ctx`.
50+
return eval(self.expression, {'ctx': ctx})
51+
4452

4553
with try_import() as tenacity_import_successful:
4654
from tenacity import stop_after_attempt
@@ -135,6 +143,7 @@ async def test_add_evaluator(
135143
simple_evaluator: type[Evaluator[TaskInput, TaskOutput, TaskMetadata]],
136144
):
137145
"""Test adding evaluators to a dataset."""
146+
138147
assert len(example_dataset.evaluators) == 0
139148

140149
example_dataset.add_evaluator(simple_evaluator())

tests/evals/test_evaluator_common.py

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext
2020
from pydantic_evals.evaluators.common import (
21-
DEFAULT_EVALUATORS,
2221
Contains,
2322
Equals,
2423
EqualsExpected,
@@ -27,7 +26,6 @@
2726
LLMJudge,
2827
MaxDuration,
2928
OutputConfig,
30-
Python,
3129
)
3230
from pydantic_evals.otel._context_in_memory_span_exporter import context_subtree
3331
from pydantic_evals.otel._errors import SpanTreeRecordingError
@@ -395,68 +393,6 @@ async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
395393
)
396394

397395

398-
async def test_python():
399-
"""Test Python evaluator."""
400-
evaluator = Python(expression='ctx.output > 0')
401-
402-
# Test with valid expression
403-
assert evaluator.evaluate(MockContext(output=42)) is True
404-
assert evaluator.evaluate(MockContext(output=-1)) is False
405-
406-
# Test with invalid expression
407-
evaluator_invalid = Python(expression='invalid syntax')
408-
with pytest.raises(SyntaxError):
409-
evaluator_invalid.evaluate(MockContext(output=42))
410-
411-
412-
async def test_python_evaluator():
413-
"""Test Python evaluator."""
414-
ctx = EvaluatorContext(
415-
name='test',
416-
inputs={'x': 42},
417-
metadata=None,
418-
expected_output=None,
419-
output={'y': 84},
420-
duration=0.0,
421-
_span_tree=SpanTreeRecordingError('did not record spans'),
422-
attributes={},
423-
metrics={},
424-
)
425-
426-
# Test simple expression
427-
evaluator = Python(expression='ctx.output["y"] == 84')
428-
assert evaluator.evaluate(ctx) is True
429-
430-
# Test accessing inputs
431-
evaluator = Python(expression='ctx.inputs["x"] * 2 == ctx.output["y"]')
432-
assert evaluator.evaluate(ctx) is True
433-
434-
# Test complex expression
435-
evaluator = Python(expression='all(k in ctx.output for k in ["y"])')
436-
assert evaluator.evaluate(ctx) is True
437-
438-
# Test invalid expression
439-
evaluator = Python(expression='invalid syntax')
440-
with pytest.raises(SyntaxError):
441-
evaluator.evaluate(ctx)
442-
443-
# Test expression with undefined variables
444-
evaluator = Python(expression='undefined_var')
445-
with pytest.raises(NameError):
446-
evaluator.evaluate(ctx)
447-
448-
# Test expression with type error
449-
evaluator = Python(expression='ctx.output + 1') # Can't add dict and int
450-
with pytest.raises(TypeError):
451-
evaluator.evaluate(ctx)
452-
453-
454-
def test_default_evaluators():
455-
"""Test DEFAULT_EVALUATORS tuple."""
456-
# Verify that Python evaluator is not included for security reasons
457-
assert Python not in DEFAULT_EVALUATORS
458-
459-
460396
async def test_span_query_evaluator(capfire: CaptureLogfire):
461397
"""Test HasMatchingSpan evaluator."""
462398
# Create a span tree with a known structure

tests/evals/test_evaluators.py

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
IsInstance,
2828
LLMJudge,
2929
MaxDuration,
30-
Python,
3130
)
3231
from pydantic_evals.evaluators.context import EvaluatorContext
3332
from pydantic_evals.evaluators.evaluator import (
@@ -579,34 +578,43 @@ async def test_span_query_evaluator(
579578
assert result is False
580579

581580

582-
async def test_python_evaluator(test_context: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]):
583-
"""Test the python evaluator."""
584-
# Test with a simple condition
585-
evaluator = Python(expression="ctx.output.answer == '4'")
586-
assert evaluator.evaluate(test_context) == snapshot(True)
587-
588-
# Test type sensitivity
589-
evaluator = Python(expression='ctx.output.answer == 4')
590-
assert evaluator.evaluate(test_context) == snapshot(False)
591-
592-
# Test with a named condition
593-
evaluator = Python(expression="{'correct_answer': ctx.output.answer == '4'}")
594-
assert evaluator.evaluate(test_context) == snapshot({'correct_answer': True})
595-
596-
# Test with a condition that returns false
597-
evaluator = Python(expression="ctx.output.answer == '5'")
598-
assert evaluator.evaluate(test_context) == snapshot(False)
599-
600-
# Test with a condition that accesses context properties
601-
evaluator = Python(expression="ctx.output.answer == '4' and ctx.metadata.difficulty == 'easy'")
602-
assert evaluator.evaluate(test_context) == snapshot(True)
603-
604-
# Test reason rendering for strings
605-
evaluator = Python(expression='ctx.output.answer')
606-
assert evaluator.evaluate(test_context) == snapshot('4')
607-
608-
# Test with a condition that returns a dict
609-
evaluator = Python(
610-
expression="{'is_correct': ctx.output.answer == '4', 'is_easy': ctx.metadata.difficulty == 'easy'}"
611-
)
612-
assert evaluator.evaluate(test_context) == snapshot({'is_correct': True, 'is_easy': True})
581+
async def test_import_errors():
582+
with pytest.raises(
583+
ImportError,
584+
match='The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.',
585+
):
586+
from pydantic_evals.evaluators import Python # pyright: ignore[reportUnusedImport]
587+
588+
with pytest.raises(
589+
ImportError,
590+
match='The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.',
591+
):
592+
from pydantic_evals.evaluators.common import Python # pyright: ignore[reportUnusedImport] # noqa: F401
593+
594+
with pytest.raises(
595+
ImportError,
596+
match="cannot import name 'Foo' from 'pydantic_evals.evaluators'",
597+
):
598+
from pydantic_evals.evaluators import Foo # pyright: ignore[reportUnusedImport]
599+
600+
with pytest.raises(
601+
ImportError,
602+
match="cannot import name 'Foo' from 'pydantic_evals.evaluators.common'",
603+
):
604+
from pydantic_evals.evaluators.common import Foo # pyright: ignore[reportUnusedImport] # noqa: F401
605+
606+
with pytest.raises(
607+
AttributeError,
608+
match="module 'pydantic_evals.evaluators' has no attribute 'Foo'",
609+
):
610+
import pydantic_evals.evaluators as _evaluators
611+
612+
_evaluators.Foo
613+
614+
with pytest.raises(
615+
AttributeError,
616+
match="module 'pydantic_evals.evaluators.common' has no attribute 'Foo'",
617+
):
618+
import pydantic_evals.evaluators.common as _common
619+
620+
_common.Foo

0 commit comments

Comments
 (0)