Skip to content

Commit 1936c1f

Browse files
committed
feat: wiring up trajectory evals
1 parent 746c8a7 commit 1936c1f

File tree

17 files changed

+1023
-923
lines changed

17 files changed

+1023
-923
lines changed

samples/calculator/evals/eval-sets/default.json

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
"ExactMatchEvaluator",
88
"JsonSimilarityEvaluator",
99
"LLMJudgeOutputEvaluator",
10-
"LLMJudgeStrictJSONSimilarityOutputEvaluator"
10+
"LLMJudgeStrictJSONSimilarityOutputEvaluator",
11+
"TrajectoryEvaluator"
1112
],
1213
"evaluations": [
1314
{
@@ -23,7 +24,8 @@
2324
"ExactMatchEvaluator": null,
2425
"JsonSimilarityEvaluator": null,
2526
"LLMJudgeOutputEvaluator": null,
26-
"LLMJudgeStrictJSONSimilarityOutputEvaluator": null
27+
"LLMJudgeStrictJSONSimilarityOutputEvaluator": null,
28+
"TrajectoryEvaluator": null
2729
}
2830
},
2931
{
@@ -57,6 +59,9 @@
5759
"expectedOutput": {
5860
"result": 8.0
5961
}
62+
},
63+
"TrajectoryEvaluator": {
64+
"expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
6065
}
6166
}
6267
},

samples/calculator/evals/eval-sets/legacy.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
"evaluatorRefs": [
77
"equality",
88
"llm-as-a-judge",
9-
"json-similarity"
9+
"json-similarity",
10+
"trajectory"
1011
],
1112
"evaluations": [
1213
{
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"fileName": "trajectory.json",
3+
"id": "trajectory",
4+
"name": "Trajectory Evaluator",
5+
"description": "An evaluator that analyzes the execution trajectory and decision sequence taken by the agent.",
6+
"category": 3,
7+
"type": 7,
8+
"prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.",
9+
"model": "gpt-4o-mini",
10+
"targetOutputKey": "*",
11+
"createdAt": "2025-06-26T17:45:39.651Z",
12+
"updatedAt": "2025-06-26T17:45:39.651Z"
13+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"version": "1.0",
3+
"id": "TrajectoryEvaluator",
4+
"description": "Evaluates the agent's execution trajectory and decision sequence.",
5+
"evaluatorTypeId": "uipath-llm-judge-trajectory-similarity",
6+
"evaluatorConfig": {
7+
"name": "TrajectoryEvaluator",
8+
"model": "gpt-4.1-2025-04-14",
9+
"prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.",
10+
"temperature": 0.0,
11+
"defaultEvaluationCriteria": {
12+
"expectedAgentBehavior": "The agent should correctly perform the calculation and return the result."
13+
}
14+
}
15+
}

src/uipath/_cli/_evals/_evaluator_factory.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,18 @@
3232
LLMJudgeStrictJSONSimilarityOutputEvaluator,
3333
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
3434
)
35+
from uipath.eval.coded_evaluators.llm_judge_trajectory_evaluator import (
36+
LLMJudgeTrajectoryEvaluator,
37+
LLMJudgeTrajectoryEvaluatorConfig,
38+
LLMJudgeTrajectorySimulationEvaluator,
39+
LLMJudgeTrajectorySimulationEvaluatorConfig,
40+
)
3541
from uipath.eval.evaluators import (
3642
LegacyBaseEvaluator,
3743
LegacyExactMatchEvaluator,
3844
LegacyJsonSimilarityEvaluator,
3945
LegacyLlmAsAJudgeEvaluator,
40-
TrajectoryEvaluator,
46+
LegacyTrajectoryEvaluator,
4147
)
4248

4349

@@ -70,6 +76,14 @@ def _create_evaluator_internal(
7076
return EvaluatorFactory._create_llm_judge_strict_json_similarity_output_evaluator(
7177
data
7278
)
79+
case LLMJudgeTrajectoryEvaluatorConfig():
80+
return EvaluatorFactory._create_trajectory_evaluator(data)
81+
case LLMJudgeTrajectorySimulationEvaluatorConfig():
82+
return (
83+
EvaluatorFactory._create_llm_judge_simulation_trajectory_evaluator(
84+
data
85+
)
86+
)
7387
case _:
7488
raise ValueError(f"Unknown evaluator configuration: {config}")
7589

@@ -116,6 +130,24 @@ def _create_llm_judge_strict_json_similarity_output_evaluator(
116130
config=data.get("evaluatorConfig"),
117131
) # type: ignore
118132

133+
@staticmethod
134+
def _create_trajectory_evaluator(
135+
data: Dict[str, Any],
136+
) -> LLMJudgeTrajectoryEvaluator:
137+
return LLMJudgeTrajectoryEvaluator(
138+
id=data.get("id"),
139+
config=data.get("evaluatorConfig"),
140+
) # type: ignore
141+
142+
@staticmethod
143+
def _create_llm_judge_simulation_trajectory_evaluator(
144+
data: Dict[str, Any],
145+
) -> LLMJudgeTrajectorySimulationEvaluator:
146+
return LLMJudgeTrajectorySimulationEvaluator(
147+
id=data.get("id"),
148+
config=data.get("evaluatorConfig"),
149+
) # type: ignore
150+
119151
@staticmethod
120152
def _create_legacy_evaluator_internal(
121153
data: Dict[str, Any],
@@ -179,7 +211,7 @@ def _create_legacy_llm_as_judge_evaluator(
179211
@staticmethod
180212
def _create_legacy_trajectory_evaluator(
181213
params: TrajectoryEvaluatorParams,
182-
) -> TrajectoryEvaluator:
214+
) -> LegacyTrajectoryEvaluator:
183215
"""Create a trajectory evaluator."""
184216
if not params.prompt:
185217
raise ValueError("Trajectory evaluator must include 'prompt' field")
@@ -191,4 +223,4 @@ def _create_legacy_trajectory_evaluator(
191223
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
192224
)
193225

194-
return TrajectoryEvaluator(**params.model_dump())
226+
return LegacyTrajectoryEvaluator(**params.model_dump())

src/uipath/_cli/_evals/_models/_evaluator.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212
LLMJudgeOutputEvaluatorConfig,
1313
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
1414
)
15-
from uipath.eval.models.models import (
15+
from uipath.eval.coded_evaluators.llm_judge_trajectory_evaluator import (
16+
LLMJudgeTrajectoryEvaluatorConfig,
17+
LLMJudgeTrajectorySimulationEvaluatorConfig,
18+
)
19+
from uipath.eval.models import (
1620
EvaluatorType,
1721
LegacyEvaluatorCategory,
1822
LegacyEvaluatorType,
@@ -165,6 +169,18 @@ def evaluator_config_discriminator(data: Any) -> str:
165169
return "LLMJudgeOutputEvaluatorConfig"
166170
case EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY:
167171
return "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"
172+
case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY:
173+
return "LLMJudgeTrajectoryEvaluatorConfig"
174+
case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION:
175+
return "LLMJudgeTrajectorySimulationEvaluatorConfig"
176+
case EvaluatorType.TOOL_CALL_ARGS:
177+
return "ToolCallArgsEvaluatorConfig"
178+
case EvaluatorType.TOOL_CALL_COUNT:
179+
return "ToolCallCountEvaluatorConfig"
180+
case EvaluatorType.TOOL_CALL_ORDER:
181+
return "ToolCallOrderEvaluatorConfig"
182+
case EvaluatorType.TOOL_CALL_OUTPUT:
183+
return "ToolCallOutputEvaluatorConfig"
168184
case _:
169185
return "UnknownEvaluatorConfig"
170186
else:
@@ -219,6 +235,14 @@ def evaluator_config_discriminator(data: Any) -> str:
219235
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
220236
Tag("LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"),
221237
],
238+
Annotated[
239+
LLMJudgeTrajectoryEvaluatorConfig,
240+
Tag("LLMJudgeTrajectoryEvaluatorConfig"),
241+
],
242+
Annotated[
243+
LLMJudgeTrajectorySimulationEvaluatorConfig,
244+
Tag("LLMJudgeTrajectorySimulationEvaluatorConfig"),
245+
],
222246
Annotated[
223247
UnknownEvaluatorConfig,
224248
Tag("UnknownEvaluatorConfig"),

src/uipath/_cli/_evals/_runtime.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,6 @@ async def execute_runtime(
475475

476476
if result is None:
477477
raise ValueError("Execution result cannot be None for eval runs")
478-
479478
return UiPathEvalRunExecutionOutput(
480479
execution_time=end_time - start_time,
481480
spans=spans,
Binary file not shown.

src/uipath/eval/coded_evaluators/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
)
1414
from .llm_judge_trajectory_evaluator import (
1515
BaseLLMTrajectoryEvaluator,
16-
LLMJudgeSimulationTrajectoryEvaluator,
1716
LLMJudgeTrajectoryEvaluator,
17+
LLMJudgeTrajectorySimulationEvaluator,
1818
)
1919
from .tool_call_args_evaluator import ToolCallArgsEvaluator
2020
from .tool_call_count_evaluator import ToolCallCountEvaluator
@@ -28,7 +28,7 @@
2828
LLMJudgeOutputEvaluator,
2929
LLMJudgeStrictJSONSimilarityOutputEvaluator,
3030
LLMJudgeTrajectoryEvaluator,
31-
LLMJudgeSimulationTrajectoryEvaluator,
31+
LLMJudgeTrajectorySimulationEvaluator,
3232
ToolCallOrderEvaluator,
3333
ToolCallArgsEvaluator,
3434
ToolCallCountEvaluator,
@@ -45,7 +45,7 @@
4545
"LLMJudgeStrictJSONSimilarityOutputEvaluator",
4646
"BaseLLMTrajectoryEvaluator",
4747
"LLMJudgeTrajectoryEvaluator",
48-
"LLMJudgeSimulationTrajectoryEvaluator",
48+
"LLMJudgeTrajectorySimulationEvaluator",
4949
"ToolCallOrderEvaluator",
5050
"ToolCallArgsEvaluator",
5151
"ToolCallCountEvaluator",

src/uipath/eval/coded_evaluators/contains_evaluator.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Contains evaluator for agent outputs."""
22

3-
from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
4-
from ..models.models import EvaluatorType
3+
from ..models import AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult
54
from .base_evaluator import BaseEvaluationCriteria
65
from .output_evaluator import (
76
OutputEvaluator,

0 commit comments

Comments
 (0)