Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions samples/calculator/evals/eval-sets/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"ExactMatchEvaluator",
"JsonSimilarityEvaluator",
"LLMJudgeOutputEvaluator",
"LLMJudgeStrictJSONSimilarityOutputEvaluator"
"LLMJudgeStrictJSONSimilarityOutputEvaluator",
"TrajectoryEvaluator"
],
"evaluations": [
{
Expand All @@ -23,7 +24,8 @@
"ExactMatchEvaluator": null,
"JsonSimilarityEvaluator": null,
"LLMJudgeOutputEvaluator": null,
"LLMJudgeStrictJSONSimilarityOutputEvaluator": null
"LLMJudgeStrictJSONSimilarityOutputEvaluator": null,
"TrajectoryEvaluator": null
}
},
{
Expand Down Expand Up @@ -57,6 +59,9 @@
"expectedOutput": {
"result": 8.0
}
},
"TrajectoryEvaluator": {
"expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
}
}
},
Expand Down
3 changes: 2 additions & 1 deletion samples/calculator/evals/eval-sets/legacy.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"evaluatorRefs": [
"equality",
"llm-as-a-judge",
"json-similarity"
"json-similarity",
"trajectory"
],
"evaluations": [
{
Expand Down
13 changes: 13 additions & 0 deletions samples/calculator/evals/evaluators/legacy-trajectory.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"fileName": "trajectory.json",
"id": "trajectory",
"name": "Trajectory Evaluator",
"description": "An evaluator that analyzes the execution trajectory and decision sequence taken by the agent.",
"category": 3,
"type": 7,
"prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.",
"model": "gpt-4o-mini",
"targetOutputKey": "*",
"createdAt": "2025-06-26T17:45:39.651Z",
"updatedAt": "2025-06-26T17:45:39.651Z"
}
15 changes: 15 additions & 0 deletions samples/calculator/evals/evaluators/trajectory.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"version": "1.0",
"id": "TrajectoryEvaluator",
"description": "Evaluates the agent's execution trajectory and decision sequence.",
"evaluatorTypeId": "uipath-llm-judge-trajectory-similarity",
"evaluatorConfig": {
"name": "TrajectoryEvaluator",
"model": "gpt-4.1-2025-04-14",
"prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.",
"temperature": 0.0,
"defaultEvaluationCriteria": {
"expectedAgentBehavior": "The agent should correctly perform the calculation and return the result."
}
}
}
38 changes: 35 additions & 3 deletions src/uipath/_cli/_evals/_evaluator_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,18 @@
LLMJudgeStrictJSONSimilarityOutputEvaluator,
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
)
from uipath.eval.coded_evaluators.llm_judge_trajectory_evaluator import (
LLMJudgeTrajectoryEvaluator,
LLMJudgeTrajectoryEvaluatorConfig,
LLMJudgeTrajectorySimulationEvaluator,
LLMJudgeTrajectorySimulationEvaluatorConfig,
)
from uipath.eval.evaluators import (
LegacyBaseEvaluator,
LegacyExactMatchEvaluator,
LegacyJsonSimilarityEvaluator,
LegacyLlmAsAJudgeEvaluator,
TrajectoryEvaluator,
LegacyTrajectoryEvaluator,
)


Expand Down Expand Up @@ -70,6 +76,14 @@ def _create_evaluator_internal(
return EvaluatorFactory._create_llm_judge_strict_json_similarity_output_evaluator(
data
)
case LLMJudgeTrajectoryEvaluatorConfig():
return EvaluatorFactory._create_trajectory_evaluator(data)
case LLMJudgeTrajectorySimulationEvaluatorConfig():
return (
EvaluatorFactory._create_llm_judge_simulation_trajectory_evaluator(
data
)
)
case _:
raise ValueError(f"Unknown evaluator configuration: {config}")

Expand Down Expand Up @@ -116,6 +130,24 @@ def _create_llm_judge_strict_json_similarity_output_evaluator(
config=data.get("evaluatorConfig"),
) # type: ignore

@staticmethod
def _create_trajectory_evaluator(
data: Dict[str, Any],
) -> LLMJudgeTrajectoryEvaluator:
return LLMJudgeTrajectoryEvaluator(
id=data.get("id"),
config=data.get("evaluatorConfig"),
) # type: ignore

@staticmethod
def _create_llm_judge_simulation_trajectory_evaluator(
data: Dict[str, Any],
) -> LLMJudgeTrajectorySimulationEvaluator:
return LLMJudgeTrajectorySimulationEvaluator(
id=data.get("id"),
config=data.get("evaluatorConfig"),
) # type: ignore

@staticmethod
def _create_legacy_evaluator_internal(
data: Dict[str, Any],
Expand Down Expand Up @@ -179,7 +211,7 @@ def _create_legacy_llm_as_judge_evaluator(
@staticmethod
def _create_legacy_trajectory_evaluator(
params: TrajectoryEvaluatorParams,
) -> TrajectoryEvaluator:
) -> LegacyTrajectoryEvaluator:
"""Create a trajectory evaluator."""
if not params.prompt:
raise ValueError("Trajectory evaluator must include 'prompt' field")
Expand All @@ -191,4 +223,4 @@ def _create_legacy_trajectory_evaluator(
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
)

return TrajectoryEvaluator(**params.model_dump())
return LegacyTrajectoryEvaluator(**params.model_dump())
26 changes: 25 additions & 1 deletion src/uipath/_cli/_evals/_models/_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
LLMJudgeOutputEvaluatorConfig,
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
)
from uipath.eval.models.models import (
from uipath.eval.coded_evaluators.llm_judge_trajectory_evaluator import (
LLMJudgeTrajectoryEvaluatorConfig,
LLMJudgeTrajectorySimulationEvaluatorConfig,
)
from uipath.eval.models import (
EvaluatorType,
LegacyEvaluatorCategory,
LegacyEvaluatorType,
Expand Down Expand Up @@ -165,6 +169,18 @@ def evaluator_config_discriminator(data: Any) -> str:
return "LLMJudgeOutputEvaluatorConfig"
case EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY:
return "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"
case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY:
return "LLMJudgeTrajectoryEvaluatorConfig"
case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION:
return "LLMJudgeTrajectorySimulationEvaluatorConfig"
case EvaluatorType.TOOL_CALL_ARGS:
return "ToolCallArgsEvaluatorConfig"
case EvaluatorType.TOOL_CALL_COUNT:
return "ToolCallCountEvaluatorConfig"
case EvaluatorType.TOOL_CALL_ORDER:
return "ToolCallOrderEvaluatorConfig"
case EvaluatorType.TOOL_CALL_OUTPUT:
return "ToolCallOutputEvaluatorConfig"
case _:
return "UnknownEvaluatorConfig"
else:
Expand Down Expand Up @@ -219,6 +235,14 @@ def evaluator_config_discriminator(data: Any) -> str:
LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig,
Tag("LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"),
],
Annotated[
LLMJudgeTrajectoryEvaluatorConfig,
Tag("LLMJudgeTrajectoryEvaluatorConfig"),
],
Annotated[
LLMJudgeTrajectorySimulationEvaluatorConfig,
Tag("LLMJudgeTrajectorySimulationEvaluatorConfig"),
],
Annotated[
UnknownEvaluatorConfig,
Tag("UnknownEvaluatorConfig"),
Expand Down
1 change: 0 additions & 1 deletion src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,6 @@ async def execute_runtime(

if result is None:
raise ValueError("Execution result cannot be None for eval runs")

return UiPathEvalRunExecutionOutput(
execution_time=end_time - start_time,
spans=spans,
Expand Down
6 changes: 3 additions & 3 deletions src/uipath/eval/coded_evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
)
from .llm_judge_trajectory_evaluator import (
BaseLLMTrajectoryEvaluator,
LLMJudgeSimulationTrajectoryEvaluator,
LLMJudgeTrajectoryEvaluator,
LLMJudgeTrajectorySimulationEvaluator,
)
from .tool_call_args_evaluator import ToolCallArgsEvaluator
from .tool_call_count_evaluator import ToolCallCountEvaluator
Expand All @@ -28,7 +28,7 @@
LLMJudgeOutputEvaluator,
LLMJudgeStrictJSONSimilarityOutputEvaluator,
LLMJudgeTrajectoryEvaluator,
LLMJudgeSimulationTrajectoryEvaluator,
LLMJudgeTrajectorySimulationEvaluator,
ToolCallOrderEvaluator,
ToolCallArgsEvaluator,
ToolCallCountEvaluator,
Expand All @@ -45,7 +45,7 @@
"LLMJudgeStrictJSONSimilarityOutputEvaluator",
"BaseLLMTrajectoryEvaluator",
"LLMJudgeTrajectoryEvaluator",
"LLMJudgeSimulationTrajectoryEvaluator",
"LLMJudgeTrajectorySimulationEvaluator",
"ToolCallOrderEvaluator",
"ToolCallArgsEvaluator",
"ToolCallCountEvaluator",
Expand Down
3 changes: 1 addition & 2 deletions src/uipath/eval/coded_evaluators/contains_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Contains evaluator for agent outputs."""

from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
from ..models.models import EvaluatorType
from ..models import AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult
from .base_evaluator import BaseEvaluationCriteria
from .output_evaluator import (
OutputEvaluator,
Expand Down
3 changes: 1 addition & 2 deletions src/uipath/eval/coded_evaluators/exact_match_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Exact match evaluator for agent outputs."""

from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
from ..models.models import EvaluatorType
from ..models import AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult
from .output_evaluator import (
OutputEvaluationCriteria,
OutputEvaluator,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import math
from typing import Any, Tuple

from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult
from ..models.models import EvaluatorType
from ..models import AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult
from .output_evaluator import (
OutputEvaluationCriteria,
OutputEvaluator,
Expand Down
18 changes: 14 additions & 4 deletions src/uipath/eval/coded_evaluators/llm_judge_trajectory_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from .._helpers.coded_evaluators_helpers import trace_to_str
from ..models import (
AgentExecution,
EvaluationResult,
EvaluatorType,
)
from ..models.llm_judge_types import (
LLMJudgePromptTemplates,
Expand Down Expand Up @@ -35,12 +37,12 @@ class LLMJudgeTrajectoryEvaluatorConfig(
prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_DEFAULT_USER_PROMPT


class LLMJudgeSimulationEvaluatorConfig(
class LLMJudgeTrajectorySimulationEvaluatorConfig(
BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria]
):
"""Configuration for the llm judge simulation trajectory evaluator."""

name: str = "LLMJudgeSimulationEvaluator"
name: str = "LLMJudgeTrajectorySimulationEvaluator"
prompt: str = (
LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_DEFAULT_USER_PROMPT
)
Expand All @@ -67,6 +69,14 @@ def get_evaluator_id(cls) -> str:
"""Get the evaluator id."""
return EvaluatorType.LLM_JUDGE_TRAJECTORY.value

async def evaluate(
self,
agent_execution: AgentExecution,
evaluation_criteria: TrajectoryEvaluationCriteria,
) -> EvaluationResult:
"""Evaluate using trajectory analysis."""
return await super().evaluate(agent_execution, evaluation_criteria)

def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
"""Get the actual output from the agent execution."""
return trace_to_str(agent_execution.agent_trace)
Expand Down Expand Up @@ -114,8 +124,8 @@ def get_evaluator_id(cls) -> str:
return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY.value


class LLMJudgeSimulationTrajectoryEvaluator(
BaseLLMTrajectoryEvaluator[LLMJudgeSimulationEvaluatorConfig]
class LLMJudgeTrajectorySimulationEvaluator(
BaseLLMTrajectoryEvaluator[LLMJudgeTrajectorySimulationEvaluatorConfig]
):
"""Evaluator that uses an LLM to judge the quality of agent trajectory for simulations.

Expand Down
4 changes: 2 additions & 2 deletions src/uipath/eval/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from .exact_match_evaluator import LegacyExactMatchEvaluator
from .json_similarity_evaluator import LegacyJsonSimilarityEvaluator
from .llm_as_judge_evaluator import LegacyLlmAsAJudgeEvaluator
from .trajectory_evaluator import TrajectoryEvaluator
from .trajectory_evaluator import LegacyTrajectoryEvaluator

__all__ = [
"LegacyBaseEvaluator",
"LegacyExactMatchEvaluator",
"LegacyJsonSimilarityEvaluator",
"LegacyLlmAsAJudgeEvaluator",
"TrajectoryEvaluator",
"LegacyTrajectoryEvaluator",
]
5 changes: 2 additions & 3 deletions src/uipath/eval/evaluators/trajectory_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from .base_evaluator import LegacyBaseEvaluator


class TrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]):
"""Evaluator that analyzes the trajectory/path taken to reach outputs."""
class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]):
"""Legacy evaluator that analyzes the trajectory/path taken to reach outputs."""

prompt: str
model: str
Expand Down Expand Up @@ -76,7 +76,6 @@ async def evaluate(
expected_agent_behavior=agent_execution.expected_agent_behavior,
agent_run_history=agent_execution.agent_trace,
)

llm_response = await self._get_llm_response(evaluation_prompt)

return NumericEvaluationResult(
Expand Down
5 changes: 5 additions & 0 deletions src/uipath/eval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
EvalItemResult,
EvaluationResult,
EvaluatorType,
LegacyEvaluatorCategory,
LegacyEvaluatorType,
LLMResponse,
NumericEvaluationResult,
ScoreType,
Expand All @@ -18,6 +20,9 @@
"AgentExecution",
"EvaluationResult",
"LLMResponse",
"LegacyEvaluatorCategory",
"LegacyEvaluatorType",
"EvaluatorType",
"ScoreType",
"EvalItemResult",
"BooleanEvaluationResult",
Expand Down
Loading