diff --git a/samples/calculator/evals/eval-sets/default.json b/samples/calculator/evals/eval-sets/default.json index 18e55982a..d594f687f 100644 --- a/samples/calculator/evals/eval-sets/default.json +++ b/samples/calculator/evals/eval-sets/default.json @@ -7,7 +7,8 @@ "ExactMatchEvaluator", "JsonSimilarityEvaluator", "LLMJudgeOutputEvaluator", - "LLMJudgeStrictJSONSimilarityOutputEvaluator" + "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "TrajectoryEvaluator" ], "evaluations": [ { @@ -23,7 +24,8 @@ "ExactMatchEvaluator": null, "JsonSimilarityEvaluator": null, "LLMJudgeOutputEvaluator": null, - "LLMJudgeStrictJSONSimilarityOutputEvaluator": null + "LLMJudgeStrictJSONSimilarityOutputEvaluator": null, + "TrajectoryEvaluator": null } }, { @@ -57,6 +59,9 @@ "expectedOutput": { "result": 8.0 } + }, + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0" } } }, diff --git a/samples/calculator/evals/eval-sets/legacy.json b/samples/calculator/evals/eval-sets/legacy.json index 2491b803b..e2274e996 100644 --- a/samples/calculator/evals/eval-sets/legacy.json +++ b/samples/calculator/evals/eval-sets/legacy.json @@ -6,7 +6,8 @@ "evaluatorRefs": [ "equality", "llm-as-a-judge", - "json-similarity" + "json-similarity", + "trajectory" ], "evaluations": [ { diff --git a/samples/calculator/evals/evaluators/legacy-trajectory.json b/samples/calculator/evals/evaluators/legacy-trajectory.json new file mode 100644 index 000000000..0da184260 --- /dev/null +++ b/samples/calculator/evals/evaluators/legacy-trajectory.json @@ -0,0 +1,13 @@ +{ + "fileName": "trajectory.json", + "id": "trajectory", + "name": "Trajectory Evaluator", + "description": "An evaluator that analyzes the execution trajectory and decision sequence taken by the agent.", + "category": 3, + "type": 7, + "prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.", + "model": "gpt-4o-mini", + "targetOutputKey": "*", + "createdAt": "2025-06-26T17:45:39.651Z", + "updatedAt": "2025-06-26T17:45:39.651Z" +} diff --git a/samples/calculator/evals/evaluators/trajectory.json b/samples/calculator/evals/evaluators/trajectory.json new file mode 100644 index 000000000..2924d8a41 --- /dev/null +++ b/samples/calculator/evals/evaluators/trajectory.json @@ -0,0 +1,15 @@ +{ + "version": "1.0", + "id": "TrajectoryEvaluator", + "description": "Evaluates the agent's execution trajectory and decision sequence.", + "evaluatorTypeId": "uipath-llm-judge-trajectory-similarity", + "evaluatorConfig": { + "name": "TrajectoryEvaluator", + "model": "gpt-4.1-2025-04-14", + "prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.", + "temperature": 0.0, + "defaultEvaluationCriteria": { + "expectedAgentBehavior": "The agent should correctly perform the calculation and return the result." + } + } +} diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py index c5492c7de..7e1e3ae0f 100644 --- a/src/uipath/_cli/_evals/_evaluator_factory.py +++ b/src/uipath/_cli/_evals/_evaluator_factory.py @@ -32,12 +32,18 @@ LLMJudgeStrictJSONSimilarityOutputEvaluator, LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, ) +from uipath.eval.coded_evaluators.llm_judge_trajectory_evaluator import ( + LLMJudgeTrajectoryEvaluator, + LLMJudgeTrajectoryEvaluatorConfig, + LLMJudgeTrajectorySimulationEvaluator, + LLMJudgeTrajectorySimulationEvaluatorConfig, +) from uipath.eval.evaluators import ( LegacyBaseEvaluator, LegacyExactMatchEvaluator, LegacyJsonSimilarityEvaluator, LegacyLlmAsAJudgeEvaluator, - TrajectoryEvaluator, + LegacyTrajectoryEvaluator, ) @@ -70,6 +76,14 @@ def _create_evaluator_internal( return EvaluatorFactory._create_llm_judge_strict_json_similarity_output_evaluator( data ) + case LLMJudgeTrajectoryEvaluatorConfig(): + return EvaluatorFactory._create_trajectory_evaluator(data) + case LLMJudgeTrajectorySimulationEvaluatorConfig(): + return ( + EvaluatorFactory._create_llm_judge_simulation_trajectory_evaluator( + data + ) + ) case _: raise ValueError(f"Unknown evaluator configuration: {config}") @@ -116,6 +130,24 @@ def _create_llm_judge_strict_json_similarity_output_evaluator( config=data.get("evaluatorConfig"), ) # type: ignore + @staticmethod + def _create_trajectory_evaluator( + data: Dict[str, Any], + ) -> LLMJudgeTrajectoryEvaluator: + return LLMJudgeTrajectoryEvaluator( + id=data.get("id"), + config=data.get("evaluatorConfig"), + ) # type: ignore + + @staticmethod + def _create_llm_judge_simulation_trajectory_evaluator( + data: Dict[str, Any], + ) -> LLMJudgeTrajectorySimulationEvaluator: + return LLMJudgeTrajectorySimulationEvaluator( + id=data.get("id"), + config=data.get("evaluatorConfig"), + ) # type: ignore + @staticmethod def _create_legacy_evaluator_internal( data: Dict[str, Any], @@ -179,7 +211,7 @@ def _create_legacy_llm_as_judge_evaluator( @staticmethod def _create_legacy_trajectory_evaluator( params: TrajectoryEvaluatorParams, - ) -> TrajectoryEvaluator: + ) -> LegacyTrajectoryEvaluator: """Create a trajectory evaluator.""" if not params.prompt: raise ValueError("Trajectory evaluator must include 'prompt' field") @@ -191,4 +223,4 @@ def _create_legacy_trajectory_evaluator( "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator." ) - return TrajectoryEvaluator(**params.model_dump()) + return LegacyTrajectoryEvaluator(**params.model_dump()) diff --git a/src/uipath/_cli/_evals/_models/_evaluator.py b/src/uipath/_cli/_evals/_models/_evaluator.py index c980b9efe..8f612d50e 100644 --- a/src/uipath/_cli/_evals/_models/_evaluator.py +++ b/src/uipath/_cli/_evals/_models/_evaluator.py @@ -12,7 +12,11 @@ LLMJudgeOutputEvaluatorConfig, LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, ) -from uipath.eval.models.models import ( +from uipath.eval.coded_evaluators.llm_judge_trajectory_evaluator import ( + LLMJudgeTrajectoryEvaluatorConfig, + LLMJudgeTrajectorySimulationEvaluatorConfig, +) +from uipath.eval.models import ( EvaluatorType, LegacyEvaluatorCategory, LegacyEvaluatorType, @@ -165,6 +169,18 @@ def evaluator_config_discriminator(data: Any) -> str: return "LLMJudgeOutputEvaluatorConfig" case EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY: return "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig" + case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY: + return "LLMJudgeTrajectoryEvaluatorConfig" + case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION: + return "LLMJudgeTrajectorySimulationEvaluatorConfig" + case EvaluatorType.TOOL_CALL_ARGS: + return "ToolCallArgsEvaluatorConfig" + case EvaluatorType.TOOL_CALL_COUNT: + return "ToolCallCountEvaluatorConfig" + case EvaluatorType.TOOL_CALL_ORDER: + return "ToolCallOrderEvaluatorConfig" + case EvaluatorType.TOOL_CALL_OUTPUT: + return "ToolCallOutputEvaluatorConfig" case _: return "UnknownEvaluatorConfig" else: @@ -219,6 +235,14 @@ def evaluator_config_discriminator(data: Any) -> str: LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, Tag("LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"), ], + Annotated[ + LLMJudgeTrajectoryEvaluatorConfig, + Tag("LLMJudgeTrajectoryEvaluatorConfig"), + ], + Annotated[ + LLMJudgeTrajectorySimulationEvaluatorConfig, + Tag("LLMJudgeTrajectorySimulationEvaluatorConfig"), + ], Annotated[ UnknownEvaluatorConfig, Tag("UnknownEvaluatorConfig"), diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index c2f3b050b..a24391109 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -475,7 +475,6 @@ async def execute_runtime( if result is None: raise ValueError("Execution result cannot be None for eval runs") - return UiPathEvalRunExecutionOutput( execution_time=end_time - start_time, spans=spans, diff --git a/src/uipath/eval/coded_evaluators/__init__.py b/src/uipath/eval/coded_evaluators/__init__.py index 487252e12..75747cba5 100644 --- a/src/uipath/eval/coded_evaluators/__init__.py +++ b/src/uipath/eval/coded_evaluators/__init__.py @@ -13,8 +13,8 @@ ) from .llm_judge_trajectory_evaluator import ( BaseLLMTrajectoryEvaluator, - LLMJudgeSimulationTrajectoryEvaluator, LLMJudgeTrajectoryEvaluator, + LLMJudgeTrajectorySimulationEvaluator, ) from .tool_call_args_evaluator import ToolCallArgsEvaluator from .tool_call_count_evaluator import ToolCallCountEvaluator @@ -28,7 +28,7 @@ LLMJudgeOutputEvaluator, LLMJudgeStrictJSONSimilarityOutputEvaluator, LLMJudgeTrajectoryEvaluator, - LLMJudgeSimulationTrajectoryEvaluator, + LLMJudgeTrajectorySimulationEvaluator, ToolCallOrderEvaluator, ToolCallArgsEvaluator, ToolCallCountEvaluator, @@ -45,7 +45,7 @@ "LLMJudgeStrictJSONSimilarityOutputEvaluator", "BaseLLMTrajectoryEvaluator", "LLMJudgeTrajectoryEvaluator", - "LLMJudgeSimulationTrajectoryEvaluator", + "LLMJudgeTrajectorySimulationEvaluator", "ToolCallOrderEvaluator", "ToolCallArgsEvaluator", "ToolCallCountEvaluator", diff --git a/src/uipath/eval/coded_evaluators/contains_evaluator.py b/src/uipath/eval/coded_evaluators/contains_evaluator.py index 69ab004c1..2fed0cfc7 100644 --- a/src/uipath/eval/coded_evaluators/contains_evaluator.py +++ b/src/uipath/eval/coded_evaluators/contains_evaluator.py @@ -1,7 +1,6 @@ """Contains evaluator for agent outputs.""" -from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult -from ..models.models import EvaluatorType +from ..models import AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult from .base_evaluator import BaseEvaluationCriteria from .output_evaluator import ( OutputEvaluator, diff --git a/src/uipath/eval/coded_evaluators/exact_match_evaluator.py b/src/uipath/eval/coded_evaluators/exact_match_evaluator.py index a4f44b043..60def739f 100644 --- a/src/uipath/eval/coded_evaluators/exact_match_evaluator.py +++ b/src/uipath/eval/coded_evaluators/exact_match_evaluator.py @@ -1,7 +1,6 @@ """Exact match evaluator for agent outputs.""" -from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult -from ..models.models import EvaluatorType +from ..models import AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult from .output_evaluator import ( OutputEvaluationCriteria, OutputEvaluator, diff --git a/src/uipath/eval/coded_evaluators/json_similarity_evaluator.py b/src/uipath/eval/coded_evaluators/json_similarity_evaluator.py index 32e8bcfed..aecbab32c 100644 --- a/src/uipath/eval/coded_evaluators/json_similarity_evaluator.py +++ b/src/uipath/eval/coded_evaluators/json_similarity_evaluator.py @@ -3,8 +3,7 @@ import math from typing import Any, Tuple -from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult -from ..models.models import EvaluatorType +from ..models import AgentExecution, EvaluationResult, EvaluatorType, NumericEvaluationResult from .output_evaluator import ( OutputEvaluationCriteria, OutputEvaluator, diff --git a/src/uipath/eval/coded_evaluators/llm_judge_trajectory_evaluator.py b/src/uipath/eval/coded_evaluators/llm_judge_trajectory_evaluator.py index e474ee965..eb86a74bd 100644 --- a/src/uipath/eval/coded_evaluators/llm_judge_trajectory_evaluator.py +++ b/src/uipath/eval/coded_evaluators/llm_judge_trajectory_evaluator.py @@ -7,6 +7,8 @@ from .._helpers.coded_evaluators_helpers import trace_to_str from ..models import ( AgentExecution, + EvaluationResult, + EvaluatorType, ) from ..models.llm_judge_types import ( LLMJudgePromptTemplates, @@ -35,12 +37,12 @@ class LLMJudgeTrajectoryEvaluatorConfig( prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_DEFAULT_USER_PROMPT -class LLMJudgeSimulationEvaluatorConfig( +class LLMJudgeTrajectorySimulationEvaluatorConfig( BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria] ): """Configuration for the llm judge simulation trajectory evaluator.""" - name: str = "LLMJudgeSimulationEvaluator" + name: str = "LLMJudgeTrajectorySimulationEvaluator" prompt: str = ( LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_DEFAULT_USER_PROMPT ) @@ -67,6 +69,14 @@ def get_evaluator_id(cls) -> str: """Get the evaluator id.""" return EvaluatorType.LLM_JUDGE_TRAJECTORY.value + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: TrajectoryEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate using trajectory analysis.""" + return await super().evaluate(agent_execution, evaluation_criteria) + def _get_actual_output(self, agent_execution: AgentExecution) -> Any: """Get the actual output from the agent execution.""" return trace_to_str(agent_execution.agent_trace) @@ -114,8 +124,8 @@ def get_evaluator_id(cls) -> str: return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY.value -class LLMJudgeSimulationTrajectoryEvaluator( - BaseLLMTrajectoryEvaluator[LLMJudgeSimulationEvaluatorConfig] +class LLMJudgeTrajectorySimulationEvaluator( + BaseLLMTrajectoryEvaluator[LLMJudgeTrajectorySimulationEvaluatorConfig] ): """Evaluator that uses an LLM to judge the quality of agent trajectory for simulations. diff --git a/src/uipath/eval/evaluators/__init__.py b/src/uipath/eval/evaluators/__init__.py index 2891bdf8d..a95982fab 100644 --- a/src/uipath/eval/evaluators/__init__.py +++ b/src/uipath/eval/evaluators/__init__.py @@ -4,12 +4,12 @@ from .exact_match_evaluator import LegacyExactMatchEvaluator from .json_similarity_evaluator import LegacyJsonSimilarityEvaluator from .llm_as_judge_evaluator import LegacyLlmAsAJudgeEvaluator -from .trajectory_evaluator import TrajectoryEvaluator +from .trajectory_evaluator import LegacyTrajectoryEvaluator __all__ = [ "LegacyBaseEvaluator", "LegacyExactMatchEvaluator", "LegacyJsonSimilarityEvaluator", "LegacyLlmAsAJudgeEvaluator", - "TrajectoryEvaluator", + "LegacyTrajectoryEvaluator", ] diff --git a/src/uipath/eval/evaluators/trajectory_evaluator.py b/src/uipath/eval/evaluators/trajectory_evaluator.py index 78988d2e0..8018fbd7b 100644 --- a/src/uipath/eval/evaluators/trajectory_evaluator.py +++ b/src/uipath/eval/evaluators/trajectory_evaluator.py @@ -19,8 +19,8 @@ from .base_evaluator import LegacyBaseEvaluator -class TrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]): - """Evaluator that analyzes the trajectory/path taken to reach outputs.""" +class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]): + """Legacy evaluator that analyzes the trajectory/path taken to reach outputs.""" prompt: str model: str @@ -76,7 +76,6 @@ async def evaluate( expected_agent_behavior=agent_execution.expected_agent_behavior, agent_run_history=agent_execution.agent_trace, ) - llm_response = await self._get_llm_response(evaluation_prompt) return NumericEvaluationResult( diff --git a/src/uipath/eval/models/__init__.py b/src/uipath/eval/models/__init__.py index dd7e521a2..b2defbc87 100644 --- a/src/uipath/eval/models/__init__.py +++ b/src/uipath/eval/models/__init__.py @@ -7,6 +7,8 @@ EvalItemResult, EvaluationResult, EvaluatorType, + LegacyEvaluatorCategory, + LegacyEvaluatorType, LLMResponse, NumericEvaluationResult, ScoreType, @@ -18,6 +20,9 @@ "AgentExecution", "EvaluationResult", "LLMResponse", + "LegacyEvaluatorCategory", + "LegacyEvaluatorType", + "EvaluatorType", "ScoreType", "EvalItemResult", "BooleanEvaluationResult",