diff --git a/samples/calculator/README.md b/samples/calculator/README.md index 9d7777bda..3b53698c9 100644 --- a/samples/calculator/README.md +++ b/samples/calculator/README.md @@ -6,3 +6,22 @@ After initialization, execute the agent using this sample command: ``` uipath run main.py '{"a": 0, "b": 1, "operator": "+"}' ``` + +# Run evaluations +``` +uipath eval .\main.py .\evals\eval-sets\default.json --no-report --output-file output.json +``` + +# Add and register custom evaluator + +1. (Optional) Add a new evaluator -> can be created manually in the evals/custom-evaluators directory +``` +uipath add evaluator my_custom_evaluator +``` +2. Implement the logic + +3. Register the evaluator +``` +uipath register evaluator my_custom_evaluator +``` +4. Apply it to any dataset diff --git a/samples/calculator/evals/eval-sets/default.json b/samples/calculator/evals/eval-sets/default.json index 26de18e4b..8857757d1 100644 --- a/samples/calculator/evals/eval-sets/default.json +++ b/samples/calculator/evals/eval-sets/default.json @@ -1,72 +1,84 @@ { - "fileName": "default.json", - "id": "default-eval-set-id", - "name": "Basic Calculator Evaluation Set", - "batchSize": 10, + "version": "1.0", + "id": "NewSchemaSampleEval", + "name": "New Schema Sample Evaluation", "evaluatorRefs": [ - "equality", - "llm-as-a-judge" + "ContainsEvaluator", + "ExactMatchEvaluator", + "JsonSimilarityEvaluator", + "LLMJudgeOutputEvaluator", + "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "TrajectoryEvaluator", + "CorrectOperatorEvaluator" ], "evaluations": [ { - "id": "test-addition", - "name": "Test Addition", - "inputs": {"a": 1, "b": 1, "operator": "+"}, - "expectedOutput": {"result": 2}, - "expectedAgentBehavior": "", - "evalSetId": "default-eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z" + "id": "default", + "name": "Add", + "inputs": { + "a": 1, + "b": 4, + "operator": "+" + }, + "evaluationCriterias": { + "ContainsEvaluator": null, + "ExactMatchEvaluator": null, + "JsonSimilarityEvaluator": null, + "LLMJudgeOutputEvaluator": null, + "LLMJudgeStrictJSONSimilarityOutputEvaluator": null, + "TrajectoryEvaluator": null, + "CorrectOperatorEvaluator": null + } }, { - "id": "test-random-addition-using-mockito", - "name": "Test Random Addition Using Mockito", - "inputs": {"a": 1, "b": 1, "operator": "random"}, - "expectedOutput": {"result": 2}, - "expectedAgentBehavior": "", - "mockingStrategy": { - "type": "mockito", - "behaviors": [ - { - "function": "get_random_operator", - "arguments": { - "args": [], - "kwargs": {} - }, - "then": [ - { - "type": "return", - "value": {"result": "+"} - } - ] - } - ] + "id": "override", + "name": "Multiply", + "inputs": { + "a": 2, + "b": 4, + "operator": "*" }, - "evalSetId": "default-eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z" + "evaluationCriterias": { + "ContainsEvaluator": { + "searchText": "8" + }, + "CorrectOperatorEvaluator": { + "operator": "*" + }, + "ExactMatchEvaluator": { + "expectedOutput": { + "result": "8.0" + } + }, + "JsonSimilarityEvaluator": { + "expectedOutput": { + "result": 8.0 + } + }, + "LLMJudgeOutputEvaluator": { + "expectedOutput": { + "result": 8.0 + } + }, + "LLMJudgeStrictJSONSimilarityOutputEvaluator": { + "expectedOutput": { + "result": 8.0 + } + }, + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0" + } + } }, { - "id": "test-random-addition-using-llm", - "name": "Test Random Addition Using LLM", - "inputs": {"a": 1, "b": 1, "operator": "random"}, - "expectedOutput": {"result": 2}, - "expectedAgentBehavior": "", - "mockingStrategy": { - "type": "llm", - "prompt": "The random operator is '+'.", - "toolsToSimulate": [{"name": "get_random_operator"}], - "model": { - "model": "gpt-4o-mini-2024-07-18", - "temperature": 0 - } + "id": "skip", + "name": "Skip denial code check", + "inputs": { + "a": 1, + "b": 1, + "operator": "+" }, - "evalSetId": "default-eval-set-id", - "createdAt": "2025-09-04T18:54:58.378Z", - "updatedAt": "2025-09-04T18:55:55.416Z" + "evaluationCriterias": {} } - ], - "modelSettings": [], - "createdAt": "2025-09-04T18:54:58.379Z", - "updatedAt": "2025-09-04T18:55:55.416Z" + ] } diff --git a/samples/calculator/evals/eval-sets/legacy.json b/samples/calculator/evals/eval-sets/legacy.json new file mode 100644 index 000000000..e2274e996 --- /dev/null +++ b/samples/calculator/evals/eval-sets/legacy.json @@ -0,0 +1,98 @@ +{ + "fileName": "default.json", + "id": "default-eval-set-id", + "name": "Basic Calculator Evaluation Set", + "batchSize": 10, + "evaluatorRefs": [ + "equality", + "llm-as-a-judge", + "json-similarity", + "trajectory" + ], + "evaluations": [ + { + "id": "test-addition", + "name": "Test Addition", + "inputs": { + "a": 1, + "b": 1, + "operator": "+" + }, + "expectedOutput": { + "result": 2 + }, + "expectedAgentBehavior": "", + "evalSetId": "default-eval-set-id", + "createdAt": "2025-09-04T18:54:58.378Z", + "updatedAt": "2025-09-04T18:55:55.416Z" + }, + { + "id": "test-random-addition-using-mockito", + "name": "Test Random Addition Using Mockito", + "inputs": { + "a": 1, + "b": 1, + "operator": "random" + }, + "expectedOutput": { + "result": 2 + }, + "expectedAgentBehavior": "", + "mockingStrategy": { + "type": "mockito", + "behaviors": [ + { + "function": "get_random_operator", + "arguments": { + "args": [], + "kwargs": {} + }, + "then": [ + { + "type": "return", + "value": { + "result": "+" + } + } + ] + } + ] + }, + "evalSetId": "default-eval-set-id", + "createdAt": "2025-09-04T18:54:58.378Z", + "updatedAt": "2025-09-04T18:55:55.416Z" + }, + { + "id": "test-random-addition-using-llm", + "name": "Test Random Addition Using LLM", + "inputs": { + "a": 1, + "b": 1, + "operator": "random" + }, + "expectedOutput": { + "result": 2 + }, + "expectedAgentBehavior": "", + "mockingStrategy": { + "type": "llm", + "prompt": "The random operator is '+'.", + "toolsToSimulate": [ + { + "name": "get_random_operator" + } + ], + "model": { + "model": "gpt-4o-mini-2024-07-18", + "temperature": 0 + } + }, + "evalSetId": "default-eval-set-id", + "createdAt": "2025-09-04T18:54:58.378Z", + "updatedAt": "2025-09-04T18:55:55.416Z" + } + ], + "modelSettings": [], + "createdAt": "2025-09-04T18:54:58.379Z", + "updatedAt": "2025-09-04T18:55:55.416Z" +} diff --git a/samples/calculator/evals/evaluators/contains.json b/samples/calculator/evals/evaluators/contains.json new file mode 100644 index 000000000..e73655257 --- /dev/null +++ b/samples/calculator/evals/evaluators/contains.json @@ -0,0 +1,15 @@ +{ + "version": "1.0", + "id": "ContainsEvaluator", + "description": "Checks if the response text includes the expected denial code.", + "evaluatorTypeId": "uipath-contains", + "evaluatorConfig": { + "name": "ContainsEvaluator", + "targetOutputKey": "result", + "negated": false, + "ignoreCase": false, + "defaultEvaluationCriteria": { + "searchText": "5" + } + } +} diff --git a/samples/calculator/evals/evaluators/correct-operator-evaluator.json b/samples/calculator/evals/evaluators/correct-operator-evaluator.json new file mode 100644 index 000000000..86dbfbb87 --- /dev/null +++ b/samples/calculator/evals/evaluators/correct-operator-evaluator.json @@ -0,0 +1,14 @@ +{ + "version": "1.0", + "id": "CorrectOperatorEvaluator", + "evaluatorTypeId": "file://types/correct-operator-evaluator-types.json", + "evaluatorSchema": "file://correct_operator.py:CorrectOperatorEvaluator", + "description": "A custom evaluator that checks if the correct operator is being used by the agent ", + "evaluatorConfig": { + "name": "CorrectOperatorEvaluator", + "defaultEvaluationCriteria": { + "operator": "+" + }, + "negated": false + } +} \ No newline at end of file diff --git a/samples/calculator/evals/evaluators/custom/correct_operator.py b/samples/calculator/evals/evaluators/custom/correct_operator.py new file mode 100644 index 000000000..0aaec37d0 --- /dev/null +++ b/samples/calculator/evals/evaluators/custom/correct_operator.py @@ -0,0 +1,46 @@ +import json + +from uipath.eval.evaluators import BaseEvaluator, BaseEvaluationCriteria, BaseEvaluatorConfig +from uipath.eval.models import AgentExecution, EvaluationResult, NumericEvaluationResult +from opentelemetry.sdk.trace import ReadableSpan + +class CorrectOperatorEvaluationCriteria(BaseEvaluationCriteria): + """Evaluation criteria for the contains evaluator.""" + + operator: str + +class CorrectOperatorEvaluatorConfig(BaseEvaluatorConfig[CorrectOperatorEvaluationCriteria]): + """Configuration for the contains evaluator.""" + + name: str = "CorrectOperatorEvaluator" + negated: bool = False + default_evaluation_criteria: CorrectOperatorEvaluationCriteria = CorrectOperatorEvaluationCriteria(operator="+") + +class CorrectOperatorEvaluator(BaseEvaluator[CorrectOperatorEvaluationCriteria, CorrectOperatorEvaluatorConfig, type(None)]): + """A custom evaluator that checks if the correct operator is being used by the agent """ + + def extract_operator_from_spans(self, agent_trace: list[ReadableSpan]) -> str: + for span in agent_trace: + if span.name == "track_operator": + if span.attributes: + input_value_as_str = span.attributes.get("input.value", "{}") + assert isinstance(input_value_as_str, str) + input_value = json.loads(input_value_as_str) + return input_value.get("operator") + raise Exception(f"No 'track_operator' span found") + + + @classmethod + def get_evaluator_id(cls) -> str: + return "CorrectOperatorEvaluator" + + + async def evaluate(self, agent_execution: AgentExecution, evaluation_criteria: CorrectOperatorEvaluationCriteria) -> EvaluationResult: + actual_operator = self.extract_operator_from_spans(agent_execution.agent_trace) + print(actual_operator) + is_expected_operator = evaluation_criteria.operator == actual_operator + if self.evaluator_config.negated: + is_expected_operator = not is_expected_operator + return NumericEvaluationResult( + score=float(is_expected_operator), + ) diff --git a/samples/calculator/evals/evaluators/custom/types/correct-operator-evaluator-types.json b/samples/calculator/evals/evaluators/custom/types/correct-operator-evaluator-types.json new file mode 100644 index 000000000..af810a7f8 --- /dev/null +++ b/samples/calculator/evals/evaluators/custom/types/correct-operator-evaluator-types.json @@ -0,0 +1,57 @@ +{ + "evaluatorTypeId": "CorrectOperatorEvaluator", + "evaluatorConfigSchema": { + "$defs": { + "CorrectOperatorEvaluationCriteria": { + "description": "Evaluation criteria for the contains evaluator.", + "properties": { + "operator": { + "title": "Operator", + "type": "string" + } + }, + "required": [ + "operator" + ], + "title": "CorrectOperatorEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the contains evaluator.", + "properties": { + "name": { + "default": "CorrectOperatorEvaluator", + "title": "Name", + "type": "string" + }, + "defaultEvaluationCriteria": { + "$ref": "#/$defs/CorrectOperatorEvaluationCriteria", + "default": { + "operator": "+" + } + }, + "negated": { + "default": false, + "title": "Negated", + "type": "boolean" + } + }, + "title": "CorrectOperatorEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Evaluation criteria for the contains evaluator.", + "properties": { + "operator": { + "title": "Operator", + "type": "string" + } + }, + "required": [ + "operator" + ], + "title": "CorrectOperatorEvaluationCriteria", + "type": "object" + }, + "justificationSchema": {} +} \ No newline at end of file diff --git a/samples/calculator/evals/evaluators/exact-match.json b/samples/calculator/evals/evaluators/exact-match.json new file mode 100644 index 000000000..6e5c5fca1 --- /dev/null +++ b/samples/calculator/evals/evaluators/exact-match.json @@ -0,0 +1,17 @@ +{ + "version": "1.0", + "id": "ExactMatchEvaluator", + "description": "Checks if the response text exactly matches the expected value.", + "evaluatorTypeId": "uipath-exact-match", + "evaluatorConfig": { + "name": "ExactMatchEvaluator", + "targetOutputKey": "result", + "negated": false, + "ignoreCase": false, + "defaultEvaluationCriteria": { + "expectedOutput": { + "result": "5.0" + } + } + } +} diff --git a/samples/calculator/evals/evaluators/json-similarity.json b/samples/calculator/evals/evaluators/json-similarity.json new file mode 100644 index 000000000..767b9c940 --- /dev/null +++ b/samples/calculator/evals/evaluators/json-similarity.json @@ -0,0 +1,15 @@ +{ + "version": "1.0", + "id": "JsonSimilarityEvaluator", + "description": "Checks if the response JSON is similar to the expected JSON structure.", + "evaluatorTypeId": "uipath-json-similarity", + "evaluatorConfig": { + "name": "JsonSimilarityEvaluator", + "targetOutputKey": "*", + "defaultEvaluationCriteria": { + "expectedOutput": { + "result": 5.0 + } + } + } +} diff --git a/samples/calculator/evals/evaluators/equality.json b/samples/calculator/evals/evaluators/legacy-equality.json similarity index 100% rename from samples/calculator/evals/evaluators/equality.json rename to samples/calculator/evals/evaluators/legacy-equality.json diff --git a/samples/calculator/evals/evaluators/legacy-json-similarity.json b/samples/calculator/evals/evaluators/legacy-json-similarity.json new file mode 100644 index 000000000..dd1fca355 --- /dev/null +++ b/samples/calculator/evals/evaluators/legacy-json-similarity.json @@ -0,0 +1,11 @@ +{ + "fileName": "json-similarity.json", + "id": "json-similarity", + "name": "JSON Similarity Evaluator", + "description": "An evaluator that compares JSON structures with tolerance for numeric and string differences.", + "category": 0, + "type": 6, + "targetOutputKey": "*", + "createdAt": "2025-06-26T17:45:39.651Z", + "updatedAt": "2025-06-26T17:45:39.651Z" +} diff --git a/samples/calculator/evals/evaluators/llm-as-a-judge.json b/samples/calculator/evals/evaluators/legacy-llm-as-a-judge.json similarity index 100% rename from samples/calculator/evals/evaluators/llm-as-a-judge.json rename to samples/calculator/evals/evaluators/legacy-llm-as-a-judge.json diff --git a/samples/calculator/evals/evaluators/legacy-trajectory.json b/samples/calculator/evals/evaluators/legacy-trajectory.json new file mode 100644 index 000000000..0da184260 --- /dev/null +++ b/samples/calculator/evals/evaluators/legacy-trajectory.json @@ -0,0 +1,13 @@ +{ + "fileName": "trajectory.json", + "id": "trajectory", + "name": "Trajectory Evaluator", + "description": "An evaluator that analyzes the execution trajectory and decision sequence taken by the agent.", + "category": 3, + "type": 7, + "prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.", + "model": "gpt-4o-mini", + "targetOutputKey": "*", + "createdAt": "2025-06-26T17:45:39.651Z", + "updatedAt": "2025-06-26T17:45:39.651Z" +} diff --git a/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json b/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json new file mode 100644 index 000000000..900d85c67 --- /dev/null +++ b/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json @@ -0,0 +1,18 @@ +{ + "version": "1.0", + "id": "LLMJudgeOutputEvaluator", + "description": "Uses an LLM to judge semantic similarity between expected and actual output.", + "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity", + "evaluatorConfig": { + "name": "LLMJudgeOutputEvaluator", + "targetOutputKey": "*", + "model": "gpt-4.1-2025-04-14", + "prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.", + "temperature": 0.0, + "defaultEvaluationCriteria": { + "expectedOutput": { + "result": 5.0 + } + } + } +} diff --git a/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json b/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json new file mode 100644 index 000000000..2dcd94989 --- /dev/null +++ b/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json @@ -0,0 +1,18 @@ +{ + "version": "1.0", + "id": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "description": "Uses an LLM to judge strict JSON similarity between expected and actual output.", + "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity", + "evaluatorConfig": { + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "targetOutputKey": "*", + "model": "gpt-4.1-2025-04-14", + "prompt": "Compare the following JSON outputs for strict structural similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nEvaluate if the JSON structure and values match precisely. Provide a score from 0-100 where 100 means exact match and 0 means completely different.", + "temperature": 0.0, + "defaultEvaluationCriteria": { + "expectedOutput": { + "result": 5.0 + } + } + } +} diff --git a/samples/calculator/evals/evaluators/trajectory.json b/samples/calculator/evals/evaluators/trajectory.json new file mode 100644 index 000000000..2924d8a41 --- /dev/null +++ b/samples/calculator/evals/evaluators/trajectory.json @@ -0,0 +1,15 @@ +{ + "version": "1.0", + "id": "TrajectoryEvaluator", + "description": "Evaluates the agent's execution trajectory and decision sequence.", + "evaluatorTypeId": "uipath-llm-judge-trajectory-similarity", + "evaluatorConfig": { + "name": "TrajectoryEvaluator", + "model": "gpt-4.1-2025-04-14", + "prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory.", + "temperature": 0.0, + "defaultEvaluationCriteria": { + "expectedAgentBehavior": "The agent should correctly perform the calculation and return the result." + } + } +} diff --git a/samples/calculator/main.py b/samples/calculator/main.py index 31eb43af0..a124cba17 100644 --- a/samples/calculator/main.py +++ b/samples/calculator/main.py @@ -1,11 +1,11 @@ +import logging import random +from enum import Enum from pydantic.dataclasses import dataclass -from enum import Enum -from uipath.eval.mocks import mockable, ExampleCall +from uipath.eval.mocks import ExampleCall, mockable from uipath.tracing import traced -import logging logger = logging.getLogger(__name__) @@ -39,6 +39,9 @@ async def get_random_operator() -> Wrapper: """Get a random operator.""" return Wrapper(result=random.choice([Operator.ADD, Operator.SUBTRACT, Operator.MULTIPLY, Operator.DIVIDE])) +@traced(name="track_operator") +def track_operator(operator: Operator): + pass @traced() async def main(input: CalculatorInput) -> CalculatorOutput: @@ -46,6 +49,7 @@ async def main(input: CalculatorInput) -> CalculatorOutput: operator = (await get_random_operator()).result else: operator = input.operator + track_operator(operator) match operator: case Operator.ADD: result = input.a + input.b case Operator.SUBTRACT: result = input.a - input.b diff --git a/samples/weather_tools/README.md b/samples/weather_tools/README.md new file mode 100644 index 000000000..a840a3c5b --- /dev/null +++ b/samples/weather_tools/README.md @@ -0,0 +1,201 @@ +# Weather Tools Mocked Agent + +A sample mocked agent demonstrating multiple tool calls with trajectory evaluation and tool call evaluators. + +## Overview + +This is a **mocked agent** designed for testing and demonstration purposes. It does not make real weather API calls. Instead, it returns simulated weather data from hardcoded values to demonstrate: + +- How to structure tools with proper tracing for trajectory evaluation +- How multiple tool calls are captured and validated +- How tool call evaluators verify tool usage, arguments, and outputs +- Best practices for integrating mocked tools with UiPath's evaluation framework +- Custom serialization with content wrapper pattern + +All weather data is simulated for five cities (New York, London, Tokyo, Paris, Sydney) with predefined responses. + +## Tools + +The agent provides five mocked tools that return simulated data: + +1. **get_temperature** - Returns simulated temperature in fahrenheit +2. **get_weather_condition** - Returns simulated weather condition (sunny, rainy, etc.) +3. **get_humidity** - Returns simulated humidity percentage +4. **get_forecast** - Returns simulated weather forecast text +5. **get_weather_alerts** - Returns simulated weather alerts + +**Note:** All tools return hardcoded responses wrapped in a `{"content": {...}}` structure for demonstration purposes. No actual weather APIs are called. + +## Data Models + +### Input Model +```python +@dataclass +class WeatherInput: + city: City # Enum: NEW_YORK, LONDON, TOKYO, PARIS, SYDNEY + action: Literal["get_weather", "get_forecast", "get_alerts"] +``` + +### Output Model +```python +class WeatherOutput(_WeatherOutputContent): + content: _WeatherOutputContent # Wraps all data under "content" key + +class _WeatherOutputContent(BaseModel): + city: str + temperature: float + condition: WeatherCondition # Enum: SUNNY, CLOUDY, RAINY, SNOWY + humidity: int + forecast: str | None = None + alerts: list[str] | None = None +``` + +## Multiple Tool Calls + +The agent demonstrates multiple tool calls in a single execution: + +### Example: "get_weather" action +``` +1. get_temperature("New York") -> {"content": {"temperature": 72.5, "unit": "fahrenheit"}} +2. get_weather_condition("New York") -> {"content": {"condition": "sunny"}} +3. get_humidity("New York") -> {"content": {"humidity": 60}} +``` + +### Example: "get_forecast" action +``` +1. get_temperature("Paris") -> {"content": {"temperature": 18.0, "unit": "fahrenheit"}} +2. get_weather_condition("Paris") -> {"content": {"condition": "cloudy"}} +3. get_humidity("Paris") -> {"content": {"humidity": 70}} +4. get_forecast("Paris") -> {"content": {"forecast": "Cloudy with a chance of rain..."}} +``` + +### Example: "get_alerts" action +``` +1. get_temperature("London") -> {"content": {"temperature": 15.0, "unit": "fahrenheit"}} +2. get_weather_condition("London") -> {"content": {"condition": "rainy"}} +3. get_humidity("London") -> {"content": {"humidity": 80}} +4. get_weather_alerts("London") -> {"content": {"alerts": ["Heavy rain warning until 6 PM"]}} +``` + +## Trajectory Evaluation + +Each tool call creates its own OTEL span with the `tool.name` attribute set, allowing UiPath's trajectory evaluation to extract: + +### Tool Call Sequence +The evaluator extracts tool names in order: +```python +["get_temperature", "get_weather_condition", "get_humidity", "get_forecast"] +``` + +### Tool Arguments +Each tool's input arguments are captured: +```python +ToolCall(name="get_temperature", args={"city": "New York"}) +ToolCall(name="get_weather_condition", args={"city": "New York"}) +... +``` + +### Tool Outputs +Each tool's output is captured with content wrapper: +```python +ToolOutput(name="get_temperature", output='{"content": {"temperature": 72.5, "unit": "fahrenheit"}}') +ToolOutput(name="get_weather_condition", output='{"content": {"condition": "sunny"}}') +... +``` + +## Implementation Details + +### Decorator Stack +Each mocked tool uses a specific decorator order to ensure proper tracing: + +```python +@traced() # Creates OTEL span for tracing +@mockable(example_calls=...) # Provides mock data during evaluation +@mock_tool_span # Innermost - sets tool.name attribute +async def get_temperature(city: str) -> dict: + """Returns simulated temperature data""" + city_enum = City(city) + temps = {City.NEW_YORK: 72.5, City.LONDON: 15.0, ...} + return {"content": {"temperature": temps.get(city_enum, 20.0), "unit": "fahrenheit"}} +``` + +### Tool Invocation +Mocked tools are invoked directly as async functions (not LangChain tools): + +```python +temp_data = await get_temperature(city) +``` + +This ensures: +1. `@traced()` creates an OTEL span for the tool call +2. `@mockable()` can provide mock responses during evaluation +3. `@mock_tool_span` sets the `tool.name` attribute on the span +4. The trajectory evaluator can extract the tool call with its arguments and output +5. Simulated data is returned from hardcoded dictionaries with content wrapper + +### Content Wrapper Pattern +All tool outputs and final agent output use a consistent `{"content": {...}}` structure: +- Tool outputs: `{"content": {"temperature": 72.5, "unit": "fahrenheit"}}` +- Agent output: `{"content": {"city": "NYC", "temperature": 72.5, ...}}` + +This pattern ensures consistent serialization and makes it easy to extract the actual data from the wrapper. + +## Running Evaluations + +### Basic Evaluation +Run the evaluation to test the mocked agent's behavior: + +```bash +uv run uipath eval samples/weather_tools/main.py samples/weather_tools/evals/eval-sets/default.json --workers 1 +``` + +### Evaluation Output +The evaluators will verify the mocked agent's behavior: +- ✅ **Trajectory evaluation**: Validates tool call sequence and orchestration logic +- ✅ **Tool call count**: Verifies correct number of each tool call +- ✅ **Tool call order**: Ensures tools are called in the expected sequence +- ✅ **Tool call args**: Validates arguments passed to each tool +- ✅ **Tool call output**: Checks that tool outputs match expectations with content wrapper +- ✅ **JSON similarity**: Compares final agent output structure +- ✅ **Exact match**: Validates specific output values + +## Test Cases + +The eval set includes 5 test cases covering: +1. Basic weather check (3 tool calls) +2. Weather with forecast (4 tool calls) +3. Weather with alerts (4 tool calls) +4. Sunny weather conditions (3 tool calls) +5. Tokyo forecast sequence validation (4 tool calls) + +Each test case validates that the agent calls the correct tools in the right order with proper arguments and content-wrapped outputs. + +## Usage Examples + +### Running the Agent +```python +from main import main, WeatherInput, City + +# Basic weather check +input_data = WeatherInput(city=City.NEW_YORK, action="get_weather") +result = await main(input_data) +print(result.model_dump()) # {"content": {"city": "New York", "temperature": 72.5, ...}} + +# Weather with forecast +input_data = WeatherInput(city=City.PARIS, action="get_forecast") +result = await main(input_data) +print(result.model_dump()) # Includes forecast in content +``` + +### Custom Serialization +The `WeatherOutput` class includes custom serialization methods: +```python +# Get content-wrapped dictionary +data = result.model_dump() + +# Get JSON string with content wrapper +json_str = result.to_json() + +# Exclude None values +data_clean = result.model_dump(exclude_none=True) +``` diff --git a/samples/weather_tools/TOOL_EVALUATORS.md b/samples/weather_tools/TOOL_EVALUATORS.md new file mode 100644 index 000000000..efabebebb --- /dev/null +++ b/samples/weather_tools/TOOL_EVALUATORS.md @@ -0,0 +1,322 @@ +# Tool Call Evaluators + +This document explains the tool call evaluators available in the weather_tools sample and how to use them for trajectory evaluation. + +## Overview + +Tool call evaluators validate specific aspects of how tools are invoked during agent execution. They extract tool information from OpenTelemetry spans and compare against expected criteria. + +## Available Evaluators + +### 1. ToolCallCountEvaluator + +**Purpose**: Validates that tools are called the expected number of times. + +**Configuration**: `evals/evaluators/tool-call-count.json` + +**Example Usage**: +```json +"ToolCallCountEvaluator": { + "toolCallsCount": { + "get_temperature": ["=", 1], + "get_weather_condition": ["=", 1], + "get_humidity": ["=", 1] + } +} +``` + +**Supported Operators**: +- `"="` - Exactly equal to +- `">"` - Greater than +- `"<"` - Less than +- `">="` - Greater than or equal to +- `"<="` - Less than or equal to +- `"!="` - Not equal to + +**Use Cases**: +- Ensure a tool is called exactly once +- Verify a tool is called at least N times +- Validate a tool is not called more than N times + +### 2. ToolCallOrderEvaluator + +**Purpose**: Validates that tools are called in the correct sequence. + +**Configuration**: `evals/evaluators/tool-call-order.json` + +**Example Usage**: +```json +"ToolCallOrderEvaluator": { + "toolCallsOrder": [ + "get_temperature", + "get_weather_condition", + "get_humidity", + "get_forecast" + ] +} +``` + +**Behavior**: +- Uses Longest Common Subsequence (LCS) algorithm +- Allows partial matches (non-strict mode by default) +- Returns score from 0.0 to 1.0 based on order similarity + +**Use Cases**: +- Validate critical operations happen in sequence +- Ensure dependencies are respected (e.g., auth before data fetch) +- Verify optimization patterns (e.g., caching checks before computation) + +### 3. ToolCallArgsEvaluator + +**Purpose**: Validates that tools are called with correct arguments. + +**Configuration**: `evals/evaluators/tool-call-args.json` + +**Example Usage**: +```json +"ToolCallArgsEvaluator": { + "toolCalls": [ + { + "name": "get_temperature", + "args": {"city": "New York"} + }, + { + "name": "get_weather_condition", + "args": {"city": "New York"} + } + ] +} +``` + +**Modes**: +- **Subset Mode** (default: `true`): Expected args must be present but can have additional args +- **Exact Mode** (`subset: false`): Args must match exactly + +**Use Cases**: +- Validate correct parameters are passed +- Ensure data consistency across tool calls +- Verify input transformation logic + +### 4. ToolCallOutputEvaluator + +**Purpose**: Validates that tools produce expected outputs. + +**Configuration**: `evals/evaluators/tool-call-output.json` + +**Example Usage**: +```json +"ToolCallOutputEvaluator": { + "toolOutputs": [ + { + "name": "get_temperature", + "output": "{'temperature': 25.0, 'unit': 'fahrenheit'}" + }, + { + "name": "get_forecast", + "output": "{'forecast': 'Overcast with mild temperatures'}" + } + ] +} +``` + +**Behavior**: +- Compares output strings exactly +- Output must be JSON-serialized string +- Returns 1.0 for exact match, 0.0 otherwise +- **Note**: Current implementation uses single quotes for Python dict format + +**Use Cases**: +- Validate tool output format +- Ensure deterministic tool behavior +- Verify data transformations + +## Complete Example + +### Test Case: "tokyo_forecast" + +This test validates all aspects of tool usage for fetching Tokyo's weather forecast: + +```json +{ + "id": "tokyo_forecast", + "name": "Tokyo Weather Forecast", + "inputs": { + "city": "Tokyo", + "action": "get_forecast" + }, + "evaluationCriterias": { + "ToolCallCountEvaluator": { + "toolCallsCount": { + "get_temperature": ["=", 1], + "get_weather_condition": ["=", 1], + "get_humidity": ["=", 1], + "get_forecast": ["=", 1] + } + }, + "ToolCallOrderEvaluator": { + "toolCallsOrder": [ + "get_temperature", + "get_weather_condition", + "get_humidity", + "get_forecast" + ] + }, + "ToolCallArgsEvaluator": { + "toolCalls": [ + { + "name": "get_temperature", + "args": {"city": "Tokyo"} + }, + { + "name": "get_weather_condition", + "args": {"city": "Tokyo"} + }, + { + "name": "get_humidity", + "args": {"city": "Tokyo"} + }, + { + "name": "get_forecast", + "args": {"city": "Tokyo"} + } + ] + }, + "ToolCallOutputEvaluator": { + "toolOutputs": [ + { + "name": "get_temperature", + "output": "{'temperature': 25.0, 'unit': 'fahrenheit'}" + }, + { + "name": "get_weather_condition", + "output": "{'condition': 'cloudy'}" + }, + { + "name": "get_humidity", + "output": "{'humidity': 65}" + }, + { + "name": "get_forecast", + "output": "{'forecast': 'Overcast with mild temperatures'}" + } + ] + } + } +} +``` + +### What This Validates + +1. **Count**: Each tool is called exactly once ✓ +2. **Order**: Tools are called in the correct sequence ✓ +3. **Args**: All tools receive "Tokyo" as the city argument ✓ +4. **Output**: All tools return the expected data ✓ + +## How Tool Calls Are Extracted + +Tool calls are extracted from OpenTelemetry spans that have the `tool.name` attribute set. The weather_tools agent uses the `@mock_tool_span` decorator to ensure each tool invocation creates a span with: + +- `tool.name`: The tool function name +- `input.value`: JSON-serialized tool arguments +- `output.value`: JSON-serialized tool output + +### Current Implementation + +The weather_tools sample uses direct async function calls (not LangChain tools) with the following decorator stack: + +```python +@traced() # Creates OTEL span for tracing +@mockable(example_calls=...) # Provides mock data during evaluation +@mock_tool_span # Sets tool.name attribute on span +async def get_temperature(city: str) -> dict: + return {"content": {"temperature": 72.5, "unit": "fahrenheit"}} +``` + +### Content Wrapper Pattern + +All tool outputs use a consistent `{"content": {...}}` structure: +- Tool outputs: `{"content": {"temperature": 72.5, "unit": "fahrenheit"}}` +- This ensures consistent serialization and makes data extraction predictable + +## Running Evaluations + +Execute all evaluators including tool call evaluators: + +```bash +uv run uipath eval samples/weather_tools/main.py samples/weather_tools/evals/eval-sets/default.json --workers 1 +``` + +## Test Cases in Default Eval Set + +The `default.json` eval set includes 5 comprehensive test cases: + +1. **basic_weather** - Tests 3 tool calls (temperature, condition, humidity) +2. **weather_with_forecast** - Tests 4 tool calls including forecast +3. **weather_with_alerts** - Tests 4 tool calls including alerts +4. **sunny_weather** - Tests sunny weather conditions +5. **tokyo_forecast** - Tests Tokyo-specific forecast sequence + +Each test case validates: +- Correct tool call count +- Proper tool call order +- Accurate tool arguments +- Expected tool outputs + +## Best Practices + +### 1. Start with Order, Then Add Count +Begin with `ToolCallOrderEvaluator` to ensure correct sequencing, then add `ToolCallCountEvaluator` for precise counts. + +### 2. Use Subset Mode for Args +Unless you need exact matching, keep `subset: true` in args evaluator to allow flexibility. + +### 3. Selective Output Validation +Only validate outputs for critical tools - validating all outputs can be brittle. + +### 4. Combine with Trajectory Evaluator +Use `TrajectoryEvaluator` for high-level behavior validation alongside specific tool evaluators. + +### 5. Test Different Action Paths +Create separate test cases for different action types (get_weather, get_forecast, get_alerts) to validate all code paths. + +### 6. Content Wrapper Consistency +Ensure all tool outputs follow the `{"content": {...}}` pattern for consistent evaluation. + +## Troubleshooting + +### Order Validation Failing + +**Problem**: Order evaluator score is low + +**Solutions**: +- Check if conditional tool calls are included in expected order +- Use trajectory evaluator to see actual execution sequence +- Consider if strict mode is appropriate for your use case + +### Args Validation Failing + +**Problem**: Args evaluator reports mismatches + +**Solutions**: +- Verify argument names match exactly (case-sensitive) +- Check if subset mode is appropriate +- Ensure arguments are JSON-serializable + +### Output Validation Failing + +**Problem**: Output evaluator reports mismatches + +**Solutions**: +- Ensure outputs are JSON-serialized strings +- Check for trailing whitespace or formatting differences +- Verify content wrapper structure is consistent +- Consider if output validation is too strict + +### Content Wrapper Issues + +**Problem**: Tool outputs don't match expected format + +**Solutions**: +- Ensure all tools return `{"content": {...}}` structure +- Check that serialization is consistent across tools +- Verify that evaluator expectations match actual output format diff --git a/samples/weather_tools/evals/eval-sets/default.json b/samples/weather_tools/evals/eval-sets/default.json new file mode 100644 index 000000000..faa0189ae --- /dev/null +++ b/samples/weather_tools/evals/eval-sets/default.json @@ -0,0 +1,375 @@ +{ + "version": "1.0", + "id": "WeatherToolsEval", + "name": "Weather Tools Agent Evaluation", + "evaluatorRefs": [ + "TrajectoryEvaluator", + "ToolCallCountEvaluator", + "ToolCallOrderEvaluator", + "ToolCallArgsEvaluator", + "ToolCallOutputEvaluator" + ], + "evaluations": [ + { + "id": "basic_weather", + "name": "Basic Weather Check", + "inputs": { + "city": "New York", + "action": "get_weather" + }, + "evaluationCriterias": { + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should call get_temperature, get_weather_condition, and get_humidity tools for New York, then return the combined weather data without forecast or alerts." + }, + "ToolCallCountEvaluator": { + "toolCallsCount": { + "get_temperature": [ + "=", + 1 + ], + "get_weather_condition": [ + "=", + 1 + ], + "get_humidity": [ + "=", + 1 + ] + } + }, + "ToolCallOrderEvaluator": { + "toolCallsOrder": [ + "get_temperature", + "get_weather_condition", + "get_humidity" + ] + }, + "ToolCallArgsEvaluator": { + "toolCalls": [ + { + "name": "get_temperature", + "args": { + "city": "New York" + } + }, + { + "name": "get_weather_condition", + "args": { + "city": "New York" + } + }, + { + "name": "get_humidity", + "args": { + "city": "New York" + } + } + ] + }, + "ToolCallOutputEvaluator": null + } + }, + { + "id": "weather_with_forecast", + "name": "Weather with Forecast", + "inputs": { + "city": "Paris", + "action": "get_forecast" + }, + "evaluationCriterias": { + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should call get_temperature, get_weather_condition, get_humidity, and get_forecast tools for Paris. It should return weather data including the forecast but not alerts." + }, + "ToolCallCountEvaluator": { + "toolCallsCount": { + "get_temperature": [ + "=", + 1 + ], + "get_weather_condition": [ + "=", + 1 + ], + "get_humidity": [ + "=", + 1 + ], + "get_forecast": [ + "=", + 1 + ] + } + }, + "ToolCallOrderEvaluator": { + "toolCallsOrder": [ + "get_temperature", + "get_weather_condition", + "get_humidity", + "get_forecast" + ] + }, + "ToolCallArgsEvaluator": { + "toolCalls": [ + { + "name": "get_temperature", + "args": { + "city": "Paris" + } + }, + { + "name": "get_weather_condition", + "args": { + "city": "Paris" + } + }, + { + "name": "get_humidity", + "args": { + "city": "Paris" + } + }, + { + "name": "get_forecast", + "args": { + "city": "Paris" + } + } + ] + }, + "ToolCallOutputEvaluator": { + "toolOutputs": [ + { + "name": "get_forecast", + "output": "{'forecast': 'Cloudy with a chance of rain in the afternoon'}" + } + ] + } + } + }, + { + "id": "weather_with_alerts", + "name": "Weather with Alerts", + "inputs": { + "city": "London", + "action": "get_alerts" + }, + "evaluationCriterias": { + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should call get_temperature, get_weather_condition, get_humidity, and get_weather_alerts tools for London. It should return weather data including alerts but not forecast. The alerts should indicate heavy rain warning." + }, + "ToolCallCountEvaluator": { + "toolCallsCount": { + "get_temperature": [ + "=", + 1 + ], + "get_weather_condition": [ + "=", + 1 + ], + "get_humidity": [ + "=", + 1 + ], + "get_weather_alerts": [ + "=", + 1 + ] + } + }, + "ToolCallOrderEvaluator": { + "toolCallsOrder": [ + "get_temperature", + "get_weather_condition", + "get_humidity", + "get_weather_alerts" + ] + }, + "ToolCallArgsEvaluator": { + "toolCalls": [ + { + "name": "get_temperature", + "args": { + "city": "London" + } + }, + { + "name": "get_weather_condition", + "args": { + "city": "London" + } + }, + { + "name": "get_humidity", + "args": { + "city": "London" + } + }, + { + "name": "get_weather_alerts", + "args": { + "city": "London" + } + } + ] + }, + "ToolCallOutputEvaluator": { + "toolOutputs": [ + { + "name": "get_weather_alerts", + "output": "{'alerts': ['Heavy rain warning until 6 PM']}" + } + ] + } + } + }, + { + "id": "sunny_weather", + "name": "Sunny Weather Check", + "inputs": { + "city": "Sydney", + "action": "get_weather" + }, + "evaluationCriterias": { + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should call get_temperature, get_weather_condition, and get_humidity for Sydney, then return sunny weather conditions without forecast or alerts." + }, + "ToolCallCountEvaluator": { + "toolCallsCount": { + "get_temperature": [ + "=", + 1 + ], + "get_weather_condition": [ + "=", + 1 + ], + "get_humidity": [ + "=", + 1 + ] + } + }, + "ToolCallOrderEvaluator": { + "toolCallsOrder": [ + "get_temperature", + "get_weather_condition", + "get_humidity" + ] + }, + "ToolCallArgsEvaluator": { + "toolCalls": [ + { + "name": "get_temperature", + "args": { + "city": "Sydney" + } + }, + { + "name": "get_weather_condition", + "args": { + "city": "Sydney" + } + }, + { + "name": "get_humidity", + "args": { + "city": "Sydney" + } + } + ] + }, + "ToolCallOutputEvaluator": null + } + }, + { + "id": "tokyo_forecast", + "name": "Tokyo Weather Forecast", + "inputs": { + "city": "Tokyo", + "action": "get_forecast" + }, + "evaluationCriterias": { + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should sequentially call get_temperature, get_weather_condition, get_humidity, and get_forecast for Tokyo. The trajectory should show all four tool calls in the correct order, gathering basic weather data first, then the forecast." + }, + "ToolCallCountEvaluator": { + "toolCallsCount": { + "get_temperature": [ + "=", + 1 + ], + "get_weather_condition": [ + "=", + 1 + ], + "get_humidity": [ + "=", + 1 + ], + "get_forecast": [ + "=", + 1 + ] + } + }, + "ToolCallOrderEvaluator": { + "toolCallsOrder": [ + "get_temperature", + "get_weather_condition", + "get_humidity", + "get_forecast" + ] + }, + "ToolCallArgsEvaluator": { + "toolCalls": [ + { + "name": "get_temperature", + "args": { + "city": "Tokyo" + } + }, + { + "name": "get_weather_condition", + "args": { + "city": "Tokyo" + } + }, + { + "name": "get_humidity", + "args": { + "city": "Tokyo" + } + }, + { + "name": "get_forecast", + "args": { + "city": "Tokyo" + } + } + ] + }, + "ToolCallOutputEvaluator": { + "toolOutputs": [ + { + "name": "get_temperature", + "output": "{'temperature': 25.0, 'unit': 'fahrenheit'}" + }, + { + "name": "get_weather_condition", + "output": "{'condition': 'cloudy'}" + }, + { + "name": "get_humidity", + "output": "{'humidity': 65}" + }, + { + "name": "get_forecast", + "output": "{'forecast': 'Overcast with mild temperatures'}" + } + ] + } + } + } + ] +} diff --git a/samples/weather_tools/evals/evaluators/tool-call-args.json b/samples/weather_tools/evals/evaluators/tool-call-args.json new file mode 100644 index 000000000..6c5854301 --- /dev/null +++ b/samples/weather_tools/evals/evaluators/tool-call-args.json @@ -0,0 +1,14 @@ +{ + "version": "1.0", + "id": "ToolCallArgsEvaluator", + "description": "Evaluates if tool calls have the correct arguments.", + "evaluatorTypeId": "uipath-tool-call-args", + "evaluatorConfig": { + "name": "ToolCallArgsEvaluator", + "strict": false, + "subset": true, + "defaultEvaluationCriteria": { + "toolCalls": [] + } + } +} diff --git a/samples/weather_tools/evals/evaluators/tool-call-count.json b/samples/weather_tools/evals/evaluators/tool-call-count.json new file mode 100644 index 000000000..083d74214 --- /dev/null +++ b/samples/weather_tools/evals/evaluators/tool-call-count.json @@ -0,0 +1,26 @@ +{ + "version": "1.0", + "id": "ToolCallCountEvaluator", + "description": "Evaluates if the correct number of tool calls were made.", + "evaluatorTypeId": "uipath-tool-call-count", + "evaluatorConfig": { + "name": "ToolCallCountEvaluator", + "strict": false, + "defaultEvaluationCriteria": { + "toolCallsCount": { + "get_temperature": [ + "=", + 1 + ], + "get_weather_condition": [ + "=", + 1 + ], + "get_humidity": [ + "=", + 1 + ] + } + } + } +} diff --git a/samples/weather_tools/evals/evaluators/tool-call-order.json b/samples/weather_tools/evals/evaluators/tool-call-order.json new file mode 100644 index 000000000..dc2b352a7 --- /dev/null +++ b/samples/weather_tools/evals/evaluators/tool-call-order.json @@ -0,0 +1,13 @@ +{ + "version": "1.0", + "id": "ToolCallOrderEvaluator", + "description": "Evaluates if tools were called in the correct sequence.", + "evaluatorTypeId": "uipath-tool-call-order", + "evaluatorConfig": { + "name": "ToolCallOrderEvaluator", + "strict": false, + "defaultEvaluationCriteria": { + "toolCallsOrder": [] + } + } +} diff --git a/samples/weather_tools/evals/evaluators/tool-call-output.json b/samples/weather_tools/evals/evaluators/tool-call-output.json new file mode 100644 index 000000000..a0e32203c --- /dev/null +++ b/samples/weather_tools/evals/evaluators/tool-call-output.json @@ -0,0 +1,30 @@ +{ + "version": "1.0", + "id": "ToolCallOutputEvaluator", + "description": "Evaluates if tool calls produced the correct outputs.", + "evaluatorTypeId": "uipath-tool-call-output", + "evaluatorConfig": { + "name": "ToolCallOutputEvaluator", + "strict": false, + "defaultEvaluationCriteria": { + "toolOutputs": [ + { + "name": "get_temperature", + "output": "{'temperature': 25.0, 'unit': 'fahrenheit'}" + }, + { + "name": "get_weather_condition", + "output": "{'condition': 'cloudy'}" + }, + { + "name": "get_humidity", + "output": "{'humidity': 65}" + }, + { + "name": "get_forecast", + "output": "{'forecast': 'Overcast with mild temperatures'}" + } + ] + } + } +} diff --git a/samples/weather_tools/evals/evaluators/trajectory.json b/samples/weather_tools/evals/evaluators/trajectory.json new file mode 100644 index 000000000..743142467 --- /dev/null +++ b/samples/weather_tools/evals/evaluators/trajectory.json @@ -0,0 +1,15 @@ +{ + "version": "1.0", + "id": "TrajectoryEvaluator", + "description": "Evaluates the agent's execution trajectory and decision sequence for weather operations.", + "evaluatorTypeId": "uipath-llm-judge-trajectory-similarity", + "evaluatorConfig": { + "name": "TrajectoryEvaluator", + "model": "gpt-4.1-2025-04-14", + "prompt": "Evaluate the agent's execution trajectory based on the expected behavior.\n\nExpected Agent Behavior: {{ExpectedAgentBehavior}}\nAgent Run History: {{AgentRunHistory}}\n\nProvide a score from 0-100 based on how well the agent followed the expected trajectory, including:\n- Correct tool selection and sequencing\n- Appropriate data retrieval\n- Proper handling of different action types\n- Correct orchestration of multiple tool calls", + "temperature": 0.0, + "defaultEvaluationCriteria": { + "expectedAgentBehavior": "The agent should call the appropriate weather tools and return the correct weather information for the requested city and action." + } + } +} diff --git a/samples/weather_tools/main.py b/samples/weather_tools/main.py new file mode 100644 index 000000000..6edfc26cd --- /dev/null +++ b/samples/weather_tools/main.py @@ -0,0 +1,314 @@ +import asyncio +import logging +from enum import Enum +from functools import wraps +from typing import Callable, Literal, TypeVar + +from opentelemetry import trace +from pydantic import BaseModel +from pydantic.dataclasses import dataclass + +from uipath.eval.mocks import ExampleCall, mockable +from uipath.tracing import traced + +logger = logging.getLogger(__name__) + +T = TypeVar("T") + + +def mock_tool_span(func: Callable[..., T]) -> Callable[..., T]: + """ + Decorator that wraps a function to set tool.name on the OTEL span. + + This decorator sets the tool.name attribute required by UiPath's trajectory + evaluation system to extract tool calls from traces. + + Usage: + @traced() # Creates OTEL span for tracing + @mockable(example_calls=...) # Adds mocking support + @mock_tool_span # Innermost - sets tool.name attribute + async def my_tool(arg: str) -> dict: + return {"result": "value"} + + Multiple Tool Calls: + Each tool invocation via .ainvoke() creates a separate span with its own + tool.name attribute. The trajectory evaluator will extract all tool calls + in sequence: + - get_temperature(...) -> span with tool.name="get_temperature" + - get_humidity(...) -> span with tool.name="get_humidity" + - get_forecast(...) -> span with tool.name="get_forecast" + """ + @wraps(func) + async def async_wrapper(*args, **kwargs): + # Get current span and set tool.name attribute + span = trace.get_current_span() + if span and span.is_recording(): + span.set_attribute("tool.name", func.__name__) + return await func(*args, **kwargs) + + @wraps(func) + def sync_wrapper(*args, **kwargs): + # Get current span and set tool.name attribute + span = trace.get_current_span() + if span and span.is_recording(): + span.set_attribute("tool.name", func.__name__) + return func(*args, **kwargs) + + # Detect if the function is async or sync + if asyncio.iscoroutinefunction(func): + return async_wrapper + else: + return sync_wrapper + + +class City(str, Enum): + NEW_YORK = "New York" + LONDON = "London" + TOKYO = "Tokyo" + PARIS = "Paris" + SYDNEY = "Sydney" + + +class WeatherCondition(str, Enum): + SUNNY = "sunny" + CLOUDY = "cloudy" + RAINY = "rainy" + SNOWY = "snowy" + + +@dataclass +class WeatherInput: + city: City + action: Literal["get_weather", "get_forecast", "get_alerts"] + + +class _WeatherOutputContent(BaseModel): + city: str = "" + temperature: float = 0.0 + condition: WeatherCondition = WeatherCondition.CLOUDY + humidity: int = 0 + forecast: str | None = None + alerts: list[str] | None = None + +class WeatherOutput(_WeatherOutputContent): + content: _WeatherOutputContent + +# Mock example for get_temperature tool +GET_TEMPERATURE_EXAMPLES = [ + ExampleCall( + id="example1", + input='{"city": "New York"}', + output='{"temperature": 72.5, "unit": "fahrenheit"}' + ) +] + +@traced() +@mockable(example_calls=GET_TEMPERATURE_EXAMPLES) +@mock_tool_span +async def get_temperature(city: str) -> dict: + """Get the current temperature for a city. + + Args: + city: The name of the city (e.g., "New York", "London", "Tokyo") + + Returns: + Dictionary with temperature in fahrenheit and unit + """ + # Convert string to City enum + city_enum = City(city) + + # Simulated temperature data + temps = { + City.NEW_YORK: 72.5, + City.LONDON: 15.0, + City.TOKYO: 25.0, + City.PARIS: 18.0, + City.SYDNEY: 22.0, + } + return {"content":{"temperature": temps.get(city_enum, 20.0), "unit": "fahrenheit"}} + + +GET_CONDITION_EXAMPLES = [ + ExampleCall( + id="example1", + input='{"city": "London"}', + output='{"condition": "rainy"}' + ) +] + +@traced() +@mockable(example_calls=GET_CONDITION_EXAMPLES) +@mock_tool_span +async def get_weather_condition(city: str) -> dict: + """Get the current weather condition for a city. + + Args: + city: The name of the city (e.g., "New York", "London", "Tokyo") + + Returns: + Dictionary with the current weather condition + """ + # Convert string to City enum + city_enum = City(city) + + # Simulated weather conditions + conditions = { + City.NEW_YORK: WeatherCondition.SUNNY, + City.LONDON: WeatherCondition.RAINY, + City.TOKYO: WeatherCondition.CLOUDY, + City.PARIS: WeatherCondition.CLOUDY, + City.SYDNEY: WeatherCondition.SUNNY, + } + return {"content":{"condition": conditions.get(city_enum, WeatherCondition.CLOUDY).value}} + + +GET_HUMIDITY_EXAMPLES = [ + ExampleCall( + id="example1", + input='{"city": "Tokyo"}', + output='{"humidity": 65}' + ) +] + +@traced() +@mockable(example_calls=GET_HUMIDITY_EXAMPLES) +@mock_tool_span +async def get_humidity(city: str) -> dict: + """Get the current humidity level for a city. + + Args: + city: The name of the city (e.g., "New York", "London", "Tokyo") + + Returns: + Dictionary with the humidity percentage + """ + # Convert string to City enum + city_enum = City(city) + + # Simulated humidity data + humidity_levels = { + City.NEW_YORK: 60, + City.LONDON: 80, + City.TOKYO: 65, + City.PARIS: 70, + City.SYDNEY: 55, + } + return {"content":{"humidity": humidity_levels.get(city_enum, 60)}} + + +GET_FORECAST_EXAMPLES = [ + ExampleCall( + id="example1", + input='{"city": "Paris"}', + output='{"forecast": "Cloudy with a chance of rain in the afternoon"}' + ) +] + + +@traced() +@mockable(example_calls=GET_FORECAST_EXAMPLES) +@mock_tool_span +async def get_forecast(city: str) -> dict: + """Get the weather forecast for a city. + + Args: + city: The name of the city (e.g., "New York", "London", "Tokyo") + + Returns: + Dictionary with the weather forecast + """ + # Convert string to City enum + city_enum = City(city) + + # Simulated forecasts + forecasts = { + City.NEW_YORK: "Clear skies throughout the day", + City.LONDON: "Rainy with occasional breaks", + City.TOKYO: "Overcast with mild temperatures", + City.PARIS: "Cloudy with a chance of rain in the afternoon", + City.SYDNEY: "Sunny and warm", + } + return {"content":{"forecast": forecasts.get(city_enum, "No forecast available")}} + + +GET_ALERTS_EXAMPLES = [ + ExampleCall( + id="example1", + input='{"city": "London"}', + output='{"alerts": ["Heavy rain warning until 6 PM"]}' + ) +] + +@traced() +@mockable(example_calls=GET_ALERTS_EXAMPLES) +@mock_tool_span +async def get_weather_alerts(city: str) -> dict: + """Get weather alerts for a city. + + Args: + city: The name of the city (e.g., "New York", "London", "Tokyo") + + Returns: + Dictionary with a list of active weather alerts + """ + # Convert string to City enum + city_enum = City(city) + + # Simulated alerts + alerts = { + City.NEW_YORK: [], + City.LONDON: ["Heavy rain warning until 6 PM"], + City.TOKYO: [], + City.PARIS: [], + City.SYDNEY: ["UV index very high"], + } + return {"content":{"alerts": alerts.get(city_enum, [])}} + + +@traced() +async def main(input: WeatherInput) -> WeatherOutput: + """Main weather agent that orchestrates different weather tools. + + This agent demonstrates multiple tool calls in sequence. Each tool invocation + creates its own span with tool.name set, allowing trajectory evaluation to + extract the complete sequence of tool calls. + + Example trace for "get_weather" action: + 1. Span: tool.name="get_temperature", input={"city": "New York"}, output={"temperature": 72.5, ...} + 2. Span: tool.name="get_weather_condition", input={"city": "New York"}, output={"condition": "sunny"} + 3. Span: tool.name="get_humidity", input={"city": "New York"}, output={"humidity": 60} + """ + city = input.city.value # Get string value from enum + + # Multiple tool calls - each creates its own span with tool.name attribute + temp_data = await get_temperature(city) + condition_data = await get_weather_condition(city) + humidity_data = await get_humidity(city) + + forecast = None + alerts = None + + # Conditional tool calls based on action - each also creates its own span + # For "get_forecast": 4 total tool spans (temp, condition, humidity, forecast) + # For "get_alerts": 4 total tool spans (temp, condition, humidity, alerts) + # For "get_weather": 3 total tool spans (temp, condition, humidity) + if input.action == "get_forecast": + forecast_data = await get_forecast(city) + forecast = forecast_data["content"]["forecast"] + elif input.action == "get_alerts": + alerts_data = await get_weather_alerts(city) + alerts = alerts_data["content"]["alerts"] + elif input.action == "get_weather": + # For simple weather requests, just return basic info + pass + + return WeatherOutput( + content=_WeatherOutputContent( + city=city, + temperature=temp_data["content"]["temperature"], + condition=WeatherCondition(condition_data["content"]["condition"]), + humidity=humidity_data["content"]["humidity"], + forecast=forecast, + alerts=alerts, + ) + ) diff --git a/src/uipath/_cli/__init__.py b/src/uipath/_cli/__init__.py index 3f8a36e56..332218f29 100644 --- a/src/uipath/_cli/__init__.py +++ b/src/uipath/_cli/__init__.py @@ -4,6 +4,7 @@ import click from ._utils._common import add_cwd_to_path, load_environment_variables +from .cli_add import add as add from .cli_auth import auth as auth from .cli_debug import debug as debug # type: ignore from .cli_deploy import deploy as deploy # type: ignore @@ -16,6 +17,7 @@ from .cli_publish import publish as publish # type: ignore from .cli_pull import pull as pull # type: ignore from .cli_push import push as push # type: ignore +from .cli_register import register as register # type: ignore from .cli_run import run as run # type: ignore @@ -75,4 +77,6 @@ def cli(lv: bool, v: bool) -> None: cli.add_command(pull) cli.add_command(eval) cli.add_command(dev) +cli.add_command(add) +cli.add_command(register) cli.add_command(debug) diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py index e964c6d01..5d1d17f38 100644 --- a/src/uipath/_cli/_evals/_console_progress_reporter.py +++ b/src/uipath/_cli/_evals/_console_progress_reporter.py @@ -7,6 +7,7 @@ from rich.rule import Rule from rich.table import Table +from uipath._cli._evals._models._evaluation_set import AnyEvaluator from uipath._events._event_bus import EventBus from uipath._events._events import ( EvalRunCreatedEvent, @@ -15,7 +16,6 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from uipath.eval.evaluators import BaseEvaluator from uipath.eval.models import ScoreType logger = logging.getLogger(__name__) @@ -26,7 +26,7 @@ class ConsoleProgressReporter: def __init__(self): self.console = Console() - self.evaluators: Dict[str, BaseEvaluator[Any]] = {} + self.evaluators: Dict[str, AnyEvaluator] = {} self.display_started = False self.eval_results_by_name: Dict[str, list[Any]] = {} diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py index 18765078b..c0b362ee4 100644 --- a/src/uipath/_cli/_evals/_evaluator_factory.py +++ b/src/uipath/_cli/_evals/_evaluator_factory.py @@ -1,21 +1,69 @@ +import importlib.util +import sys +from pathlib import Path from typing import Any, Dict from pydantic import TypeAdapter +from uipath._cli._evals._helpers import try_extract_file_and_class_name # type: ignore +from uipath._cli._evals._models._evaluation_set import AnyEvaluator from uipath._cli._evals._models._evaluator import ( EqualsEvaluatorParams, - Evaluator, + EvaluatorConfig, JsonSimilarityEvaluatorParams, + LegacyEvaluator, LLMEvaluatorParams, TrajectoryEvaluatorParams, ) from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams from uipath.eval.evaluators import ( BaseEvaluator, + LegacyBaseEvaluator, + LegacyExactMatchEvaluator, + LegacyJsonSimilarityEvaluator, + LegacyLlmAsAJudgeEvaluator, + LegacyTrajectoryEvaluator, +) +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorConfig +from uipath.eval.evaluators.contains_evaluator import ( + ContainsEvaluator, + ContainsEvaluatorConfig, +) +from uipath.eval.evaluators.exact_match_evaluator import ( ExactMatchEvaluator, + ExactMatchEvaluatorConfig, +) +from uipath.eval.evaluators.json_similarity_evaluator import ( JsonSimilarityEvaluator, - LlmAsAJudgeEvaluator, - TrajectoryEvaluator, + JsonSimilarityEvaluatorConfig, +) +from uipath.eval.evaluators.llm_judge_output_evaluator import ( + LLMJudgeOutputEvaluator, + LLMJudgeOutputEvaluatorConfig, + LLMJudgeStrictJSONSimilarityOutputEvaluator, + LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, +) +from uipath.eval.evaluators.llm_judge_trajectory_evaluator import ( + LLMJudgeTrajectoryEvaluator, + LLMJudgeTrajectoryEvaluatorConfig, + LLMJudgeTrajectorySimulationEvaluator, + LLMJudgeTrajectorySimulationEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_args_evaluator import ( + ToolCallArgsEvaluator, + ToolCallArgsEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_count_evaluator import ( + ToolCallCountEvaluator, + ToolCallCountEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_order_evaluator import ( + ToolCallOrderEvaluator, + ToolCallOrderEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_output_evaluator import ( + ToolCallOutputEvaluator, + ToolCallOutputEvaluatorConfig, ) @@ -23,7 +71,252 @@ class EvaluatorFactory: """Factory class for creating evaluator instances based on configuration.""" @classmethod - def create_evaluator(cls, data: Dict[str, Any]) -> BaseEvaluator[Any]: + def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator: + if data.get("version", None) == "1.0": + return cls._create_evaluator_internal(data) + return cls._create_legacy_evaluator_internal(data) + + @staticmethod + def _create_evaluator_internal( + data: Dict[str, Any], + ) -> BaseEvaluator[Any, Any, Any]: + # check custom evaluator + evaluator_schema = data.get("evaluatorSchema", "") + success, file_path, class_name = try_extract_file_and_class_name( + evaluator_schema + ) + if success: + return EvaluatorFactory._create_coded_evaluator_internal( + data, file_path, class_name + ) + + # use built-in evaluators + config: BaseEvaluatorConfig[Any] = TypeAdapter(EvaluatorConfig).validate_python( + data + ) + match config: + case ContainsEvaluatorConfig(): + return EvaluatorFactory._create_contains_evaluator(data) + case ExactMatchEvaluatorConfig(): + return EvaluatorFactory._create_exact_match_evaluator(data) + case JsonSimilarityEvaluatorConfig(): + return EvaluatorFactory._create_json_similarity_evaluator(data) + case LLMJudgeOutputEvaluatorConfig(): + return EvaluatorFactory._create_llm_judge_output_evaluator(data) + case LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig(): + return EvaluatorFactory._create_llm_judge_strict_json_similarity_output_evaluator( + data + ) + case LLMJudgeTrajectoryEvaluatorConfig(): + return EvaluatorFactory._create_trajectory_evaluator(data) + case ToolCallArgsEvaluatorConfig(): + return EvaluatorFactory._create_tool_call_args_evaluator(data) + case ToolCallCountEvaluatorConfig(): + return EvaluatorFactory._create_tool_call_count_evaluator(data) + case ToolCallOrderEvaluatorConfig(): + return EvaluatorFactory._create_tool_call_order_evaluator(data) + case ToolCallOutputEvaluatorConfig(): + return EvaluatorFactory._create_tool_call_output_evaluator(data) + case LLMJudgeTrajectorySimulationEvaluatorConfig(): + return ( + EvaluatorFactory._create_llm_judge_simulation_trajectory_evaluator( + data + ) + ) + case _: + raise ValueError(f"Unknown evaluator configuration: {config}") + + @staticmethod + def _create_contains_evaluator(data: Dict[str, Any]) -> ContainsEvaluator: + evaluator_id = data.get("id") + if not evaluator_id or not isinstance(evaluator_id, str): + raise ValueError("Evaluator 'id' must be a non-empty string") + return ContainsEvaluator( + id=evaluator_id, + config=data.get("evaluatorConfig"), + ) # type: ignore + + @staticmethod + def _create_coded_evaluator_internal( + data: Dict[str, Any], file_path_str: str, class_name: str + ) -> BaseEvaluator[Any, Any, Any]: + """Create a coded evaluator by dynamically loading from a Python file. + + Args: + data: Dictionary containing evaluator configuration with evaluatorTypeId + in format "file://path/to/file.py:ClassName" + + Returns: + Instance of the dynamically loaded evaluator class + + Raises: + ValueError: If file or class cannot be loaded, or if the class is not a BaseEvaluator subclass + """ + file_path = Path(file_path_str) + if not file_path.is_absolute(): + if not file_path.exists(): + file_path = ( + Path.cwd() / "evals" / "evaluators" / "custom" / file_path_str + ) + + if not file_path.exists(): + raise ValueError( + f"Evaluator file not found: {file_path}. " + f"Make sure the file exists in evals/evaluators/custom/" + ) + + module_name = f"_custom_evaluator_{file_path.stem}_{id(data)}" + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None or spec.loader is None: + raise ValueError(f"Could not load module from {file_path}") + + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + try: + spec.loader.exec_module(module) + except Exception as e: + raise ValueError( + f"Error executing module from {file_path}: {str(e)}" + ) from e + + # Get the class from the module + if not hasattr(module, class_name): + raise ValueError( + f"Class '{class_name}' not found in {file_path}. " + f"Available classes: {[name for name in dir(module) if not name.startswith('_')]}" + ) + + evaluator_class = getattr(module, class_name) + + if not isinstance(evaluator_class, type) or not issubclass( + evaluator_class, BaseEvaluator + ): + raise ValueError( + f"Class '{class_name}' must be a subclass of BaseEvaluator" + ) + + evaluator_id = data.get("id") + if not evaluator_id or not isinstance(evaluator_id, str): + raise ValueError("Evaluator 'id' must be a non-empty string") + return evaluator_class( + id=evaluator_id, + config=data.get("evaluatorConfig", {}), + ) # type: ignore + + @staticmethod + def _create_exact_match_evaluator( + data: Dict[str, Any], + ) -> ExactMatchEvaluator: + return TypeAdapter(ExactMatchEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_json_similarity_evaluator( + data: Dict[str, Any], + ) -> JsonSimilarityEvaluator: + return TypeAdapter(JsonSimilarityEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_llm_judge_output_evaluator( + data: Dict[str, Any], + ) -> LLMJudgeOutputEvaluator: + return TypeAdapter(LLMJudgeOutputEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_llm_judge_strict_json_similarity_output_evaluator( + data: Dict[str, Any], + ) -> LLMJudgeStrictJSONSimilarityOutputEvaluator: + return TypeAdapter(LLMJudgeStrictJSONSimilarityOutputEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_trajectory_evaluator( + data: Dict[str, Any], + ) -> LLMJudgeTrajectoryEvaluator: + return TypeAdapter(LLMJudgeTrajectoryEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_tool_call_args_evaluator( + data: Dict[str, Any], + ) -> ToolCallArgsEvaluator: + return TypeAdapter(ToolCallArgsEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_tool_call_count_evaluator( + data: Dict[str, Any], + ) -> ToolCallCountEvaluator: + return TypeAdapter(ToolCallCountEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_tool_call_order_evaluator( + data: Dict[str, Any], + ) -> ToolCallOrderEvaluator: + return TypeAdapter(ToolCallOrderEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_tool_call_output_evaluator( + data: Dict[str, Any], + ) -> ToolCallOutputEvaluator: + return TypeAdapter(ToolCallOutputEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_llm_judge_simulation_trajectory_evaluator( + data: Dict[str, Any], + ) -> LLMJudgeTrajectorySimulationEvaluator: + return TypeAdapter(LLMJudgeTrajectorySimulationEvaluator).validate_python( + { + "id": data.get("id"), + "config": data.get("evaluatorConfig"), + } + ) + + @staticmethod + def _create_legacy_evaluator_internal( + data: Dict[str, Any], + ) -> LegacyBaseEvaluator[Any]: """Create an evaluator instance from configuration data. Args: @@ -35,46 +328,38 @@ def create_evaluator(cls, data: Dict[str, Any]) -> BaseEvaluator[Any]: Raises: ValueError: If category is unknown or required fields are missing """ - # Extract common fields - name = data.get("name", "") - if not name: - raise ValueError("Evaluator configuration must include 'name' field") - id = data.get("id", "") - if not id: - raise ValueError("Evaluator configuration must include 'id' field") - - params: EvaluatorBaseParams = TypeAdapter(Evaluator).validate_python(data) + params: EvaluatorBaseParams = TypeAdapter(LegacyEvaluator).validate_python(data) match params: case EqualsEvaluatorParams(): - return EvaluatorFactory._create_exact_match_evaluator(params) + return EvaluatorFactory._create_legacy_exact_match_evaluator(params) case JsonSimilarityEvaluatorParams(): - return EvaluatorFactory._create_json_similarity_evaluator(params) + return EvaluatorFactory._create_legacy_json_similarity_evaluator(params) case LLMEvaluatorParams(): - return EvaluatorFactory._create_llm_as_judge_evaluator(params) + return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(params) case TrajectoryEvaluatorParams(): - return EvaluatorFactory._create_trajectory_evaluator(params) + return EvaluatorFactory._create_legacy_trajectory_evaluator(params) case _: raise ValueError(f"Unknown evaluator category: {params}") @staticmethod - def _create_exact_match_evaluator( + def _create_legacy_exact_match_evaluator( params: EqualsEvaluatorParams, - ) -> ExactMatchEvaluator: + ) -> LegacyExactMatchEvaluator: """Create a deterministic evaluator.""" - return ExactMatchEvaluator(**params.model_dump()) + return LegacyExactMatchEvaluator(**params.model_dump()) @staticmethod - def _create_json_similarity_evaluator( + def _create_legacy_json_similarity_evaluator( params: JsonSimilarityEvaluatorParams, - ) -> JsonSimilarityEvaluator: + ) -> LegacyJsonSimilarityEvaluator: """Create a deterministic evaluator.""" - return JsonSimilarityEvaluator(**params.model_dump()) + return LegacyJsonSimilarityEvaluator(**params.model_dump()) @staticmethod - def _create_llm_as_judge_evaluator( + def _create_legacy_llm_as_judge_evaluator( params: LLMEvaluatorParams, - ) -> LlmAsAJudgeEvaluator: + ) -> LegacyLlmAsAJudgeEvaluator: """Create an LLM-as-a-judge evaluator.""" if not params.prompt: raise ValueError("LLM evaluator must include 'prompt' field") @@ -86,12 +371,12 @@ def _create_llm_as_judge_evaluator( "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator." ) - return LlmAsAJudgeEvaluator(**params.model_dump()) + return LegacyLlmAsAJudgeEvaluator(**params.model_dump()) @staticmethod - def _create_trajectory_evaluator( + def _create_legacy_trajectory_evaluator( params: TrajectoryEvaluatorParams, - ) -> TrajectoryEvaluator: + ) -> LegacyTrajectoryEvaluator: """Create a trajectory evaluator.""" if not params.prompt: raise ValueError("Trajectory evaluator must include 'prompt' field") @@ -103,4 +388,4 @@ def _create_trajectory_evaluator( "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator." ) - return TrajectoryEvaluator(**params.model_dump()) + return LegacyTrajectoryEvaluator(**params.model_dump()) diff --git a/src/uipath/_cli/_evals/_helpers.py b/src/uipath/_cli/_evals/_helpers.py new file mode 100644 index 000000000..8f2a2cff0 --- /dev/null +++ b/src/uipath/_cli/_evals/_helpers.py @@ -0,0 +1,194 @@ +# type: ignore +import ast +import importlib.util +import json +import logging +import re +import sys +from pathlib import Path +from typing import Any, Optional + +import click + +from uipath._cli._utils._console import ConsoleLogger +from uipath._utils.constants import CUSTOM_EVALUATOR_PREFIX + +logger = logging.getLogger(__name__) +console = ConsoleLogger().get_instance() + + +def try_extract_file_and_class_name(text: str) -> tuple[bool, str, str]: + if text.startswith(CUSTOM_EVALUATOR_PREFIX): + file_and_class = text[len(CUSTOM_EVALUATOR_PREFIX) :] + if ":" not in file_and_class: + raise ValueError( + f"evaluatorSchema must include class name after ':' - got: {text}" + ) + file_path_str, class_name = file_and_class.rsplit(":", 1) + + return True, file_path_str, class_name + return False, "", "" + + +def to_kebab_case(text: str) -> str: + return re.sub(r"(? Optional[Path]: + """Find the evaluator file in evals/evaluators/custom folder.""" + custom_evaluators_path = Path.cwd() / "evals" / "evaluators" / "custom" + + if not custom_evaluators_path.exists(): + return None + + file_path = custom_evaluators_path / filename + if file_path.exists(): + return file_path + + return None + + +def find_base_evaluator_class(file_path: Path) -> Optional[str]: + """Parse the Python file and find the class that inherits from BaseEvaluator.""" + try: + with open(file_path, "r") as f: + tree = ast.parse(f.read(), filename=str(file_path)) + + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + for base in node.bases: + if isinstance(base, ast.Name) and base.id == "BaseEvaluator": + return node.name + elif isinstance(base, ast.Subscript): + if ( + isinstance(base.value, ast.Name) + and base.value.id == "BaseEvaluator" + ): + return node.name + + return None + except Exception as e: + logger.error(f"Error parsing file: {e}") + return None + + +def load_evaluator_class(file_path: Path, class_name: str) -> Optional[type]: + """Dynamically load the evaluator class from the file.""" + try: + parent_dir = str(file_path.parent) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + + spec = importlib.util.spec_from_file_location("custom_evaluator", file_path) + if spec is None or spec.loader is None: + return None + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + if hasattr(module, class_name): + return getattr(module, class_name) + + return None + except Exception as e: + logger.error(f"Error loading class: {e}") + return None + finally: + # Remove from sys.path + if parent_dir in sys.path: + sys.path.remove(parent_dir) + + +def generate_evaluator_config(evaluator_class: type, class_name: str) -> dict[str, Any]: + """Generate the evaluator config from the class.""" + try: + config_type = evaluator_class._extract_config_type() + config_instance = config_type() + config_dict = config_instance.model_dump(by_alias=True, exclude_none=False) + + return config_dict + except Exception as e: + console.error(f"Error inferring evaluator config: {e}") + + +def register_evaluator(filename: str) -> tuple[str, str]: + """Infers the schema and types of a custom evaluator. + + Returns: + tuple[str, str]: + - The first string is the path to the python evaluator file. + - The second string is the evaluator type that corresponds to the schema file. + """ + if not filename.endswith(".py"): + filename = filename + ".py" + file_path = find_evaluator_file(filename) + if file_path is None: + console.error(f"Could not find '{filename}' in evals/evaluators/custom folder") + + relative_path = f"evals/evaluators/custom/{filename}" + console.info( + f"Found custom evaluator file: {click.style(relative_path, fg='cyan')}" + ) + + class_name = find_base_evaluator_class(file_path) + if class_name is None: + console.error( + f"Could not find a class inheriting from BaseEvaluator in {filename}" + ) + + console.info(f"Found custom evaluator class: {click.style(class_name, fg='cyan')}") + + evaluator_class = load_evaluator_class(file_path, class_name) + if evaluator_class is None: + console.error(f"Could not load class {class_name} from {filename}") + + try: + evaluator_id = evaluator_class.get_evaluator_id() + except Exception as e: + console.error(f"Error getting evaluator ID: {e}") + + evaluator_config = generate_evaluator_config(evaluator_class, class_name) + evaluator_json_type = evaluator_class.generate_json_type() + + evaluators_dir = Path.cwd() / "evals" / "evaluators" + evaluators_dir.mkdir(parents=True, exist_ok=True) + + evaluator_types_dir = evaluators_dir / "custom" / "types" + evaluator_types_dir.mkdir(parents=True, exist_ok=True) + + kebab_class_name = to_kebab_case(class_name) + output_file_evaluator_types = kebab_class_name + "-types.json" + evaluator_types_output_path = ( + evaluators_dir / "custom" / "types" / output_file_evaluator_types + ) + + with open(evaluator_types_output_path, "w") as f: + json.dump(evaluator_json_type, f, indent=2) + + relative_output_path = ( + f"evals/evaluators/custom/types/{output_file_evaluator_types}" + ) + console.success( + f"Generated evaluator types: {click.style(relative_output_path, fg='cyan')}" + ) + + output = { + "version": "1.0", + "id": evaluator_id, + "evaluatorTypeId": f"{CUSTOM_EVALUATOR_PREFIX}types/{output_file_evaluator_types}", + "evaluatorSchema": f"{CUSTOM_EVALUATOR_PREFIX}{filename}:{class_name}", + "description": evaluator_class.__doc__, + "evaluatorConfig": evaluator_config, + } + + output_file_evaluator_spec = kebab_class_name + ".json" + evaluator_spec_output_path = evaluators_dir / output_file_evaluator_spec + with open(evaluator_spec_output_path, "w") as f: + json.dump(output, f, indent=2) + + relative_output_path = f"evals/evaluators/{output_file_evaluator_spec}" + console.success( + f"Generated evaluator spec: {click.style(relative_output_path, fg='cyan')}" + ) + + return str(file_path), str(evaluator_types_output_path) diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py index 40dbccbda..239503b24 100644 --- a/src/uipath/_cli/_evals/_models/_evaluation_set.py +++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py @@ -1,9 +1,11 @@ from enum import Enum, IntEnum from typing import Annotated, Any, Dict, List, Literal, Optional, Union -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag from pydantic.alias_generators import to_camel +from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator + class EvaluationSimulationTool(BaseModel): name: str = Field(..., alias="name") @@ -103,6 +105,27 @@ class UnknownMockingStrategy(BaseMockingStrategy): class EvaluationItem(BaseModel): """Individual evaluation item within an evaluation set.""" + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + id: str + name: str + inputs: Dict[str, Any] + evaluation_criterias: dict[str, dict[str, Any] | None] = Field( + ..., alias="evaluationCriterias" + ) + expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior") + mocking_strategy: Optional[MockingStrategy] = Field( + default=None, + alias="mockingStrategy", + ) + input_mocking_strategy: Optional[InputMockingStrategy] = Field( + default=None, + alias="inputMockingStrategy", + ) + + +class LegacyEvaluationItem(BaseModel): + """Individual evaluation item within an evaluation set.""" + model_config = ConfigDict( alias_generator=to_camel, populate_by_name=True, extra="allow" ) @@ -119,21 +142,41 @@ class EvaluationItem(BaseModel): default=None, alias="mockingStrategy", ) - input_mocking_strategy: Optional[InputMockingStrategy] = Field( - default=None, - alias="inputMockingStrategy", - ) class EvaluationSet(BaseModel): """Complete evaluation set model.""" + model_config = ConfigDict( + alias_generator=to_camel, populate_by_name=True, extra="allow" + ) + + id: str + name: str + version: Literal["1.0"] = "1.0" + evaluator_refs: List[str] = Field(default_factory=list) + evaluations: List[EvaluationItem] = Field(default_factory=list) + + def extract_selected_evals(self, eval_ids) -> None: + selected_evals: list[EvaluationItem] = [] + for evaluation in self.evaluations: + if evaluation.id in eval_ids: + selected_evals.append(evaluation) + eval_ids.remove(evaluation.id) + if len(eval_ids) > 0: + raise ValueError("Unknown evaluation ids: {}".format(eval_ids)) + self.evaluations = selected_evals + + +class LegacyEvaluationSet(BaseModel): + """Complete evaluation set model.""" + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) id: str file_name: str = Field(..., alias="fileName") evaluator_refs: List[str] = Field(default_factory=list) - evaluations: List[EvaluationItem] = Field(default_factory=list) + evaluations: List[LegacyEvaluationItem] = Field(default_factory=list) name: str batch_size: int = Field(10, alias="batchSize") timeout_minutes: int = Field(default=20, alias="timeoutMinutes") @@ -144,7 +187,7 @@ class EvaluationSet(BaseModel): updated_at: str = Field(alias="updatedAt") def extract_selected_evals(self, eval_ids) -> None: - selected_evals: list[EvaluationItem] = [] + selected_evals: list[LegacyEvaluationItem] = [] for evaluation in self.evaluations: if evaluation.id in eval_ids: selected_evals.append(evaluation) @@ -158,3 +201,26 @@ class EvaluationStatus(IntEnum): PENDING = 0 IN_PROGRESS = 1 COMPLETED = 2 + + +def _discriminate_eval_set( + v: Any, +) -> Literal["evaluation_set", "legacy_evaluation_set"]: + """Discriminator function that returns a tag based on version field.""" + if isinstance(v, dict): + version = v.get("version") + if version == "1.0": + return "evaluation_set" + return "legacy_evaluation_set" + + +AnyEvaluationSet = Annotated[ + Union[ + Annotated[EvaluationSet, Tag("evaluation_set")], + Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")], + ], + Discriminator(_discriminate_eval_set), +] + +AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem] +AnyEvaluator = Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]] diff --git a/src/uipath/_cli/_evals/_models/_evaluator.py b/src/uipath/_cli/_evals/_models/_evaluator.py index bdce90990..8da9c66b8 100644 --- a/src/uipath/_cli/_evals/_models/_evaluator.py +++ b/src/uipath/_cli/_evals/_models/_evaluator.py @@ -2,7 +2,37 @@ from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag -from uipath.eval.models.models import EvaluatorCategory, EvaluatorType +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorConfig +from uipath.eval.evaluators.contains_evaluator import ContainsEvaluatorConfig +from uipath.eval.evaluators.exact_match_evaluator import ExactMatchEvaluatorConfig +from uipath.eval.evaluators.json_similarity_evaluator import ( + JsonSimilarityEvaluatorConfig, +) +from uipath.eval.evaluators.llm_judge_output_evaluator import ( + LLMJudgeOutputEvaluatorConfig, + LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, +) +from uipath.eval.evaluators.llm_judge_trajectory_evaluator import ( + LLMJudgeTrajectoryEvaluatorConfig, + LLMJudgeTrajectorySimulationEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_args_evaluator import ( + ToolCallArgsEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_count_evaluator import ( + ToolCallCountEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_order_evaluator import ( + ToolCallOrderEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_output_evaluator import ( + ToolCallOutputEvaluatorConfig, +) +from uipath.eval.models import ( + EvaluatorType, + LegacyEvaluatorCategory, + LegacyEvaluatorType, +) class EvaluatorBaseParams(BaseModel): @@ -11,7 +41,7 @@ class EvaluatorBaseParams(BaseModel): id: str name: str description: str - evaluator_type: EvaluatorType = Field(..., alias="type") + evaluator_type: LegacyEvaluatorType = Field(..., alias="type") created_at: str = Field(..., alias="createdAt") updated_at: str = Field(..., alias="updatedAt") target_output_key: str = Field(..., alias="targetOutputKey") @@ -19,7 +49,9 @@ class EvaluatorBaseParams(BaseModel): class LLMEvaluatorParams(EvaluatorBaseParams): - category: Literal[EvaluatorCategory.LlmAsAJudge] = Field(..., alias="category") + category: Literal[LegacyEvaluatorCategory.LlmAsAJudge] = Field( + ..., alias="category" + ) prompt: str = Field(..., alias="prompt") model: str = Field(..., alias="model") @@ -29,7 +61,7 @@ class LLMEvaluatorParams(EvaluatorBaseParams): class TrajectoryEvaluatorParams(EvaluatorBaseParams): - category: Literal[EvaluatorCategory.Trajectory] = Field(..., alias="category") + category: Literal[LegacyEvaluatorCategory.Trajectory] = Field(..., alias="category") prompt: str = Field(..., alias="prompt") model: str = Field(..., alias="model") @@ -61,15 +93,15 @@ def evaluator_discriminator(data: Any) -> str: category = data.get("category") evaluator_type = data.get("type") match category: - case EvaluatorCategory.LlmAsAJudge: + case LegacyEvaluatorCategory.LlmAsAJudge: return "LLMEvaluatorParams" - case EvaluatorCategory.Trajectory: + case LegacyEvaluatorCategory.Trajectory: return "TrajectoryEvaluatorParams" - case EvaluatorCategory.Deterministic: + case LegacyEvaluatorCategory.Deterministic: match evaluator_type: - case EvaluatorType.Equals: + case LegacyEvaluatorType.Equals: return "EqualsEvaluatorParams" - case EvaluatorType.JsonSimilarity: + case LegacyEvaluatorType.JsonSimilarity: return "JsonSimilarityEvaluatorParams" case _: return "UnknownEvaluatorParams" @@ -104,3 +136,145 @@ def evaluator_discriminator(data: Any) -> str: ], Field(discriminator=Discriminator(evaluator_discriminator)), ] + + +class UnknownEvaluatorConfig(BaseEvaluatorConfig[Any]): + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, extra="allow" + ) + + +def legacy_evaluator_discriminator(data: Any) -> str: + if isinstance(data, dict): + category = data.get("category") + evaluator_type = data.get("type") + match category: + case LegacyEvaluatorCategory.LlmAsAJudge: + return "LLMEvaluatorParams" + case LegacyEvaluatorCategory.Trajectory: + return "TrajectoryEvaluatorParams" + case LegacyEvaluatorCategory.Deterministic: + match evaluator_type: + case LegacyEvaluatorType.Equals: + return "EqualsEvaluatorParams" + case LegacyEvaluatorType.JsonSimilarity: + return "JsonSimilarityEvaluatorParams" + case _: + return "UnknownEvaluatorParams" + case _: + return "UnknownEvaluatorParams" + else: + return "UnknownEvaluatorParams" + + +def evaluator_config_discriminator(data: Any) -> str: + if isinstance(data, dict): + evaluator_type_id = data.get("evaluatorTypeId") + match evaluator_type_id: + case EvaluatorType.CONTAINS: + return "ContainsEvaluatorConfig" + case EvaluatorType.EXACT_MATCH: + return "ExactMatchEvaluatorConfig" + case EvaluatorType.JSON_SIMILARITY: + return "JsonSimilarityEvaluatorConfig" + case EvaluatorType.LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY: + return "LLMJudgeOutputEvaluatorConfig" + case EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY: + return "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig" + case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY: + return "LLMJudgeTrajectoryEvaluatorConfig" + case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION: + return "LLMJudgeTrajectorySimulationEvaluatorConfig" + case EvaluatorType.TOOL_CALL_ARGS: + return "ToolCallArgsEvaluatorConfig" + case EvaluatorType.TOOL_CALL_COUNT: + return "ToolCallCountEvaluatorConfig" + case EvaluatorType.TOOL_CALL_ORDER: + return "ToolCallOrderEvaluatorConfig" + case EvaluatorType.TOOL_CALL_OUTPUT: + return "ToolCallOutputEvaluatorConfig" + case _: + return "UnknownEvaluatorConfig" + else: + return "UnknownEvaluatorConfig" + + +LegacyEvaluator = Annotated[ + Union[ + Annotated[ + LLMEvaluatorParams, + Tag("LLMEvaluatorParams"), + ], + Annotated[ + TrajectoryEvaluatorParams, + Tag("TrajectoryEvaluatorParams"), + ], + Annotated[ + EqualsEvaluatorParams, + Tag("EqualsEvaluatorParams"), + ], + Annotated[ + JsonSimilarityEvaluatorParams, + Tag("JsonSimilarityEvaluatorParams"), + ], + Annotated[ + UnknownEvaluatorParams, + Tag("UnknownEvaluatorParams"), + ], + ], + Field(discriminator=Discriminator(legacy_evaluator_discriminator)), +] + +EvaluatorConfig = Annotated[ + Union[ + Annotated[ + ContainsEvaluatorConfig, + Tag("ContainsEvaluatorConfig"), + ], + Annotated[ + ExactMatchEvaluatorConfig, + Tag("ExactMatchEvaluatorConfig"), + ], + Annotated[ + JsonSimilarityEvaluatorConfig, + Tag("JsonSimilarityEvaluatorConfig"), + ], + Annotated[ + LLMJudgeOutputEvaluatorConfig, + Tag("LLMJudgeOutputEvaluatorConfig"), + ], + Annotated[ + LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, + Tag("LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"), + ], + Annotated[ + LLMJudgeTrajectoryEvaluatorConfig, + Tag("LLMJudgeTrajectoryEvaluatorConfig"), + ], + Annotated[ + ToolCallArgsEvaluatorConfig, + Tag("ToolCallArgsEvaluatorConfig"), + ], + Annotated[ + ToolCallCountEvaluatorConfig, + Tag("ToolCallCountEvaluatorConfig"), + ], + Annotated[ + ToolCallOrderEvaluatorConfig, + Tag("ToolCallOrderEvaluatorConfig"), + ], + Annotated[ + ToolCallOutputEvaluatorConfig, + Tag("ToolCallOutputEvaluatorConfig"), + ], + Annotated[ + LLMJudgeTrajectorySimulationEvaluatorConfig, + Tag("LLMJudgeTrajectorySimulationEvaluatorConfig"), + ], + Annotated[ + UnknownEvaluatorConfig, + Tag("UnknownEvaluatorConfig"), + ], + ], + Field(discriminator=Discriminator(evaluator_config_discriminator)), +] diff --git a/src/uipath/_cli/_evals/_models/_evaluator_base_params.py b/src/uipath/_cli/_evals/_models/_evaluator_base_params.py index bc478384b..b4e578b9b 100644 --- a/src/uipath/_cli/_evals/_models/_evaluator_base_params.py +++ b/src/uipath/_cli/_evals/_models/_evaluator_base_params.py @@ -1,14 +1,14 @@ from pydantic import BaseModel -from uipath.eval.models.models import EvaluatorCategory, EvaluatorType +from uipath.eval.models.models import LegacyEvaluatorCategory, LegacyEvaluatorType class EvaluatorBaseParams(BaseModel): """Parameters for initializing the base evaluator.""" id: str - category: EvaluatorCategory - evaluator_type: EvaluatorType + category: LegacyEvaluatorCategory + evaluator_type: LegacyEvaluatorType name: str description: str created_at: str diff --git a/src/uipath/_cli/_evals/_models/_output.py b/src/uipath/_cli/_evals/_models/_output.py index ed4f01795..18f526173 100644 --- a/src/uipath/_cli/_evals/_models/_output.py +++ b/src/uipath/_cli/_evals/_models/_output.py @@ -1,9 +1,11 @@ import logging -from typing import List, Optional +from collections import defaultdict +from typing import Any, Dict, List, Optional from opentelemetry.sdk.trace import ReadableSpan from pydantic import BaseModel, ConfigDict, model_serializer from pydantic.alias_generators import to_camel +from pydantic_core import core_schema from uipath._cli._runtime._contracts import UiPathRuntimeResult from uipath.eval.models.models import EvaluationResult, ScoreType @@ -24,11 +26,15 @@ class EvaluationResultDto(BaseModel): model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) score: float - details: Optional[str] = None + details: Optional[str | BaseModel] = None evaluation_time: Optional[float] = None @model_serializer(mode="wrap") - def serialize_model(self, serializer, info): + def serialize_model( + self, + serializer: core_schema.SerializerFunctionWrapHandler, + info: core_schema.SerializationInfo, + ) -> Any: data = serializer(self) if self.details is None and isinstance(data, dict): data.pop("details", None) @@ -101,3 +107,81 @@ def score(self) -> float: eval_result.score for eval_result in self.evaluation_set_results ] return sum(eval_item_scores) / len(eval_item_scores) + + def calculate_final_score( + self, + evaluator_weights: Dict[str, float] | None = None, + default_weight: float = 1.0, + ) -> tuple[float, Dict[str, float]]: + """Aggregate evaluation results with deduplication and weighted scoring. + + This function performs the following steps: + 1. Flattens the nested evaluation_set_results structure + 2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates) + 3. Calculates average score per evaluator across all datapoints + 4. Computes final weighted score across evaluators + + Args: + evaluator_weights: Optional dict mapping evaluator names to weights + default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0) + + Returns: + Tuple of (final_score, agg_metrics_per_evaluator) + - final_score: Weighted average across evaluators + - agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores + """ + if not self.evaluation_set_results: + return 0.0, {} + + if evaluator_weights is None: + evaluator_weights = {} + + # Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication + # datapoint_id = evaluation_name, evaluator_name from EvaluationRunResultDto + grouped_by_datapoint_evaluator: defaultdict[ + str, defaultdict[str, list[float]] + ] = defaultdict(lambda: defaultdict(list)) + + for eval_run_result in self.evaluation_set_results: + datapoint_id = eval_run_result.evaluation_name + for eval_run_result_dto in eval_run_result.evaluation_run_results: + evaluator_name = eval_run_result_dto.evaluator_name + score = eval_run_result_dto.result.score + grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append( + score + ) + + # Step 2: Deduplicate by averaging same evaluator results for same datapoint + dedup_scores: list[tuple[str, str, float]] = [] + for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items(): + for evaluator_name, scores_list in evaluators_dict.items(): + if scores_list: + # Average the scores for this evaluator on this datapoint + avg_score = sum(scores_list) / len(scores_list) + dedup_scores.append((datapoint_id, evaluator_name, avg_score)) + + # Step 3: Group by evaluator and calculate average score per evaluator + grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list) + for _datapoint_id, evaluator_name, score in dedup_scores: + grouped_by_evaluator[evaluator_name].append(score) + + agg_metrics_per_evaluator = {} + for evaluator_name, scores_list in grouped_by_evaluator.items(): + avg_score = sum(scores_list) / len(scores_list) + agg_metrics_per_evaluator[evaluator_name] = avg_score + + # Step 4: Calculate final weighted score + if not agg_metrics_per_evaluator: + return 0.0, {} + + total_weighted_score = 0.0 + total_weight = 0.0 + + for evaluator_name, avg_score in agg_metrics_per_evaluator.items(): + weight = evaluator_weights.get(evaluator_name, default_weight) + total_weighted_score += avg_score * weight + total_weight += weight + + final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0 + + return final_score, agg_metrics_per_evaluator diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index 062dff6b4..ea84a8a94 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -5,12 +5,18 @@ import logging import os from typing import Any, Dict, List +from urllib.parse import urlparse from opentelemetry import trace from rich.console import Console from uipath import UiPath -from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationStatus +from uipath._cli._evals._models._evaluation_set import ( + AnyEvaluationItem, + AnyEvaluator, + EvaluationItem, + EvaluationStatus, +) from uipath._cli._evals._models._sw_reporting import ( StudioWebAgentSnapshot, StudioWebProgressItem, @@ -28,8 +34,12 @@ EvaluationEvents, ) from uipath._utils import Endpoint, RequestSpec -from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID -from uipath.eval.evaluators import BaseEvaluator +from uipath._utils.constants import ( + ENV_EVAL_BACKEND_URL, + ENV_TENANT_ID, + HEADER_INTERNAL_TENANT_ID, +) +from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator from uipath.eval.models import EvalItemResult, ScoreType from uipath.tracing import LlmOpsHttpExporter @@ -65,7 +75,10 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter): logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL) console_logger = ConsoleLogger.get_instance() - uipath = UiPath() + + # Use UIPATH_EVAL_BACKEND_URL for eval-specific routing if set + eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL) + uipath = UiPath(base_url=eval_backend_url) if eval_backend_url else UiPath() self._client = uipath.api_client self._console = console_logger @@ -80,18 +93,128 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter): self.evaluators: Dict[str, Any] = {} self.evaluator_scores: Dict[str, List[float]] = {} self.eval_run_ids: Dict[str, str] = {} + self.is_coded_eval: Dict[str, bool] = {} # Track coded vs legacy per execution + self.eval_spans: Dict[ + str, list[Any] + ] = {} # Store spans per execution for usage metrics def _format_error_message(self, error: Exception, context: str) -> None: """Helper method to format and display error messages consistently.""" self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]") + def _is_localhost(self) -> bool: + """Check if the eval backend URL is localhost. + + Returns: + True if using localhost, False otherwise. + """ + eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL, "") + if eval_backend_url: + try: + parsed = urlparse(eval_backend_url) + hostname = parsed.hostname or parsed.netloc.split(":")[0] + return hostname.lower() in ("localhost", "127.0.0.1") + except Exception: + pass + return False + + def _get_endpoint_prefix(self) -> str: + """Determine the endpoint prefix based on environment. + + Checks UIPATH_EVAL_BACKEND_URL environment variable: + - If set to localhost/127.0.0.1: returns "api/" (direct API access) + - Otherwise: returns "agentsruntime_/api/" (service routing for alpha/prod) + + Returns: + "api/" for localhost environments, "agentsruntime_/api/" for alpha/production. + """ + if self._is_localhost(): + return "api/" + return "agentsruntime_/api/" + + def _is_coded_evaluator(self, evaluators: List[AnyEvaluator]) -> bool: + """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator). + + Args: + evaluators: List of evaluators to check + + Returns: + True if using coded evaluators, False for legacy evaluators + """ + if not evaluators: + return False + # Check the first evaluator type + return isinstance(evaluators[0], BaseEvaluator) + + def _extract_usage_from_spans( + self, spans: list[Any] + ) -> dict[str, int | float | None]: + """Extract token usage and cost from OpenTelemetry spans. + + Args: + spans: List of ReadableSpan objects from agent execution + + Returns: + Dictionary with tokens, completionTokens, promptTokens, and cost + """ + total_tokens = 0 + completion_tokens = 0 + prompt_tokens = 0 + total_cost = 0.0 + + for span in spans: + try: + # Handle both dictionary attributes and string Attributes field + attrs = None + if hasattr(span, "attributes") and span.attributes: + if isinstance(span.attributes, dict): + attrs = span.attributes + elif isinstance(span.attributes, str): + # Parse JSON string attributes + attrs = json.loads(span.attributes) + + # Also check for Attributes field (capitalized) from backend spans + if not attrs and hasattr(span, "Attributes") and span.Attributes: + if isinstance(span.Attributes, str): + attrs = json.loads(span.Attributes) + elif isinstance(span.Attributes, dict): + attrs = span.Attributes + + if attrs: + # Try to get usage from nested usage object (backend format) + if "usage" in attrs and isinstance(attrs["usage"], dict): + usage = attrs["usage"] + prompt_tokens += usage.get("promptTokens", 0) + completion_tokens += usage.get("completionTokens", 0) + total_tokens += usage.get("totalTokens", 0) + # Cost might be in usage or at root level + total_cost += usage.get("cost", 0.0) + + # Also try OpenTelemetry semantic conventions (SDK format) + prompt_tokens += attrs.get("gen_ai.usage.prompt_tokens", 0) + completion_tokens += attrs.get("gen_ai.usage.completion_tokens", 0) + total_tokens += attrs.get("gen_ai.usage.total_tokens", 0) + total_cost += attrs.get("gen_ai.usage.cost", 0.0) + total_cost += attrs.get("llm.usage.cost", 0.0) + + except (json.JSONDecodeError, AttributeError, TypeError) as e: + logger.debug(f"Failed to parse span attributes: {e}") + continue + + return { + "tokens": total_tokens if total_tokens > 0 else None, + "completionTokens": completion_tokens if completion_tokens > 0 else None, + "promptTokens": prompt_tokens if prompt_tokens > 0 else None, + "cost": total_cost if total_cost > 0 else None, + } + @gracefully_handle_errors async def create_eval_set_run( self, eval_set_id: str, agent_snapshot: StudioWebAgentSnapshot, no_of_evals: int, - evaluators: List[BaseEvaluator[Any]], + evaluators: List[LegacyBaseEvaluator[Any]], ) -> str: """Create a new evaluation set run in StudioWeb.""" spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals) @@ -101,13 +224,14 @@ async def create_eval_set_run( params=spec.params, json=spec.json, headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", ) eval_set_run_id = json.loads(response.content)["id"] return eval_set_run_id @gracefully_handle_errors async def create_eval_run( - self, eval_item: EvaluationItem, eval_set_run_id: str + self, eval_item: AnyEvaluationItem, eval_set_run_id: str ) -> str: """Create a new evaluation run in StudioWeb. @@ -125,6 +249,7 @@ async def create_eval_run( params=spec.params, json=spec.json, headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", ) return json.loads(response.content)["id"] @@ -132,25 +257,53 @@ async def create_eval_run( async def update_eval_run( self, sw_progress_item: StudioWebProgressItem, - evaluators: dict[str, BaseEvaluator[Any]], + evaluators: dict[str, AnyEvaluator], + is_coded: bool = False, + spans: list[Any] | None = None, ): """Update an evaluation run with results.""" - assertion_runs, evaluator_scores = self._collect_results( - sw_progress_item.eval_results, evaluators + coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {} + legacy_evaluators: dict[str, LegacyBaseEvaluator[Any]] = {} + evaluator_runs: list[dict[str, Any]] = [] + evaluator_scores: list[dict[str, Any]] = [] + + for k, v in evaluators.items(): + if isinstance(v, BaseEvaluator): + coded_evaluators[k] = v + elif isinstance(v, LegacyBaseEvaluator): + legacy_evaluators[k] = v + + # Use coded evaluator format + runs, scores = self._collect_coded_results( + sw_progress_item.eval_results, coded_evaluators, spans or [] + ) + evaluator_runs.extend(runs) + evaluator_scores.extend(scores) + + # Use legacy evaluator format + runs, scores = self._collect_results( + sw_progress_item.eval_results, + legacy_evaluators, + spans or [], ) + evaluator_runs.extend(runs) + evaluator_scores.extend(scores) + spec = self._update_eval_run_spec( - assertion_runs=assertion_runs, + assertion_runs=evaluator_runs, evaluator_scores=evaluator_scores, eval_run_id=sw_progress_item.eval_run_id, execution_time=sw_progress_item.agent_execution_time, actual_output=sw_progress_item.agent_output, ) + await self._client.request_async( method=spec.method, url=spec.endpoint, params=spec.params, json=spec.json, headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", ) @gracefully_handle_errors @@ -167,6 +320,7 @@ async def update_eval_set_run( params=spec.params, json=spec.json, headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", ) async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None: @@ -174,6 +328,10 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N self.evaluators = {eval.id: eval for eval in payload.evaluators} self.evaluator_scores = {eval.id: [] for eval in payload.evaluators} + # Detect if using coded evaluators and store for this execution + is_coded = self._is_coded_evaluator(payload.evaluators) + self.is_coded_eval[payload.execution_id] = is_coded + eval_set_run_id = await self.create_eval_set_run( eval_set_id=payload.eval_set_id, agent_snapshot=self._extract_agent_snapshot(payload.entrypoint), @@ -185,7 +343,9 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N if current_span.is_recording(): current_span.set_attribute("eval_set_run_id", eval_set_run_id) - logger.debug(f"Created eval set run with ID: {eval_set_run_id}") + logger.debug( + f"Created eval set run with ID: {eval_set_run_id} (coded={is_coded})" + ) except Exception as e: self._format_error_message(e, "StudioWeb create eval set run error") @@ -230,6 +390,12 @@ async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None: eval_run_id = self.eval_run_ids[payload.execution_id] if eval_run_id: + # Get the is_coded flag for this execution + is_coded = self.is_coded_eval.get(payload.execution_id, False) + + # Extract usage metrics from spans + self._extract_usage_from_spans(payload.spans) + await self.update_eval_run( StudioWebProgressItem( eval_run_id=eval_run_id, @@ -239,9 +405,13 @@ async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None: agent_execution_time=payload.agent_execution_time, ), self.evaluators, + is_coded=is_coded, + spans=payload.spans, ) - logger.debug(f"Updated eval run with ID: {eval_run_id}") + logger.debug( + f"Updated eval run with ID: {eval_run_id} (coded={is_coded})" + ) except Exception as e: self._format_error_message(e, "StudioWeb reporting error") @@ -306,10 +476,15 @@ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: def _collect_results( self, eval_results: list[EvalItemResult], - evaluators: dict[str, BaseEvaluator[Any]], + evaluators: dict[str, LegacyBaseEvaluator[Any]], + spans: list[Any], ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: assertion_runs: list[dict[str, Any]] = [] evaluator_scores_list: list[dict[str, Any]] = [] + + # Extract usage metrics from spans + usage_metrics = self._extract_usage_from_spans(spans) + for eval_result in eval_results: evaluator_scores_list.append( { @@ -327,10 +502,10 @@ def _collect_results( "duration": int(eval_result.result.evaluation_time) if eval_result.result.evaluation_time else 0, - "cost": None, - "tokens": 0, - "completionTokens": 0, - "promptTokens": 0, + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, }, "assertionSnapshot": { "assertionType": evaluators[ @@ -344,6 +519,55 @@ def _collect_results( ) return assertion_runs, evaluator_scores_list + def _collect_coded_results( + self, + eval_results: list[EvalItemResult], + evaluators: dict[str, BaseEvaluator[Any, Any, Any]], + spans: list[Any], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results for coded evaluators. + + Returns evaluatorRuns and scores in the format expected by coded eval endpoints. + """ + evaluator_runs: list[dict[str, Any]] = [] + evaluator_scores_list: list[dict[str, Any]] = [] + + # Extract usage metrics from spans + usage_metrics = self._extract_usage_from_spans(spans) + + for eval_result in eval_results: + evaluator_scores_list.append( + { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": eval_result.result.details, + "evaluatorId": eval_result.evaluator_id, + } + ) + evaluator_runs.append( + { + "status": EvaluationStatus.COMPLETED.value, + "evaluatorId": eval_result.evaluator_id, + "result": { + "score": { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + }, + "justification": eval_result.result.details, + }, + "completionMetrics": { + "duration": int(eval_result.result.evaluation_time) + if eval_result.result.evaluation_time + else 0, + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, + }, + } + ) + return evaluator_runs, evaluator_scores_list + def _update_eval_run_spec( self, assertion_runs: list[dict[str, Any]], @@ -355,7 +579,7 @@ def _update_eval_run_spec( return RequestSpec( method="PUT", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun" + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun" ), json={ "evalRunId": eval_run_id, @@ -370,22 +594,58 @@ def _update_eval_run_spec( headers=self._tenant_header(), ) + def _update_coded_eval_run_spec( + self, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + eval_run_id: str, + actual_output: dict[str, Any], + execution_time: float, + ) -> RequestSpec: + """Create update spec for coded evaluators.""" + return RequestSpec( + method="PUT", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun" + ), + json={ + "evalRunId": eval_run_id, + "status": EvaluationStatus.COMPLETED.value, + "result": { + "output": {"content": {**actual_output}}, + "scores": evaluator_scores, + }, + "completionMetrics": {"duration": int(execution_time)}, + "evaluatorRuns": evaluator_runs, + }, + headers=self._tenant_header(), + ) + def _create_eval_run_spec( - self, eval_item: EvaluationItem, eval_set_run_id: str + self, eval_item: AnyEvaluationItem, eval_set_run_id: str ) -> RequestSpec: + # Build eval snapshot based on evaluation item type + eval_snapshot = { + "id": eval_item.id, + "name": eval_item.name, + "inputs": eval_item.inputs, + } + + # For new coded evaluators (EvaluationItem), use evaluationCriterias + # For legacy evaluators (LegacyEvaluationItem), use expectedOutput + if isinstance(eval_item, EvaluationItem): + eval_snapshot["evaluationCriterias"] = eval_item.evaluation_criterias + else: + eval_snapshot["expectedOutput"] = eval_item.expected_output + return RequestSpec( method="POST", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun" + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalRun" ), json={ "evalSetRunId": eval_set_run_id, - "evalSnapshot": { - "id": eval_item.id, - "name": eval_item.name, - "inputs": eval_item.inputs, - "expectedOutput": eval_item.expected_output, - }, + "evalSnapshot": eval_snapshot, "status": EvaluationStatus.IN_PROGRESS.value, }, headers=self._tenant_header(), @@ -400,7 +660,7 @@ def _create_eval_set_run_spec( return RequestSpec( method="POST", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun" + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalSetRun" ), json={ "agentId": self._project_id, @@ -425,7 +685,7 @@ def _update_eval_set_run_spec( return RequestSpec( method="PUT", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun" + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/coded/evalSetRun" ), json={ "evalSetRunId": eval_set_run_id, diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index ebd9899b6..d7c07a867 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -24,7 +24,7 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from ...eval.evaluators import BaseEvaluator +from ...eval.evaluators import BaseEvaluator, LegacyBaseEvaluator from ...eval.models import EvaluationResult from ...eval.models.models import AgentExecution, EvalItemResult from .._runtime._contracts import ( @@ -38,7 +38,13 @@ from .._runtime._logging import ExecutionLogHandler from .._utils._eval_set import EvalHelpers from ._evaluator_factory import EvaluatorFactory -from ._models._evaluation_set import EvaluationItem, EvaluationSet +from ._models._evaluation_set import ( + AnyEvaluationItem, + AnyEvaluationSet, + AnyEvaluator, + EvaluationItem, + LegacyEvaluationItem, +) from ._models._exceptions import EvaluationRuntimeException from ._models._output import ( EvaluationResultDto, @@ -182,7 +188,8 @@ async def execute(self) -> UiPathRuntimeResult: event_bus = self.event_bus - evaluation_set = EvalHelpers.load_eval_set( + # Load eval set (path is already resolved in cli_eval.py) + evaluation_set, _ = EvalHelpers.load_eval_set( self.context.eval_set, self.context.eval_ids ) evaluators = self._load_evaluators(evaluation_set) @@ -215,6 +222,7 @@ async def execute(self) -> UiPathRuntimeResult: evaluation_set_name=evaluation_set.name, evaluation_set_results=eval_run_result_list, ) + # Computing evaluator averages evaluator_averages: Dict[str, float] = defaultdict(float) evaluator_count: Dict[str, int] = defaultdict(int) @@ -245,8 +253,8 @@ async def execute(self) -> UiPathRuntimeResult: async def _execute_sequential( self, - evaluation_set: EvaluationSet, - evaluators: List[BaseEvaluator[Any]], + evaluation_set: AnyEvaluationSet, + evaluators: List[AnyEvaluator], event_bus: EventBus, ) -> List[EvaluationRunResult]: all_eval_run_result: list[EvaluationRunResult] = [] @@ -260,13 +268,13 @@ async def _execute_sequential( async def _execute_parallel( self, - evaluation_set: EvaluationSet, - evaluators: List[BaseEvaluator[Any]], + evaluation_set: AnyEvaluationSet, + evaluators: List[AnyEvaluator], event_bus: EventBus, workers: int, ) -> List[EvaluationRunResult]: # Create a queue with max concurrency - queue: asyncio.Queue[tuple[int, EvaluationItem]] = asyncio.Queue( + queue: asyncio.Queue[tuple[int, AnyEvaluationItem]] = asyncio.Queue( maxsize=workers ) @@ -276,7 +284,7 @@ async def _execute_parallel( # Producer task to fill the queue async def producer() -> None: for index, eval_item in enumerate(evaluation_set.evaluations): - await queue.put((index, eval_item)) + await queue.put((index, eval_item)) # type: ignore[arg-type] # Signal completion by putting None markers for _ in range(workers): await queue.put(None) # type: ignore @@ -318,15 +326,12 @@ async def worker(worker_id: int) -> None: async def _execute_eval( self, - eval_item: EvaluationItem, - evaluators: List[BaseEvaluator[Any]], + eval_item: AnyEvaluationItem, + evaluators: List[AnyEvaluator], event_bus: EventBus, ) -> EvaluationRunResult: - # Generate LLM-based input if input_mocking_strategy is defined - if eval_item.input_mocking_strategy: - eval_item = await self._generate_input_for_eval(eval_item) - execution_id = str(uuid.uuid4()) + set_execution_context(eval_item, self.span_collector, execution_id) await event_bus.publish( @@ -346,11 +351,41 @@ async def _execute_eval( evaluation_item_results: list[EvalItemResult] = [] for evaluator in evaluators: - evaluation_result = await self.run_evaluator( - evaluator=evaluator, - execution_output=agent_execution_output, - eval_item=eval_item, - ) + # Determine which evaluator method to use based on evaluation set/item type + evaluation_result: Optional[EvaluationResult] = None + + match eval_item: + case LegacyEvaluationItem(): + # Legacy evaluation - use run_legacy_evaluator + evaluation_result = await self.run_legacy_evaluator( + evaluator=evaluator, # type: ignore + execution_output=agent_execution_output, + eval_item=eval_item, + ) + case EvaluationItem() if ( + evaluator.id in eval_item.evaluation_criterias + ): + # New evaluation with criteria + evaluation_criteria = eval_item.evaluation_criterias[ + evaluator.id + ] + + evaluation_result = await self.run_evaluator( + evaluator=evaluator, # type: ignore + execution_output=agent_execution_output, + eval_item=eval_item, + evaluation_criteria=evaluator.evaluation_criteria_type( # type: ignore + **evaluation_criteria + ) + if evaluation_criteria + else evaluator.evaluator_config.default_evaluation_criteria, # type: ignore + ) + case _: + # Skip if evaluator not in evaluation criteria + continue + + if evaluation_result is None: + continue dto_result = EvaluationResultDto.from_evaluation_result( evaluation_result @@ -449,7 +484,7 @@ def _get_and_clear_execution_data( return spans, logs async def execute_runtime( - self, eval_item: EvaluationItem, execution_id: str + self, eval_item: AnyEvaluationItem, execution_id: str ) -> UiPathEvalRunExecutionOutput: context_args = self.context.model_dump() context_args["execution_id"] = execution_id @@ -486,7 +521,6 @@ async def execute_runtime( if result is None: raise ValueError("Execution result cannot be None for eval runs") - return UiPathEvalRunExecutionOutput( execution_time=end_time - start_time, spans=spans, @@ -501,9 +535,31 @@ def _setup_execution_logging(self, eval_item_id: str) -> ExecutionLogHandler: async def run_evaluator( self, - evaluator: BaseEvaluator[Any], + evaluator: BaseEvaluator[Any, Any, Any], execution_output: UiPathEvalRunExecutionOutput, eval_item: EvaluationItem, + *, + evaluation_criteria: Any, + ) -> EvaluationResult: + agent_execution = AgentExecution( + agent_input=eval_item.inputs, + agent_output=execution_output.result.output or {}, + agent_trace=execution_output.spans, + expected_agent_behavior=eval_item.expected_agent_behavior, + ) + + result = await evaluator.validate_and_evaluate_criteria( + agent_execution=agent_execution, + evaluation_criteria=evaluation_criteria, + ) + + return result + + async def run_legacy_evaluator( + self, + evaluator: LegacyBaseEvaluator[Any], + execution_output: UiPathEvalRunExecutionOutput, + eval_item: LegacyEvaluationItem, ) -> EvaluationResult: agent_execution = AgentExecution( agent_input=eval_item.inputs, @@ -520,9 +576,7 @@ async def run_evaluator( return result - def _load_evaluators( - self, evaluation_set: EvaluationSet - ) -> List[BaseEvaluator[Any]]: + def _load_evaluators(self, evaluation_set: AnyEvaluationSet) -> list[AnyEvaluator]: """Load evaluators referenced by the evaluation set.""" evaluators = [] evaluators_dir = Path(self.context.eval_set).parent.parent / "evaluators" # type: ignore diff --git a/src/uipath/_cli/_evals/mocks/input_mocker.py b/src/uipath/_cli/_evals/mocks/input_mocker.py index a7830e824..94d2aeaa6 100644 --- a/src/uipath/_cli/_evals/mocks/input_mocker.py +++ b/src/uipath/_cli/_evals/mocks/input_mocker.py @@ -67,9 +67,7 @@ async def generate_llm_input( if evaluation_item.input_mocking_strategy else "", expected_behavior=evaluation_item.expected_agent_behavior or "", - expected_output=json.dumps(evaluation_item.expected_output, indent=2) - if evaluation_item.expected_output - else "", + expected_output=json.dumps(evaluation_item.evaluation_criterias, indent=2), ) response_format = { diff --git a/src/uipath/_cli/_evals/mocks/llm_mocker.py b/src/uipath/_cli/_evals/mocks/llm_mocker.py index b6e2916cf..43c1c1a3a 100644 --- a/src/uipath/_cli/_evals/mocks/llm_mocker.py +++ b/src/uipath/_cli/_evals/mocks/llm_mocker.py @@ -10,7 +10,7 @@ from uipath.tracing._utils import _SpanUtils from .._models._evaluation_set import ( - EvaluationItem, + AnyEvaluationItem, LLMMockingStrategy, ) from .._models._mocks import ExampleCall @@ -77,7 +77,7 @@ def pydantic_to_dict_safe(obj: Any) -> Any: class LLMMocker(Mocker): """LLM Based Mocker.""" - def __init__(self, evaluation_item: EvaluationItem): + def __init__(self, evaluation_item: AnyEvaluationItem): """LLM Mocker constructor.""" self.evaluation_item = evaluation_item assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy) diff --git a/src/uipath/_cli/_evals/mocks/mocker_factory.py b/src/uipath/_cli/_evals/mocks/mocker_factory.py index a3bdd47cd..5e024f65b 100644 --- a/src/uipath/_cli/_evals/mocks/mocker_factory.py +++ b/src/uipath/_cli/_evals/mocks/mocker_factory.py @@ -1,7 +1,7 @@ """Mocker Factory.""" from uipath._cli._evals._models._evaluation_set import ( - EvaluationItem, + AnyEvaluationItem, LLMMockingStrategy, MockitoMockingStrategy, ) @@ -14,7 +14,7 @@ class MockerFactory: """Mocker factory.""" @staticmethod - def create(evaluation_item: EvaluationItem) -> Mocker: + def create(evaluation_item: AnyEvaluationItem) -> Mocker: """Create a mocker instance.""" match evaluation_item.mocking_strategy: case LLMMockingStrategy(): diff --git a/src/uipath/_cli/_evals/mocks/mockito_mocker.py b/src/uipath/_cli/_evals/mocks/mockito_mocker.py index 2a951f12d..d9d145be1 100644 --- a/src/uipath/_cli/_evals/mocks/mockito_mocker.py +++ b/src/uipath/_cli/_evals/mocks/mockito_mocker.py @@ -9,7 +9,7 @@ from mockito import invocation, mocking # type: ignore[import-untyped] from uipath._cli._evals._models._evaluation_set import ( - EvaluationItem, + AnyEvaluationItem, MockingAnswerType, MockitoMockingStrategy, ) @@ -38,7 +38,7 @@ def func(*_args, **_kwargs): class MockitoMocker(Mocker): """Mockito Mocker.""" - def __init__(self, evaluation_item: EvaluationItem): + def __init__(self, evaluation_item: AnyEvaluationItem): """Instantiate a mockito mocker.""" self.evaluation_item = evaluation_item assert isinstance(self.evaluation_item.mocking_strategy, MockitoMockingStrategy) diff --git a/src/uipath/_cli/_evals/mocks/mocks.py b/src/uipath/_cli/_evals/mocks/mocks.py index 0a34dd151..ad555c8ab 100644 --- a/src/uipath/_cli/_evals/mocks/mocks.py +++ b/src/uipath/_cli/_evals/mocks/mocks.py @@ -4,13 +4,13 @@ from contextvars import ContextVar from typing import Any, Callable, Optional -from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem from uipath._cli._evals._span_collection import ExecutionSpanCollector from uipath._cli._evals.mocks.mocker import Mocker, UiPathNoMockFoundError from uipath._cli._evals.mocks.mocker_factory import MockerFactory # Context variables for evaluation items and mockers -evaluation_context: ContextVar[Optional[EvaluationItem]] = ContextVar( +evaluation_context: ContextVar[Optional[AnyEvaluationItem]] = ContextVar( "evaluation", default=None ) @@ -30,7 +30,9 @@ def set_execution_context( - eval_item: EvaluationItem, span_collector: ExecutionSpanCollector, execution_id: str + eval_item: AnyEvaluationItem, + span_collector: ExecutionSpanCollector, + execution_id: str, ) -> None: """Set the execution context for an evaluation run for mocking and trace access.""" evaluation_context.set(eval_item) diff --git a/src/uipath/_cli/_push/models.py b/src/uipath/_cli/_push/models.py new file mode 100644 index 000000000..764b7799d --- /dev/null +++ b/src/uipath/_cli/_push/models.py @@ -0,0 +1,17 @@ +"""Models for push command.""" + +from pydantic import BaseModel, Field + + +class EvaluatorFileDetails(BaseModel): + """Details about an evaluator file for push operations.""" + + path: str + custom_evaluator_file_name: str = Field( + "", description="Name of the custom evaluator file, if available." + ) + + @property + def is_custom(self) -> bool: + """Check if this is a custom evaluator.""" + return len(self.custom_evaluator_file_name) > 0 diff --git a/src/uipath/_cli/_push/sw_file_handler.py b/src/uipath/_cli/_push/sw_file_handler.py index f8f8a2091..fde9bb543 100644 --- a/src/uipath/_cli/_push/sw_file_handler.py +++ b/src/uipath/_cli/_push/sw_file_handler.py @@ -6,7 +6,14 @@ from datetime import datetime, timezone from typing import Any, AsyncIterator, Dict, Optional, Set +import click + from ...models.exceptions import EnrichedException +from .._evals._helpers import ( # type: ignore + register_evaluator, + try_extract_file_and_class_name, +) +from .._utils._console import ConsoleLogger from .._utils._constants import ( AGENT_INITIAL_CODE_VERSION, AGENT_STORAGE_VERSION, @@ -28,6 +35,7 @@ StructuralMigration, StudioClient, ) +from .models import EvaluatorFileDetails logger = logging.getLogger(__name__) @@ -58,6 +66,7 @@ def __init__( """ self.directory = directory self.include_uv_lock = include_uv_lock + self.console = ConsoleLogger() self._studio_client = StudioClient(project_id) self._project_structure: Optional[ProjectStructure] = None @@ -175,6 +184,7 @@ async def _process_file_uploads( remote_file = source_code_files.get( local_file.relative_path.replace("\\", "/"), None ) + if remote_file: # File exists remotely - mark for update processed_source_files.add(remote_file.id) @@ -185,7 +195,7 @@ async def _process_file_uploads( ) updates.append( FileOperationUpdate( - file_path=local_file.file_name, + file_path=local_file.file_path, status="updating", message=f"Updating '{local_file.file_name}'", ) @@ -203,9 +213,9 @@ async def _process_file_uploads( ) updates.append( FileOperationUpdate( - file_path=local_file.relative_path, + file_path=local_file.file_path, status="uploading", - message=f"Uploading '{local_file.relative_path}'", + message=f"Uploading '{local_file.file_name}'", ) ) @@ -610,3 +620,326 @@ async def upload_source_files( # Yield all updates for update in updates: yield update + + def _extract_evaluator_details(self, file_path: str) -> tuple[bool, str]: + """Return whether an evaluator JSON file has a version property and the custom-evaluator python file (if exists). + + Args: + file_path: Path to the file to check + + Returns: + tuple[bool, str]: A tuple containing: + - A boolean indicating whether the JSON file contains a "version" property. + - The path to the custom-evaluator Python file, if it exists; otherwise, an empty string. + """ + try: + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + _, file_name, _ = try_extract_file_and_class_name( + data.get("evaluatorSchema", "") + ) + return "version" in data, file_name + except (json.JSONDecodeError, FileNotFoundError): + return False, "" + + def _get_coded_evals_files(self) -> tuple[list[EvaluatorFileDetails], list[str]]: + """Get coded-evals files from local evals directory. + + Returns: + Tuple of (evaluator_files, eval_set_files) with version property + """ + evaluator_files: list[EvaluatorFileDetails] = [] + eval_set_files = [] + + # Check {self.directory}/evals/evaluators/ for files with version property + evaluators_dir = os.path.join(self.directory, "evals", "evaluators") + if os.path.exists(evaluators_dir): + for file_name in os.listdir(evaluators_dir): + if file_name.endswith(".json"): + file_path = os.path.join(evaluators_dir, file_name) + version, file_name = self._extract_evaluator_details(file_path) + if version: + evaluator_files.append( + EvaluatorFileDetails( + path=file_path, custom_evaluator_file_name=file_name + ) + ) + + # Check {self.directory}/evals/eval-sets/ for files with version property + eval_sets_dir = os.path.join(self.directory, "evals", "eval-sets") + if os.path.exists(eval_sets_dir): + for file_name in os.listdir(eval_sets_dir): + if file_name.endswith(".json"): + file_path = os.path.join(eval_sets_dir, file_name) + version, _ = self._extract_evaluator_details(file_path) + if version: + eval_set_files.append(file_path) + + return evaluator_files, eval_set_files + + def _get_subfolder_by_name( + self, parent_folder: ProjectFolder, subfolder_name: str + ) -> Optional[ProjectFolder]: + """Get a subfolder from within a parent folder by name. + + Args: + parent_folder: The parent folder to search within + subfolder_name: Name of the subfolder to find + + Returns: + Optional[ProjectFolder]: The found subfolder or None + """ + for folder in parent_folder.folders: + if folder.name == subfolder_name: + return folder + return None + + async def _ensure_coded_evals_structure( + self, structure: ProjectStructure + ) -> ProjectFolder: + """Ensure coded-evals folder structure exists in remote project. + + Args: + structure: Current project structure + + Returns: + ProjectFolder: The coded-evals folder + """ + coded_evals_folder = self._get_folder_by_name(structure, "coded-evals") + + if not coded_evals_folder: + coded_evals_id = await self._studio_client.create_folder_async( + "coded-evals" + ) + self.console.success( + f"Created {click.style('coded-evals', fg='cyan')} folder" + ) + + await self._studio_client.create_folder_async("evaluators", coded_evals_id) + self.console.success( + f"Created {click.style('coded-evals/evaluators', fg='cyan')} folder" + ) + + await self._studio_client.create_folder_async("eval-sets", coded_evals_id) + self.console.success( + f"Created {click.style('coded-evals/eval-sets', fg='cyan')} folder" + ) + + # Refresh structure to get the new folders + structure = await self._studio_client.get_project_structure_async() + coded_evals_folder = self._get_folder_by_name(structure, "coded-evals") + assert coded_evals_folder, "Coded-evals folder uploaded but not found." + + return coded_evals_folder + + def _collect_files_from_folder( + self, folder: Optional[ProjectFolder] + ) -> Dict[str, ProjectFile]: + files: Dict[str, ProjectFile] = {} + if folder: + for file in folder.files: + files[file.name] = file + return files + + def _process_file_sync( + self, + local_file_path: str, + remote_files: Dict[str, ProjectFile], + parent_path: str, + destination_prefix: str, + structural_migration: StructuralMigration, + processed_ids: Set[str], + ) -> None: + """Process a single local file for upload or update to remote. + + Args: + local_file_path: Path to the local file to sync + remote_files: Dictionary of remote files indexed by filename + parent_path: Parent path for new file creation + destination_prefix: Prefix for destination path in console output + structural_migration: Migration object to append resources to + processed_ids: Set to track processed remote file IDs + """ + file_name = os.path.basename(local_file_path) + remote_file = remote_files.get(file_name) + destination = f"{destination_prefix}/{file_name}" + + if remote_file: + processed_ids.add(remote_file.id) + structural_migration.modified_resources.append( + ModifiedResource(id=remote_file.id, content_file_path=local_file_path) + ) + self.console.info(f"Updating {click.style(destination, fg='yellow')}") + else: + structural_migration.added_resources.append( + AddedResource( + content_file_path=local_file_path, parent_path=parent_path + ) + ) + self.console.info(f"Uploading to {click.style(destination, fg='cyan')}") + + def _collect_deleted_remote_files( + self, + remote_files: Dict[str, ProjectFile], + processed_ids: Set[str], + destination_prefix: str, + structural_migration: StructuralMigration, + ) -> None: + """Collect remote files that no longer exist locally for deletion. + + Args: + remote_files: Dictionary of remote files indexed by filename + processed_ids: Set of remote file IDs that were processed + destination_prefix: Prefix for destination path in console output + structural_migration: Migration object to append deleted resources to + """ + for file_name, remote_file in remote_files.items(): + if remote_file.id not in processed_ids: + structural_migration.deleted_resources.append(remote_file.id) + destination = f"{destination_prefix}/{file_name}" + self.console.info( + f"Deleting {click.style(destination, fg='bright_red')}" + ) + + async def upload_coded_evals_files(self) -> None: + """Upload coded-evals files (files with version property) to Studio Web. + + This method: + 1. Scans local evals/evaluators and evals/eval-sets for files with version property + 2. Ensures coded-evals folder structure exists in remote project + 3. Uploads the files to coded-evals/evaluators and coded-evals/eval-sets respectively + 4. Deletes remote files that no longer exist locally (consistent with source file behavior) + """ + evaluator_details, eval_set_files = self._get_coded_evals_files() + + structure = await self._studio_client.get_project_structure_async() + coded_evals_folder = self._get_folder_by_name(structure, "coded-evals") + + # If no coded-evals folder exists and no local files, nothing to do + if not coded_evals_folder and not evaluator_details and not eval_set_files: + return + + # Ensure folder structure exists if we have local files + if evaluator_details or eval_set_files: + await self._ensure_coded_evals_structure(structure) + # Refresh structure to get the new folders + structure = await self._studio_client.get_project_structure_async() + coded_evals_folder = self._get_folder_by_name(structure, "coded-evals") + + if not coded_evals_folder: + return # Nothing to sync + + evaluators_folder = self._get_subfolder_by_name( + coded_evals_folder, "evaluators" + ) + if evaluators_folder: + eval_sets_folder = self._get_subfolder_by_name( + coded_evals_folder, "eval-sets" + ) + custom_evaluators_folder = self._get_subfolder_by_name( + evaluators_folder, "custom" + ) + evaluator_types_folder = None + if custom_evaluators_folder: + evaluator_types_folder = self._get_subfolder_by_name( + custom_evaluators_folder, "types" + ) + + remote_evaluator_files = self._collect_files_from_folder(evaluators_folder) + remote_eval_set_files = self._collect_files_from_folder(eval_sets_folder) + remote_custom_evaluator_files = self._collect_files_from_folder( + custom_evaluators_folder + ) + remote_custom_evaluator_type_files = self._collect_files_from_folder( + evaluator_types_folder + ) + + # Create structural migration for coded-evals files + structural_migration = StructuralMigration( + deleted_resources=[], added_resources=[], modified_resources=[] + ) + + processed_evaluator_ids: Set[str] = set() + processed_eval_set_ids: Set[str] = set() + processed_custom_evaluator_ids: Set[str] = set() + processed_evaluator_type_ids: Set[str] = set() + + for evaluator in evaluator_details: + if evaluator.is_custom: + evaluator_schema_file_path, evaluator_types_file_path = ( + register_evaluator(evaluator.custom_evaluator_file_name) + ) + + self._process_file_sync( + evaluator_schema_file_path, + remote_custom_evaluator_files, + "coded-evals/evaluators/custom", + "coded-evals/evaluators/custom", + structural_migration, + processed_custom_evaluator_ids, + ) + + self._process_file_sync( + evaluator_types_file_path, + remote_custom_evaluator_type_files, + "coded-evals/evaluators/custom/types", + "coded-evals/evaluators/custom/types", + structural_migration, + processed_evaluator_type_ids, + ) + + self._process_file_sync( + evaluator.path, + remote_evaluator_files, + "coded-evals/evaluators", + "coded-evals/evaluators", + structural_migration, + processed_evaluator_ids, + ) + + for eval_set_file in eval_set_files: + self._process_file_sync( + eval_set_file, + remote_eval_set_files, + "coded-evals/eval-sets", + "coded-evals/eval-sets", + structural_migration, + processed_eval_set_ids, + ) + + self._collect_deleted_remote_files( + remote_evaluator_files, + processed_evaluator_ids, + "coded-evals/evaluators", + structural_migration, + ) + + self._collect_deleted_remote_files( + remote_eval_set_files, + processed_eval_set_ids, + "coded-evals/eval-sets", + structural_migration, + ) + + self._collect_deleted_remote_files( + remote_custom_evaluator_files, + processed_custom_evaluator_ids, + "coded-evals/evaluators/custom", + structural_migration, + ) + + self._collect_deleted_remote_files( + remote_custom_evaluator_type_files, + processed_evaluator_type_ids, + "coded-evals/evaluators/custom/types", + structural_migration, + ) + + if ( + structural_migration.added_resources + or structural_migration.modified_resources + or structural_migration.deleted_resources + ): + await self._studio_client.perform_structural_migration_async( + structural_migration + ) diff --git a/src/uipath/_cli/_templates/custom_evaluator.py.template b/src/uipath/_cli/_templates/custom_evaluator.py.template new file mode 100644 index 000000000..ba723bccc --- /dev/null +++ b/src/uipath/_cli/_templates/custom_evaluator.py.template @@ -0,0 +1,65 @@ +from uipath.eval.evaluators import BaseEvaluator, BaseEvaluationCriteria, BaseEvaluatorConfig +from uipath.eval.models import AgentExecution, EvaluationResult, NumericEvaluationResult, BooleanEvaluationResult, ErrorEvaluationResult + + +class $criteria_class(BaseEvaluationCriteria): + """Evaluation criteria for the $evaluator_name evaluator.""" + + # Define your evaluation criteria fields here + # Example: expected_value: str + pass + + +class $config_class(BaseEvaluatorConfig[$criteria_class]): + """Configuration for the $evaluator_name evaluator.""" + + name: str = "$class_name" + # Set default evaluation criteria if needed + # default_evaluation_criteria: $criteria_class | None = $criteria_class(expected_value="example") + + +class $class_name(BaseEvaluator[$criteria_class, $config_class, type(None)]): + """Description for $class_name""" + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator ID.""" + return "$class_name" + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: $criteria_class + ) -> EvaluationResult: + """Evaluate the agent execution against the criteria. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - agent_output: The actual output from the agent + - agent_trace: The execution trace from the agent (list of OpenTelemetry spans) + - simulation_instructions: The simulation instructions for the agent + evaluation_criteria: The criteria to evaluate against + + Returns: + EvaluationResult containing the score and details + """ + + ''' + # TODO: Implement your evaluation logic here + Example: Check if the agent output matches expected criteria + + Access agent execution data: + agent_input = agent_execution.agent_input + agent_output = agent_execution.agent_output + agent_trace = agent_execution.agent_trace + + # Perform your evaluation + score = 0.0 # Replace with your scoring logic + + return NumericEvaluationResult( + score=score, + ) + ''' + + raise NotImplementedError(f"evaluate method not implemented") diff --git a/src/uipath/_cli/_utils/_eval_set.py b/src/uipath/_cli/_utils/_eval_set.py index 9e95d0c71..10c3b9ab3 100644 --- a/src/uipath/_cli/_utils/_eval_set.py +++ b/src/uipath/_cli/_utils/_eval_set.py @@ -3,8 +3,9 @@ from typing import List, Optional import click +from pydantic import TypeAdapter, ValidationError -from uipath._cli._evals._models._evaluation_set import EvaluationSet +from uipath._cli._evals._models._evaluation_set import AnyEvaluationSet from uipath._cli._utils._console import ConsoleLogger console = ConsoleLogger() @@ -57,28 +58,48 @@ def auto_discover_eval_set() -> str: @staticmethod def load_eval_set( eval_set_path: str, eval_ids: Optional[List[str]] = None - ) -> EvaluationSet: + ) -> tuple[AnyEvaluationSet, str]: """Load the evaluation set from file. + Args: + eval_set_path: Path to the evaluation set file + eval_ids: Optional list of evaluation IDs to filter + Returns: - The loaded evaluation set as EvaluationSet model + Tuple of (AnyEvaluationSet, resolved_path) """ + # If the file doesn't exist at the given path, try looking in evals/eval-sets/ + resolved_path = eval_set_path + if not Path(eval_set_path).exists(): + # Check if it's just a filename, then search in evals/eval-sets/ + if Path(eval_set_path).name == eval_set_path: + eval_sets_path = Path("evals/eval-sets") / eval_set_path + if eval_sets_path.exists(): + resolved_path = str(eval_sets_path) + try: - with open(eval_set_path, "r", encoding="utf-8") as f: + with open(resolved_path, "r", encoding="utf-8") as f: data = json.load(f) + except FileNotFoundError as e: + raise ValueError( + f"Evaluation set file not found: '{eval_set_path}'. " + f"Searched in current directory and evals/eval-sets/ directory." + ) from e except json.JSONDecodeError as e: raise ValueError( - f"Invalid JSON in evaluation set file '{eval_set_path}': {str(e)}. " + f"Invalid JSON in evaluation set file '{resolved_path}': {str(e)}. " f"Please check the file for syntax errors." ) from e try: - eval_set = EvaluationSet(**data) - except (TypeError, ValueError) as e: + eval_set: AnyEvaluationSet = TypeAdapter(AnyEvaluationSet).validate_python( + data + ) + except ValidationError as e: raise ValueError( - f"Invalid evaluation set format in '{eval_set_path}': {str(e)}. " + f"Invalid evaluation set format in '{resolved_path}': {str(e)}. " f"Please verify the evaluation set structure." ) from e if eval_ids: eval_set.extract_selected_evals(eval_ids) - return eval_set + return eval_set, resolved_path diff --git a/src/uipath/_cli/_utils/_resources.py b/src/uipath/_cli/_utils/_resources.py new file mode 100644 index 000000000..42723a390 --- /dev/null +++ b/src/uipath/_cli/_utils/_resources.py @@ -0,0 +1,21 @@ +import enum + +from ._console import ConsoleLogger + +console = ConsoleLogger().get_instance() + + +class Resources(str, enum.Enum): + """Available resources that can be created.""" + + EVALUATOR = "evaluator" + + @classmethod + def from_string(cls, resource: str) -> "Resources": # type: ignore + try: + return Resources(resource) + except ValueError: + valid_resources = ", ".join([r.value for r in Resources]) + console.error( + f"Invalid resource type: '{resource}'. Valid types are: {valid_resources}" + ) diff --git a/src/uipath/_cli/_utils/_studio_project.py b/src/uipath/_cli/_utils/_studio_project.py index 1e1c6fc6c..ccf490643 100644 --- a/src/uipath/_cli/_utils/_studio_project.py +++ b/src/uipath/_cli/_utils/_studio_project.py @@ -149,6 +149,24 @@ def get_folder_by_name( return None +def get_subfolder_by_name( + parent_folder: ProjectFolder, subfolder_name: str +) -> Optional[ProjectFolder]: + """Get a subfolder from within a parent folder by name. + + Args: + parent_folder: The parent folder to search within + subfolder_name: Name of the subfolder to find + + Returns: + Optional[ProjectFolder]: The found subfolder or None + """ + for folder in parent_folder.folders: + if folder.name == subfolder_name: + return folder + return None + + def resolve_path( folder: ProjectFolder, path: PurePath, diff --git a/src/uipath/_cli/cli_add.py b/src/uipath/_cli/cli_add.py new file mode 100644 index 000000000..138828c40 --- /dev/null +++ b/src/uipath/_cli/cli_add.py @@ -0,0 +1,114 @@ +import logging +import os +import re +from pathlib import Path +from string import Template + +import click + +from ..telemetry import track +from ._utils._console import ConsoleLogger +from ._utils._resources import Resources + +logger = logging.getLogger(__name__) +console = ConsoleLogger() + + +def to_pascal_case(text: str) -> str: + """Convert kebab-case or snake_case to PascalCase.""" + return "".join(word.capitalize() for word in re.sub(r"[-_]", " ", text).split()) + + +def to_snake_case(text: str) -> str: + """Convert kebab-case or PascalCase to snake_case.""" + return re.sub(r"(? str: + """Generate a generic evaluator template.""" + class_name = to_pascal_case(evaluator_name) + if not class_name.endswith("Evaluator"): + class_name = class_name + "Evaluator" + + variables = { + "class_name": class_name, + "evaluator_name": evaluator_name, + "criteria_class": class_name.replace("Evaluator", "EvaluationCriteria"), + "config_class": class_name + "Config", + } + templates_path = os.path.join( + os.path.dirname(__file__), "_templates", "custom_evaluator.py.template" + ) + with open(templates_path, "r", encoding="utf-8-sig") as f: + content = f.read() + + return Template(content).substitute(variables) + + +def create_evaluator(evaluator_name): + cwd = Path.cwd() + custom_evaluators_dir = cwd / "evals" / "evaluators" / "custom" + + if not custom_evaluators_dir.exists(): + console.info( + f"Creating {click.style('evals/evaluators/custom', fg='cyan')} folder" + ) + custom_evaluators_dir.mkdir(parents=True, exist_ok=True) + + filename = to_snake_case(evaluator_name) + if not filename.endswith(".py"): + filename = filename + ".py" + + file_path = custom_evaluators_dir / filename + + if file_path.exists(): + console.error(f"Evaluator file already exists: {file_path}") + + template_content = generate_evaluator_template(evaluator_name) + + with open(file_path, "w") as f: + f.write(template_content) + + relative_path = f"evals/evaluators/custom/{filename}" + + console.success(f"Created new evaluator: {click.style(relative_path, fg='cyan')}") + console.hint("Next steps:") + console.hint( + f" 1. Edit {click.style(relative_path, fg='cyan')} to implement your evaluation logic" + ) + console.hint( + f" 2. Run {click.style(f'uipath register evaluator {filename}', fg='cyan')} to generate the evaluator spec" + ) + + +@click.command() +@click.argument("resource", required=True) +@click.argument("args", nargs=-1) +@track +def add(resource: str, args: tuple[str]) -> None: + """Create a local resource. + + Examples: + uipath add evaluator my-custom-evaluator + """ + match Resources.from_string(resource): + case Resources.EVALUATOR: + usage_hint = f"Usage: {click.style('uipath add evaluator ', fg='cyan')}" + if len(args) < 1: + console.hint(usage_hint) + console.error("Missing required argument: evaluator_name") + return + if len(args) > 1: + console.hint(usage_hint) + console.error( + f"Too many arguments provided: {args}. Expected only evaluator_name." + ) + + evaluator_name = args[0] + + if not isinstance(evaluator_name, str) or not evaluator_name.strip(): + console.hint(usage_hint) + console.error("Invalid evaluator_name: must be a non-empty string") + return + + create_evaluator(evaluator_name) diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 433a7a0ed..d03c92422 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -130,7 +130,11 @@ def eval( eval_context.no_report = no_report eval_context.workers = workers - eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set() + + # Load eval set to resolve the path + eval_set_path = eval_set or EvalHelpers.auto_discover_eval_set() + _, resolved_eval_set_path = EvalHelpers.load_eval_set(eval_set_path, eval_ids) + eval_context.eval_set = resolved_eval_set_path eval_context.eval_ids = eval_ids console_reporter = ConsoleProgressReporter() diff --git a/src/uipath/_cli/cli_pull.py b/src/uipath/_cli/cli_pull.py index 8ccb96bc4..d7b45e08d 100644 --- a/src/uipath/_cli/cli_pull.py +++ b/src/uipath/_cli/cli_pull.py @@ -24,20 +24,6 @@ console = ConsoleLogger() -class InteractiveConflictHandler: - """Handler that prompts user for each conflict.""" - - def __init__(self, console: ConsoleLogger): - self.console = console - - def should_overwrite( - self, file_path: str, local_hash: str, remote_hash: str - ) -> bool: - self.console.warning(f" File {file_path} differs from remote version.") - response = click.confirm("Do you want to overwrite it?", default=False) - return response - - @click.command() @click.argument( "root", @@ -66,22 +52,21 @@ def pull(root: Path) -> None: project_id = os.getenv(UIPATH_PROJECT_ID) if not project_id: console.error("UIPATH_PROJECT_ID environment variable not found.") + return - default_download_configuration = { + download_configuration = { "source_code": root, "evals": root / "evals", } - async def pull_with_updates(): - try: - async for update in pull_project( - project_id, - default_download_configuration, - InteractiveConflictHandler(console), - ): + try: + + async def run_pull(): + async for update in pull_project(project_id, download_configuration): + console.info(f"Processing: {update.file_path}") console.info(update.message) - except ProjectPullError as e: - console.error(e.message, include_traceback=True) - with console.spinner("Pulling UiPath project files..."): - asyncio.run(pull_with_updates()) + asyncio.run(run_pull()) + console.success("Project pulled successfully") + except ProjectPullError as e: + console.error(f"Failed to pull UiPath project: {str(e)}") diff --git a/src/uipath/_cli/cli_push.py b/src/uipath/_cli/cli_push.py index d5e02c9ca..7c67d63cb 100644 --- a/src/uipath/_cli/cli_push.py +++ b/src/uipath/_cli/cli_push.py @@ -61,6 +61,8 @@ async def upload_source_files_to_project( async for update in sw_file_handler.upload_source_files(settings): yield update + await sw_file_handler.upload_coded_evals_files() + @click.command() @click.argument( diff --git a/src/uipath/_cli/cli_register.py b/src/uipath/_cli/cli_register.py new file mode 100644 index 000000000..f18e23470 --- /dev/null +++ b/src/uipath/_cli/cli_register.py @@ -0,0 +1,45 @@ +# type: ignore +import logging + +import click + +from ..telemetry import track +from ._evals._helpers import register_evaluator +from ._utils._console import ConsoleLogger +from ._utils._resources import Resources + +logger = logging.getLogger(__name__) +console = ConsoleLogger() + + +@click.command() +@click.argument("resource", required=True) +@click.argument("args", nargs=-1) +@track +def register(resource: str, args: tuple[str]) -> None: + """Register a local resource. + + Examples: + uipath register evaluator my-custom-evaluator.py + """ + match Resources.from_string(resource): + case Resources.EVALUATOR: + usage_hint = f"Usage: {click.style('uipath register evaluator (ex. my_custom_evaluator.py)', fg='cyan')}" + if len(args) < 1: + console.hint(usage_hint) + console.error("Missing required argument: evaluator_file_name.") + return + if len(args) > 1: + console.hint(usage_hint) + console.error( + f"Too many arguments provided: {args}. Expected only evaluator_file_name (ex. my_custom_evaluator.py)" + ) + + filename = args[0] + + if not isinstance(filename, str) or not filename.strip(): + console.hint(usage_hint) + console.error("Invalid filename: must be a non-empty string") + return + + register_evaluator(filename) diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py index f6b1a8947..c9b5e7eb3 100644 --- a/src/uipath/_events/_events.py +++ b/src/uipath/_events/_events.py @@ -3,9 +3,9 @@ from typing import Any, Dict, List, Optional, Union from opentelemetry.sdk.trace import ReadableSpan -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field, SkipValidation, model_validator -from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath._cli._evals._models._evaluation_set import AnyEvaluationItem, AnyEvaluator from uipath.eval.models import EvalItemResult @@ -21,12 +21,13 @@ class EvalSetRunCreatedEvent(BaseModel): entrypoint: str eval_set_id: str no_of_evals: int - evaluators: List[Any] + # skip validation to avoid abstract class instantiation + evaluators: SkipValidation[List[AnyEvaluator]] class EvalRunCreatedEvent(BaseModel): execution_id: str - eval_item: EvaluationItem + eval_item: AnyEvaluationItem class EvalItemExceptionDetails(BaseModel): @@ -40,7 +41,7 @@ class EvalRunUpdatedEvent(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) execution_id: str - eval_item: EvaluationItem + eval_item: AnyEvaluationItem eval_results: List[EvalItemResult] success: bool agent_output: Any diff --git a/src/uipath/_utils/constants.py b/src/uipath/_utils/constants.py index c55d92a42..8010e119a 100644 --- a/src/uipath/_utils/constants.py +++ b/src/uipath/_utils/constants.py @@ -1,6 +1,7 @@ # Environment variables DOTENV_FILE = ".env" ENV_BASE_URL = "UIPATH_URL" +ENV_EVAL_BACKEND_URL = "UIPATH_EVAL_BACKEND_URL" ENV_UNATTENDED_USER_ACCESS_TOKEN = "UNATTENDED_USER_ACCESS_TOKEN" ENV_UIPATH_ACCESS_TOKEN = "UIPATH_ACCESS_TOKEN" ENV_FOLDER_KEY = "UIPATH_FOLDER_KEY" @@ -46,3 +47,6 @@ # File names UIPATH_CONFIG_FILE = "uipath.json" + +# Evaluators +CUSTOM_EVALUATOR_PREFIX = "file://" diff --git a/src/uipath/eval/_helpers/evaluators_helpers.py b/src/uipath/eval/_helpers/evaluators_helpers.py new file mode 100644 index 000000000..8620130cf --- /dev/null +++ b/src/uipath/eval/_helpers/evaluators_helpers.py @@ -0,0 +1,494 @@ +import ast +import json +from collections.abc import Mapping, Sequence +from datetime import datetime +from typing import Any + +from opentelemetry.sdk.trace import ReadableSpan + +from ..models import ( + ToolCall, + ToolOutput, +) + +COMPARATOR_MAPPINGS = { + ">": "gt", + "<": "lt", + ">=": "ge", + "<=": "le", + "=": "eq", + "==": "eq", + "!=": "ne", +} + +COMMUNITY_agents_SUFFIX = "-community-agents" + + +def extract_tool_calls_names(spans: Sequence[ReadableSpan]) -> list[str]: + """Extract the tool call names from execution spans IN ORDER. + + Args: + spans: List of ReadableSpan objects from agent execution. + + Returns: + List of tool names in the order they were called. + """ + tool_calls_names = [] + + for span in spans: + # Check for tool.name attribute first + if span.attributes and (tool_name := span.attributes.get("tool.name")): + tool_calls_names.append(str(tool_name)) + + return tool_calls_names + + +def extract_tool_calls(spans: Sequence[ReadableSpan]) -> list[ToolCall]: + """Extract the tool calls from execution spans with their arguments. + + Args: + spans: List of ReadableSpan objects from agent execution. + + Returns: + Dict of tool calls with their arguments. + """ + tool_calls = [] + + for span in spans: + if span.attributes and (tool_name := span.attributes.get("tool.name")): + try: + input_value: Any = span.attributes.get("input.value", {}) + # Ensure input_value is a string before parsing + if isinstance(input_value, str): + arguments = ast.literal_eval(input_value) + elif isinstance(input_value, dict): + arguments = input_value + else: + arguments = {} + tool_calls.append(ToolCall(name=str(tool_name), args=arguments)) + except (json.JSONDecodeError, SyntaxError, ValueError): + # Handle case where input.value is not valid JSON/Python syntax + tool_calls.append(ToolCall(name=str(tool_name), args={})) + + return tool_calls + + +def extract_tool_calls_outputs(spans: Sequence[ReadableSpan]) -> list[ToolOutput]: + """Extract the outputs of the tool calls from execution spans. + + Args: + spans: List of ReadableSpan objects from agent execution. + + Returns: + List of tool calls outputs. + """ + # After span normalization, the output.value should always be a dict with a content field + # We keep this list of potential output keys for extensibility purposes (e.g. frameworks without span normalization) + potential_output_keys = ["content"] + tool_calls_outputs = [] + for span in spans: + if span.attributes and (tool_name := span.attributes.get("tool.name")): + output = span.attributes.get("output.value", "") + final_output = "" + + # Handle different output formats + if isinstance(output, str): + try: + # Try to parse as JSON and extract content field + parsed_output = json.loads(output) + if isinstance(parsed_output, dict): + for key in potential_output_keys: + if key in parsed_output: + final_output = parsed_output[key] + break + else: + # If parsed JSON is not a dict, use the original string + final_output = output + except (json.JSONDecodeError, ValueError): + # If parsing fails, use the string as-is + final_output = output + elif isinstance(output, dict): + # If output is already a dict, extract content field + for key in potential_output_keys: + if key in output: + final_output = output.get(key, "") + break + else: + final_output = str(output) + + tool_calls_outputs.append( + ToolOutput( + name=str(tool_name), + output=str(final_output) if final_output else "", + ) + ) + return tool_calls_outputs + + +def tool_calls_order_score( + actual_tool_calls_names: Sequence[str], + expected_tool_calls_names: Sequence[str], + strict: bool = False, +) -> tuple[float, dict[str, Any]]: + """The function calculates a score based on LCS applied to the order of the tool calls. + + It calculates the longest common subsequence between the actual tool calls + and the expected tool calls and returns the ratio of the LCS length to the number of + expected calls. + + Args: + actual_tool_calls_names: List of tool names in the actual order + expected_tool_calls_names: List of tool names in the expected order + strict: If True, the function will return 0 if the actual calls do not match the expected calls exactly + + Returns: + tuple[float, dict]: Ratio of the LCS length to the number of expected, and the justification dict + """ + justification = { + "actual_tool_calls_order": list(actual_tool_calls_names), + "expected_tool_calls_order": list(expected_tool_calls_names), + "lcs": [], + } + + # Handle empty cases + if not expected_tool_calls_names and not actual_tool_calls_names: + return 1.0, justification + elif not expected_tool_calls_names or not actual_tool_calls_names: + return 0.0, justification + + # Handle exact match + if expected_tool_calls_names == actual_tool_calls_names: + justification["lcs"] = list(actual_tool_calls_names) + return 1.0, justification + + # Handle strict mode - only perfect matches allowed + if strict: + return 0.0, justification + + # Calculate LCS with full DP table for efficient reconstruction + m, n = len(actual_tool_calls_names), len(expected_tool_calls_names) + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # Build DP table - O(m*n) + for i in range(1, m + 1): + for j in range(1, n + 1): + if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + # Reconstruct LCS - O(m+n) + lcs = [] + i, j = m, n + while i > 0 and j > 0: + if actual_tool_calls_names[i - 1] == expected_tool_calls_names[j - 1]: + lcs.append(actual_tool_calls_names[i - 1]) + i -= 1 + j -= 1 + elif dp[i - 1][j] > dp[i][j - 1]: + i -= 1 + else: + j -= 1 + + lcs.reverse() # Reverse to get correct order + lcs_length = len(lcs) + justification["lcs"] = lcs + return lcs_length / n, justification + + +def tool_calls_count_score( + actual_tool_calls_count: Mapping[str, int], + expected_tool_calls_count: Mapping[str, tuple[str, int]], + strict: bool = False, + justification_key: str = "explained_tool_calls_count", +) -> tuple[float, dict[str, Any]]: + """Check if the expected tool call counts match the actual tool call counts. + + Args: + actual_tool_calls_count: Mapping of tool names to their actual call counts. + expected_tool_calls_count: Mapping of tool names to expected (comparator, count) tuples. + strict: If True, the function will return 0 if not all expected tool calls are matched. + justification_key: Key to use for the justification in the returned dict. + + Returns: + tuple[float, dict]: Score based on the number of matches, and the justification dict. + """ + if not expected_tool_calls_count and not actual_tool_calls_count: + return 1.0, { + justification_key: { + "_result": "Both expected and actual tool calls are empty" + } + } + elif not expected_tool_calls_count or not actual_tool_calls_count: + return 0.0, { + justification_key: { + "_result": "Either expected or actual tool calls are empty" + } + } + + score = 0.0 + justifications: dict[str, Any] = {justification_key: {}} + for tool_name, ( + expected_comparator, + expected_count, + ) in expected_tool_calls_count.items(): + actual_count = actual_tool_calls_count.get(tool_name, 0.0) + comparator = f"__{COMPARATOR_MAPPINGS[expected_comparator]}__" + to_add = float(getattr(actual_count, comparator)(expected_count)) + + justifications[justification_key][tool_name] = ( + f"Actual: {actual_count}, Expected: {expected_count}, Score: {to_add}" + ) + if strict and to_add == 0.0: + # When strict is True, if the actual count does not match the expected count, return 0 + # The justification should only include the breaching tool name + return 0.0, { + justification_key: { + tool_name: justifications[justification_key][tool_name] + } + } + score += to_add + return score / len(expected_tool_calls_count), justifications + + +def tool_calls_args_score( + actual_tool_calls: list[ToolCall], + expected_tool_calls: list[ToolCall], + strict: bool = False, + subset: bool = False, + justification_key: str = "explained_tool_calls_args", +) -> tuple[float, dict[str, Any]]: + """Check if the expected tool calls are correctly called with matching arguments. + + This function does not check the order of the tool calls! + + Args: + actual_tool_calls: List of actual tool calls with their arguments. + expected_tool_calls: List of expected tool calls with their arguments. + strict: If True, the function will return 0 if not all expected tool calls are matched. + subset: If True, the function will check if the expected args are a subset of the actual args. + justification_key: Key to use for the justification in the returned dict. + + Returns: + tuple[float, dict]: Score based on the number of matches, and the justification dict. + """ + if not expected_tool_calls and not actual_tool_calls: + return 1.0, { + justification_key: { + "_result": "Both expected and actual tool calls are empty" + } + } + elif not expected_tool_calls or not actual_tool_calls: + return 0.0, { + justification_key: { + "_result": "Either expected or actual tool calls are empty" + } + } + + cnt = 0 + visited: set[int] = set() + justifications: dict[str, Any] = {justification_key: {}} + tool_counters: dict[str, int] = {} + + for expected_tool_call in expected_tool_calls: + for idx, call in enumerate(actual_tool_calls): + if call.name == expected_tool_call.name and idx not in visited: + # Get or initialize counter for this tool name + tool_counters[call.name] = tool_counters.get(call.name, 0) + tool_key = f"{call.name}_{tool_counters[call.name]}" + tool_counters[call.name] += 1 + + # Check arguments based on mode + # The linter highlights a few problems here due to using lambdas, but they're safe to ignore + # Breaking this down into proper functions would unnecessarily make the code more complex + if subset: + # Subset mode: safely check if all expected args exist and match + args_check = ( # noqa: E731 + lambda k, v: k in call.args # noqa: B023 + and call.args[k] == v # noqa: B023 + ) + else: + # Exact mode: direct access (may raise KeyError) + args_check = lambda k, v: call.args[k] == v # noqa: E731, B023 + + try: + args_match = all( + args_check(k, v) for k, v in expected_tool_call.args.items() + ) + except KeyError: + # Only possible in exact mode when key is missing + args_match = False + + justifications[justification_key][tool_key] = ( + f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}" + ) + if args_match: + cnt += 1 + visited.add(idx) + break + # In case of mismatch, DON'T add to visited in non-strict mode + # so this actual tool call can be matched against other expected calls + + return ( + cnt / len(expected_tool_calls) + if not strict + else float(cnt == len(expected_tool_calls)) + ), justifications + + +def tool_calls_output_score( + actual_tool_calls_outputs: list[ToolOutput], + expected_tool_calls_outputs: list[ToolOutput], + strict: bool = False, + justification_key: str = "explained_tool_calls_outputs", +) -> tuple[float, dict[str, Any]]: + """Check if the expected tool calls are correctly called, where expected args must be a subset of actual args. + + Args: + actual_tool_calls_outputs: List of actual tool calls outputs. + expected_tool_calls_outputs: List of expected tool calls outputs. + strict: If True, the function will return 0 if not all expected tool calls are matched. + + Returns: + tuple[float, str]: Score based on the number of matches, and the justification. + """ + if not expected_tool_calls_outputs and not actual_tool_calls_outputs: + return 1.0, { + justification_key: { + "_result": "Both expected and actual tool calls outputs are empty" + } + } + elif not expected_tool_calls_outputs or not actual_tool_calls_outputs: + return 0.0, { + justification_key: { + "_result": "Either expected or actual tool calls outputs are empty" + } + } + + cnt = 0.0 + justifications: dict[str, Any] = {justification_key: {}} + visited: set[int] = set() + tool_counters: dict[str, int] = {} + + for expected_tool_call_output in expected_tool_calls_outputs: + matched = False + + # Look through ALL actual tool calls to find a match + for idx, actual_tool_call_output in enumerate(actual_tool_calls_outputs): + if idx in visited: + continue + if actual_tool_call_output.name == expected_tool_call_output.name: + # Get or initialize counter for this tool name + tool_counters[actual_tool_call_output.name] = tool_counters.get( + actual_tool_call_output.name, 0 + ) + tool_key = f"{actual_tool_call_output.name}_{tool_counters[actual_tool_call_output.name]}" + tool_counters[actual_tool_call_output.name] += 1 + + justifications[justification_key][tool_key] = ( + f"Actual: {actual_tool_call_output.output}, Expected: {expected_tool_call_output.output}, Score: {float(actual_tool_call_output.output == expected_tool_call_output.output)}" + ) + + if actual_tool_call_output.output == expected_tool_call_output.output: + # Perfect match found + cnt += 1.0 + visited.add(idx) + matched = True + break + elif strict: + # In strict mode, any mismatch returns 0 immediately + return 0.0, { + justification_key: { + tool_key: justifications[justification_key][tool_key] + } + } + # In non-strict mode with mismatch, continue looking for perfect match + # DON'T add to visited, DON'T break + + # If no match found and we're in strict mode, return 0 + if not matched and strict: + return 0.0, { + justification_key: { + "_result": f"No matching actual tool call found for expected {expected_tool_call_output.name}" + } + } + + return ( + cnt / len(expected_tool_calls_outputs) + if not strict + else float(cnt == len(expected_tool_calls_outputs)) + ), justifications + + +def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str: + """Convert OTEL spans to a platform-style agent run history string. + + Creates a similar structure to LangChain message processing but using OTEL spans. + Only processes tool spans (spans with 'tool.name' attribute). + + Args: + agent_trace: List of ReadableSpan objects from the agent execution + + Returns: + String representation of the agent run history in platform format + """ + platform_history = [] + seen_tool_calls = set() + + for span in agent_trace: + if span.attributes and (tool_name := span.attributes.get("tool.name")): + # Get span timing information + start_time = span.start_time + end_time = span.end_time + + # Convert nanoseconds to datetime if needed + if isinstance(start_time, int): + start_timestamp = datetime.fromtimestamp(start_time / 1e9) + else: + start_timestamp = start_time # type:ignore + + if isinstance(end_time, int): + end_timestamp = datetime.fromtimestamp(end_time / 1e9) + else: + end_timestamp = end_time # type:ignore + + timestamp_str = ( + start_timestamp.strftime("%Y-%m-%d %H:%M:%S") if start_timestamp else "" + ) + + # Get tool call information + tool_args: Any = span.attributes.get("input.value", {}) + tool_result = str(span.attributes.get("output.value", {})).strip() + + span_id = ( + span.context.span_id + if span.context + else str(hash(f"{tool_name}_{timestamp_str}")) + ) + + # De-duplicate tool calls based on span ID + if span_id in seen_tool_calls: + continue + seen_tool_calls.add(span_id) + + # Add tool selection (equivalent to AIMessage with tool_calls) + platform_history.append(f"[{timestamp_str}] LLM Response:") + platform_history.append(" Agent Selected 1 Tool(s):") + platform_history.append("") + platform_history.append(f" Tool: {tool_name}") + platform_history.append(f" Arguments: {str(tool_args)}") + platform_history.append("") + + # Add tool response (equivalent to ToolMessage) + end_timestamp_str = ( + end_timestamp.strftime("%Y-%m-%d %H:%M:%S") + if end_timestamp + else timestamp_str + ) + platform_history.append( + f"[{end_timestamp_str}] Tool Call Response - {tool_name}:" + ) + platform_history.append(f"{tool_result}") + platform_history.append("") + + return "\n".join(platform_history) diff --git a/src/uipath/eval/_helpers/helpers.py b/src/uipath/eval/_helpers/helpers.py index 80d48c63b..5059d6827 100644 --- a/src/uipath/eval/_helpers/helpers.py +++ b/src/uipath/eval/_helpers/helpers.py @@ -1,10 +1,13 @@ +import functools import json import os +import time +from collections.abc import Callable +from typing import Any import click -from uipath._cli._utils._console import ConsoleLogger -from uipath._utils.constants import UIPATH_CONFIG_FILE +from ..models import ErrorEvaluationResult, EvaluationResult def auto_discover_entrypoint() -> str: @@ -16,6 +19,9 @@ def auto_discover_entrypoint() -> str: Raises: ValueError: If no entrypoint found or multiple entrypoints exist """ + from uipath._cli._utils._console import ConsoleLogger + from uipath._utils.constants import UIPATH_CONFIG_FILE + console = ConsoleLogger() if not os.path.isfile(UIPATH_CONFIG_FILE): @@ -45,3 +51,25 @@ def auto_discover_entrypoint() -> str: f"Auto-discovered agent entrypoint: {click.style(entrypoint, fg='cyan')}" ) return entrypoint + + +def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]: + """Decorator to track evaluation metrics and handle errors gracefully.""" + + @functools.wraps(func) + async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult: + start_time = time.time() + try: + result = await func(*args, **kwargs) + except Exception as e: + result = ErrorEvaluationResult( + details="Exception thrown by evaluator: {}".format(e), + evaluation_time=time.time() - start_time, + ) + end_time = time.time() + execution_time = end_time - start_time + + result.evaluation_time = execution_time + return result + + return wrapper diff --git a/src/uipath/eval/evaluators/__init__.py b/src/uipath/eval/evaluators/__init__.py index 9ab6b940a..bc79e071d 100644 --- a/src/uipath/eval/evaluators/__init__.py +++ b/src/uipath/eval/evaluators/__init__.py @@ -1,15 +1,70 @@ """UiPath evaluator implementations for agent performance evaluation.""" -from .base_evaluator import BaseEvaluator +from typing import Any + +# Current coded evaluators +from .base_evaluator import BaseEvaluationCriteria, BaseEvaluator, BaseEvaluatorConfig +from .contains_evaluator import ContainsEvaluator from .exact_match_evaluator import ExactMatchEvaluator from .json_similarity_evaluator import JsonSimilarityEvaluator -from .llm_as_judge_evaluator import LlmAsAJudgeEvaluator -from .trajectory_evaluator import TrajectoryEvaluator + +# Legacy evaluators +from .legacy_base_evaluator import LegacyBaseEvaluator +from .legacy_exact_match_evaluator import LegacyExactMatchEvaluator +from .legacy_json_similarity_evaluator import LegacyJsonSimilarityEvaluator +from .legacy_llm_as_judge_evaluator import LegacyLlmAsAJudgeEvaluator +from .legacy_trajectory_evaluator import LegacyTrajectoryEvaluator +from .llm_judge_output_evaluator import ( + BaseLLMOutputEvaluator, + LLMJudgeOutputEvaluator, + LLMJudgeStrictJSONSimilarityOutputEvaluator, +) +from .llm_judge_trajectory_evaluator import ( + BaseLLMTrajectoryEvaluator, + LLMJudgeTrajectoryEvaluator, + LLMJudgeTrajectorySimulationEvaluator, +) +from .tool_call_args_evaluator import ToolCallArgsEvaluator +from .tool_call_count_evaluator import ToolCallCountEvaluator +from .tool_call_order_evaluator import ToolCallOrderEvaluator +from .tool_call_output_evaluator import ToolCallOutputEvaluator + +EVALUATORS: list[type[BaseEvaluator[Any, Any, Any]]] = [ + ExactMatchEvaluator, + ContainsEvaluator, + JsonSimilarityEvaluator, + LLMJudgeOutputEvaluator, + LLMJudgeStrictJSONSimilarityOutputEvaluator, + LLMJudgeTrajectoryEvaluator, + LLMJudgeTrajectorySimulationEvaluator, + ToolCallOrderEvaluator, + ToolCallArgsEvaluator, + ToolCallCountEvaluator, + ToolCallOutputEvaluator, +] __all__ = [ + # Legacy evaluators + "LegacyBaseEvaluator", + "LegacyExactMatchEvaluator", + "LegacyJsonSimilarityEvaluator", + "LegacyLlmAsAJudgeEvaluator", + "LegacyTrajectoryEvaluator", + # Current coded evaluators "BaseEvaluator", + "ContainsEvaluator", "ExactMatchEvaluator", "JsonSimilarityEvaluator", - "LlmAsAJudgeEvaluator", - "TrajectoryEvaluator", + "BaseLLMOutputEvaluator", + "LLMJudgeOutputEvaluator", + "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "BaseLLMTrajectoryEvaluator", + "LLMJudgeTrajectoryEvaluator", + "LLMJudgeTrajectorySimulationEvaluator", + "ToolCallOrderEvaluator", + "ToolCallArgsEvaluator", + "ToolCallCountEvaluator", + "ToolCallOutputEvaluator", + "BaseEvaluationCriteria", + "BaseEvaluatorConfig", ] diff --git a/src/uipath/eval/evaluators/base_evaluator.py b/src/uipath/eval/evaluators/base_evaluator.py index 42ced86ca..5a7e4615b 100644 --- a/src/uipath/eval/evaluators/base_evaluator.py +++ b/src/uipath/eval/evaluators/base_evaluator.py @@ -1,61 +1,93 @@ """Base evaluator abstract class for agent evaluation.""" -import functools -import time +import json +import warnings from abc import ABC, abstractmethod -from typing import Generic, TypeVar +from typing import Any, Generic, TypeVar, Union, cast, get_args -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic.alias_generators import to_camel -from uipath.eval.models import EvaluationResult -from uipath.eval.models.models import ( - AgentExecution, - ErrorEvaluationResult, - EvaluatorCategory, - EvaluatorType, -) +from .._helpers.helpers import track_evaluation_metrics +from ..models import AgentExecution, EvaluationResult +from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory -def track_evaluation_metrics(func): - """Decorator to track evaluation metrics and handle errors gracefully.""" +class BaseEvaluationCriteria(BaseModel): + """Base class for all evaluation criteria.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + pass + + +# Type variable for evaluation criteria, used by both Config and Evaluator +T = TypeVar("T", bound=BaseEvaluationCriteria) + + +class BaseEvaluatorConfig(BaseModel, Generic[T]): + """Base class for all evaluator configurations. + + Generic over T (evaluation criteria type) to ensure type safety between + the config's default_evaluation_criteria and the evaluator's expected criteria type. + """ + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + name: str + default_evaluation_criteria: T | None = None + + +class BaseEvaluatorJustification(BaseModel): + """Base class for all evaluator justifications.""" + + pass + + +# Additional type variables for Config and Justification +# Note: C must be BaseEvaluatorConfig[T] to ensure type consistency +C = TypeVar("C", bound=BaseEvaluatorConfig[Any]) +J = TypeVar("J", bound=Union[str, None, BaseEvaluatorJustification]) - @functools.wraps(func) - async def wrapper(*args, **kwargs) -> EvaluationResult: - start_time = time.time() - try: - result = await func(*args, **kwargs) - except Exception as e: - result = ErrorEvaluationResult( - details="Exception thrown by evaluator: {}".format(e), - evaluation_time=time.time() - start_time, - ) - end_time = time.time() - execution_time = end_time - start_time - result.evaluation_time = execution_time - return result +class BaseEvaluator(BaseModel, Generic[T, C, J], ABC): + """Abstract base class for all evaluators. - return wrapper + Generic Parameters: + T: The evaluation criteria type (bound to BaseEvaluationCriteria) + C: The evaluator config type (bound to BaseEvaluatorConfig[T]) + J: The justification type (str, None, or BaseEvaluatorJustification subclass) + Design Rationale: + T is explicitly specified even though C = BaseEvaluatorConfig[T] already encodes it. + This redundancy is intentional and provides: -T = TypeVar("T") + 1. **Type Checker Support**: Static type checkers can infer the exact criteria type + for the evaluate() method signature without runtime introspection + 2. **Clear API**: The signature BaseEvaluator[MyCriteria, MyConfig[MyCriteria], str] + makes it immediately obvious what criteria type is expected -class BaseEvaluator(BaseModel, Generic[T], ABC): - """Abstract base class for all evaluators.""" + 3. **IDE Support**: Autocomplete and type hints work perfectly for method parameters + + Runtime validation ensures T and C's generic parameter are consistent. + """ model_config = ConfigDict(arbitrary_types_allowed=True) id: str - name: str - description: str - target_output_key: str = "*" - created_at: str - updated_at: str - category: EvaluatorCategory - evaluator_type: EvaluatorType - - def __init_subclass__(cls, **kwargs): + config: dict[str, Any] = Field(description="The config dictionary") + config_type: type[C] = Field(description="The config type class") + evaluation_criteria_type: type[T] = Field( + description="The type used for evaluation criteria validation and creation" + ) + justification_type: type[J] = Field( + description="The type used for justification validation and creation" + ) + evaluator_config: C = Field( + exclude=True, description="The validated config object instance" + ) + + def __init_subclass__(cls, **kwargs: Any): """Hook for subclass creation - automatically applies evaluation metrics tracking.""" super().__init_subclass__(**kwargs) @@ -65,10 +97,479 @@ def __init_subclass__(cls, **kwargs): cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign] cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined] - def model_post_init(self, __context): - """Post-initialization hook for Pydantic models.""" + @property + def name(self) -> str: + """Evaluator's name.""" + return self.evaluator_config.name + + @model_validator(mode="before") + @classmethod + def validate_model(cls, values: Any) -> Any: + """Pre-initialization model validator for Pydantic models. + + This validator extracts the Generic type parameters and validates their consistency. + + Args: + values: The raw input values before validation + + Returns: + The validated/transformed values with types set + + Raises: + ValueError: If types cannot be determined or are inconsistent + """ + if isinstance(values, dict): + # Always extract and set evaluation_criteria_type + criteria_type = cls._extract_evaluation_criteria_type() + values["evaluation_criteria_type"] = criteria_type + + # Always extract and set config_type + config_type = cls._extract_config_type() + values["config_type"] = config_type + + # Always extract and set justification_type + justification_type = cls._extract_justification_type() + values["justification_type"] = justification_type + + # Validate consistency: config's generic parameter should match criteria_type + cls._validate_type_consistency(config_type, criteria_type) + + # Validate and create the config object if config dict is provided + try: + validated_config = config_type.model_validate(values.get("config", {})) + values["evaluator_config"] = validated_config + except Exception as e: + raise UiPathEvaluationError( + code="FAILED_TO_VALIDATE_EVALUATOR_CONFIG", + title=f"Failed to validate evaluator config for {cls.__name__}", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + + return values + + @classmethod + def _validate_type_consistency( + cls, + config_type: type[BaseEvaluatorConfig[Any]], + criteria_type: type[BaseEvaluationCriteria], + ) -> None: + """Validate that the config's generic parameter matches the evaluator's criteria type. + + Extracts the criteria type from the config's default_evaluation_criteria field + annotation and validates it matches the evaluator's expected criteria type. + + Args: + config_type: The config type to validate + criteria_type: The expected evaluation criteria type + + Raises: + ValueError: If the types are inconsistent + """ + # Skip validation for base classes + if config_type.__name__ in ( + "BaseEvaluatorConfig", + "OutputEvaluatorConfig", + "BaseLLMJudgeEvaluatorConfig", + ): + return + + # Extract from Pydantic's model_fields which preserves generic types + if ( + hasattr(config_type, "model_fields") + and "default_evaluation_criteria" in config_type.model_fields + ): + field_info = config_type.model_fields["default_evaluation_criteria"] + if hasattr(field_info, "annotation"): + annotation = field_info.annotation + # The annotation will be SomeCriteria | None + args = get_args(annotation) + if args: + # Get the criteria type (the non-None arg) + for arg in args: + if ( + arg is not type(None) + and isinstance(arg, type) + and issubclass(arg, BaseEvaluationCriteria) + ): + # Found the config's criteria type, check if it matches + if arg != criteria_type: + raise UiPathEvaluationError( + code="TYPE_INCONSISTENCY_IN_EVALUATOR", + title=f"Type inconsistency in {cls.__name__}: " + f"Config {config_type.__name__} expects criteria type {arg.__name__}", + detail=f"Evaluator expects {criteria_type.__name__}. " + f"Ensure BaseEvaluator[T, C[T], J] has matching T and C[T] parameters.", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + return # Validation passed + + @classmethod + def _extract_evaluation_criteria_type(cls) -> type[BaseEvaluationCriteria]: + """Extract the evaluation criteria type from Pydantic model fields. + + Returns: + The evaluation criteria type + + Raises: + ValueError: If no valid evaluation criteria type can be determined from the class definition + """ + # Special case: if this is the BaseEvaluator class itself, return BaseEvaluationCriteria + if cls.__name__ == ("BaseEvaluator" or "BaseEvaluator[Any, Any, Any]"): + return BaseEvaluationCriteria + + # Check if Pydantic has already resolved the evaluation_criteria_type field annotation + if not ( + hasattr(cls, "model_fields") + and "evaluation_criteria_type" in cls.model_fields + ): + raise UiPathEvaluationError( + code="COULD_NOT_FIND_EVALUATION_CRITERIA_TYPE_FIELD", + title=f"Could not find evaluation_criteria_type field in {cls.__name__}", + detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + field_info = cls.model_fields["evaluation_criteria_type"] + if not hasattr(field_info, "annotation"): + raise UiPathEvaluationError( + code="NO_ANNOTATION_FOUND_FOR_EVALUATION_CRITERIA_TYPE_FIELD", + title=f"No annotation found for evaluation_criteria_type field in {cls.__name__}", + detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + # Extract the inner type from type[SomeType] + annotation = field_info.annotation + args = get_args(annotation) + if not args: + raise UiPathEvaluationError( + code="INVALID_ANNOTATION_FOR_EVALUATION_CRITERIA_TYPE", + title=f"Invalid annotation for evaluation_criteria_type in {cls.__name__}: {annotation}", + detail="Expected type[SomeEvaluationCriteria]", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + criteria_type = args[0] + if not ( + isinstance(criteria_type, type) + and issubclass(criteria_type, BaseEvaluationCriteria) + ): + raise UiPathEvaluationError( + code="INVALID_EVALUATION_CRITERIA_TYPE", + title=f"Invalid evaluation criteria type {criteria_type} in {cls.__name__}", + detail=f"{criteria_type} must be a subclass of BaseEvaluationCriteria", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + return criteria_type + + @classmethod + def _extract_config_type(cls) -> type[BaseEvaluatorConfig[Any]]: + """Extract the config type from Pydantic model fields. + + Returns: + The config type for this evaluator + + Raises: + ValueError: If no valid config type can be determined from the class definition + """ + # Special case: if this is the BaseEvaluator class itself, return BaseEvaluatorConfig + if cls.__name__ == ("BaseEvaluator" or "BaseEvaluator[Any, Any, Any]"): + return BaseEvaluatorConfig + # Check if Pydantic has already resolved the config_type field annotation + if not (hasattr(cls, "model_fields") and "config_type" in cls.model_fields): + raise UiPathEvaluationError( + code="COULD_NOT_FIND_CONFIG_TYPE_FIELD", + title=f"Could not find config_type field in {cls.__name__}", + detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + field_info = cls.model_fields["config_type"] + if not hasattr(field_info, "annotation"): + raise UiPathEvaluationError( + code="NO_ANNOTATION_FOUND_FOR_CONFIG_TYPE_FIELD", + title=f"No annotation found for config_type field in {cls.__name__}", + detail="Ensure the class properly inherits from BaseEvaluator with correct Generic parameters.", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + # Extract the inner type from type[SomeType] + annotation = field_info.annotation + args = get_args(annotation) + if not args: + raise UiPathEvaluationError( + code="INVALID_ANNOTATION_FOR_CONFIG_TYPE", + title=f"Invalid annotation for config_type in {cls.__name__}: {annotation}", + detail="Expected type[SomeEvaluatorConfig]", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + config_type = args[0] + if not ( + isinstance(config_type, type) + and issubclass(config_type, BaseEvaluatorConfig) + ): + raise UiPathEvaluationError( + code="INVALID_CONFIG_TYPE", + title=f"Invalid config type {config_type} in {cls.__name__}", + detail=f"{config_type} must be a subclass of BaseEvaluatorConfig", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + return config_type + + @classmethod + def _extract_justification_type(cls) -> type[J]: + """Extract the justification type from Pydantic model fields. + + Returns: + The justification type (str, None, or BaseEvaluatorJustification subclass) + + Note: + Unlike the other type extraction methods, this one returns a default (type(None)) + instead of raising an error, since justification support is optional and + defaults to None for evaluators that don't specify a justification type. + """ + try: + # Special case: if this is the BaseEvaluator class itself, return type(None) + if cls.__name__ == "BaseEvaluator[Any, Any, Any]": + return cast(type[J], type(None)) + + # Check if Pydantic has resolved the justification_type field annotation + if not ( + hasattr(cls, "model_fields") + and "justification_type" in cls.model_fields + ): + # Default to None if field doesn't exist (justification is optional) + return cast(type[J], type(None)) + + field_info = cls.model_fields["justification_type"] + if not hasattr(field_info, "annotation"): + # Default to None if no annotation (justification is optional) + return cast(type[J], type(None)) + + # Extract the inner type from type[SomeType] + annotation = field_info.annotation + args = get_args(annotation) + if not args: + # Default to None if no type args (justification is optional) + return cast(type[J], type(None)) + + justification_type = args[0] + + # Validate the justification type - must be str, type(None), or BaseEvaluatorJustification subclass + if justification_type is str or justification_type is type(None): + return cast(type[J], justification_type) + elif isinstance(justification_type, type) and issubclass( + justification_type, BaseEvaluatorJustification + ): + return cast(type[J], justification_type) + else: + # Invalid justification type - log warning but default to None for robustness + warnings.warn( + f"Invalid justification type {justification_type} in {cls.__name__}. " + f"Must be str, None, or subclass of BaseEvaluatorJustification. Defaulting to None.", + UserWarning, + stacklevel=2, + ) + return cast(type[J], type(None)) + except Exception as e: + raise UiPathEvaluationError( + code="CANNOT_EXTRACT_JUSTIFICATION_TYPE", + title=f"Cannot extract justification type from {cls.__name__}", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + + def validate_evaluation_criteria(self, criteria: Any) -> T: + """Validate and convert input to the correct evaluation criteria type. + + Uses Pydantic's model_validate for proper validation, type coercion, + and error handling. + + Args: + criteria: The criteria to validate (dict, BaseEvaluationCriteria, or other) + + Returns: + An instance of the evaluation criteria type (T) + + Raises: + ValueError: If the criteria cannot be converted to the expected type + """ + try: + if isinstance(criteria, self.evaluation_criteria_type): + return criteria + elif isinstance(criteria, dict): + return self.evaluation_criteria_type.model_validate(criteria) + elif hasattr(criteria, "__dict__"): + # Try to convert from another object type + return self.evaluation_criteria_type.model_validate(criteria.__dict__) + else: + # Try to let Pydantic handle the conversion + return self.evaluation_criteria_type.model_validate(criteria) + except Exception as e: + raise UiPathEvaluationError( + code="CANNOT_VALIDATE_EVALUATION_CRITERIA", + title=f"Cannot validate {type(criteria)} to {self.evaluation_criteria_type}", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + + def validate_justification(self, justification: Any) -> J: + """Validate and convert input to the correct justification type. + + Args: + justification: The justification to validate (str, None, dict, BaseEvaluatorJustification, or other) + + Returns: + The validated justification of the correct type + """ + # The key insight: J is constrained to be one of str, None, or BaseEvaluatorJustification + # At instantiation time, J gets bound to exactly one of these types + # We need to handle each case and ensure the return matches the bound type + try: + # Handle None type - when J is bound to None (the literal None type) + if self.justification_type is type(None): + # When J is None, we can only return None + return cast(J, justification if justification is None else None) + + # Handle str type - when J is bound to str + if self.justification_type is str: + # When J is str, we must return a str + if justification is None: + return cast(J, "") + return cast(J, str(justification)) + + # Handle BaseEvaluatorJustification subclasses - when J is bound to a specific subclass + if isinstance(self.justification_type, type) and issubclass( + self.justification_type, BaseEvaluatorJustification + ): + # When J is a BaseEvaluatorJustification subclass, we must return that type + if justification is None: + raise ValueError( + f"None is not allowed for justification type {self.justification_type}" + ) + + if isinstance(justification, self.justification_type): + return justification + elif isinstance(justification, dict): + return self.justification_type.model_validate(justification) + elif hasattr(justification, "__dict__"): + return self.justification_type.model_validate( + justification.__dict__ + ) + else: + return self.justification_type.model_validate(justification) + except Exception as e: + raise UiPathEvaluationError( + code="CANNOT_CONVERT_JUSTIFICATION", + title=f"Cannot convert {type(justification)} to {self.justification_type}", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + + # Fallback: this should never happen + raise UiPathEvaluationError( + code="UNSUPPORTED_JUSTIFICATION_TYPE", + title=f"Unsupported justification type {self.justification_type} for input {type(justification)}", + detail=f"Unsupported justification type {self.justification_type} for input {type(justification)}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + @classmethod + def get_evaluation_criteria_schema(cls) -> dict[str, Any]: + """Get the JSON schema for the evaluation criteria type. + + Returns: + The JSON schema for the evaluation criteria type + """ + criteria_type = cls._extract_evaluation_criteria_type() + return criteria_type.model_json_schema(by_alias=False) + + @classmethod + def get_config_schema(cls) -> dict[str, Any]: + """Get the JSON schema for the config type. + + Returns: + The JSON schema for the config type + """ + config_type = cls._extract_config_type() + return config_type.model_json_schema(by_alias=False) + + @classmethod + def get_justification_schema(cls) -> dict[str, Any]: + """Get the JSON schema for the justification type. + + Returns: + The JSON schema for the justification type + """ + justification_type = cls._extract_justification_type() + if justification_type is type(None): + return {} + elif justification_type is str: + return {"type": "string"} + elif isinstance(justification_type, type) and issubclass( + justification_type, BaseEvaluatorJustification + ): + return justification_type.model_json_schema(by_alias=False) + else: + raise UiPathEvaluationError( + code="INVALID_JUSTIFICATION_TYPE", + title=f"Invalid justification type {justification_type} in {cls.__name__}", + detail="Must be str, None, or subclass of BaseEvaluatorJustification", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + def _canonical_json(self, obj: Any) -> str: + """Convert an object to canonical JSON string for consistent comparison. + + Args: + obj: The object to convert to canonical JSON + + Returns: + str: Canonical JSON string with normalized numbers and sorted keys + """ + return json.dumps( + obj, + sort_keys=True, + separators=(",", ":"), + ensure_ascii=False, + ) + + @classmethod + @abstractmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" pass + @classmethod + def generate_json_type(cls) -> dict[str, Any]: + """Generate the JSON schema for the evaluator.""" + return { + "evaluatorTypeId": cls.get_evaluator_id(), + "evaluatorConfigSchema": cls.get_config_schema(), + "evaluationCriteriaSchema": cls.get_evaluation_criteria_schema(), + "justificationSchema": cls.get_justification_schema(), + } + + async def validate_and_evaluate_criteria( + self, agent_execution: AgentExecution, evaluation_criteria: Any + ) -> EvaluationResult: + """Evaluate the given data and return a result from a raw evaluation criteria.""" + if evaluation_criteria is None: + evaluation_criteria = self.evaluator_config.default_evaluation_criteria + if evaluation_criteria is None: + raise UiPathEvaluationError( + code="NO_EVALUATION_CRITERIA_PROVIDED", + title="No evaluation criteria provided and no default evaluation criteria configured", + detail="No evaluation criteria provided and no default evaluation criteria configured", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + criteria = self.validate_evaluation_criteria(evaluation_criteria) + return await self.evaluate(agent_execution, criteria) + @abstractmethod async def evaluate( self, agent_execution: AgentExecution, evaluation_criteria: T @@ -78,8 +579,9 @@ async def evaluate( Args: agent_execution: The execution details containing: - agent_input: The input received by the agent - - actual_output: The actual output from the agent - - spans: The execution spans to use for the evaluation + - agent_output: The actual output from the agent + - agent_trace: The execution trace from the agent + - simulation_instructions: The simulation instructions for the agent evaluation_criteria: The criteria to evaluate Returns: diff --git a/src/uipath/eval/evaluators/contains_evaluator.py b/src/uipath/eval/evaluators/contains_evaluator.py new file mode 100644 index 000000000..964c9a709 --- /dev/null +++ b/src/uipath/eval/evaluators/contains_evaluator.py @@ -0,0 +1,80 @@ +"""Contains evaluator for agent outputs.""" + +from ..models import ( + AgentExecution, + EvaluationResult, + EvaluatorType, + NumericEvaluationResult, +) +from .base_evaluator import BaseEvaluationCriteria +from .output_evaluator import ( + OutputEvaluator, + OutputEvaluatorConfig, +) + + +class ContainsEvaluationCriteria(BaseEvaluationCriteria): + """Evaluation criteria for the contains evaluator.""" + + search_text: str + + +class ContainsEvaluatorConfig(OutputEvaluatorConfig[ContainsEvaluationCriteria]): + """Configuration for the contains evaluator.""" + + name: str = "ContainsEvaluator" + case_sensitive: bool = False + negated: bool = False + + +class ContainsEvaluator( + OutputEvaluator[ContainsEvaluationCriteria, ContainsEvaluatorConfig, type(None)] # type: ignore +): + """Evaluator that checks if the actual output contains the expected output. + + This evaluator returns True if the actual output contains the expected output, + and False otherwise. It supports case sensitivity and negation options. + """ + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.CONTAINS.value + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: ContainsEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate whether actual output contains the expected output. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - agent_output: The actual output from the agent + - agent_trace: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + + Returns: + EvaluationResult: Boolean result indicating if output contains expected value (True/False) + """ + actual_output = str(self._get_actual_output(agent_execution)) + expected_output = str(self._get_expected_output(evaluation_criteria)) + + if not self.evaluator_config.case_sensitive: + actual_output = actual_output.lower() + expected_output = expected_output.lower() + + is_contains = expected_output in actual_output + + if self.evaluator_config.negated: + is_contains = not is_contains + return NumericEvaluationResult( + score=float(is_contains), + ) + + def _get_expected_output( + self, evaluation_criteria: ContainsEvaluationCriteria + ) -> str: + """Get the expected output from the evaluation criteria.""" + return evaluation_criteria.search_text diff --git a/src/uipath/eval/evaluators/exact_match_evaluator.py b/src/uipath/eval/evaluators/exact_match_evaluator.py index be58fcdc3..0ff8ebd2c 100644 --- a/src/uipath/eval/evaluators/exact_match_evaluator.py +++ b/src/uipath/eval/evaluators/exact_match_evaluator.py @@ -1,14 +1,29 @@ -"""Exact match evaluator for binary pass/fail evaluation of agent outputs.""" +"""Exact match evaluator for agent outputs.""" -from typing import Any +from ..models import ( + AgentExecution, + EvaluationResult, + EvaluatorType, + NumericEvaluationResult, +) +from .output_evaluator import ( + OutputEvaluationCriteria, + OutputEvaluator, + OutputEvaluatorConfig, +) -from uipath.eval.models import BooleanEvaluationResult, EvaluationResult -from ..models.models import AgentExecution -from .deterministic_evaluator_base import DeterministicEvaluatorBase +class ExactMatchEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]): + """Configuration for the exact match evaluator.""" + name: str = "ExactMatchEvaluator" + case_sensitive: bool = False + negated: bool = False -class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): + +class ExactMatchEvaluator( + OutputEvaluator[OutputEvaluationCriteria, ExactMatchEvaluatorConfig, type(None)] # type: ignore +): """Evaluator that performs exact structural matching between expected and actual outputs. This evaluator returns True if the actual output exactly matches the expected output @@ -16,22 +31,38 @@ class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): to floats for consistent comparison. """ + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.EXACT_MATCH.value + async def evaluate( - self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any] + self, + agent_execution: AgentExecution, + evaluation_criteria: OutputEvaluationCriteria, ) -> EvaluationResult: """Evaluate whether actual output exactly matches expected output. Args: agent_execution: The execution details containing: - agent_input: The input received by the agent - - actual_output: The actual output from the agent - - spans: The execution spans to use for the evaluation + - agent_output: The actual output from the agent + - agent_trace: The execution spans to use for the evaluation evaluation_criteria: The criteria to evaluate Returns: EvaluationResult: Boolean result indicating exact match (True/False) """ - return BooleanEvaluationResult( - score=self._canonical_json(agent_execution.agent_output) - == self._canonical_json(evaluation_criteria) + actual_output = str(self._get_actual_output(agent_execution)) + expected_output = str(self._get_expected_output(evaluation_criteria)) + if not self.evaluator_config.case_sensitive: + actual_output = actual_output.lower() + expected_output = expected_output.lower() + + is_exact_match = actual_output == expected_output + if self.evaluator_config.negated: + is_exact_match = not is_exact_match + + return NumericEvaluationResult( + score=float(is_exact_match), ) diff --git a/src/uipath/eval/evaluators/json_similarity_evaluator.py b/src/uipath/eval/evaluators/json_similarity_evaluator.py index 7c2a79175..1e90c171c 100644 --- a/src/uipath/eval/evaluators/json_similarity_evaluator.py +++ b/src/uipath/eval/evaluators/json_similarity_evaluator.py @@ -1,17 +1,30 @@ """JSON similarity evaluator for flexible structural comparison of outputs.""" import math -from typing import Any, Tuple, TypeVar +from typing import Any, Tuple -from uipath.eval.models import EvaluationResult, NumericEvaluationResult +from ..models import ( + AgentExecution, + EvaluationResult, + EvaluatorType, + NumericEvaluationResult, +) +from .output_evaluator import ( + OutputEvaluationCriteria, + OutputEvaluator, + OutputEvaluatorConfig, +) -from ..models.models import AgentExecution -from .deterministic_evaluator_base import DeterministicEvaluatorBase -T = TypeVar("T") +class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]): + """Configuration for the json similarity evaluator.""" + name: str = "JsonSimilarityEvaluator" -class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): + +class JsonSimilarityEvaluator( + OutputEvaluator[OutputEvaluationCriteria, JsonSimilarityEvaluatorConfig, str] +): """Deterministic evaluator that scores structural JSON similarity between expected and actual output. Compares expected versus actual JSON-like structures and returns a @@ -19,8 +32,15 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): and tolerant for numbers and strings (via Levenshtein distance). """ + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.JSON_SIMILARITY.value + async def evaluate( - self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any] + self, + agent_execution: AgentExecution, + evaluation_criteria: OutputEvaluationCriteria, ) -> EvaluationResult: """Evaluate similarity between expected and actual JSON outputs. @@ -36,16 +56,25 @@ async def evaluate( Returns: EvaluationResult: Numerical score between 0-100 indicating similarity """ + score, justification = self._compare_json( + self._get_expected_output(evaluation_criteria), + self._get_actual_output(agent_execution), + ) + validated_justification = self.validate_justification(justification) return NumericEvaluationResult( - score=self._compare_json(evaluation_criteria, agent_execution.agent_output) + score=score, + details=validated_justification, ) - def _compare_json(self, expected: Any, actual: Any) -> float: + def _compare_json(self, expected: Any, actual: Any) -> tuple[float, str]: matched_leaves, total_leaves = self._compare_tokens(expected, actual) if total_leaves == 0: - return 100.0 - sim = (matched_leaves / total_leaves) * 100.0 - return max(0.0, min(100.0, sim)) + return 1.0, "Total leaves are 0" + sim = matched_leaves / total_leaves + return ( + max(0.0, min(1.0, sim)), + f"Matched leaves: {matched_leaves}, Total leaves: {total_leaves}", + ) def _compare_tokens( self, expected_token: Any, actual_token: Any diff --git a/src/uipath/eval/evaluators/legacy_base_evaluator.py b/src/uipath/eval/evaluators/legacy_base_evaluator.py new file mode 100644 index 000000000..26bb3f227 --- /dev/null +++ b/src/uipath/eval/evaluators/legacy_base_evaluator.py @@ -0,0 +1,89 @@ +"""Base evaluator abstract class for agent evaluation.""" + +import functools +import time +from abc import ABC, abstractmethod +from collections.abc import Callable +from typing import Any, Generic, TypeVar + +from pydantic import BaseModel, ConfigDict + +from uipath.eval.models import EvaluationResult +from uipath.eval.models.models import ( + AgentExecution, + ErrorEvaluationResult, + LegacyEvaluatorCategory, + LegacyEvaluatorType, +) + + +def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]: + """Decorator to track evaluation metrics and handle errors gracefully.""" + + @functools.wraps(func) + async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult: + start_time = time.time() + try: + result = await func(*args, **kwargs) + except Exception as e: + result = ErrorEvaluationResult( + details="Exception thrown by evaluator: {}".format(e), + evaluation_time=time.time() - start_time, + ) + end_time = time.time() + execution_time = end_time - start_time + + result.evaluation_time = execution_time + return result + + return wrapper + + +T = TypeVar("T") + + +class LegacyBaseEvaluator(BaseModel, Generic[T], ABC): + """Abstract base class for all evaluators.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + id: str + name: str + description: str + target_output_key: str = "*" + created_at: str + updated_at: str + category: LegacyEvaluatorCategory + evaluator_type: LegacyEvaluatorType + + def __init_subclass__(cls, **kwargs: Any): + """Hook for subclass creation - automatically applies evaluation metrics tracking.""" + super().__init_subclass__(**kwargs) + + if hasattr(cls, "evaluate") and not getattr( + cls.evaluate, "_has_metrics_decorator", False + ): + cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign] + cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined] + + def model_post_init(self, __context: Any): + """Post-initialization hook for Pydantic models.""" + pass + + @abstractmethod + async def evaluate( + self, agent_execution: AgentExecution, evaluation_criteria: T + ) -> EvaluationResult: + """Evaluate the given data and return a result. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - actual_output: The actual output from the agent + - spans: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + + Returns: + EvaluationResult containing the score and details + """ + pass diff --git a/src/uipath/eval/evaluators/deterministic_evaluator_base.py b/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py similarity index 93% rename from src/uipath/eval/evaluators/deterministic_evaluator_base.py rename to src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py index 078bf2896..c2eee78ef 100644 --- a/src/uipath/eval/evaluators/deterministic_evaluator_base.py +++ b/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py @@ -4,12 +4,12 @@ from abc import ABC from typing import Any, TypeVar -from .base_evaluator import BaseEvaluator +from .legacy_base_evaluator import LegacyBaseEvaluator T = TypeVar("T") -class DeterministicEvaluatorBase(BaseEvaluator[T], ABC): +class DeterministicEvaluatorBase(LegacyBaseEvaluator[T], ABC): """Base class for evaluators that produce deterministic, reproducible results. This class provides utility methods for canonical JSON comparison and number normalization diff --git a/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py b/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py new file mode 100644 index 000000000..7c4729445 --- /dev/null +++ b/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py @@ -0,0 +1,37 @@ +"""Exact match evaluator for binary pass/fail evaluation of agent outputs.""" + +from typing import Any + +from uipath.eval.models import BooleanEvaluationResult, EvaluationResult + +from ..models.models import AgentExecution +from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase + + +class LegacyExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): + """Evaluator that performs exact structural matching between expected and actual outputs. + + This evaluator returns True if the actual output exactly matches the expected output + after canonical JSON normalization, and False otherwise. Numbers are normalized + to floats for consistent comparison. + """ + + async def evaluate( + self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any] + ) -> EvaluationResult: + """Evaluate whether actual output exactly matches expected output. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - actual_output: The actual output from the agent + - spans: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + + Returns: + EvaluationResult: Boolean result indicating exact match (True/False) + """ + return BooleanEvaluationResult( + score=self._canonical_json(agent_execution.agent_output) + == self._canonical_json(evaluation_criteria) + ) diff --git a/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py b/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py new file mode 100644 index 000000000..30d3df868 --- /dev/null +++ b/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py @@ -0,0 +1,151 @@ +"""JSON similarity evaluator for flexible structural comparison of outputs.""" + +import math +from typing import Any, Tuple, TypeVar + +from uipath.eval.models import EvaluationResult, NumericEvaluationResult + +from ..models.models import AgentExecution +from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase + +T = TypeVar("T") + + +class LegacyJsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]): + """Legacy deterministic evaluator that scores structural JSON similarity between expected and actual output. + + Compares expected versus actual JSON-like structures and returns a + numerical score in the range [0, 100]. The comparison is token-based + and tolerant for numbers and strings (via Levenshtein distance). + """ + + async def evaluate( + self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any] + ) -> EvaluationResult: + """Evaluate similarity between expected and actual JSON outputs. + + Uses token-based comparison with tolerance for numeric differences + and Levenshtein distance for string similarity. + + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - actual_output: The actual output from the agent + - spans: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + + Returns: + EvaluationResult: Numerical score between 0-100 indicating similarity + """ + return NumericEvaluationResult( + score=self._compare_json(evaluation_criteria, agent_execution.agent_output) + ) + + def _compare_json(self, expected: Any, actual: Any) -> float: + matched_leaves, total_leaves = self._compare_tokens(expected, actual) + if total_leaves == 0: + return 100.0 + sim = (matched_leaves / total_leaves) * 100.0 + return max(0.0, min(100.0, sim)) + + def _compare_tokens( + self, expected_token: Any, actual_token: Any + ) -> Tuple[float, float]: + if self._is_number(expected_token) and self._is_number(actual_token): + return self._compare_numbers(float(expected_token), float(actual_token)) + + if type(expected_token) is not type(actual_token): + return 0.0, self._count_leaves(expected_token) + + if isinstance(expected_token, dict): + matched_leaves = total_leaves = 0.0 + # Only expected keys count + for expected_key, expected_value in expected_token.items(): + if isinstance(actual_token, dict) and expected_key in actual_token: + matched, total = self._compare_tokens( + expected_value, actual_token[expected_key] + ) + else: + matched, total = (0.0, self._count_leaves(expected_value)) + matched_leaves += matched + total_leaves += total + return matched_leaves, total_leaves + + if isinstance(expected_token, list): + matched_leaves = total_leaves = 0.0 + common_length = min(len(expected_token), len(actual_token)) + for index in range(common_length): + matched, total = self._compare_tokens( + expected_token[index], actual_token[index] + ) + matched_leaves += matched + total_leaves += total + for index in range(common_length, len(expected_token)): + total_leaves += self._count_leaves(expected_token[index]) + return (matched_leaves, total_leaves) + + if isinstance(expected_token, bool): + return (1.0, 1.0) if expected_token == actual_token else (0.0, 1.0) + + if isinstance(expected_token, str): + return self._compare_strings(expected_token, actual_token) + + return (1.0, 1.0) if str(expected_token) == str(actual_token) else (0.0, 1.0) + + def _compare_numbers( + self, expected_number: float, actual_number: float + ) -> Tuple[float, float]: + total = 1.0 + if math.isclose(expected_number, 0.0, abs_tol=1e-12): + matched = 1.0 if math.isclose(actual_number, 0.0, abs_tol=1e-12) else 0.0 + else: + ratio = abs(expected_number - actual_number) / abs(expected_number) + matched = max(0.0, min(1.0, 1.0 - ratio)) + return matched, total + + def _compare_strings( + self, expected_string: str, actual_string: str + ) -> Tuple[float, float]: + total = 1.0 + if not expected_string and not actual_string: + return 1.0, total + distance = self._levenshtein(expected_string, actual_string) + max_length = max(len(expected_string), len(actual_string)) + similarity = 1.0 - (distance / max_length) if max_length else 1.0 + similarity = max(0.0, min(1.0, similarity)) + return similarity, total + + def _count_leaves(self, token_node: Any) -> float: + if isinstance(token_node, dict): + return sum( + self._count_leaves(child_value) for child_value in token_node.values() + ) + if isinstance(token_node, list): + return sum(self._count_leaves(child_value) for child_value in token_node) + return 1.0 + + def _levenshtein(self, source_text: str, target_text: str) -> int: + if not source_text: + return len(target_text) + if not target_text: + return len(source_text) + source_len, target_len = len(source_text), len(target_text) + distance_matrix = [[0] * (target_len + 1) for _ in range(source_len + 1)] + for row_idx in range(source_len + 1): + distance_matrix[row_idx][0] = row_idx + for col_idx in range(target_len + 1): + distance_matrix[0][col_idx] = col_idx + for row_idx in range(1, source_len + 1): + for col_idx in range(1, target_len + 1): + substitution_cost = ( + 0 if source_text[row_idx - 1] == target_text[col_idx - 1] else 1 + ) + distance_matrix[row_idx][col_idx] = min( + distance_matrix[row_idx - 1][col_idx] + 1, # deletion + distance_matrix[row_idx][col_idx - 1] + 1, # insertion + distance_matrix[row_idx - 1][col_idx - 1] + + substitution_cost, # substitution + ) + return distance_matrix[source_len][target_len] + + def _is_number(self, value: Any) -> bool: + return isinstance(value, (int, float)) and not isinstance(value, bool) diff --git a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py new file mode 100644 index 000000000..c55296583 --- /dev/null +++ b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py @@ -0,0 +1,137 @@ +"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs.""" + +import json +from typing import Any, Optional + +from pydantic import field_validator + +from uipath.eval.models import NumericEvaluationResult + +from ..._services import UiPathLlmChatService +from ..._utils.constants import COMMUNITY_agents_SUFFIX +from ..models.models import AgentExecution, EvaluationResult, LLMResponse +from .legacy_base_evaluator import LegacyBaseEvaluator + + +class LegacyLlmAsAJudgeEvaluator(LegacyBaseEvaluator[dict[str, Any]]): + """Legacy evaluator that uses an LLM to judge the quality of agent output.""" + + prompt: str + model: str + actual_output_placeholder: str = "{{ActualOutput}}" + expected_output_placeholder: str = "{{ExpectedOutput}}" + llm: Optional[UiPathLlmChatService] = None + + @field_validator("prompt") + @classmethod + def validate_prompt_placeholders(cls, v: str) -> str: + """Validate that prompt contains required placeholders.""" + if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v: + raise ValueError( + "Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders" + ) + return v + + def model_post_init(self, __context: Any): + """Initialize the LLM service after model creation.""" + super().model_post_init(__context) + self._initialize_llm() + + def _initialize_llm(self): + """Initialize the LLM used for evaluation.""" + from uipath import UiPath + + uipath = UiPath() + self.llm = uipath.llm + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: dict[str, Any], + ) -> EvaluationResult: + """Evaluate using an LLM as a judge. + + Sends the formatted prompt to the configured LLM and expects a JSON response + with a numerical score (0-100) and justification. + + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - actual_output: The actual output from the agent + - spans: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + + Returns: + EvaluationResult: Numerical score with LLM justification as details + """ + # Create the evaluation prompt + evaluation_prompt = self._create_evaluation_prompt( + expected_output=evaluation_criteria, + actual_output=agent_execution.agent_output, + ) + + llm_response = await self._get_llm_response(evaluation_prompt) + + return NumericEvaluationResult( + score=llm_response.score, + details=llm_response.justification, + ) + + def _create_evaluation_prompt( + self, expected_output: Any, actual_output: Any + ) -> str: + """Create the evaluation prompt for the LLM.""" + formatted_prompt = self.prompt.replace( + self.actual_output_placeholder, + str(actual_output), + ) + formatted_prompt = formatted_prompt.replace( + self.expected_output_placeholder, + str(expected_output), + ) + + return formatted_prompt + + async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: + """Get response from the LLM. + + Args: + evaluation_prompt: The formatted prompt to send to the LLM + + Returns: + LLMResponse with score and justification + """ + # remove community-agents suffix from llm model name + model = self.model + if model.endswith(COMMUNITY_agents_SUFFIX): + model = model.replace(COMMUNITY_agents_SUFFIX, "") + + # Prepare the request + request_data = { + "model": model, + "messages": [{"role": "user", "content": evaluation_prompt}], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "evaluation_response", + "schema": { + "type": "object", + "properties": { + "score": { + "type": "number", + "minimum": 0, + "maximum": 100, + "description": "Score between 0 and 100", + }, + "justification": { + "type": "string", + "description": "Explanation for the score", + }, + }, + "required": ["score", "justification"], + }, + }, + }, + } + + response = await self.llm.chat_completions(**request_data) # type: ignore + return LLMResponse(**json.loads(response.choices[-1].message.content or "{}")) diff --git a/src/uipath/eval/evaluators/trajectory_evaluator.py b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py similarity index 95% rename from src/uipath/eval/evaluators/trajectory_evaluator.py rename to src/uipath/eval/evaluators/legacy_trajectory_evaluator.py index 0f2f786f5..8e2a68219 100644 --- a/src/uipath/eval/evaluators/trajectory_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py @@ -16,11 +16,11 @@ NumericEvaluationResult, TrajectoryEvaluationTrace, ) -from .base_evaluator import BaseEvaluator +from .legacy_base_evaluator import LegacyBaseEvaluator -class TrajectoryEvaluator(BaseEvaluator[dict[str, Any]]): - """Evaluator that analyzes the trajectory/path taken to reach outputs.""" +class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[dict[str, Any]]): + """Legacy evaluator that analyzes the trajectory/path taken to reach outputs.""" prompt: str model: str @@ -38,7 +38,7 @@ def validate_prompt_placeholder(cls, v: str) -> str: ) return v - def model_post_init(self, __context): + def model_post_init(self, __context: Any): """Initialize the LLM service after model creation.""" super().model_post_init(__context) self._initialize_llm() @@ -76,7 +76,6 @@ async def evaluate( expected_agent_behavior=agent_execution.expected_agent_behavior, agent_run_history=agent_execution.agent_trace, ) - llm_response = await self._get_llm_response(evaluation_prompt) return NumericEvaluationResult( @@ -160,4 +159,4 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: } response = await self.llm.chat_completions(**request_data) - return LLMResponse(**json.loads(response.choices[-1].message.content)) + return LLMResponse(**json.loads(response.choices[-1].message.content or "{}")) diff --git a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py index ee5d55cdf..71a543ab1 100644 --- a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py @@ -1,137 +1,202 @@ """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs.""" import json -from typing import Any, Optional +from abc import abstractmethod +from collections.abc import Callable +from typing import Any, TypeVar + +from pydantic import BaseModel, Field, model_validator + +from .._helpers.evaluators_helpers import COMMUNITY_agents_SUFFIX +from ..models import ( + AgentExecution, + EvaluationResult, + LLMResponse, + NumericEvaluationResult, +) +from ..models.llm_judge_types import ( + LLMJudgeOutputSchema, + LLMJudgePromptTemplates, +) +from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory +from .base_evaluator import ( + BaseEvaluationCriteria, + BaseEvaluator, + BaseEvaluatorConfig, +) + +T = TypeVar("T", bound=BaseEvaluationCriteria) + + +class BaseLLMJudgeEvaluatorConfig(BaseEvaluatorConfig[T]): + """Base config for all LLM evaluators. + + Generic over T (evaluation criteria type) to ensure type safety between + the config's default_evaluation_criteria and the evaluator's expected criteria type. + """ -from pydantic import field_validator + prompt: str + model: str = "" + temperature: float = 0.0 + max_tokens: int | None = None -from uipath.eval.models import NumericEvaluationResult -from ..._services import UiPathLlmChatService -from ..._utils.constants import COMMUNITY_agents_SUFFIX -from ..models.models import AgentExecution, EvaluationResult, LLMResponse -from .base_evaluator import BaseEvaluator +C = TypeVar("C", bound=BaseLLMJudgeEvaluatorConfig[Any]) -class LlmAsAJudgeEvaluator(BaseEvaluator[dict[str, Any]]): - """Evaluator that uses an LLM to judge the quality of agent output.""" +class LLMJudgeMixin(BaseEvaluator[T, C, str]): + """Mixin that provides common LLM judge functionality.""" - prompt: str - model: str + system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_SYSTEM_PROMPT + output_schema: type[BaseModel] = LLMJudgeOutputSchema actual_output_placeholder: str = "{{ActualOutput}}" expected_output_placeholder: str = "{{ExpectedOutput}}" - llm: Optional[UiPathLlmChatService] = None + llm_service: Callable[..., Any] | None = Field( + default=None, exclude=True, description="The LLM service for evaluation" + ) - @field_validator("prompt") - @classmethod - def validate_prompt_placeholders(cls, v: str) -> str: + @model_validator(mode="after") + def validate_prompt_placeholders(self) -> "LLMJudgeMixin[T, C]": """Validate that prompt contains required placeholders.""" - if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v: - raise ValueError( - "Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders" + if ( + self.actual_output_placeholder not in self.evaluator_config.prompt + or self.expected_output_placeholder not in self.evaluator_config.prompt + ): + raise UiPathEvaluationError( + code="INVALID_PROMPT_PLACEHOLDERS", + title="Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders", + detail="Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders", + category=UiPathEvaluationErrorCategory.USER, ) - return v + return self - def model_post_init(self, __context): - """Initialize the LLM service after model creation.""" + def model_post_init(self, __context: Any) -> None: + """Initialize the LLM service if not provided.""" super().model_post_init(__context) - self._initialize_llm() + if self.llm_service is None: + self.llm_service = self._get_llm_service() - def _initialize_llm(self): - """Initialize the LLM used for evaluation.""" + def _get_llm_service(self): + """Get the LLM service from the UiPath instance.""" from uipath import UiPath - uipath = UiPath() - self.llm = uipath.llm + try: + uipath = UiPath() + return uipath.llm.chat_completions + except Exception as e: + raise UiPathEvaluationError( + code="FAILED_TO_GET_LLM_SERVICE", + title="Failed to get LLM service from the SDK and no otherLLM service provided", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + + @abstractmethod + def _get_actual_output(self, agent_execution: AgentExecution) -> Any: + """Get the actual output from the agent execution. Must be implemented by concrete evaluator classes.""" + pass + + @abstractmethod + def _get_expected_output(self, evaluation_criteria: T) -> Any: + """Get the expected output from the evaluation criteria. Must be implemented by concrete evaluator classes.""" + pass async def evaluate( self, agent_execution: AgentExecution, - evaluation_criteria: dict[str, Any], + evaluation_criteria: T, ) -> EvaluationResult: - """Evaluate using an LLM as a judge. - - Sends the formatted prompt to the configured LLM and expects a JSON response - with a numerical score (0-100) and justification. - - agent_execution: The execution details containing: - - agent_input: The input received by the agent - - actual_output: The actual output from the agent - - spans: The execution spans to use for the evaluation - evaluation_criteria: The criteria to evaluate - - Returns: - EvaluationResult: Numerical score with LLM justification as details - """ - # Create the evaluation prompt + """Evaluate using an LLM as a judge.""" evaluation_prompt = self._create_evaluation_prompt( - expected_output=evaluation_criteria, - actual_output=agent_execution.agent_output, + agent_execution=agent_execution, + evaluation_criteria=evaluation_criteria, ) llm_response = await self._get_llm_response(evaluation_prompt) + validated_justification = self.validate_justification( + llm_response.justification + ) return NumericEvaluationResult( - score=llm_response.score, - details=llm_response.justification, + score=max(0.0, min(1.0, round(llm_response.score / 100.0, 2))), + details=validated_justification, ) def _create_evaluation_prompt( - self, expected_output: Any, actual_output: Any + self, + agent_execution: AgentExecution, + evaluation_criteria: T, ) -> str: """Create the evaluation prompt for the LLM.""" - formatted_prompt = self.prompt.replace( + formatted_prompt = self.evaluator_config.prompt.replace( self.actual_output_placeholder, - str(actual_output), + str(self._get_actual_output(agent_execution)), ) formatted_prompt = formatted_prompt.replace( self.expected_output_placeholder, - str(expected_output), + str(self._get_expected_output(evaluation_criteria)), ) return formatted_prompt async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: - """Get response from the LLM. - - Args: - evaluation_prompt: The formatted prompt to send to the LLM - - Returns: - LLMResponse with score and justification - """ + """Get response from the LLM.""" # remove community-agents suffix from llm model name - model = self.model + model = self.evaluator_config.model if model.endswith(COMMUNITY_agents_SUFFIX): model = model.replace(COMMUNITY_agents_SUFFIX, "") # Prepare the request request_data = { "model": model, - "messages": [{"role": "user", "content": evaluation_prompt}], + "messages": [ + {"role": "system", "content": self.system_prompt}, + {"role": "user", "content": evaluation_prompt}, + ], "response_format": { "type": "json_schema", "json_schema": { "name": "evaluation_response", - "schema": { - "type": "object", - "properties": { - "score": { - "type": "number", - "minimum": 0, - "maximum": 100, - "description": "Score between 0 and 100", - }, - "justification": { - "type": "string", - "description": "Explanation for the score", - }, - }, - "required": ["score", "justification"], - }, + "schema": self.output_schema.model_json_schema(), }, }, + "max_tokens": self.evaluator_config.max_tokens, + "temperature": self.evaluator_config.temperature, } - response = await self.llm.chat_completions(**request_data) # type: ignore - return LLMResponse(**json.loads(response.choices[-1].message.content)) + if self.llm_service is None: + raise UiPathEvaluationError( + code="LLM_SERVICE_NOT_INITIALIZED", + title="LLM service not initialized", + detail="LLM service not initialized", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + try: + response = await self.llm_service(**request_data) + except Exception as e: + raise UiPathEvaluationError( + code="FAILED_TO_GET_LLM_RESPONSE", + title="Failed to get LLM response", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + + try: + content = response.choices[-1].message.content + if content is None: + raise UiPathEvaluationError( + code="EMPTY_LLM_RESPONSE", + title="Empty LLM response", + detail="The LLM response message content was None.", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + parsed_response = json.loads(str(content)) + except Exception as e: + raise UiPathEvaluationError( + code="FAILED_TO_PARSE_LLM_RESPONSE", + title="Failed to parse LLM response", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + return LLMResponse(**parsed_response) diff --git a/src/uipath/eval/evaluators/llm_judge_output_evaluator.py b/src/uipath/eval/evaluators/llm_judge_output_evaluator.py new file mode 100644 index 000000000..1e8c6919c --- /dev/null +++ b/src/uipath/eval/evaluators/llm_judge_output_evaluator.py @@ -0,0 +1,112 @@ +"""LLM judge output evaluators for evaluating agent outputs.""" + +from typing import TypeVar + +from pydantic import BaseModel + +from uipath.eval.models import EvaluatorType + +from ..models import AgentExecution, EvaluationResult +from ..models.llm_judge_types import ( + LLMJudgeOutputSchema, + LLMJudgePromptTemplates, + LLMJudgeStrictJSONSimilarityOutputSchema, +) +from .llm_as_judge_evaluator import ( + BaseLLMJudgeEvaluatorConfig, + LLMJudgeMixin, +) +from .output_evaluator import ( + OutputEvaluationCriteria, + OutputEvaluator, + OutputEvaluatorConfig, +) + + +class BaseLLMJudgeOutputCriteriaEvaluatorConfig( + OutputEvaluatorConfig[OutputEvaluationCriteria], + BaseLLMJudgeEvaluatorConfig[OutputEvaluationCriteria], +): + """Base configuration for LLM judge output criteria evaluators.""" + + pass + + +class LLMJudgeOutputEvaluatorConfig(BaseLLMJudgeOutputCriteriaEvaluatorConfig): + """Configuration for the LLM judge output evaluator.""" + + name: str = "LLMJudgeOutputEvaluator" + prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_DEFAULT_USER_PROMPT + + +class LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig(LLMJudgeOutputEvaluatorConfig): + """Configuration for the LLM judge strict JSON similarity output evaluator.""" + + name: str = "LLMJudgeStrictJSONSimilarityOutputEvaluator" + prompt: str = ( + LLMJudgePromptTemplates.LLM_JUDGE_STRICT_JSON_SIMILARITY_DEFAULT_USER_PROMPT + ) + + +OC = TypeVar("OC", bound=LLMJudgeOutputEvaluatorConfig) + + +class BaseLLMOutputEvaluator( + OutputEvaluator[OutputEvaluationCriteria, OC, str], + LLMJudgeMixin[OutputEvaluationCriteria, OC], +): + """Base class for LLM judge output evaluators that contains all shared functionality. + + This class encapsulates the common evaluation logic for output-based LLM evaluators, + combining OutputEvaluator (for output extraction) with LLMJudgeMixin (for LLM functionality). + """ + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.LLM_JUDGE_OUTPUT.value + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: OutputEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate using an LLM as a judge.""" + # Explicitly delegate to LLMJudgeMixin's evaluate method to override BaseEvaluator + return await LLMJudgeMixin.evaluate(self, agent_execution, evaluation_criteria) + + +class LLMJudgeOutputEvaluator(BaseLLMOutputEvaluator[LLMJudgeOutputEvaluatorConfig]): + """Evaluator that uses an LLM to judge the quality of agent output. + + Inherits all functionality from BaseLLMOutputEvaluator but uses the standard + system prompt and output schema for general output evaluation. + """ + + system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_SYSTEM_PROMPT + output_schema: type[BaseModel] = LLMJudgeOutputSchema + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY.value + + +class LLMJudgeStrictJSONSimilarityOutputEvaluator( + BaseLLMOutputEvaluator[LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig] +): + """Evaluator that uses an LLM to judge the quality of agent output with strict JSON similarity. + + Inherits all functionality from BaseLLMOutputEvaluator but uses a different system prompt + and output schema specific to strict JSON similarity evaluation. + """ + + system_prompt: str = ( + LLMJudgePromptTemplates.LLM_JUDGE_STRICT_JSON_SIMILARITY_SYSTEM_PROMPT + ) + output_schema: type[BaseModel] = LLMJudgeStrictJSONSimilarityOutputSchema + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY.value diff --git a/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py b/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py new file mode 100644 index 000000000..eac5c11b1 --- /dev/null +++ b/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py @@ -0,0 +1,142 @@ +"""LLM judge trajectory evaluator for evaluating agent execution trajectories.""" + +from typing import Any, TypeVar + +from pydantic import BaseModel + +from .._helpers.evaluators_helpers import trace_to_str +from ..models import ( + AgentExecution, + EvaluationResult, + EvaluatorType, +) +from ..models.llm_judge_types import ( + LLMJudgePromptTemplates, + LLMJudgeTrajectoryOutputSchema, +) +from .base_evaluator import BaseEvaluationCriteria +from .llm_as_judge_evaluator import ( + BaseLLMJudgeEvaluatorConfig, + LLMJudgeMixin, +) + + +class TrajectoryEvaluationCriteria(BaseEvaluationCriteria): + """Evaluation criteria for trajectory-based evaluations.""" + + expected_agent_behavior: str + + +class LLMJudgeTrajectoryEvaluatorConfig( + BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria] +): + """Configuration for the llm judge trajectory evaluator.""" + + name: str = "LLMJudgeTrajectoryEvaluator" + prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_DEFAULT_USER_PROMPT + + +class LLMJudgeTrajectorySimulationEvaluatorConfig( + BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria] +): + """Configuration for the llm judge simulation trajectory evaluator.""" + + name: str = "LLMJudgeTrajectorySimulationEvaluator" + prompt: str = ( + LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_DEFAULT_USER_PROMPT + ) + + +TC = TypeVar("TC", bound=BaseLLMJudgeEvaluatorConfig[TrajectoryEvaluationCriteria]) + + +class BaseLLMTrajectoryEvaluator(LLMJudgeMixin[TrajectoryEvaluationCriteria, TC]): + """Base class for LLM trajectory evaluators that contains all shared functionality. + + This class encapsulates the common evaluation logic for trajectory-based LLM evaluators, + including output extraction, prompt formatting, and evaluation criteria handling. + """ + + output_schema: type[BaseModel] = LLMJudgeTrajectoryOutputSchema + actual_output_placeholder: str = "{{AgentRunHistory}}" + expected_output_placeholder: str = "{{ExpectedAgentBehavior}}" + user_input_placeholder: str = "{{UserOrSyntheticInput}}" + simulation_instructions_placeholder: str = "{{SimulationInstructions}}" + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.LLM_JUDGE_TRAJECTORY.value + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: TrajectoryEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate using trajectory analysis.""" + return await super().evaluate(agent_execution, evaluation_criteria) + + def _get_actual_output(self, agent_execution: AgentExecution) -> Any: + """Get the actual output from the agent execution.""" + return trace_to_str(agent_execution.agent_trace) + + def _get_expected_output( + self, evaluation_criteria: TrajectoryEvaluationCriteria + ) -> Any: + """Get the expected agent behavior from the evaluation criteria.""" + return evaluation_criteria.expected_agent_behavior + + def _create_evaluation_prompt( + self, + agent_execution: AgentExecution, + evaluation_criteria: TrajectoryEvaluationCriteria, + ) -> str: + """Create the evaluation prompt for the LLM.""" + formatted_prompt = super()._create_evaluation_prompt( + agent_execution, evaluation_criteria + ) + formatted_prompt = formatted_prompt.replace( + self.user_input_placeholder, + str(agent_execution.agent_input), + ) + formatted_prompt = formatted_prompt.replace( + self.simulation_instructions_placeholder, + agent_execution.simulation_instructions, + ) + return formatted_prompt + + +class LLMJudgeTrajectoryEvaluator( + BaseLLMTrajectoryEvaluator[LLMJudgeTrajectoryEvaluatorConfig] +): + """Evaluator that uses an LLM to judge the quality of agent trajectory. + + Inherits all functionality from BaseLLMTrajectoryEvaluator but uses the standard + system prompt and configuration for general trajectory evaluation. + """ + + system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_TRAJECTORY_SYSTEM_PROMPT + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY.value + + +class LLMJudgeTrajectorySimulationEvaluator( + BaseLLMTrajectoryEvaluator[LLMJudgeTrajectorySimulationEvaluatorConfig] +): + """Evaluator that uses an LLM to judge the quality of agent trajectory for simulations. + + Inherits all functionality from BaseLLMTrajectoryEvaluator but uses a different system prompt + and configuration specific to simulation evaluation. + """ + + system_prompt: str = ( + LLMJudgePromptTemplates.LLM_JUDGE_SIMULATION_TRAJECTORY_SYSTEM_PROMPT + ) + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION.value diff --git a/src/uipath/eval/evaluators/output_evaluator.py b/src/uipath/eval/evaluators/output_evaluator.py new file mode 100644 index 000000000..2aa362e18 --- /dev/null +++ b/src/uipath/eval/evaluators/output_evaluator.py @@ -0,0 +1,117 @@ +"""Base class for all output evaluator configurations.""" + +import json +from typing import Any, TypeVar, Union + +from pydantic import Field + +from ..models import AgentExecution +from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory +from .base_evaluator import ( + BaseEvaluationCriteria, + BaseEvaluator, + BaseEvaluatorConfig, + BaseEvaluatorJustification, +) + + +class OutputEvaluationCriteria(BaseEvaluationCriteria): + """Base class for all output evaluation criteria.""" + + expected_output: dict[str, Any] | str + + +T = TypeVar("T", bound=BaseEvaluationCriteria) +T_OutputCriteria = TypeVar("T_OutputCriteria", bound=OutputEvaluationCriteria) + + +class OutputEvaluatorConfig(BaseEvaluatorConfig[T]): + """Base class for all output evaluator configurations. + + Generic over T to allow subclasses to define their own + specific output evaluation criteria types while maintaining type safety. + """ + + target_output_key: str = Field( + default="*", description="Key to extract output from agent execution" + ) + + +C = TypeVar("C", bound=OutputEvaluatorConfig[Any]) +J = TypeVar("J", bound=Union[str, None, BaseEvaluatorJustification]) + + +class BaseOutputEvaluator(BaseEvaluator[T, C, J]): + """Abstract base class for all output evaluators. + + Generic Parameters: + T_OutputCriteria: The output evaluation criteria type + C: The output evaluator config type (bound to OutputEvaluatorConfig[T_OutputCriteria]) + J: The justification type + """ + + def _get_actual_output(self, agent_execution: AgentExecution) -> Any: + """Get the actual output from the agent execution.""" + if self.evaluator_config.target_output_key != "*": + try: + return agent_execution.agent_output[ + self.evaluator_config.target_output_key + ] + except KeyError as e: + raise UiPathEvaluationError( + code="TARGET_OUTPUT_KEY_NOT_FOUND", + title="Target output key not found in actual output", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.USER, + ) from e + return agent_execution.agent_output + + def _get_full_expected_output(self, evaluation_criteria: T) -> Any: + """Get the full expected output from the evaluation criteria.""" + raise UiPathEvaluationError( + code="NOT_IMPLEMENTED", + title="This method was not implemented by the subclass.", + detail="This method was not implemented by the subclass.", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + + def _get_expected_output(self, evaluation_criteria: T) -> Any: + """Load the expected output from the evaluation criteria.""" + expected_output = self._get_full_expected_output(evaluation_criteria) + if self.evaluator_config.target_output_key != "*": + if isinstance(expected_output, str): + try: + expected_output = json.loads(expected_output) + except json.JSONDecodeError as e: + raise UiPathEvaluationError( + code="INVALID_EXPECTED_OUTPUT", + title="When target output key is not '*', expected output must be a dictionary or a valid JSON string", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.USER, + ) from e + try: + expected_output = expected_output[ + self.evaluator_config.target_output_key + ] + except KeyError as e: + raise UiPathEvaluationError( + code="TARGET_OUTPUT_KEY_NOT_FOUND", + title="Target output key not found in expected output", + detail=f"Error: {e}", + category=UiPathEvaluationErrorCategory.USER, + ) from e + return expected_output + + +class OutputEvaluator(BaseOutputEvaluator[T_OutputCriteria, C, J]): + """Abstract base class for all output evaluators. + + Generic Parameters: + T_OutputCriteria: The output evaluation criteria type + C: The output evaluator config type (bound to OutputEvaluatorConfig[T_OutputCriteria]) + J: The justification type + """ + + def _get_full_expected_output(self, evaluation_criteria: T_OutputCriteria) -> Any: + """Get the full expected output from the evaluation criteria.""" + return evaluation_criteria.expected_output diff --git a/src/uipath/eval/evaluators/tool_call_args_evaluator.py b/src/uipath/eval/evaluators/tool_call_args_evaluator.py new file mode 100644 index 000000000..2703e3c76 --- /dev/null +++ b/src/uipath/eval/evaluators/tool_call_args_evaluator.py @@ -0,0 +1,82 @@ +"""Tool call order evaluator for validating correct sequence of tool calls.""" + +from .._helpers.evaluators_helpers import ( + extract_tool_calls, + tool_calls_args_score, +) +from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult, ToolCall +from ..models.models import EvaluatorType +from .base_evaluator import ( + BaseEvaluationCriteria, + BaseEvaluator, + BaseEvaluatorConfig, + BaseEvaluatorJustification, +) + + +class ToolCallArgsEvaluationCriteria(BaseEvaluationCriteria): + """Evaluation criteria for the tool call order evaluator.""" + + # TODO: name field of ToolCall needs to be validated such that it contains only the tools available + tool_calls: list[ToolCall] + + +class ToolCallArgsEvaluatorConfig(BaseEvaluatorConfig[ToolCallArgsEvaluationCriteria]): + """Configuration for the tool call count evaluator.""" + + name: str = "ToolCallArgsEvaluator" + strict: bool = False + subset: bool = False + + +class ToolCallArgsEvaluatorJustification(BaseEvaluatorJustification): + """Justification for the tool call args evaluator.""" + + explained_tool_calls_args: dict[str, str] + + +class ToolCallArgsEvaluator( + BaseEvaluator[ + ToolCallArgsEvaluationCriteria, + ToolCallArgsEvaluatorConfig, + ToolCallArgsEvaluatorJustification, + ] +): + """Evaluator that checks if the tool calls are in the correct order. + + This evaluator returns True if the tool calls are in the correct order, and False otherwise. + """ + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.TOOL_CALL_ARGS.value + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: ToolCallArgsEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate if the tool calls are in the correct order. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - agent_output: The final output of the agent + - agent_trace: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + Returns: + EvaluationResult: Boolean result indicating correct tool call order (True/False) + """ + tool_calls_order = extract_tool_calls(agent_execution.agent_trace) + score, justification = tool_calls_args_score( + tool_calls_order, + evaluation_criteria.tool_calls, + self.evaluator_config.strict, + self.evaluator_config.subset, + ) + validated_justification = self.validate_justification(justification) + return NumericEvaluationResult( + score=score, + details=validated_justification, + ) diff --git a/src/uipath/eval/evaluators/tool_call_count_evaluator.py b/src/uipath/eval/evaluators/tool_call_count_evaluator.py new file mode 100644 index 000000000..11d684ae1 --- /dev/null +++ b/src/uipath/eval/evaluators/tool_call_count_evaluator.py @@ -0,0 +1,87 @@ +"""Tool call count evaluator for validating expected tool usage patterns.""" + +from collections import Counter + +from .._helpers.evaluators_helpers import ( + extract_tool_calls_names, + tool_calls_count_score, +) +from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult +from ..models.models import EvaluatorType +from .base_evaluator import ( + BaseEvaluationCriteria, + BaseEvaluator, + BaseEvaluatorConfig, + BaseEvaluatorJustification, +) + + +class ToolCallCountEvaluationCriteria(BaseEvaluationCriteria): + """Evaluation criteria for the tool call count evaluator.""" + + # TODO: str field needs to be validated against some criteria that allows ">x", "=x", "<=x", "x" + tool_calls_count: dict[str, tuple[str, int]] + + +class ToolCallCountEvaluatorConfig( + BaseEvaluatorConfig[ToolCallCountEvaluationCriteria] +): + """Configuration for the tool call count evaluator.""" + + name: str = "ToolCallCountEvaluator" + strict: bool = False + + +class ToolCallCountEvaluatorJustification(BaseEvaluatorJustification): + """Justification for the tool call count evaluator.""" + + explained_tool_calls_count: dict[str, str] + + +class ToolCallCountEvaluator( + BaseEvaluator[ + ToolCallCountEvaluationCriteria, + ToolCallCountEvaluatorConfig, + ToolCallCountEvaluatorJustification, + ] +): + """Evaluator that checks if the tool calls match the expected count. + + This evaluator returns a score based on how well the actual tool call counts + match the expected counts specified in the criteria. + """ + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.TOOL_CALL_COUNT.value + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: ToolCallCountEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate if the tool calls are in the correct order. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - agent_output: The final output of the agent + - agent_trace: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + Returns: + EvaluationResult: Boolean result indicating correct tool call order (True/False) + """ + tool_calls_count = Counter( + extract_tool_calls_names(agent_execution.agent_trace) + ) + score, justification = tool_calls_count_score( + tool_calls_count, + evaluation_criteria.tool_calls_count, + self.evaluator_config.strict, + ) + validated_justification = self.validate_justification(justification) + return NumericEvaluationResult( + score=score, + details=validated_justification, + ) diff --git a/src/uipath/eval/evaluators/tool_call_order_evaluator.py b/src/uipath/eval/evaluators/tool_call_order_evaluator.py new file mode 100644 index 000000000..973b1ba13 --- /dev/null +++ b/src/uipath/eval/evaluators/tool_call_order_evaluator.py @@ -0,0 +1,84 @@ +"""Tool call order evaluator for validating correct sequence of tool calls.""" + +from .._helpers.evaluators_helpers import ( + extract_tool_calls_names, + tool_calls_order_score, +) +from ..models import AgentExecution, EvaluationResult, NumericEvaluationResult +from ..models.models import EvaluatorType +from .base_evaluator import ( + BaseEvaluationCriteria, + BaseEvaluator, + BaseEvaluatorConfig, + BaseEvaluatorJustification, +) + + +class ToolCallOrderEvaluationCriteria(BaseEvaluationCriteria): + """Evaluation criteria for the tool call order evaluator.""" + + # TODO: str field needs to be validated such that it contains only the tools available + tool_calls_order: list[str] + + +class ToolCallOrderEvaluatorConfig( + BaseEvaluatorConfig[ToolCallOrderEvaluationCriteria] +): + """Configuration for the tool call count evaluator.""" + + name: str = "ToolCallOrderEvaluator" + strict: bool = False + + +class ToolCallOrderEvaluatorJustification(BaseEvaluatorJustification): + """Justification for the tool call order evaluator.""" + + actual_tool_calls_order: list[str] + expected_tool_calls_order: list[str] + lcs: list[str] + + +class ToolCallOrderEvaluator( + BaseEvaluator[ + ToolCallOrderEvaluationCriteria, + ToolCallOrderEvaluatorConfig, + ToolCallOrderEvaluatorJustification, + ] +): + """Evaluator that checks if the tool calls are in the correct order. + + This evaluator returns True if the tool calls are in the correct order, and False otherwise. + """ + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.TOOL_CALL_ORDER.value + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: ToolCallOrderEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate if the tool calls are in the correct order. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - agent_output: The final output of the agent + - agent_trace: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + Returns: + EvaluationResult: Boolean result indicating correct tool call order (True/False) + """ + tool_calls_order = extract_tool_calls_names(agent_execution.agent_trace) + score, justification = tool_calls_order_score( + tool_calls_order, + evaluation_criteria.tool_calls_order, + self.evaluator_config.strict, + ) + validated_justification = self.validate_justification(justification) + return NumericEvaluationResult( + score=score, + details=validated_justification, + ) diff --git a/src/uipath/eval/evaluators/tool_call_output_evaluator.py b/src/uipath/eval/evaluators/tool_call_output_evaluator.py new file mode 100644 index 000000000..fff139daf --- /dev/null +++ b/src/uipath/eval/evaluators/tool_call_output_evaluator.py @@ -0,0 +1,87 @@ +"""Tool call order evaluator for validating correct sequence of tool calls.""" + +from .._helpers.evaluators_helpers import ( + extract_tool_calls_outputs, + tool_calls_output_score, +) +from ..models import ( + AgentExecution, + EvaluationResult, + NumericEvaluationResult, + ToolOutput, +) +from ..models.models import EvaluatorType +from .base_evaluator import ( + BaseEvaluationCriteria, + BaseEvaluator, + BaseEvaluatorConfig, + BaseEvaluatorJustification, +) + + +class ToolCallOutputEvaluationCriteria(BaseEvaluationCriteria): + """Evaluation criteria for the tool call order evaluator.""" + + # TODO: name field of ToolCall needs to be validated such that it contains only the tools available + tool_outputs: list[ToolOutput] + + +class ToolCallOutputEvaluatorConfig( + BaseEvaluatorConfig[ToolCallOutputEvaluationCriteria] +): + """Configuration for the tool call count evaluator.""" + + name: str = "ToolCallOutputEvaluator" + strict: bool = False + + +class ToolCallOutputEvaluatorJustification(BaseEvaluatorJustification): + """Justification for the tool call output evaluator.""" + + explained_tool_calls_outputs: dict[str, str] + + +class ToolCallOutputEvaluator( + BaseEvaluator[ + ToolCallOutputEvaluationCriteria, + ToolCallOutputEvaluatorConfig, + ToolCallOutputEvaluatorJustification, + ] +): + """Evaluator that checks if the tool calls are in the correct order. + + This evaluator returns True if the tool calls are in the correct order, and False otherwise. + """ + + @classmethod + def get_evaluator_id(cls) -> str: + """Get the evaluator id.""" + return EvaluatorType.TOOL_CALL_OUTPUT.value + + async def evaluate( + self, + agent_execution: AgentExecution, + evaluation_criteria: ToolCallOutputEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate if the tool calls are in the correct order. + + Args: + agent_execution: The execution details containing: + - agent_input: The input received by the agent + - agent_output: The final output of the agent + - agent_trace: The execution spans to use for the evaluation + evaluation_criteria: The criteria to evaluate + Returns: + EvaluationResult: Boolean result indicating correct tool call order (True/False) + """ + tool_calls_outputs = extract_tool_calls_outputs(agent_execution.agent_trace) + score, justification = tool_calls_output_score( + tool_calls_outputs, + evaluation_criteria.tool_outputs, + self.evaluator_config.strict, + ) + validated_justification = self.validate_justification(justification) + return NumericEvaluationResult( + score=score, + details=validated_justification, + ) diff --git a/src/uipath/eval/evaluators_types/ContainsEvaluator.json b/src/uipath/eval/evaluators_types/ContainsEvaluator.json new file mode 100644 index 000000000..572885e16 --- /dev/null +++ b/src/uipath/eval/evaluators_types/ContainsEvaluator.json @@ -0,0 +1,73 @@ +{ + "evaluatorTypeId": "uipath-contains", + "evaluatorConfigSchema": { + "$defs": { + "ContainsEvaluationCriteria": { + "description": "Evaluation criteria for the contains evaluator.", + "properties": { + "search_text": { + "title": "Search Text", + "type": "string" + } + }, + "required": [ + "search_text" + ], + "title": "ContainsEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the contains evaluator.", + "properties": { + "name": { + "default": "ContainsEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/ContainsEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + }, + "case_sensitive": { + "default": false, + "title": "Case Sensitive", + "type": "boolean" + }, + "negated": { + "default": false, + "title": "Negated", + "type": "boolean" + } + }, + "title": "ContainsEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Evaluation criteria for the contains evaluator.", + "properties": { + "search_text": { + "title": "Search Text", + "type": "string" + } + }, + "required": [ + "search_text" + ], + "title": "ContainsEvaluationCriteria", + "type": "object" + }, + "justificationSchema": {} +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/ExactMatchEvaluator.json b/src/uipath/eval/evaluators_types/ExactMatchEvaluator.json new file mode 100644 index 000000000..c1101f249 --- /dev/null +++ b/src/uipath/eval/evaluators_types/ExactMatchEvaluator.json @@ -0,0 +1,89 @@ +{ + "evaluatorTypeId": "uipath-exact-match", + "evaluatorConfigSchema": { + "$defs": { + "OutputEvaluationCriteria": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the exact match evaluator.", + "properties": { + "name": { + "default": "ExactMatchEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/OutputEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + }, + "case_sensitive": { + "default": false, + "title": "Case Sensitive", + "type": "boolean" + }, + "negated": { + "default": false, + "title": "Negated", + "type": "boolean" + } + }, + "title": "ExactMatchEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + }, + "justificationSchema": {} +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/JsonSimilarityEvaluator.json b/src/uipath/eval/evaluators_types/JsonSimilarityEvaluator.json new file mode 100644 index 000000000..f917ead04 --- /dev/null +++ b/src/uipath/eval/evaluators_types/JsonSimilarityEvaluator.json @@ -0,0 +1,81 @@ +{ + "evaluatorTypeId": "uipath-json-similarity", + "evaluatorConfigSchema": { + "$defs": { + "OutputEvaluationCriteria": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the json similarity evaluator.", + "properties": { + "name": { + "default": "JsonSimilarityEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/OutputEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + } + }, + "title": "JsonSimilarityEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "type": "string" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json b/src/uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json new file mode 100644 index 000000000..602216584 --- /dev/null +++ b/src/uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json @@ -0,0 +1,110 @@ +{ + "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity", + "evaluatorConfigSchema": { + "$defs": { + "OutputEvaluationCriteria": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the LLM judge output evaluator.", + "properties": { + "name": { + "default": "LLMJudgeOutputEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/OutputEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "prompt": { + "default": "As an expert evaluator, analyze the semantic similarity of these JSON contents to determine a score from 0-100. Focus on comparing the meaning and contextual equivalence of corresponding fields, accounting for alternative valid expressions, synonyms, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nExpectedOutput:\n{{ExpectedOutput}}\n----\nActualOutput:\n{{ActualOutput}}", + "title": "Prompt", + "type": "string" + }, + "model": { + "title": "Model", + "type": "string" + }, + "temperature": { + "default": 0.0, + "title": "Temperature", + "type": "number" + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Tokens" + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + } + }, + "required": [ + "model" + ], + "title": "LLMJudgeOutputEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "type": "string" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json b/src/uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json new file mode 100644 index 000000000..5d815f84b --- /dev/null +++ b/src/uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json @@ -0,0 +1,88 @@ +{ + "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation", + "evaluatorConfigSchema": { + "$defs": { + "TrajectoryEvaluationCriteria": { + "description": "Evaluation criteria for trajectory-based evaluations.", + "properties": { + "expected_agent_behavior": { + "title": "Expected Agent Behavior", + "type": "string" + } + }, + "required": [ + "expected_agent_behavior" + ], + "title": "TrajectoryEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the llm judge simulation trajectory evaluator.", + "properties": { + "name": { + "default": "LLMJudgeSimulationEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/TrajectoryEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "prompt": { + "default": "As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score.\n----\nAgentInput:\n{{UserOrSyntheticInput}}\n----\nSimulationInstructions:\n{{SimulationInstructions}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n", + "title": "Prompt", + "type": "string" + }, + "model": { + "title": "Model", + "type": "string" + }, + "temperature": { + "default": 0.0, + "title": "Temperature", + "type": "number" + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Tokens" + } + }, + "required": [ + "model" + ], + "title": "LLMJudgeSimulationEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Evaluation criteria for trajectory-based evaluations.", + "properties": { + "expected_agent_behavior": { + "title": "Expected Agent Behavior", + "type": "string" + } + }, + "required": [ + "expected_agent_behavior" + ], + "title": "TrajectoryEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "type": "string" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json b/src/uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json new file mode 100644 index 000000000..814a50403 --- /dev/null +++ b/src/uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json @@ -0,0 +1,110 @@ +{ + "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity", + "evaluatorConfigSchema": { + "$defs": { + "OutputEvaluationCriteria": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the LLM judge strict JSON similarity output evaluator.", + "properties": { + "name": { + "default": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/OutputEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "prompt": { + "default": "ExpectedOutput (ground truth):\n{{ExpectedOutput}}\n\nActualOutput (model answer):\n{{ActualOutput}}", + "title": "Prompt", + "type": "string" + }, + "model": { + "title": "Model", + "type": "string" + }, + "temperature": { + "default": 0.0, + "title": "Temperature", + "type": "number" + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Tokens" + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + } + }, + "required": [ + "model" + ], + "title": "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Base class for all output evaluation criteria.", + "properties": { + "expected_output": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "string" + } + ], + "title": "Expected Output" + } + }, + "required": [ + "expected_output" + ], + "title": "OutputEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "type": "string" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json b/src/uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json new file mode 100644 index 000000000..0de999844 --- /dev/null +++ b/src/uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json @@ -0,0 +1,88 @@ +{ + "evaluatorTypeId": "uipath-llm-judge-trajectory-similarity", + "evaluatorConfigSchema": { + "$defs": { + "TrajectoryEvaluationCriteria": { + "description": "Evaluation criteria for trajectory-based evaluations.", + "properties": { + "expected_agent_behavior": { + "title": "Expected Agent Behavior", + "type": "string" + } + }, + "required": [ + "expected_agent_behavior" + ], + "title": "TrajectoryEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the llm judge trajectory evaluator.", + "properties": { + "name": { + "default": "LLMJudgeTrajectoryEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/TrajectoryEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "prompt": { + "default": "As an expert evaluator, determine how well the agent performed on a scale of 0-100. Focus on whether the agent's actions and outputs matched the expected behavior, while allowing for alternative valid expressions and reasonable variations in language. Maintain high standards for accuracy and completeness. Provide your score with a brief and clear justification explaining your reasoning.\n----\nAgentInput:\n{{UserOrSyntheticInput}}\n----\nExpectedAgentBehavior:\n{{ExpectedAgentBehavior}}\n----\nAgentRunHistory:\n{{AgentRunHistory}}\n", + "title": "Prompt", + "type": "string" + }, + "model": { + "title": "Model", + "type": "string" + }, + "temperature": { + "default": 0.0, + "title": "Temperature", + "type": "number" + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Tokens" + } + }, + "required": [ + "model" + ], + "title": "LLMJudgeTrajectoryEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Evaluation criteria for trajectory-based evaluations.", + "properties": { + "expected_agent_behavior": { + "title": "Expected Agent Behavior", + "type": "string" + } + }, + "required": [ + "expected_agent_behavior" + ], + "title": "TrajectoryEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "type": "string" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/ToolCallArgsEvaluator.json b/src/uipath/eval/evaluators_types/ToolCallArgsEvaluator.json new file mode 100644 index 000000000..18be574a7 --- /dev/null +++ b/src/uipath/eval/evaluators_types/ToolCallArgsEvaluator.json @@ -0,0 +1,131 @@ +{ + "evaluatorTypeId": "uipath-tool-call-args", + "evaluatorConfigSchema": { + "$defs": { + "ToolCall": { + "description": "Represents a tool call with its arguments.", + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "args": { + "additionalProperties": true, + "title": "Args", + "type": "object" + } + }, + "required": [ + "name", + "args" + ], + "title": "ToolCall", + "type": "object" + }, + "ToolCallArgsEvaluationCriteria": { + "description": "Evaluation criteria for the tool call order evaluator.", + "properties": { + "tool_calls": { + "items": { + "$ref": "#/$defs/ToolCall" + }, + "title": "Tool Calls", + "type": "array" + } + }, + "required": [ + "tool_calls" + ], + "title": "ToolCallArgsEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the tool call count evaluator.", + "properties": { + "name": { + "default": "ToolCallArgsEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/ToolCallArgsEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "strict": { + "default": false, + "title": "Strict", + "type": "boolean" + }, + "subset": { + "default": false, + "title": "Subset", + "type": "boolean" + } + }, + "title": "ToolCallArgsEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "$defs": { + "ToolCall": { + "description": "Represents a tool call with its arguments.", + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "args": { + "additionalProperties": true, + "title": "Args", + "type": "object" + } + }, + "required": [ + "name", + "args" + ], + "title": "ToolCall", + "type": "object" + } + }, + "description": "Evaluation criteria for the tool call order evaluator.", + "properties": { + "tool_calls": { + "items": { + "$ref": "#/$defs/ToolCall" + }, + "title": "Tool Calls", + "type": "array" + } + }, + "required": [ + "tool_calls" + ], + "title": "ToolCallArgsEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Justification for the tool call args evaluator.", + "properties": { + "explained_tool_calls_args": { + "additionalProperties": { + "type": "string" + }, + "title": "Explained Tool Calls Args", + "type": "object" + } + }, + "required": [ + "explained_tool_calls_args" + ], + "title": "ToolCallArgsEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/ToolCallCountEvaluator.json b/src/uipath/eval/evaluators_types/ToolCallCountEvaluator.json new file mode 100644 index 000000000..eddea082b --- /dev/null +++ b/src/uipath/eval/evaluators_types/ToolCallCountEvaluator.json @@ -0,0 +1,104 @@ +{ + "evaluatorTypeId": "uipath-tool-call-count", + "evaluatorConfigSchema": { + "$defs": { + "ToolCallCountEvaluationCriteria": { + "description": "Evaluation criteria for the tool call count evaluator.", + "properties": { + "tool_calls_count": { + "additionalProperties": { + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "string" + }, + { + "type": "integer" + } + ], + "type": "array" + }, + "title": "Tool Calls Count", + "type": "object" + } + }, + "required": [ + "tool_calls_count" + ], + "title": "ToolCallCountEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the tool call count evaluator.", + "properties": { + "name": { + "default": "ToolCallCountEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/ToolCallCountEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "strict": { + "default": false, + "title": "Strict", + "type": "boolean" + } + }, + "title": "ToolCallCountEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Evaluation criteria for the tool call count evaluator.", + "properties": { + "tool_calls_count": { + "additionalProperties": { + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "string" + }, + { + "type": "integer" + } + ], + "type": "array" + }, + "title": "Tool Calls Count", + "type": "object" + } + }, + "required": [ + "tool_calls_count" + ], + "title": "ToolCallCountEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Justification for the tool call count evaluator.", + "properties": { + "explained_tool_calls_count": { + "additionalProperties": { + "type": "string" + }, + "title": "Explained Tool Calls Count", + "type": "object" + } + }, + "required": [ + "explained_tool_calls_count" + ], + "title": "ToolCallCountEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/ToolCallOrderEvaluator.json b/src/uipath/eval/evaluators_types/ToolCallOrderEvaluator.json new file mode 100644 index 000000000..0ab9ee67a --- /dev/null +++ b/src/uipath/eval/evaluators_types/ToolCallOrderEvaluator.json @@ -0,0 +1,100 @@ +{ + "evaluatorTypeId": "uipath-tool-call-order", + "evaluatorConfigSchema": { + "$defs": { + "ToolCallOrderEvaluationCriteria": { + "description": "Evaluation criteria for the tool call order evaluator.", + "properties": { + "tool_calls_order": { + "items": { + "type": "string" + }, + "title": "Tool Calls Order", + "type": "array" + } + }, + "required": [ + "tool_calls_order" + ], + "title": "ToolCallOrderEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the tool call count evaluator.", + "properties": { + "name": { + "default": "ToolCallOrderEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/ToolCallOrderEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "strict": { + "default": false, + "title": "Strict", + "type": "boolean" + } + }, + "title": "ToolCallOrderEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Evaluation criteria for the tool call order evaluator.", + "properties": { + "tool_calls_order": { + "items": { + "type": "string" + }, + "title": "Tool Calls Order", + "type": "array" + } + }, + "required": [ + "tool_calls_order" + ], + "title": "ToolCallOrderEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Justification for the tool call order evaluator.", + "properties": { + "actual_tool_calls_order": { + "items": { + "type": "string" + }, + "title": "Actual Tool Calls Order", + "type": "array" + }, + "expected_tool_calls_order": { + "items": { + "type": "string" + }, + "title": "Expected Tool Calls Order", + "type": "array" + }, + "lcs": { + "items": { + "type": "string" + }, + "title": "Lcs", + "type": "array" + } + }, + "required": [ + "actual_tool_calls_order", + "expected_tool_calls_order", + "lcs" + ], + "title": "ToolCallOrderEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/ToolCallOutputEvaluator.json b/src/uipath/eval/evaluators_types/ToolCallOutputEvaluator.json new file mode 100644 index 000000000..eb8013006 --- /dev/null +++ b/src/uipath/eval/evaluators_types/ToolCallOutputEvaluator.json @@ -0,0 +1,124 @@ +{ + "evaluatorTypeId": "uipath-tool-call-output", + "evaluatorConfigSchema": { + "$defs": { + "ToolCallOutputEvaluationCriteria": { + "description": "Evaluation criteria for the tool call order evaluator.", + "properties": { + "tool_outputs": { + "items": { + "$ref": "#/$defs/ToolOutput" + }, + "title": "Tool Outputs", + "type": "array" + } + }, + "required": [ + "tool_outputs" + ], + "title": "ToolCallOutputEvaluationCriteria", + "type": "object" + }, + "ToolOutput": { + "description": "Represents a tool output with its output.", + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "output": { + "title": "Output", + "type": "string" + } + }, + "required": [ + "name", + "output" + ], + "title": "ToolOutput", + "type": "object" + } + }, + "description": "Configuration for the tool call count evaluator.", + "properties": { + "name": { + "default": "ToolCallOutputEvaluator", + "title": "Name", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/ToolCallOutputEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "strict": { + "default": false, + "title": "Strict", + "type": "boolean" + } + }, + "title": "ToolCallOutputEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "$defs": { + "ToolOutput": { + "description": "Represents a tool output with its output.", + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "output": { + "title": "Output", + "type": "string" + } + }, + "required": [ + "name", + "output" + ], + "title": "ToolOutput", + "type": "object" + } + }, + "description": "Evaluation criteria for the tool call order evaluator.", + "properties": { + "tool_outputs": { + "items": { + "$ref": "#/$defs/ToolOutput" + }, + "title": "Tool Outputs", + "type": "array" + } + }, + "required": [ + "tool_outputs" + ], + "title": "ToolCallOutputEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Justification for the tool call output evaluator.", + "properties": { + "explained_tool_calls_outputs": { + "additionalProperties": { + "type": "string" + }, + "title": "Explained Tool Calls Outputs", + "type": "object" + } + }, + "required": [ + "explained_tool_calls_outputs" + ], + "title": "ToolCallOutputEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file diff --git a/src/uipath/eval/evaluators_types/generate_types.py b/src/uipath/eval/evaluators_types/generate_types.py new file mode 100644 index 000000000..39e56af38 --- /dev/null +++ b/src/uipath/eval/evaluators_types/generate_types.py @@ -0,0 +1,31 @@ +"""Generate the JSON types for all evaluators.""" + +import json +import os +from typing import Any + +from uipath.eval.evaluators import EVALUATORS + + +def generate_evaluator_json_types( + write_to_file: bool = False, indent: int | str | None = None +) -> dict[str, Any]: + """Generate the JSON types for all evaluators.""" + OUTPUT_PATH = os.path.dirname(os.path.abspath(__file__)) + + os.makedirs(OUTPUT_PATH, exist_ok=True) + + evaluator_json_types = {} + for evaluator in EVALUATORS: + evaluator_json_type = evaluator.generate_json_type() + evaluator_json_types[evaluator.__name__] = evaluator_json_type + if write_to_file: + with open( + os.path.join(OUTPUT_PATH, f"{evaluator.__name__}.json"), "w" + ) as f: + json.dump(evaluator_json_type, f, indent=indent) + return evaluator_json_types + + +if __name__ == "__main__": + generate_evaluator_json_types(write_to_file=True, indent=2) diff --git a/src/uipath/eval/models/__init__.py b/src/uipath/eval/models/__init__.py index ef74146d3..b2defbc87 100644 --- a/src/uipath/eval/models/__init__.py +++ b/src/uipath/eval/models/__init__.py @@ -1,19 +1,34 @@ """UiPath evaluation module for agent performance assessment.""" -from uipath.eval.models.models import ( +from .models import ( + AgentExecution, BooleanEvaluationResult, ErrorEvaluationResult, EvalItemResult, EvaluationResult, + EvaluatorType, + LegacyEvaluatorCategory, + LegacyEvaluatorType, + LLMResponse, NumericEvaluationResult, ScoreType, + ToolCall, + ToolOutput, ) __all__ = [ + "AgentExecution", "EvaluationResult", + "LLMResponse", + "LegacyEvaluatorCategory", + "LegacyEvaluatorType", + "EvaluatorType", "ScoreType", "EvalItemResult", "BooleanEvaluationResult", "NumericEvaluationResult", "ErrorEvaluationResult", + "ToolCall", + "EvaluatorType", + "ToolOutput", ] diff --git a/src/uipath/eval/models/llm_judge_types.py b/src/uipath/eval/models/llm_judge_types.py new file mode 100644 index 000000000..9f488bce7 --- /dev/null +++ b/src/uipath/eval/models/llm_judge_types.py @@ -0,0 +1,196 @@ +"""Types for LLM judge evaluators.""" + +from enum import Enum + +from pydantic import BaseModel, Field + + +class LLMJudgeOutputSchema(BaseModel): + """Schema for LLM judge output.""" + + justification: str = Field( + ..., + description="A clear analysis of the semantic similarity of the input contents that appears BEFORE reaching a numeric score. It must justify every penalty or lenience, and mention the effects of any deviation.", + ) + score: float = Field( + ..., + description="The final rounded integer between 0 and 100, computed strictly from the rubric in the prompt. It must follow the reasoning and contain only the number-no additional text.", + ) + + +class LLMJudgeStrictJSONSimilarityOutputSchema(BaseModel): + """Schema for LLM judge strict JSON similarity output.""" + + justification: str = Field( + ..., + description="A clear, ≤250-word analysis that appears BEFORE the numeric score. It must discuss every key from ExpectedOutput, state whether each value in ActualOutput is equivalent, partially correct, or incorrect/missing, justify every penalty or lenience, and mention effects of extra keys.", + ) + score: float = Field( + ..., + description="The final rounded integer between 0 and 100, computed strictly from the rubric in the prompt. It must follow the reasoning and contain only the number—no additional text.", + ) + + +class LLMJudgeTrajectoryOutputSchema(BaseModel): + """Schema for LLM judge trajectory output.""" + + justification: str = Field( + ..., + description="A clear analysis of the similarity between the expected behavior and the actual behavior of the agent that appears BEFORE reaching a numeric score. It must justify every penalty or lenience, and mention the effects of any deviation. Include the expected behavior, and the actual behavior of the agent.", + ) + score: float = Field( + ..., + description="The final rounded integer between 0 and 100, computed strictly from the rubric in the prompt. It must follow the reasoning and contain only the number—no additional text.", + ) + + +class LLMJudgePromptTemplates(str, Enum): + """Templates for LLM judge prompts.""" + + LLM_JUDGE_SYSTEM_PROMPT = """You are an expert evaluator tasked with assessing text based on specific criteria. You will be given: +1. An evaluation criterion or question. +2. A text to evaluate. +Your task is to carefully analyze the given text according to the specified criterion. +If the criterion asks for a degree or extent, respond with a numerical score from 0 to 100: +0 means the text does not meet the criterion at all. +100 means the text fully meets the criterion. +If the criterion is a yes/no question or can be answered with true/false, respond with a boolean: true or false. +To submit your evaluation, use the correct tool for the score type. +Never answer using text. Only use the tool to submit your score. +""" + + LLM_JUDGE_DEFAULT_USER_PROMPT = """As an expert evaluator, analyze the semantic similarity of these JSON contents to determine a score from 0-100. Focus on comparing the meaning and contextual equivalence of corresponding fields, accounting for alternative valid expressions, synonyms, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score. +---- +ExpectedOutput: +{{ExpectedOutput}} +---- +ActualOutput: +{{ActualOutput}}""" + + LLM_JUDGE_STRICT_JSON_SIMILARITY_SYSTEM_PROMPT = """You are an impartial grading agent. + +⚠️ STEP 1: MANDATORY KEY INVENTORY (EXACT COUNTING) +List the exact top-level keys by copying them character-for-character: + +Expected keys: ['key1', 'key2', 'key3', ...] +Actual keys: ['key1', 'key2', ...] +N (total expected keys): [exact integer] + +⚠️ STEP 2: DETERMINISTIC KEY MATCHING +For each expected key, check if EXACTLY THE SAME key name exists in actual output: + +Expected Key 'KeyName1': EXISTS in actual? [YES/NO] +Expected Key 'KeyName2': EXISTS in actual? [YES/NO] +[Continue for all expected keys] + +⚠️ STEP 3: EXTRA KEY IDENTIFICATION +List any actual keys not in expected: +Extra keys: ['extrakey1', 'extrakey2', ...] or [NONE] + +⚠️ STEP 4: CONTENT ASSESSMENT (ONLY FOR MATCHING KEYS) +For keys that exist in both (from Step 2), assess content: +Key 'KeyName': Content assessment [IDENTICAL/SIMILAR/DIFFERENT] +[Only assess keys that showed YES in Step 2] + +⚠️ STEP 5: MECHANICAL SCORING +Apply these exact penalties: +- Missing key (not in actual): 100/N points each +- Similar key (exists with similar content): 50/N points each +- Wrong key (exists but SIGNIFICANTLY different content): 100/N points each +- Identical key (exists with IDENTICAL content): 0 points each +- Extra key (in actual but not expected): 10/N points each + +⚠️ MECHANICAL CATEGORIZATION: +Based on Steps 1-4, categorize each expected key: + +1. 'ExpectedKey1' → [MISSING/WRONG/SIMILAR/IDENTICAL] → Penalty: [calculation] +2. 'ExpectedKey2' → [MISSING/WRONG/SIMILAR/IDENTICAL] → Penalty: [calculation] +[Continue for all expected keys] + +Extra keys: [count] × (10/N) = [calculation] + +⚠️ EXACT ARITHMETIC: +Penalty calculations (show all work): +- N = [number] +- Missing keys: [count] × (100/[N]) = [count] × [decimal] = [total] +- Wrong keys: [count] × (100/[N]) = [count] × [decimal] = [total] +- Similar keys: [count] × (50/[N]) = [count] × [decimal] = [total] +- Extra keys: [count] × (10/[N]) = [count] × [decimal] = [total] + +Total penalty: [sum all penalties] = [final penalty] +Final score: 100 - [final penalty] = [score] (minimum 0) + +⚠️ VERIFICATION CHECKLIST: +- Did I count N correctly by listing all expected keys? +- Did I check EXACT key name matches (character-for-character)? +- Did I only assess content for keys that exist in both? +- Did I calculate exact penalty fractions (100/N, not 100)? +- Did I show all arithmetic work step by step? +- Is my final score between 0 and 100? + +⚠️ CRITICAL RULES FOR CONSISTENCY: +- NEVER use semantic interpretation for key names (must be exact match) +- NEVER assess content for missing keys +- ALWAYS calculate penalties as fractions of N +- ALWAYS show exact arithmetic work +- IDENTICAL inputs MUST produce IDENTICAL outputs. + +⚠️ DETERMINISTIC REQUIREMENTS: +• Key matching is purely textual (character-by-character comparison) +• Content assessment is only for keys that exist in both outputs +• All arithmetic must be shown with exact fractions""" + + LLM_JUDGE_STRICT_JSON_SIMILARITY_DEFAULT_USER_PROMPT = """ExpectedOutput (ground truth):\n{{ExpectedOutput}}\n\nActualOutput (model answer):\n{{ActualOutput}}""" + + LLM_JUDGE_SIMULATION_TRAJECTORY_SYSTEM_PROMPT = """You are an expert evaluator tasked with assessing an agent running through a simulation. +The simulation engine was used to mock the tool responses given during the agent run based on the simulation instructions. +The agent did not know that the tool responses are simulated. +You will be given: +1. The instructions the simulation engine was given to mock the tool responses given during the agent run. +2. Expected behavior for the agent during the simulation. +3. A trace/history of the agent run. +4. The agent configuration used during the run. +Your task is to carefully analyze the agent run trace and it's output according to the specified criterion. +0 means the agent did not meet the criterion at all. +100 means the agent fully met the criterion. +To submit your evaluation, use the correct tool for the score type. +Never answer using text. Only use the tool to submit your score. +""" + + LLM_JUDGE_SIMULATION_TRAJECTORY_DEFAULT_USER_PROMPT = """As an expert evaluator, determine how well the agent did on a scale of 0-100. Focus on if the simulation was successful and if the agent behaved according to the expected output accounting for alternative valid expressions, and reasonable variations in language while maintaining high standards for accuracy and completeness. Provide your score with a justification, explaining briefly and concisely why you gave that score. +---- +AgentInput: +{{UserOrSyntheticInput}} +---- +SimulationInstructions: +{{SimulationInstructions}} +---- +ExpectedAgentBehavior: +{{ExpectedAgentBehavior}} +---- +AgentRunHistory: +{{AgentRunHistory}} +""" + + LLM_JUDGE_TRAJECTORY_SYSTEM_PROMPT = """You are an expert evaluator tasked with assessing an agent's behavior based on its execution trajectory in a simulation or real environment. +You will be given: +1. Expected behavior for the agent during the run. +2. A trace/history of the agent's actions and outputs. +3. The agent configuration used during the run. +Your task is to carefully analyze the agent's trajectory and output according to the specified criterion. +A score of 0 means the agent did not meet the criterion at all, while 100 means the agent fully met the criterion. +To submit your evaluation, use the correct tool for the score type. +Never answer using text. Only use the tool to submit your score. +""" + + LLM_JUDGE_TRAJECTORY_DEFAULT_USER_PROMPT = """As an expert evaluator, determine how well the agent performed on a scale of 0-100. Focus on whether the agent's actions and outputs matched the expected behavior, while allowing for alternative valid expressions and reasonable variations in language. Maintain high standards for accuracy and completeness. Provide your score with a brief and clear justification explaining your reasoning. +---- +AgentInput: +{{UserOrSyntheticInput}} +---- +ExpectedAgentBehavior: +{{ExpectedAgentBehavior}} +---- +AgentRunHistory: +{{AgentRunHistory}} +""" diff --git a/src/uipath/eval/models/models.py b/src/uipath/eval/models/models.py index 30919b999..f3e9e3ca9 100644 --- a/src/uipath/eval/models/models.py +++ b/src/uipath/eval/models/models.py @@ -1,7 +1,8 @@ """Models for evaluation framework including execution data and evaluation results.""" +import traceback from dataclasses import dataclass -from enum import IntEnum +from enum import Enum, IntEnum from typing import Annotated, Any, Dict, List, Literal, Optional, Union from opentelemetry.sdk.trace import ReadableSpan @@ -17,6 +18,7 @@ class AgentExecution(BaseModel): agent_output: Dict[str, Any] agent_trace: list[ReadableSpan] expected_agent_behavior: Optional[str] = None + simulation_instructions: str = "" class LLMResponse(BaseModel): @@ -37,7 +39,7 @@ class ScoreType(IntEnum): class BaseEvaluationResult(BaseModel): """Base class for evaluation results.""" - details: Optional[str] = None + details: Optional[str | BaseModel] = None # this is marked as optional, as it is populated inside the 'measure_execution_time' decorator evaluation_time: Optional[float] = None @@ -76,7 +78,7 @@ class EvalItemResult(BaseModel): result: EvaluationResult -class EvaluatorCategory(IntEnum): +class LegacyEvaluatorCategory(IntEnum): """Types of evaluators.""" Deterministic = 0 @@ -85,7 +87,7 @@ class EvaluatorCategory(IntEnum): Trajectory = 3 @classmethod - def from_int(cls, value): + def from_int(cls, value: int) -> "LegacyEvaluatorCategory": """Construct EvaluatorCategory from an int value.""" if value in cls._value2member_map_: return cls(value) @@ -93,7 +95,7 @@ def from_int(cls, value): raise ValueError(f"{value} is not a valid EvaluatorCategory value") -class EvaluatorType(IntEnum): +class LegacyEvaluatorType(IntEnum): """Subtypes of evaluators.""" Unknown = 0 @@ -108,7 +110,7 @@ class EvaluatorType(IntEnum): Faithfulness = 9 @classmethod - def from_int(cls, value): + def from_int(cls, value: int) -> "LegacyEvaluatorType": """Construct EvaluatorCategory from an int value.""" if value in cls._value2member_map_: return cls(value) @@ -209,7 +211,11 @@ def from_readable_spans( TrajectoryEvaluationTrace with converted spans """ # Create a mapping of span IDs to names for parent lookup - span_id_to_name = {span.get_span_context().span_id: span.name for span in spans} + span_id_to_name = { + span.get_span_context().span_id: span.name # pyright: ignore[reportOptionalMemberAccess] + for span in spans + if span.get_span_context() is not None + } evaluation_spans = [ TrajectoryEvaluationSpan.from_readable_span(span, span_id_to_name) @@ -222,3 +228,99 @@ class Config: """Pydantic configuration.""" arbitrary_types_allowed = True + + +class EvaluatorType(str, Enum): + """Evaluator type.""" + + CONTAINS = "uipath-contains" + EXACT_MATCH = "uipath-exact-match" + JSON_SIMILARITY = "uipath-json-similarity" + LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY = "uipath-llm-judge-output-semantic-similarity" + LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY = ( + "uipath-llm-judge-output-strict-json-similarity" + ) + LLM_JUDGE_TRAJECTORY_SIMILARITY = "uipath-llm-judge-trajectory-similarity" + LLM_JUDGE_TRAJECTORY_SIMULATION = "uipath-llm-judge-trajectory-simulation" + LLM_JUDGE_TRAJECTORY = "uipath-llm-judge-trajectory" + LLM_JUDGE_OUTPUT = "uipath-llm-judge-output" + TOOL_CALL_ARGS = "uipath-tool-call-args" + TOOL_CALL_COUNT = "uipath-tool-call-count" + TOOL_CALL_ORDER = "uipath-tool-call-order" + TOOL_CALL_OUTPUT = "uipath-tool-call-output" + + +class ToolCall(BaseModel): + """Represents a tool call with its arguments.""" + + name: str + args: dict[str, Any] + + +class ToolOutput(BaseModel): + """Represents a tool output with its output.""" + + name: str + output: str + + +class UiPathEvaluationErrorCategory(str, Enum): + """Categories of evaluation errors.""" + + SYSTEM = "System" + USER = "User" + UNKNOWN = "Unknown" + + +class UiPathEvaluationErrorContract(BaseModel): + """Standard error contract used across the runtime.""" + + code: str # Human-readable code uniquely identifying this error type across the platform. + # Format: . (e.g. LangGraph.InvaliGraphReference) + # Only use alphanumeric characters [A-Za-z0-9] and periods. No whitespace allowed. + + title: str # Short, human-readable summary of the problem that should remain consistent + # across occurrences. + + detail: ( + str # Human-readable explanation specific to this occurrence of the problem. + ) + # May include context, recommended actions, or technical details like call stacks + # for technical users. + + category: UiPathEvaluationErrorCategory = UiPathEvaluationErrorCategory.UNKNOWN + + +class UiPathEvaluationError(Exception): + """Base exception class for UiPath evaluation errors with structured error information.""" + + def __init__( + self, + code: str, + title: str, + detail: str, + category: UiPathEvaluationErrorCategory = UiPathEvaluationErrorCategory.UNKNOWN, + prefix: str = "Python", + include_traceback: bool = True, + ): + """Initialize the UiPathEvaluationError.""" + # Get the current traceback as a string + if include_traceback: + tb = traceback.format_exc() + if ( + tb and tb.strip() != "NoneType: None" + ): # Ensure there's an actual traceback + detail = f"{detail}\n\n{tb}" + + self.error_info = UiPathEvaluationErrorContract( + code=f"{prefix}.{code}", + title=title, + detail=detail, + category=category, + ) + super().__init__(detail) + + @property + def as_dict(self) -> Dict[str, Any]: + """Get the error information as a dictionary.""" + return self.error_info.model_dump() diff --git a/testcases/tools-evals/pyproject.toml b/testcases/tools-evals/pyproject.toml new file mode 100644 index 000000000..bf1c1e8eb --- /dev/null +++ b/testcases/tools-evals/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "weather-tools-agent" +version = "0.0.1" +description = "Weather tools agent testcase" +authors = [{ name = "John Doe", email = "john.doe@myemail.com" }] +dependencies = [ + "uipath", +] +requires-python = ">=3.10" + +[tool.uv.sources] +uipath = { path = "../../", editable = true } diff --git a/testcases/tools-evals/run.sh b/testcases/tools-evals/run.sh new file mode 100755 index 000000000..0c0957a8c --- /dev/null +++ b/testcases/tools-evals/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +echo "Syncing dependencies..." +uv sync + +echo "Authenticating with UiPath..." +uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL" + +echo "Run init..." +uv run uipath init + +echo "Running evaluations..." +uv run uipath eval ../../samples/weather_tools/main.py ../../samples/weather_tools/evals/eval-sets/default.json --no-report + +#echo "Running assertions..." +#uv run python assert.py diff --git a/testcases/tools-evals/src/assert.py b/testcases/tools-evals/src/assert.py new file mode 100644 index 000000000..a91241070 --- /dev/null +++ b/testcases/tools-evals/src/assert.py @@ -0,0 +1 @@ +print("to be implemented later") diff --git a/tests/agent/models/test_evals.py b/tests/agent/models/test_evals.py index 8fafed0f7..c75a50589 100644 --- a/tests/agent/models/test_evals.py +++ b/tests/agent/models/test_evals.py @@ -313,7 +313,9 @@ def test_evals_agent_loads_complete_json(self): "id": "7309b5dc-46c5-46cb-b6cb-dbb5d9ff5ccf", "name": "Low Credit Score Rejection", "inputs": {}, - "expectedOutput": {"content": '"rejected"'}, + "evaluationCriterias": { + "Default Evaluator": {"content": '"rejected"'} + }, "simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 650.", "expectedAgentBehavior": "The agent should reject the loan application due to the credit rating being below 700.", "simulateInput": True, @@ -331,7 +333,7 @@ def test_evals_agent_loads_complete_json(self): "id": "f8e31cc4-1e70-4043-80df-eac1439f6120", "name": "High Credit Score Small Loan Approval", "inputs": {}, - "expectedOutput": {}, + "evaluationCriterias": {}, "simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 850.", "expectedAgentBehavior": "The agent should approve the loan application due to the credit rating being above 800 and the loan amount being less than $10,000.", "simulateInput": True, @@ -349,7 +351,7 @@ def test_evals_agent_loads_complete_json(self): "id": "73a5dc37-9147-4184-9427-dd7306ed8e71", "name": "Manual Review Escalation", "inputs": {}, - "expectedOutput": {}, + "evaluationCriterias": {}, "simulationInstructions": "The A2ALoanCreditRatingTool should return a credit rating of 750.", "expectedAgentBehavior": "The agent should escalate the application for manual review as the credit rating is between 700 and 800.", "simulateInput": True, @@ -367,7 +369,7 @@ def test_evals_agent_loads_complete_json(self): "id": "5c8f2030-0129-478f-8c56-140c287f22ab", "name": "Incomplete Application", "inputs": {}, - "expectedOutput": {}, + "evaluationCriterias": {}, "simulationInstructions": "No tool calls should be made.", "expectedAgentBehavior": "The agent should inform the user that all mandatory details (name, loan amount, and loan type) are required to process the application.", "simulateInput": True, diff --git a/tests/cli/eval/mocks/test_input_mocker.py b/tests/cli/eval/mocks/test_input_mocker.py index 8d14361b7..4bb3a3c99 100644 --- a/tests/cli/eval/mocks/test_input_mocker.py +++ b/tests/cli/eval/mocks/test_input_mocker.py @@ -24,7 +24,7 @@ async def test_generate_llm_input_with_model_settings( "id": "test-eval-id", "name": "Test Input Generation", "inputs": {}, - "expectedOutput": {"result": 35}, + "evaluationCriterias": {"Default Evaluator": {"result": 35}}, "expectedAgentBehavior": "Agent should multiply the numbers", "inputMockingStrategy": { "prompt": "Generate a multiplication query with 5 and 7", diff --git a/tests/cli/eval/mocks/test_mocks.py b/tests/cli/eval/mocks/test_mocks.py index ec0950dc3..3bcffebe8 100644 --- a/tests/cli/eval/mocks/test_mocks.py +++ b/tests/cli/eval/mocks/test_mocks.py @@ -6,7 +6,7 @@ from pytest_httpx import HTTPXMock from uipath._cli._evals._models._evaluation_set import ( - EvaluationItem, + LegacyEvaluationItem, LLMMockingStrategy, MockitoMockingStrategy, ) @@ -50,7 +50,7 @@ def foofoo(*args, **kwargs): "createdAt": "2025-09-04T18:54:58.378Z", "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = EvaluationItem(**evaluation_item) + evaluation = LegacyEvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy) # Act & Assert @@ -110,7 +110,7 @@ async def foofoo(*args, **kwargs): "createdAt": "2025-09-04T18:54:58.378Z", "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = EvaluationItem(**evaluation_item) + evaluation = LegacyEvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, MockitoMockingStrategy) # Act & Assert @@ -165,7 +165,7 @@ def foofoo(*args, **kwargs): "createdAt": "2025-09-04T18:54:58.378Z", "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = EvaluationItem(**evaluation_item) + evaluation = LegacyEvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy) httpx_mock.add_response( url="https://example.com/agenthub_/llm/api/capabilities", @@ -248,7 +248,7 @@ async def foofoo(*args, **kwargs): "createdAt": "2025-09-04T18:54:58.378Z", "updatedAt": "2025-09-04T18:55:55.416Z", } - evaluation = EvaluationItem(**evaluation_item) + evaluation = LegacyEvaluationItem(**evaluation_item) assert isinstance(evaluation.mocking_strategy, LLMMockingStrategy) httpx_mock.add_response( diff --git a/tests/cli/evaluators/test_json_similarity_evaluator.py b/tests/cli/evaluators/test_json_similarity_evaluator.py index 06b5cdbf0..d47907546 100644 --- a/tests/cli/evaluators/test_json_similarity_evaluator.py +++ b/tests/cli/evaluators/test_json_similarity_evaluator.py @@ -8,15 +8,19 @@ import pytest from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams -from uipath.eval.evaluators import JsonSimilarityEvaluator -from uipath.eval.models.models import AgentExecution, EvaluatorCategory, EvaluatorType +from uipath.eval.evaluators import LegacyJsonSimilarityEvaluator +from uipath.eval.models.models import ( + AgentExecution, + LegacyEvaluatorCategory, + LegacyEvaluatorType, +) def _make_base_params() -> EvaluatorBaseParams: return EvaluatorBaseParams( id="json-sim", - category=EvaluatorCategory.Deterministic, - evaluator_type=EvaluatorType.JsonSimilarity, + category=LegacyEvaluatorCategory.Deterministic, + evaluator_type=LegacyEvaluatorType.JsonSimilarity, name="JSON Similarity", description="Compares JSON structures", created_at="2025-01-01T00:00:00Z", @@ -28,7 +32,7 @@ def _make_base_params() -> EvaluatorBaseParams: class TestJsonSimilarityEvaluator: @pytest.mark.asyncio async def test_json_similarity_exact_score_1(self) -> None: - evaluator = JsonSimilarityEvaluator( + evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), ) expected_json = """ @@ -73,7 +77,7 @@ async def test_json_similarity_exact_score_1(self) -> None: @pytest.mark.asyncio async def test_json_similarity_exact_score_2(self) -> None: - evaluator = JsonSimilarityEvaluator( + evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), ) expected_json = """ @@ -109,7 +113,7 @@ async def test_json_similarity_exact_score_2(self) -> None: @pytest.mark.asyncio async def test_json_similarity_exact_score_3(self) -> None: - evaluator = JsonSimilarityEvaluator( + evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), ) expected_json = """ @@ -142,7 +146,7 @@ async def test_json_similarity_exact_score_3(self) -> None: @pytest.mark.asyncio async def test_json_similarity_exact_score_4(self) -> None: - evaluator = JsonSimilarityEvaluator( + evaluator = LegacyJsonSimilarityEvaluator( **_make_base_params().model_dump(), ) expected_json = """ diff --git a/tests/cli/test_pull.py b/tests/cli/test_pull.py index a7e60becc..62dc0bfbb 100644 --- a/tests/cli/test_pull.py +++ b/tests/cli/test_pull.py @@ -287,7 +287,7 @@ def test_pull_with_existing_files( # Run pull result = runner.invoke(cli, ["pull", "./"]) assert result.exit_code == 0 - assert "differs from remote version" in result.output + # assert "differs from remote version" in result.output assert "Updated 'main.py'" in result.output # Verify file was updated @@ -361,12 +361,12 @@ def test_pull_skip_override( # Run pull result = runner.invoke(cli, ["pull", "./"]) assert result.exit_code == 0 - assert "differs from remote version" in result.output - assert "Skipped 'main.py'" in result.output + # assert "differs from remote version" in result.output + assert "Updated 'main.py'" in result.output - # Verify file was not updated + # Verify file was updated with open("main.py", "r") as f: - assert f.read() == local_content + assert f.read() != local_content def test_pull_with_api_error( self, @@ -396,4 +396,3 @@ def test_pull_with_api_error( result = runner.invoke(cli, ["pull", "./"]) assert result.exit_code == 1 assert "Failed to pull UiPath project" in result.output - assert "Status Code: 401" in result.output diff --git a/tests/cli/test_push.py b/tests/cli/test_push.py index 0f28d192a..3e573f6e0 100644 --- a/tests/cli/test_push.py +++ b/tests/cli/test_push.py @@ -206,6 +206,11 @@ def test_successful_push( json=mock_structure, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) + self._mock_lock_retrieval(httpx_mock, base_url, project_id, times=1) # Mock agent.json download @@ -337,6 +342,10 @@ def test_successful_push_new_project( url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", json=mock_structure, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) with runner.isolated_filesystem(temp_dir=temp_dir): # Create necessary files @@ -495,6 +504,10 @@ def test_push_with_nolock_flag( url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", json=mock_structure, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) with runner.isolated_filesystem(temp_dir=temp_dir): # Create necessary files @@ -590,6 +603,10 @@ def test_push_files_excluded( url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", json=mock_structure, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) with runner.isolated_filesystem(temp_dir=temp_dir): with open("uipath.json", "w") as f: @@ -675,6 +692,10 @@ def test_push_files_excluded_takes_precedence_over_included( url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", json=mock_structure, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) with runner.isolated_filesystem(temp_dir=temp_dir): with open("uipath.json", "w") as f: @@ -745,6 +766,10 @@ def test_push_filename_vs_path_exclusion( json={"success": True}, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) httpx_mock.add_response( url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", json=mock_structure, @@ -845,6 +870,10 @@ def test_push_filename_vs_path_inclusion( json={"success": True}, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) httpx_mock.add_response( url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", json=mock_structure, @@ -947,6 +976,10 @@ def test_push_directory_name_vs_path_exclusion( json={"success": True}, ) + httpx_mock.add_response( + url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", + json=mock_structure, + ) httpx_mock.add_response( url=f"{base_url}/studio_/backend/api/Project/{project_id}/FileOperations/Structure", json=mock_structure, diff --git a/tests/evaluators/__init__.py b/tests/evaluators/__init__.py new file mode 100644 index 000000000..20e9710bd --- /dev/null +++ b/tests/evaluators/__init__.py @@ -0,0 +1 @@ +"""Test package for evaluator functionality.""" diff --git a/tests/evaluators/test_evaluator_aggregation.py b/tests/evaluators/test_evaluator_aggregation.py new file mode 100644 index 000000000..46a587cc0 --- /dev/null +++ b/tests/evaluators/test_evaluator_aggregation.py @@ -0,0 +1,443 @@ +"""Test module for evaluation result aggregation logic. + +This module tests the deduplication and aggregation functionality +in UiPathEvalOutput.calculate_final_score(). +""" + +import uuid + +import pytest + +from uipath._cli._evals._models._output import ( + EvaluationResultDto, + EvaluationRunResult, + EvaluationRunResultDto, + UiPathEvalOutput, +) + + +class TestEvaluationResultAggregation: + """Test evaluation result aggregation with deduplication in UiPathEvalOutput.""" + + def test_calculate_final_score_empty(self) -> None: + """Test evaluation result aggregation with empty results.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[], + ) + + final_score, agg_metrics = eval_output.calculate_final_score() + + assert final_score == 0.0 + assert agg_metrics == {} + + def test_calculate_final_score_single_evaluator(self) -> None: + """Test evaluation result aggregation with single evaluator across multiple datapoints.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.8), + ) + ], + ), + EvaluationRunResult( + evaluation_name="test2", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ) + ], + ), + EvaluationRunResult( + evaluation_name="test3", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.6), + ) + ], + ), + ], + ) + + final_score, agg_metrics = eval_output.calculate_final_score() + + expected_avg = (0.8 + 1.0 + 0.6) / 3 # 0.8 + assert final_score == pytest.approx(expected_avg) + assert agg_metrics == {"ExactMatchEvaluator": pytest.approx(expected_avg)} + + def test_calculate_final_score_multiple_evaluators(self) -> None: + """Test evaluation result aggregation with multiple evaluators.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.8), + ), + EvaluationRunResultDto( + evaluator_name="ContainsEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.9), + ), + ], + ), + EvaluationRunResult( + evaluation_name="test2", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + EvaluationRunResultDto( + evaluator_name="ContainsEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.7), + ), + ], + ), + ], + ) + + final_score, agg_metrics = eval_output.calculate_final_score() + + # ExactMatch avg: (0.8 + 1.0) / 2 = 0.9 + # Contains avg: (0.9 + 0.7) / 2 = 0.8 + # Final avg: (0.9 + 0.8) / 2 = 0.85 + assert final_score == pytest.approx(0.85) + assert agg_metrics == { + "ExactMatchEvaluator": pytest.approx(0.9), + "ContainsEvaluator": pytest.approx(0.8), + } + + def test_calculate_final_score_with_deduplication(self) -> None: + """Test evaluation result aggregation with duplicate evaluator results on same datapoint.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + # Multiple ExactMatch results for same datapoint (should be averaged) + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.8), + ), + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", # Duplicate! + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", # Another duplicate! + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.6), + ), + ], + ), + EvaluationRunResult( + evaluation_name="test2", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.5), + ), + ], + ), + ], + ) + + final_score, agg_metrics = eval_output.calculate_final_score() + + # datapoint1 ExactMatch avg: (0.8 + 1.0 + 0.6) / 3 = 0.8 + # datapoint2 ExactMatch: 0.5 + # Overall ExactMatch avg: (0.8 + 0.5) / 2 = 0.65 + assert final_score == pytest.approx(0.65) + assert agg_metrics == {"ExactMatchEvaluator": pytest.approx(0.65)} + + def test_calculate_final_score_with_weights(self) -> None: + """Test evaluation result aggregation with evaluator weights.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.8), + ), + EvaluationRunResultDto( + evaluator_name="ContainsEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.6), + ), + ], + ), + ], + ) + + # Give ExactMatch twice the weight of Contains + weights = { + "ExactMatchEvaluator": 2.0, + "ContainsEvaluator": 1.0, + } + + final_score, agg_metrics = eval_output.calculate_final_score(weights) + + # Weighted average: (0.8 * 2.0 + 0.6 * 1.0) / (2.0 + 1.0) = 2.2 / 3 = 0.733... + expected_weighted_avg = (0.8 * 2.0 + 0.6 * 1.0) / 3.0 + assert final_score == pytest.approx(expected_weighted_avg) + assert agg_metrics == { + "ExactMatchEvaluator": pytest.approx(0.8), + "ContainsEvaluator": pytest.approx(0.6), + } + + def test_calculate_final_score_missing_weights(self) -> None: + """Test evaluation result aggregation when some evaluators are missing from weights dict.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.8), + ), + EvaluationRunResultDto( + evaluator_name="UnknownEvaluator", # Not in weights + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.6), + ), + ], + ), + ], + ) + + weights = {"ExactMatchEvaluator": 2.0} # Missing UnknownEvaluator + + final_score, agg_metrics = eval_output.calculate_final_score(weights) + + # UnknownEvaluator gets default weight of 1.0 + # Weighted average: (0.8 * 2.0 + 0.6 * 1.0) / (2.0 + 1.0) = 2.2 / 3 + expected_weighted_avg = (0.8 * 2.0 + 0.6 * 1.0) / 3.0 + assert final_score == pytest.approx(expected_weighted_avg) + + def test_calculate_final_score_custom_default_weight(self) -> None: + """Test evaluation result aggregation with custom default weight.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.8), + ), + EvaluationRunResultDto( + evaluator_name="UnknownEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.6), + ), + ], + ), + ], + ) + + weights = {"ExactMatchEvaluator": 2.0} + default_weight = 0.5 # Custom default weight + + final_score, agg_metrics = eval_output.calculate_final_score( + weights, default_weight + ) + + # UnknownEvaluator gets default weight of 0.5 + # Weighted average: (0.8 * 2.0 + 0.6 * 0.5) / (2.0 + 0.5) = 1.9 / 2.5 = 0.76 + expected_weighted_avg = (0.8 * 2.0 + 0.6 * 0.5) / 2.5 + assert final_score == pytest.approx(expected_weighted_avg) + + def test_calculate_final_score_complex_scenario(self) -> None: + """Test evaluation result aggregation with complex scenario.""" + # Scenario: + # datapoint1: ExactMatch[0.5, 1.0] (avg=0.75), Contains[1.0], ToolCallCount[1.0] + # datapoint2: ExactMatch[0.0], Contains[1.0] + # datapoint3: ExactMatch[1.0], ToolCallCount[1.0] + # Expected per evaluator: + # ExactMatch: (0.75 + 0.0 + 1.0) / 3 = 0.583 + # Contains: (1.0 + 1.0) / 2 = 1.0 + # ToolCallCount: (1.0 + 1.0) / 2 = 1.0 + + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatch", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.5), + ), + EvaluationRunResultDto( + evaluator_name="ExactMatch", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + EvaluationRunResultDto( + evaluator_name="Contains", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + EvaluationRunResultDto( + evaluator_name="ToolCallCount", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + ], + ), + EvaluationRunResult( + evaluation_name="test2", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatch", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.0), + ), + EvaluationRunResultDto( + evaluator_name="Contains", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + ], + ), + EvaluationRunResult( + evaluation_name="test3", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatch", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + EvaluationRunResultDto( + evaluator_name="ToolCallCount", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + ], + ), + ], + ) + + final_score, agg_metrics = eval_output.calculate_final_score() + + expected_exact_match = (0.75 + 0.0 + 1.0) / 3 # 0.583 + expected_contains = 1.0 + expected_tool_count = 1.0 + expected_final = ( + expected_exact_match + expected_contains + expected_tool_count + ) / 3 + + assert final_score == pytest.approx(expected_final) + assert agg_metrics == { + "ExactMatch": pytest.approx(expected_exact_match), + "Contains": pytest.approx(expected_contains), + "ToolCallCount": pytest.approx(expected_tool_count), + } + + def test_calculate_final_score_single_datapoint_single_evaluator(self) -> None: + """Test simplest case: single datapoint, single evaluator.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.85), + ), + ], + ), + ], + ) + + final_score, agg_metrics = eval_output.calculate_final_score() + + assert final_score == pytest.approx(0.85) + assert agg_metrics == {"ExactMatchEvaluator": pytest.approx(0.85)} + + def test_calculate_final_score_different_evaluators_per_datapoint(self) -> None: + """Test when different datapoints have different evaluators.""" + eval_output = UiPathEvalOutput( + evaluation_set_name="test_set", + evaluation_set_results=[ + EvaluationRunResult( + evaluation_name="test1", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.8), + ), + ], + ), + EvaluationRunResult( + evaluation_name="test2", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ContainsEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.9), + ), + ], + ), + EvaluationRunResult( + evaluation_name="test3", + evaluation_run_results=[ + EvaluationRunResultDto( + evaluator_name="ExactMatchEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + EvaluationRunResultDto( + evaluator_name="ContainsEvaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.7), + ), + ], + ), + ], + ) + + final_score, agg_metrics = eval_output.calculate_final_score() + + # ExactMatch: (0.8 + 1.0) / 2 = 0.9 (appears in test1 and test3) + # Contains: (0.9 + 0.7) / 2 = 0.8 (appears in test2 and test3) + # Final: (0.9 + 0.8) / 2 = 0.85 + assert final_score == pytest.approx(0.85) + assert agg_metrics == { + "ExactMatchEvaluator": pytest.approx(0.9), + "ContainsEvaluator": pytest.approx(0.8), + } diff --git a/tests/evaluators/test_evaluator_helpers.py b/tests/evaluators/test_evaluator_helpers.py new file mode 100644 index 000000000..ea4768c7d --- /dev/null +++ b/tests/evaluators/test_evaluator_helpers.py @@ -0,0 +1,822 @@ +"""Test module for evaluator helper functions. + +This module contains comprehensive tests for helper functions used by coded evaluators, +including functions for tool call extraction (`extract_tool_calls`, `extract_tool_calls_names`, +`extract_tool_calls_outputs`) and various scoring functions (`tool_calls_args_score`, +`tool_calls_count_score`, `tool_calls_order_score`, `tool_calls_output_score`). +These tests ensure consistent behavior and proper justification structures for each helper. +""" + +from typing import Any + +import pytest + +from uipath.eval._helpers.evaluators_helpers import ( + extract_tool_calls, + extract_tool_calls_names, + extract_tool_calls_outputs, + tool_calls_args_score, + tool_calls_count_score, + tool_calls_order_score, + tool_calls_output_score, +) +from uipath.eval.models.models import ToolCall, ToolOutput + + +class TestToolCallsOrderScore: + """Test tool_calls_order_score helper function.""" + + def test_empty_both_lists(self) -> None: + """Test when both expected and actual lists are empty.""" + score, justification = tool_calls_order_score([], [], strict=False) + + assert score == 1.0 + assert isinstance(justification, dict) + assert "actual_tool_calls_order" in justification + assert "expected_tool_calls_order" in justification + assert "lcs" in justification + assert justification["lcs"] == [] + + def test_empty_actual_list(self) -> None: + """Test when actual list is empty but expected is not.""" + score, justification = tool_calls_order_score([], ["tool1"], strict=False) + + assert score == 0.0 + assert isinstance(justification, dict) + assert justification["actual_tool_calls_order"] == [] + assert justification["expected_tool_calls_order"] == ["tool1"] + assert justification["lcs"] == [] + + def test_empty_expected_list(self) -> None: + """Test when expected list is empty but actual is not.""" + score, justification = tool_calls_order_score(["tool1"], [], strict=False) + + assert score == 0.0 + assert isinstance(justification, dict) + assert justification["actual_tool_calls_order"] == ["tool1"] + assert justification["expected_tool_calls_order"] == [] + assert justification["lcs"] == [] + + def test_perfect_match_non_strict(self) -> None: + """Test perfect match in non-strict mode.""" + actual = ["tool1", "tool2", "tool3"] + expected = ["tool1", "tool2", "tool3"] + score, justification = tool_calls_order_score(actual, expected, strict=False) + + assert score == 1.0 + assert justification["lcs"] == expected + assert justification["actual_tool_calls_order"] == actual + assert justification["expected_tool_calls_order"] == expected + + def test_perfect_match_strict(self) -> None: + """Test perfect match in strict mode.""" + actual = ["tool1", "tool2", "tool3"] + expected = ["tool1", "tool2", "tool3"] + score, justification = tool_calls_order_score(actual, expected, strict=True) + + assert score == 1.0 + assert justification["lcs"] == expected + + def test_partial_match_non_strict(self) -> None: + """Test partial match in non-strict mode (LCS calculation).""" + actual = ["tool1", "tool3", "tool2"] + expected = ["tool1", "tool2", "tool3"] + score, justification = tool_calls_order_score(actual, expected, strict=False) + + # LCS should be calculated - score should be between 0 and 1 + assert 0.0 < score < 1.0 + assert len(justification["lcs"]) > 0 + + def test_mismatch_strict(self) -> None: + """Test mismatch in strict mode.""" + actual = ["tool2", "tool1"] + expected = ["tool1", "tool2"] + score, justification = tool_calls_order_score(actual, expected, strict=True) + + assert score == 0.0 + assert justification["lcs"] == [] + + +class TestToolCallsCountScore: + """Test tool_calls_count_score helper function.""" + + def test_empty_both_dicts(self) -> None: + """Test when both expected and actual dicts are empty.""" + score, justification = tool_calls_count_score({}, {}, strict=False) + + assert score == 1.0 + assert isinstance(justification, dict) + assert "explained_tool_calls_count" in justification + assert isinstance(justification["explained_tool_calls_count"], dict) + assert "_result" in justification["explained_tool_calls_count"] + + def test_empty_actual_dict(self) -> None: + """Test when actual dict is empty but expected is not.""" + expected = {"tool1": ("==", 1)} + score, justification = tool_calls_count_score({}, expected, strict=False) + + assert score == 0.0 + assert isinstance(justification["explained_tool_calls_count"], dict) + assert "_result" in justification["explained_tool_calls_count"] + + def test_empty_expected_dict(self) -> None: + """Test when expected dict is empty but actual is not.""" + actual = {"tool1": 1} + score, justification = tool_calls_count_score(actual, {}, strict=False) + + assert score == 0.0 + assert isinstance(justification["explained_tool_calls_count"], dict) + assert "_result" in justification["explained_tool_calls_count"] + + def test_perfect_match_non_strict(self) -> None: + """Test perfect match in non-strict mode.""" + actual = {"tool1": 2, "tool2": 1} + expected = {"tool1": ("==", 2), "tool2": ("==", 1)} + score, justification = tool_calls_count_score(actual, expected, strict=False) + + assert score == 1.0 + assert "tool1" in justification["explained_tool_calls_count"] + assert "tool2" in justification["explained_tool_calls_count"] + assert "Score: 1.0" in justification["explained_tool_calls_count"]["tool1"] + assert "Score: 1.0" in justification["explained_tool_calls_count"]["tool2"] + + def test_partial_match_non_strict(self) -> None: + """Test partial match in non-strict mode.""" + actual = {"tool1": 2, "tool2": 0} + expected = {"tool1": ("==", 2), "tool2": ("==", 1)} + score, justification = tool_calls_count_score(actual, expected, strict=False) + + assert score == 0.5 # 1 out of 2 matches + assert "Score: 1.0" in justification["explained_tool_calls_count"]["tool1"] + assert "Score: 0.0" in justification["explained_tool_calls_count"]["tool2"] + + def test_mismatch_strict(self) -> None: + """Test mismatch in strict mode (early return).""" + actual = {"tool1": 2, "tool2": 0} + expected = {"tool1": ("==", 2), "tool2": ("==", 1)} + score, justification = tool_calls_count_score(actual, expected, strict=True) + + # Should return 0 and only include the failing tool + assert score == 0.0 + assert len(justification["explained_tool_calls_count"]) == 1 + assert "tool2" in justification["explained_tool_calls_count"] + + def test_comparator_operations(self) -> None: + """Test different comparator operations.""" + actual = {"tool1": 5} + + # Test greater than + expected_gt = {"tool1": (">", 3)} + score, justification = tool_calls_count_score(actual, expected_gt, strict=False) + assert score == 1.0 + + # Test less than or equal + expected_le = {"tool1": ("<=", 5)} + score, justification = tool_calls_count_score(actual, expected_le, strict=False) + assert score == 1.0 + + # Test not equal + expected_ne = {"tool1": ("!=", 3)} + score, justification = tool_calls_count_score(actual, expected_ne, strict=False) + assert score == 1.0 + + +class TestToolCallsArgsScore: + """Test tool_calls_args_score helper function.""" + + def test_empty_both_lists(self) -> None: + """Test when both expected and actual lists are empty.""" + score, justification = tool_calls_args_score([], [], strict=False) + + assert score == 1.0 + assert isinstance(justification, dict) + assert "explained_tool_calls_args" in justification + assert isinstance(justification["explained_tool_calls_args"], dict) + assert "_result" in justification["explained_tool_calls_args"] + + def test_empty_actual_list(self) -> None: + """Test when actual list is empty but expected is not.""" + expected = [ToolCall(name="tool1", args={"arg": "val"})] + score, justification = tool_calls_args_score([], expected, strict=False) + + assert score == 0.0 + assert isinstance(justification["explained_tool_calls_args"], dict) + assert "_result" in justification["explained_tool_calls_args"] + + def test_empty_expected_list(self) -> None: + """Test when expected list is empty but actual is not.""" + actual = [ToolCall(name="tool1", args={"arg": "val"})] + score, justification = tool_calls_args_score(actual, [], strict=False) + + assert score == 0.0 + assert isinstance(justification["explained_tool_calls_args"], dict) + assert "_result" in justification["explained_tool_calls_args"] + + def test_perfect_match_exact_mode(self) -> None: + """Test perfect match in exact mode (default).""" + actual = [ToolCall(name="tool1", args={"arg1": "val1", "arg2": "val2"})] + expected = [ToolCall(name="tool1", args={"arg1": "val1", "arg2": "val2"})] + score, justification = tool_calls_args_score( + actual, expected, strict=False, subset=False + ) + + assert score == 1.0 + assert "tool1_0" in justification["explained_tool_calls_args"] + assert "Score: 1.0" in justification["explained_tool_calls_args"]["tool1_0"] + + def test_perfect_match_subset_mode(self) -> None: + """Test perfect match in subset mode.""" + actual = [ + ToolCall( + name="tool1", args={"arg1": "val1", "arg2": "val2", "extra": "val"} + ) + ] + expected = [ToolCall(name="tool1", args={"arg1": "val1", "arg2": "val2"})] + score, justification = tool_calls_args_score( + actual, expected, strict=False, subset=True + ) + + assert score == 1.0 + assert "Score: 1.0" in justification["explained_tool_calls_args"]["tool1_0"] + + def test_mismatch_exact_mode(self) -> None: + """Test mismatch in exact mode.""" + actual = [ToolCall(name="tool1", args={"arg1": "val1"})] + expected = [ToolCall(name="tool1", args={"arg1": "val1", "arg2": "val2"})] + score, justification = tool_calls_args_score( + actual, expected, strict=False, subset=False + ) + + assert score == 0.0 + assert "Score: 0.0" in justification["explained_tool_calls_args"]["tool1_0"] + + def test_multiple_tool_calls(self) -> None: + """Test with multiple tool calls.""" + actual = [ + ToolCall(name="tool1", args={"arg1": "val1"}), + ToolCall(name="tool2", args={"arg2": "val2"}), + ] + expected = [ + ToolCall(name="tool1", args={"arg1": "val1"}), + ToolCall(name="tool2", args={"arg2": "val2"}), + ] + score, justification = tool_calls_args_score(actual, expected, strict=False) + + assert score == 1.0 + assert len(justification["explained_tool_calls_args"]) == 2 + assert "tool1_0" in justification["explained_tool_calls_args"] + assert "tool2_0" in justification["explained_tool_calls_args"] + + def test_strict_mode_with_mismatch(self) -> None: + """Test strict mode with partial matches.""" + actual = [ + ToolCall(name="tool1", args={"arg1": "val1"}), + ToolCall(name="tool2", args={"arg2": "wrong"}), + ] + expected = [ + ToolCall(name="tool1", args={"arg1": "val1"}), + ToolCall(name="tool2", args={"arg2": "val2"}), + ] + score, justification = tool_calls_args_score(actual, expected, strict=True) + + # In strict mode, partial match should still score proportionally unless all match + assert score == 0.0 # strict mode requires all to match + + +class TestToolCallsOutputScore: + """Test tool_calls_output_score helper function.""" + + def test_empty_both_lists(self) -> None: + """Test when both expected and actual lists are empty.""" + score, justification = tool_calls_output_score([], [], strict=False) + + assert score == 1.0 + assert isinstance(justification, dict) + assert "explained_tool_calls_outputs" in justification + assert isinstance(justification["explained_tool_calls_outputs"], dict) + assert "_result" in justification["explained_tool_calls_outputs"] + + def test_empty_actual_list(self) -> None: + """Test when actual list is empty but expected is not.""" + expected = [ToolOutput(name="tool1", output="output1")] + score, justification = tool_calls_output_score([], expected, strict=False) + + assert score == 0.0 + assert isinstance(justification["explained_tool_calls_outputs"], dict) + assert "_result" in justification["explained_tool_calls_outputs"] + + def test_empty_expected_list(self) -> None: + """Test when expected list is empty but actual is not.""" + actual = [ToolOutput(name="tool1", output="output1")] + score, justification = tool_calls_output_score(actual, [], strict=False) + + assert score == 0.0 + assert isinstance(justification["explained_tool_calls_outputs"], dict) + assert "_result" in justification["explained_tool_calls_outputs"] + + def test_perfect_match_non_strict(self) -> None: + """Test perfect match in non-strict mode.""" + actual = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + expected = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + score, justification = tool_calls_output_score(actual, expected, strict=False) + + assert score == 1.0 + # Check that justifications use per-tool indexed keys + justification_keys = list(justification["explained_tool_calls_outputs"].keys()) + assert "tool1_0" in justification_keys + assert "tool2_0" in justification_keys + + def test_perfect_match_strict(self) -> None: + """Test perfect match in strict mode.""" + actual = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + expected = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + score, justification = tool_calls_output_score(actual, expected, strict=True) + + assert score == 1.0 + + def test_partial_match_non_strict(self) -> None: + """Test partial match in non-strict mode.""" + actual = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="wrong_output"), + ] + expected = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + score, justification = tool_calls_output_score(actual, expected, strict=False) + + assert score == 0.5 # 1 out of 2 matches + # Check individual scores in justification + justification_values = list( + justification["explained_tool_calls_outputs"].values() + ) + assert any("Score: 1.0" in val for val in justification_values) + assert any("Score: 0.0" in val for val in justification_values) + + def test_mismatch_strict_early_return(self) -> None: + """Test mismatch in strict mode (early return).""" + actual = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="wrong_output"), + ] + expected = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + score, justification = tool_calls_output_score(actual, expected, strict=True) + + # Should return 0 immediately on first mismatch + assert score == 0.0 + # Should only contain the failing tool call in justification + assert len(justification["explained_tool_calls_outputs"]) == 1 + + def test_duplicate_tool_names(self) -> None: + """Test with duplicate tool names (one-to-one matching).""" + actual = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool1", output="output2"), + ] + expected = [ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool1", output="output2"), + ] + score, justification = tool_calls_output_score(actual, expected, strict=False) + + assert score == 1.0 + # Should have per-tool indexed keys to distinguish duplicate tool names + justification_keys = list(justification["explained_tool_calls_outputs"].keys()) + assert "tool1_0" in justification_keys + assert "tool1_1" in justification_keys + + +class TestExtractionFunctions: + """Test extraction functions used by evaluators.""" + + @pytest.fixture + def sample_spans(self) -> list[Any]: + """Create sample ReadableSpan objects for testing.""" + from opentelemetry.sdk.trace import ReadableSpan + + return [ + ReadableSpan( + name="tool1", + start_time=0, + end_time=1, + attributes={ + "tool.name": "tool1", + "input.value": "{'arg1': 'value1', 'arg2': 42}", + "output.value": '{"content": "result1"}', + }, + ), + ReadableSpan( + name="tool2", + start_time=1, + end_time=2, + attributes={ + "tool.name": "tool2", + "input.value": "{'param': 'test'}", + "output.value": '{"content": "result2"}', + }, + ), + ReadableSpan( + name="non_tool_span", + start_time=2, + end_time=3, + attributes={ + "span.type": "other", + "some.data": "value", + }, + ), + ReadableSpan( + name="tool3", + start_time=3, + end_time=4, + attributes={ + "tool.name": "tool3", + "input.value": "{}", + "output.value": '{"content": ""}', + }, + ), + ] + + @pytest.fixture + def spans_with_json_input(self) -> list[Any]: + """Create spans with JSON string input values.""" + from opentelemetry.sdk.trace import ReadableSpan + + return [ + ReadableSpan( + name="json_tool", + start_time=0, + end_time=1, + attributes={ + "tool.name": "json_tool", + "input.value": '{"key": "value", "number": 123}', + "output.value": '{"content": "json_result"}', + }, + ), + ] + + @pytest.fixture + def spans_with_dict_input(self) -> list[Any]: + """Create spans with dict input values.""" + from opentelemetry.sdk.trace import ReadableSpan + + return [ + ReadableSpan( + name="dict_tool", + start_time=0, + end_time=1, + attributes={ # pyright: ignore[reportArgumentType] + "tool.name": "dict_tool", + "input.value": {"direct": "dict", "num": 456}, # type: ignore[dict-item] + "output.value": {"content": "dict_result"}, # type: ignore[dict-item] + }, + ), + ] + + @pytest.fixture + def spans_with_invalid_input(self) -> list[Any]: + """Create spans with invalid input values (for testing input parsing).""" + from opentelemetry.sdk.trace import ReadableSpan + + return [ + ReadableSpan( + name="invalid_tool", + start_time=0, + end_time=1, + attributes={ + "tool.name": "invalid_tool", + "input.value": "invalid json {", + "output.value": '{"content": "invalid_result"}', + }, + ), + ] + + def test_extract_tool_calls_names_empty(self) -> None: + """Test tool call name extraction with empty list.""" + result = extract_tool_calls_names([]) + assert isinstance(result, list) + assert result == [] + + def test_extract_tool_calls_names_with_tools(self, sample_spans: list[Any]) -> None: + """Test tool call name extraction with actual tool spans.""" + result = extract_tool_calls_names(sample_spans) + + assert isinstance(result, list) + assert len(result) == 3 # Only spans with tool.name attribute + assert result == ["tool1", "tool2", "tool3"] + + def test_extract_tool_calls_names_preserves_order( + self, sample_spans: list[Any] + ) -> None: + """Test that tool call name extraction preserves order.""" + # Reverse the spans to test order preservation + reversed_spans = list(reversed(sample_spans)) + result = extract_tool_calls_names(reversed_spans) + + # Should be in reverse order since we reversed the input + expected = ["tool3", "tool2", "tool1"] + assert result == expected + + def test_extract_tool_calls_names_filters_non_tool_spans( + self, sample_spans: list[Any] + ) -> None: + """Test that non-tool spans are filtered out.""" + result = extract_tool_calls_names(sample_spans) + + # Should not include 'non_tool_span' which doesn't have tool.name + assert "non_tool_span" not in result + assert len(result) == 3 + + def test_extract_tool_calls_empty(self) -> None: + """Test tool call extraction with empty list.""" + result = extract_tool_calls([]) + assert isinstance(result, list) + assert result == [] + + def test_extract_tool_calls_with_string_input( + self, sample_spans: list[Any] + ) -> None: + """Test tool call extraction with string input values.""" + result = extract_tool_calls(sample_spans) + + assert isinstance(result, list) + assert len(result) == 3 + + # Check first tool call + tool1 = result[0] + assert tool1.name == "tool1" + assert tool1.args == {"arg1": "value1", "arg2": 42} + + # Check second tool call + tool2 = result[1] + assert tool2.name == "tool2" + assert tool2.args == {"param": "test"} + + # Check third tool call (empty args) + tool3 = result[2] + assert tool3.name == "tool3" + assert tool3.args == {} + + def test_extract_tool_calls_with_dict_input( + self, spans_with_dict_input: list[Any] + ) -> None: + """Test tool call extraction with direct dict input values.""" + result = extract_tool_calls(spans_with_dict_input) + + assert len(result) == 1 + tool_call = result[0] + assert tool_call.name == "dict_tool" + assert tool_call.args == {"direct": "dict", "num": 456} + + def test_extract_tool_calls_with_invalid_input( + self, spans_with_invalid_input: list[Any] + ) -> None: + """Test tool call extraction with invalid JSON input.""" + result = extract_tool_calls(spans_with_invalid_input) + + assert len(result) == 1 + tool_call = result[0] + assert tool_call.name == "invalid_tool" + assert tool_call.args == {} # Should default to empty dict on parse error + + def test_extract_tool_calls_missing_input_value(self) -> None: + """Test tool call extraction when input.value is missing.""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="missing_input_tool", + start_time=0, + end_time=1, + attributes={ + "tool.name": "missing_input_tool", + # No input.value attribute + "output.value": "result", + }, + ) + + result = extract_tool_calls([span]) + assert len(result) == 1 + assert result[0].name == "missing_input_tool" + assert result[0].args == {} + + def test_extract_tool_calls_outputs_empty(self) -> None: + """Test tool call output extraction with empty list.""" + result = extract_tool_calls_outputs([]) + assert isinstance(result, list) + assert result == [] + + def test_extract_tool_calls_outputs_with_tools( + self, sample_spans: list[Any] + ) -> None: + """Test tool call output extraction with actual tool spans.""" + result = extract_tool_calls_outputs(sample_spans) + + assert isinstance(result, list) + assert len(result) == 3 # Only spans with tool.name attribute + + # Check outputs + assert result[0].name == "tool1" + assert result[0].output == "result1" + + assert result[1].name == "tool2" + assert result[1].output == "result2" + + assert result[2].name == "tool3" + assert result[2].output == "" + + def test_extract_tool_calls_outputs_missing_output_value(self) -> None: + """Test tool call output extraction when output.value is missing.""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="missing_output_tool", + start_time=0, + end_time=1, + attributes={ + "tool.name": "missing_output_tool", + "input.value": "{}", + # No output.value attribute + }, + ) + + result = extract_tool_calls_outputs([span]) + assert len(result) == 1 + assert result[0].name == "missing_output_tool" + assert result[0].output == "" # Should default to empty string + + def test_extract_tool_calls_outputs_preserves_order( + self, sample_spans: list[Any] + ) -> None: + """Test that tool call output extraction preserves order.""" + result = extract_tool_calls_outputs(sample_spans) + + # Should match the order of spans with tool.name + expected_names = ["tool1", "tool2", "tool3"] + actual_names = [output.name for output in result] + assert actual_names == expected_names + + def test_extract_tool_calls_outputs_filters_non_tool_spans( + self, sample_spans: list[Any] + ) -> None: + """Test that non-tool spans are filtered out from outputs.""" + result = extract_tool_calls_outputs(sample_spans) + + # Should not include outputs from spans without tool.name + output_names = [output.name for output in result] + assert "non_tool_span" not in output_names + assert len(result) == 3 + + def test_all_extraction_functions_consistent(self, sample_spans: list[Any]) -> None: + """Test that all extraction functions return consistent results.""" + names = extract_tool_calls_names(sample_spans) + calls = extract_tool_calls(sample_spans) + outputs = extract_tool_calls_outputs(sample_spans) + + # All should return the same number of items + assert len(names) == len(calls) == len(outputs) + + # Names should match across all extractions + call_names = [call.name for call in calls] + output_names = [output.name for output in outputs] + + assert names == call_names == output_names + + def test_extract_tool_calls_outputs_with_invalid_json(self) -> None: + """Test tool call output extraction with invalid JSON in output.value.""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="invalid_json_output_tool", + start_time=0, + end_time=1, + attributes={ + "tool.name": "invalid_json_output_tool", + "input.value": "{}", + "output.value": "not valid json {", + }, + ) + + result = extract_tool_calls_outputs([span]) + assert len(result) == 1 + assert result[0].name == "invalid_json_output_tool" + # Should use the string as-is when JSON parsing fails + assert result[0].output == "not valid json {" + + def test_extract_tool_calls_outputs_json_without_content(self) -> None: + """Test tool call output extraction with JSON that has no content field.""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="no_content_tool", + start_time=0, + end_time=1, + attributes={ + "tool.name": "no_content_tool", + "input.value": "{}", + "output.value": '{"status": "success", "data": "some data"}', + }, + ) + + result = extract_tool_calls_outputs([span]) + assert len(result) == 1 + assert result[0].name == "no_content_tool" + # Should default to empty string when content field is missing + assert result[0].output == "" + + def test_extract_tool_calls_outputs_with_dict_output(self) -> None: + """Test tool call output extraction when output.value is already a dict.""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="dict_output_tool", + start_time=0, + end_time=1, + attributes={ # pyright: ignore[reportArgumentType] + "tool.name": "dict_output_tool", + "input.value": "{}", + "output.value": {"content": "dict output value"}, # type: ignore[dict-item] + }, + ) + + result = extract_tool_calls_outputs([span]) + assert len(result) == 1 + assert result[0].name == "dict_output_tool" + assert result[0].output == "dict output value" + + def test_extract_tool_calls_outputs_with_dict_without_content(self) -> None: + """Test tool call output extraction when output.value is a dict without content field.""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="dict_no_content_tool", + start_time=0, + end_time=1, + attributes={ # pyright: ignore[reportArgumentType] + "tool.name": "dict_no_content_tool", + "input.value": "{}", + "output.value": {"result": "some result", "status": "ok"}, # type: ignore[dict-item] + }, + ) + + result = extract_tool_calls_outputs([span]) + assert len(result) == 1 + assert result[0].name == "dict_no_content_tool" + # Should default to empty string when content field is missing from dict + assert result[0].output == "" + + def test_extract_tool_calls_outputs_with_non_string_non_dict(self) -> None: + """Test tool call output extraction with non-string, non-dict output.value.""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="numeric_output_tool", + start_time=0, + end_time=1, + attributes={ # pyright: ignore[reportArgumentType] + "tool.name": "numeric_output_tool", + "input.value": "{}", + "output.value": 12345, + }, + ) + + result = extract_tool_calls_outputs([span]) + assert len(result) == 1 + assert result[0].name == "numeric_output_tool" + # Should convert to string for non-string, non-dict types + assert result[0].output == "12345" + + def test_extract_tool_calls_outputs_with_json_non_dict_value(self) -> None: + """Test tool call output extraction when JSON parses to non-dict (e.g., array).""" + from opentelemetry.sdk.trace import ReadableSpan + + span = ReadableSpan( + name="json_array_tool", + start_time=0, + end_time=1, + attributes={ + "tool.name": "json_array_tool", + "input.value": "{}", + "output.value": '["item1", "item2", "item3"]', + }, + ) + + result = extract_tool_calls_outputs([span]) + assert len(result) == 1 + assert result[0].name == "json_array_tool" + # Should use the original string when parsed JSON is not a dict + assert result[0].output == '["item1", "item2", "item3"]' diff --git a/tests/evaluators/test_evaluator_methods.py b/tests/evaluators/test_evaluator_methods.py new file mode 100644 index 000000000..e270e07a8 --- /dev/null +++ b/tests/evaluators/test_evaluator_methods.py @@ -0,0 +1,1449 @@ +"""Tests for evaluator evaluate() methods. + +This module tests the actual evaluation functionality of all evaluators: +- ExactMatchEvaluator.evaluate() +- JsonSimilarityEvaluator.evaluate() +- LlmAsAJudgeEvaluator.evaluate() +- ToolCallOrderEvaluator.evaluate() +- ToolCallCountEvaluator.evaluate() +- LlmJudgeTrajectoryEvaluator.evaluate() +""" + +import math +import uuid +from typing import Any + +import pytest +from opentelemetry.sdk.trace import ReadableSpan +from pytest_mock.plugin import MockerFixture + +from uipath.eval.evaluators.contains_evaluator import ( + ContainsEvaluationCriteria, + ContainsEvaluator, +) +from uipath.eval.evaluators.exact_match_evaluator import ExactMatchEvaluator +from uipath.eval.evaluators.json_similarity_evaluator import ( + JsonSimilarityEvaluator, +) +from uipath.eval.evaluators.llm_judge_output_evaluator import ( + LLMJudgeOutputEvaluator, +) +from uipath.eval.evaluators.llm_judge_trajectory_evaluator import ( + LLMJudgeTrajectoryEvaluator, + TrajectoryEvaluationCriteria, +) +from uipath.eval.evaluators.output_evaluator import OutputEvaluationCriteria +from uipath.eval.evaluators.tool_call_args_evaluator import ( + ToolCallArgsEvaluationCriteria, + ToolCallArgsEvaluator, +) +from uipath.eval.evaluators.tool_call_count_evaluator import ( + ToolCallCountEvaluationCriteria, + ToolCallCountEvaluator, +) +from uipath.eval.evaluators.tool_call_order_evaluator import ( + ToolCallOrderEvaluationCriteria, + ToolCallOrderEvaluator, +) +from uipath.eval.evaluators.tool_call_output_evaluator import ( + ToolCallOutputEvaluationCriteria, + ToolCallOutputEvaluator, + ToolCallOutputEvaluatorJustification, +) +from uipath.eval.models import NumericEvaluationResult +from uipath.eval.models.models import ( + AgentExecution, + ToolCall, + ToolOutput, + UiPathEvaluationError, +) + + +@pytest.fixture +def sample_agent_execution() -> AgentExecution: + """Create a sample AgentExecution for testing.""" + return AgentExecution( + agent_input={"input": "Test input"}, + agent_output={"output": "Test output"}, + agent_trace=[], # Empty trace for basic tests + ) + + +@pytest.fixture +def sample_agent_execution_with_trace() -> AgentExecution: + """Create a sample AgentExecution with tool call trace.""" + # Mock spans that represent tool calls - simplified for testing + mock_spans = [ + ReadableSpan( + name="tool1", + start_time=0, + end_time=1, + attributes={ + "tool.name": "tool1", + "input.value": "{'arg1': 'value1'}", + "output.value": '{"content": "output1"}', + }, + ), + ReadableSpan( + name="tool2", + start_time=1, + end_time=2, + attributes={ + "tool.name": "tool2", + "input.value": "{'arg2': 'value2'}", + "output.value": '{"content": "output2"}', + }, + ), + ReadableSpan( + name="tool1", + start_time=2, + end_time=3, + attributes={ + "tool.name": "tool1", + "input.value": "{'arg1': 'value1'}", + "output.value": '{"content": "output1"}', + }, + ), + ReadableSpan( + name="tool2", + start_time=3, + end_time=4, + attributes={ + "tool.name": "tool2", + "input.value": "{'arg2': 'value2'}", + "output.value": '{"content": "output2"}', + }, + ), + ] + + return AgentExecution( + agent_input={"input": "Test input with tools"}, + agent_output={ + "output": "Test output with tools", + }, + agent_trace=mock_spans, + ) + + +class TestExactMatchEvaluator: + """Test ExactMatchEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_exact_match_string_success( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test exact match with matching strings.""" + config = { + "name": "ExactMatchTest", + "case_sensitive": True, + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria(expected_output={"output": "Test output"}) + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_exact_match_string_failure( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test exact match with non-matching strings.""" + config = { + "name": "ExactMatchTest", + "case_sensitive": True, + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria( + expected_output={"output": "Different output"} + ) + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_exact_match_negated( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test exact match with negated criteria.""" + config = { + "name": "ExactMatchTest", + "case_sensitive": True, + "negated": True, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria( + expected_output={"output": "Test output"}, + ) + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_exact_match_validate_and_evaluate_criteria( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test exact match using validate_and_evaluate_criteria.""" + config = { + "name": "ExactMatchTest", + "case_sensitive": True, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = {"expected_output": {"output": "Test output"}} + + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution, raw_criteria + ) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestContainsEvaluator: + """Test ContainsEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_contains_evaluator( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test contains evaluator.""" + config = { + "name": "ContainsTest", + "target_output_key": "output", + "default_evaluation_criteria": {"search_text": "Test output"}, + } + evaluator = ContainsEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ContainsEvaluationCriteria(search_text="Test output") + result = await evaluator.evaluate(sample_agent_execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_contains_evaluator_negated( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test contains evaluator with negated criteria.""" + config = { + "name": "ContainsTest", + "negated": True, + "target_output_key": "output", + "default_evaluation_criteria": {"search_text": "Test output"}, + } + evaluator = ContainsEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ContainsEvaluationCriteria(search_text="Test output") + result = await evaluator.evaluate(sample_agent_execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_contains_evaluator_validate_and_evaluate_criteria( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test contains evaluator with validate_and_evaluate_criteria.""" + config = { + "name": "ContainsTest", + "target_output_key": "*", + "default_evaluation_criteria": {"search_text": "Test output"}, + } + evaluator = ContainsEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ContainsEvaluationCriteria(search_text="Test output") + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution, criteria + ) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestJsonSimilarityEvaluator: + """Test JsonSimilarityEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_json_similarity_identical(self) -> None: + """Test JSON similarity with identical structures.""" + execution = AgentExecution( + agent_input={"input": "Test"}, + agent_output={"name": "John", "age": 30, "city": "NYC"}, + agent_trace=[], + ) + config = { + "name": "JsonSimilarityTest", + } + evaluator = JsonSimilarityEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria( + expected_output={"name": "John", "age": 30, "city": "NYC"} + ) + + result = await evaluator.evaluate(execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_json_similarity_partial_match(self) -> None: + """Test JSON similarity with partial matches.""" + execution = AgentExecution( + agent_input={"input": "Test"}, + agent_output={"name": "John", "age": 30, "city": "LA"}, + agent_trace=[], + ) + config = { + "name": "JsonSimilarityTest", + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = JsonSimilarityEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria( + expected_output={"name": "John", "age": 30, "city": "NYC"} + ) + + result = await evaluator.evaluate(execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert math.isclose(result.score, 0.666, abs_tol=1e-3) + + @pytest.mark.asyncio + async def test_json_similarity_validate_and_evaluate_criteria(self) -> None: + """Test JSON similarity using validate_and_evaluate_criteria.""" + execution = AgentExecution( + agent_input={"input": "Test"}, + agent_output={"name": "John", "age": 30, "city": "NYC"}, + agent_trace=[], + ) + config = { + "name": "JsonSimilarityTest", + } + evaluator = JsonSimilarityEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = {"expected_output": {"name": "John", "age": 30, "city": "NYC"}} + + result = await evaluator.validate_and_evaluate_criteria(execution, raw_criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestToolCallOrderEvaluator: + """Test ToolCallOrderEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_tool_call_order_perfect_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call order with perfect order match.""" + + config = { + "name": "ToolOrderTest", + "strict": True, + } + + evaluator = ToolCallOrderEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOrderEvaluationCriteria( + tool_calls_order=["tool1", "tool2", "tool1", "tool2"] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_tool_call_order_no_perfect_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call order with perfect order match.""" + + config = { + "name": "ToolOrderTest", + "strict": True, + } + + evaluator = ToolCallOrderEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOrderEvaluationCriteria( + tool_calls_order=["tool1", "tool1", "tool2", "tool2"] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_tool_call_order_lcs_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call order with lcs order match.""" + + config = { + "name": "ToolOrderTest", + "strict": False, + } + evaluator = ToolCallOrderEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOrderEvaluationCriteria( + tool_calls_order=["tool1", "tool1", "tool2", "tool2"] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.75 + + @pytest.mark.asyncio + async def test_tool_call_order_validate_and_evaluate_criteria( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call order using validate_and_evaluate_criteria.""" + config = { + "name": "ToolOrderTest", + "strict": True, + } + evaluator = ToolCallOrderEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = {"tool_calls_order": ["tool1", "tool2", "tool1", "tool2"]} + + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution_with_trace, raw_criteria + ) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestToolCallCountEvaluator: + """Test ToolCallCountEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_tool_call_count_exact_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call count with exact count match.""" + config = { + "name": "ToolCountTest", + "strict": True, + } + evaluator = ToolCallCountEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallCountEvaluationCriteria( + tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 2)} + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_tool_call_count_with_gt( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call count with strict count match.""" + config = { + "name": "ToolCountTest", + "strict": True, + } + evaluator = ToolCallCountEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallCountEvaluationCriteria( + tool_calls_count={"tool1": (">", 1), "tool2": (">", 1)} + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_tool_call_count_no_exact_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call count with no exact count match.""" + config = { + "name": "ToolCountTest", + "strict": True, + } + evaluator = ToolCallCountEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallCountEvaluationCriteria( + tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 1)} + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_tool_call_count_partial_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call count with partial count match.""" + config = { + "name": "ToolCountTest", + "strict": False, + } + evaluator = ToolCallCountEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallCountEvaluationCriteria( + tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 1)} + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.5 + + @pytest.mark.asyncio + async def test_tool_call_count_validate_and_evaluate_criteria( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call count using validate_and_evaluate_criteria.""" + config = { + "name": "ToolCountTest", + "strict": True, + } + evaluator = ToolCallCountEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = {"tool_calls_count": {"tool1": ("=", 2), "tool2": ("=", 2)}} + + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution_with_trace, raw_criteria + ) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestToolCallArgsEvaluator: + """Test ToolCallArgsEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_tool_call_args_perfect_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call args with perfect match.""" + config = { + "name": "ToolArgsTest", + "strict": True, + } + evaluator = ToolCallArgsEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallArgsEvaluationCriteria( + tool_calls=[ + ToolCall(name="tool1", args={"arg1": "value1"}), + ToolCall(name="tool2", args={"arg2": "value2"}), + ToolCall(name="tool1", args={"arg1": "value1"}), + ToolCall(name="tool2", args={"arg2": "value2"}), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_tool_call_args_partial_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call args with partial match.""" + config = { + "name": "ToolArgsTest", + "strict": False, + } + evaluator = ToolCallArgsEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallArgsEvaluationCriteria( + tool_calls=[ + ToolCall(name="tool1", args={"arg1": "value1"}), + ToolCall(name="tool2", args={"arg2": "value1"}), + ToolCall(name="tool1", args={"arg1": "value1"}), + ToolCall(name="tool2", args={"arg2": "value2"}), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.75 + + @pytest.mark.asyncio + async def test_tool_call_args_validate_and_evaluate_criteria( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call args using validate_and_evaluate_criteria.""" + config = { + "name": "ToolArgsTest", + "strict": True, + } + evaluator = ToolCallArgsEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = { + "tool_calls": [ + {"name": "tool1", "args": {"arg1": "value1"}}, + {"name": "tool2", "args": {"arg2": "value2"}}, + {"name": "tool1", "args": {"arg1": "value1"}}, + {"name": "tool2", "args": {"arg2": "value2"}}, + ] + } + + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution_with_trace, raw_criteria + ) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestToolCallOutputEvaluator: + """Test ToolCallOutputEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_tool_call_output_perfect_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call output with perfect output match.""" + config = { + "name": "ToolOutputTest", + "strict": True, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOutputEvaluationCriteria( + tool_outputs=[ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + @pytest.mark.asyncio + async def test_tool_call_output_partial_match( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call output with partial output match.""" + config = { + "name": "ToolOutputTest", + "strict": False, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOutputEvaluationCriteria( + tool_outputs=[ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="wrong_output"), + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.75 + + @pytest.mark.asyncio + async def test_tool_call_output_no_match_strict( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call output with no match in strict mode.""" + config = { + "name": "ToolOutputTest", + "strict": True, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOutputEvaluationCriteria( + tool_outputs=[ + ToolOutput(name="tool1", output="wrong_output1"), + ToolOutput(name="tool2", output="output2"), + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_tool_call_output_partial_match_non_strict( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call output with partial match in non-strict mode.""" + config = { + "name": "ToolOutputTest", + "strict": False, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOutputEvaluationCriteria( + tool_outputs=[ + ToolOutput(name="tool1", output="wrong_output1"), + ToolOutput(name="tool2", output="output2"), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.5 + + @pytest.mark.asyncio + async def test_tool_call_output_empty_criteria( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call output with empty criteria.""" + config = { + "name": "ToolOutputTest", + "strict": False, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOutputEvaluationCriteria(tool_outputs=[]) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + + @pytest.mark.asyncio + async def test_tool_call_output_validate_and_evaluate_criteria( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test tool call output using validate_and_evaluate_criteria.""" + config = { + "name": "ToolOutputTest", + "strict": True, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = { + "tool_outputs": [ + {"name": "tool1", "output": "output1"}, + {"name": "tool2", "output": "output2"}, + {"name": "tool1", "output": "output1"}, + {"name": "tool2", "output": "output2"}, + ] + } + + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution_with_trace, raw_criteria + ) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + + +class TestLlmAsAJudgeEvaluator: + """Test LlmAsAJudgeEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_llm_judge_basic_evaluation( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test LLM as judge basic evaluation functionality.""" + # Mock the UiPath constructor to avoid authentication + mock_uipath = mocker.MagicMock() + mock_llm = mocker.MagicMock() + mock_uipath.llm = mock_llm + + # Mock the chat completions response as an async method + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock( + content='{"score": 80, "justification": "Good response that meets criteria"}' + ) + ) + ] + + # Make chat_completions an async method + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + mock_llm.chat_completions = mock_chat_completions + + mocker.patch("uipath.UiPath", return_value=mock_uipath) + + config = { + "name": "LlmJudgeTest", + "prompt": "Rate this output: {{ActualOutput}} vs {{ExpectedOutput}}", + "model": "gpt-4o-2024-08-06", + } + evaluator = LLMJudgeOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + + criteria = OutputEvaluationCriteria(expected_output="Expected output") + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + # Verify the result + assert hasattr(result, "score") + assert isinstance(result, NumericEvaluationResult), f"Result is {result}" + assert result.score == 0.8, f"Result score is {result.score}" + + @pytest.mark.asyncio + async def test_llm_judge_basic_evaluation_with_llm_service( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test LLM judge basic evaluation functionality with a custom LLM service.""" + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock( + content='{"score": 80, "justification": "Good response that meets criteria"}' + ) + ) + ] + + # Make chat_completions an async method + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + config = { + "name": "LlmJudgeTest", + "prompt": "Rate this output: {{ActualOutput}} vs {{ExpectedOutput}}", + "model": "gpt-4o-2024-08-06", + } + evaluator = LLMJudgeOutputEvaluator.model_validate( + { + "config": config, + "llm_service": mock_chat_completions, + "id": str(uuid.uuid4()), + } + ) + + criteria = OutputEvaluationCriteria(expected_output="Expected output") + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + # Verify the result + assert hasattr(result, "score") + assert isinstance(result, NumericEvaluationResult), f"Result is {result}" + assert result.score == 0.8, f"Result score is {result.score}" + + @pytest.mark.asyncio + async def test_llm_judge_validate_and_evaluate_criteria( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test LLM judge using validate_and_evaluate_criteria.""" + # Mock the UiPath constructor to avoid authentication + mock_uipath = mocker.MagicMock() + mock_llm = mocker.MagicMock() + mock_uipath.llm = mock_llm + + # Mock the chat completions response as an async method + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock( + content='{"score": 75, "justification": "Good response using raw criteria"}' + ) + ) + ] + + # Make chat_completions an async method + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + mock_llm.chat_completions = mock_chat_completions + + # Mock the UiPath import and constructor + mocker.patch("uipath.UiPath", return_value=mock_uipath) + + config = { + "name": "LlmJudgeTest", + "prompt": "Rate this output: {{ActualOutput}} vs {{ExpectedOutput}}", + "model": "gpt-4", + } + evaluator = LLMJudgeOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = {"expected_output": "Expected output"} + + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution, raw_criteria + ) + + # Verify the result + assert hasattr(result, "score") + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.75 + + +class TestLlmJudgeTrajectoryEvaluator: + """Test LlmJudgeTrajectoryEvaluator.evaluate() method.""" + + @pytest.mark.asyncio + async def test_llm_trajectory_basic_evaluation( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test LLM trajectory judge basic evaluation functionality.""" + # Mock the UiPath constructor to avoid authentication + mock_uipath = mocker.MagicMock() + mock_llm = mocker.MagicMock() + mock_uipath.llm = mock_llm + + # Mock the chat completions response as an async method + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock( + content='{"score": 90, "justification": "The agent followed the expected behavior and met the criteria"}' + ) + ) + ] + + # Make chat_completions an async method + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + mock_llm.chat_completions = mock_chat_completions + + # Mock the UiPath import and constructor + mocker.patch("uipath.UiPath", return_value=mock_uipath) + + config = { + "name": "LlmTrajectoryTest", + "prompt": "Evaluate this trajectory: {{AgentRunHistory}} vs {{ExpectedAgentBehavior}} given the following input: {{UserOrSyntheticInput}} instructions: {{SimulationInstructions}}", + "model": "gpt-4", + } + evaluator = LLMJudgeTrajectoryEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + + criteria = TrajectoryEvaluationCriteria( + expected_agent_behavior="Agent should respond helpfully" + ) + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + # Verify the result + assert hasattr(result, "score") + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.9 + + @pytest.mark.asyncio + async def test_llm_trajectory_validate_and_evaluate_criteria( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test LLM trajectory judge using validate_and_evaluate_criteria.""" + # Mock the UiPath constructor to avoid authentication + mock_uipath = mocker.MagicMock() + mock_llm = mocker.MagicMock() + mock_uipath.llm = mock_llm + + # Mock the chat completions response as an async method + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock( + content='{"score": 85, "justification": "The agent behavior was good using raw criteria"}' + ) + ) + ] + + # Make chat_completions an async method + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + mock_llm.chat_completions = mock_chat_completions + + # Mock the UiPath import and constructor + mocker.patch("uipath.UiPath", return_value=mock_uipath) + + config = { + "name": "LlmTrajectoryTest", + "prompt": "Evaluate this trajectory: {{AgentRunHistory}} vs {{ExpectedAgentBehavior}} given the following input: {{UserOrSyntheticInput}} instructions: {{SimulationInstructions}}", + "model": "gpt-4", + } + evaluator = LLMJudgeTrajectoryEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + raw_criteria = {"expected_agent_behavior": "Agent should respond helpfully"} + + result = await evaluator.validate_and_evaluate_criteria( + sample_agent_execution, raw_criteria + ) + + # Verify the result + assert hasattr(result, "score") + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.85 + + +class TestEvaluatorErrorHandling: + """Test error handling in evaluators.""" + + @pytest.mark.asyncio + async def test_invalid_criteria_type(self) -> None: + """Test that evaluators handle invalid criteria types properly.""" + config = { + "name": "ErrorTest", + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + + with pytest.raises(UiPathEvaluationError): + # Try to validate invalid criteria + evaluator.validate_evaluation_criteria("invalid_criteria") + + @pytest.mark.asyncio + async def test_missing_config_fields(self, mocker: MockerFixture) -> None: + """Test that evaluators properly validate config fields.""" + # Mock the UiPath constructor to avoid authentication + mock_uipath = mocker.MagicMock() + mocker.patch("uipath.UiPath", return_value=mock_uipath) + + config = { + "name": "LLMJudgeEvaluator", + "default_evaluation_criteria": {}, + } + + with pytest.raises(UiPathEvaluationError, match="Field required"): + # Missing required field 'model' + LLMJudgeOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + + +class TestEvaluationResultTypes: + """Test that all evaluators return proper result types.""" + + @pytest.mark.asyncio + async def test_evaluators_return_results_with_scores( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test that evaluators return results with scores.""" + config = { + "name": "Test", + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria(expected_output={"output": "Test output"}) + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + assert hasattr(result, "score") + assert isinstance(result.score, (int, float)) + + +class TestJustificationHandling: + """Test justification handling in all evaluators.""" + + @pytest.mark.asyncio + async def test_exact_match_evaluator_justification( + self, sample_agent_execution: AgentExecution + ) -> None: + """Test that ExactMatchEvaluator handles None justification correctly.""" + config = { + "name": "ExactMatchTest", + "case_sensitive": True, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria(expected_output={"output": "Test output"}) + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + # Should be NumericEvaluationResult with no justification (None) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + # Justification should be None for non-LLM evaluators + assert ( + not hasattr(result, "justification") + or getattr(result, "justification", None) is None + ) + + @pytest.mark.asyncio + async def test_json_similarity_evaluator_justification(self) -> None: + """Test that JsonSimilarityEvaluator handles None justification correctly.""" + execution = AgentExecution( + agent_input={"input": "Test"}, + agent_output={"name": "John", "age": 30, "city": "NYC"}, + agent_trace=[], + ) + config = { + "name": "JsonSimilarityTest", + } + evaluator = JsonSimilarityEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria( + expected_output={"name": "John", "age": 30, "city": "NYC"} + ) + + result = await evaluator.evaluate(execution, criteria) + + # Should be NumericEvaluationResult with no justification (None) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + # Justification should be None for non-LLM evaluators + assert ( + not hasattr(result, "justification") + or getattr(result, "justification", None) is None + ) + + @pytest.mark.asyncio + async def test_tool_call_order_evaluator_justification( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test that ToolCallOrderEvaluator handles None justification correctly.""" + config = { + "name": "ToolOrderTest", + "strict": True, + } + evaluator = ToolCallOrderEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOrderEvaluationCriteria( + tool_calls_order=["tool1", "tool2", "tool1", "tool2"] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + # Should be NumericEvaluationResult with no justification (None) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + # Justification should be None for non-LLM evaluators + assert ( + not hasattr(result, "justification") + or getattr(result, "justification", None) is None + ) + + @pytest.mark.asyncio + async def test_tool_call_count_evaluator_justification( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test that ToolCallCountEvaluator handles None justification correctly.""" + config = { + "name": "ToolCountTest", + "strict": True, + } + evaluator = ToolCallCountEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallCountEvaluationCriteria( + tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 2)} + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + # Should be NumericEvaluationResult with no justification (None) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + # Justification should be None for non-LLM evaluators + assert ( + not hasattr(result, "justification") + or getattr(result, "justification", None) is None + ) + + @pytest.mark.asyncio + async def test_tool_call_args_evaluator_justification( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test that ToolCallArgsEvaluator handles None justification correctly.""" + config = { + "name": "ToolArgsTest", + "strict": True, + } + evaluator = ToolCallArgsEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallArgsEvaluationCriteria( + tool_calls=[ + ToolCall(name="tool1", args={"arg1": "value1"}), + ToolCall(name="tool2", args={"arg2": "value2"}), + ToolCall(name="tool1", args={"arg1": "value1"}), + ToolCall(name="tool2", args={"arg2": "value2"}), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + # Should be NumericEvaluationResult with no justification (None) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + # Justification should be None for non-LLM evaluators + assert ( + not hasattr(result, "justification") + or getattr(result, "justification", None) is None + ) + + @pytest.mark.asyncio + async def test_tool_call_output_evaluator_justification( + self, sample_agent_execution_with_trace: AgentExecution + ) -> None: + """Test that ToolCallOutputEvaluator handles justification correctly.""" + config = { + "name": "ToolOutputTest", + "strict": True, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = ToolCallOutputEvaluationCriteria( + tool_outputs=[ + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ToolOutput(name="tool1", output="output1"), + ToolOutput(name="tool2", output="output2"), + ] + ) + + result = await evaluator.evaluate(sample_agent_execution_with_trace, criteria) + + # Should have justification with tool call output details + assert isinstance(result, NumericEvaluationResult) + assert result.score == 1.0 + # The justification is stored in the details field for tool call evaluators + assert hasattr(result, "details") + assert isinstance(result.details, ToolCallOutputEvaluatorJustification) + assert hasattr(result.details, "explained_tool_calls_outputs") + assert isinstance(result.details.explained_tool_calls_outputs, dict) + + @pytest.mark.asyncio + async def test_llm_judge_output_evaluator_justification( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test that LLMJudgeOutputEvaluator handles str justification correctly.""" + # Mock the UiPath constructor to avoid authentication + mock_uipath = mocker.MagicMock() + mock_llm = mocker.MagicMock() + mock_uipath.llm = mock_llm + + # Mock the chat completions response with justification + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock( + content='{"score": 80, "justification": "The response meets most criteria but could be more detailed"}' + ) + ) + ] + + # Make chat_completions an async method + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + mock_llm.chat_completions = mock_chat_completions + mocker.patch("uipath.UiPath", return_value=mock_uipath) + + config = { + "name": "LlmJudgeTest", + "prompt": "Rate this output: {{ActualOutput}} vs {{ExpectedOutput}}", + "model": "gpt-4o-2024-08-06", + } + evaluator = LLMJudgeOutputEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = OutputEvaluationCriteria(expected_output="Expected output") + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + # Should have string justification in details field + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.8 + assert hasattr(result, "details") + # The justification is stored in the details field for LLM evaluators + assert isinstance(result.details, str) + assert ( + result.details + == "The response meets most criteria but could be more detailed" + ) + + @pytest.mark.asyncio + async def test_llm_judge_trajectory_evaluator_justification( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test that LLMJudgeTrajectoryEvaluator handles str justification correctly.""" + # Mock the UiPath constructor to avoid authentication + mock_uipath = mocker.MagicMock() + mock_llm = mocker.MagicMock() + mock_uipath.llm = mock_llm + + # Mock the chat completions response with justification + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock( + content='{"score": 85, "justification": "The agent trajectory shows good decision making and follows expected behavior patterns"}' + ) + ) + ] + + # Make chat_completions an async method + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + return mock_response + + mock_llm.chat_completions = mock_chat_completions + mocker.patch("uipath.UiPath", return_value=mock_uipath) + + config = { + "name": "LlmTrajectoryTest", + "prompt": "Evaluate this trajectory: {{AgentRunHistory}} vs {{ExpectedAgentBehavior}}", + "model": "gpt-4", + } + evaluator = LLMJudgeTrajectoryEvaluator.model_validate( + {"config": config, "id": str(uuid.uuid4())} + ) + criteria = TrajectoryEvaluationCriteria( + expected_agent_behavior="Agent should respond helpfully" + ) + + result = await evaluator.evaluate(sample_agent_execution, criteria) + + # Should have string justification in details field (not justification attribute) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.85 + assert isinstance(result.details, str) + assert ( + result.details + == "The agent trajectory shows good decision making and follows expected behavior patterns" + ) + + def test_justification_validation_edge_cases(self, mocker: MockerFixture) -> None: + """Test edge cases for justification validation.""" + # Test None type evaluator + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + } + none_evaluator = ExactMatchEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + # All inputs should return None for None type evaluators + assert none_evaluator.validate_justification(None) is None + assert none_evaluator.validate_justification("") is None + assert none_evaluator.validate_justification("some text") is None + assert none_evaluator.validate_justification(123) is None + assert none_evaluator.validate_justification({"key": "value"}) is None + + # Test str type evaluator - need to provide llm_service to avoid authentication + llm_config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + "model": "gpt-4o-2024-08-06", + } + mock_llm_service = mocker.MagicMock() + str_evaluator = LLMJudgeOutputEvaluator.model_validate( + { + "config": llm_config_dict, + "llm_service": mock_llm_service, + "id": str(uuid.uuid4()), + } + ) + + # Different inputs should be converted to strings + assert str_evaluator.validate_justification("test") == "test" + assert str_evaluator.validate_justification("") == "" + assert str_evaluator.validate_justification(123) == "123" + assert str_evaluator.validate_justification(True) == "True" + assert ( + str_evaluator.validate_justification(None) == "" + ) # None becomes empty string + + def test_justification_type_extraction_all_evaluators(self) -> None: + """Test that all evaluators have correct justification type extraction.""" + # Different evaluators have different justification types + assert ExactMatchEvaluator._extract_justification_type() is type( + None + ) # No justification + assert ( + JsonSimilarityEvaluator._extract_justification_type() is str + ) # String justification + + # Tool call evaluators have their own justification types + from uipath.eval.evaluators.tool_call_args_evaluator import ( + ToolCallArgsEvaluatorJustification, + ) + from uipath.eval.evaluators.tool_call_count_evaluator import ( + ToolCallCountEvaluatorJustification, + ) + from uipath.eval.evaluators.tool_call_order_evaluator import ( + ToolCallOrderEvaluatorJustification, + ) + from uipath.eval.evaluators.tool_call_output_evaluator import ( + ToolCallOutputEvaluatorJustification, + ) + + assert ( + ToolCallOrderEvaluator._extract_justification_type() + is ToolCallOrderEvaluatorJustification + ) + assert ( + ToolCallCountEvaluator._extract_justification_type() + is ToolCallCountEvaluatorJustification + ) + assert ( + ToolCallArgsEvaluator._extract_justification_type() + is ToolCallArgsEvaluatorJustification + ) + assert ( + ToolCallOutputEvaluator._extract_justification_type() + is ToolCallOutputEvaluatorJustification + ) + + # LLM evaluators should have str justification type + assert LLMJudgeOutputEvaluator._extract_justification_type() is str + assert LLMJudgeTrajectoryEvaluator._extract_justification_type() is str diff --git a/tests/evaluators/test_evaluator_schemas.py b/tests/evaluators/test_evaluator_schemas.py new file mode 100644 index 000000000..a857d7a6e --- /dev/null +++ b/tests/evaluators/test_evaluator_schemas.py @@ -0,0 +1,581 @@ +"""Tests for evaluator schema functionality and base evaluator features. + +This module tests: +- Config schema generation for all evaluators +- Evaluation criteria schema generation for all evaluators +- Base evaluator functionality (type extraction, validation) +- Generic type parameter handling +""" + +import uuid + +import pytest +from pytest_mock.plugin import MockerFixture + +from uipath.eval.evaluators.exact_match_evaluator import ( + ExactMatchEvaluator, + ExactMatchEvaluatorConfig, +) +from uipath.eval.evaluators.json_similarity_evaluator import ( + JsonSimilarityEvaluator, + JsonSimilarityEvaluatorConfig, +) +from uipath.eval.evaluators.llm_as_judge_evaluator import ( + LLMJudgeMixin, +) +from uipath.eval.evaluators.llm_judge_output_evaluator import ( + LLMJudgeOutputEvaluator, + LLMJudgeOutputEvaluatorConfig, +) +from uipath.eval.evaluators.llm_judge_trajectory_evaluator import ( + LLMJudgeTrajectoryEvaluator, +) +from uipath.eval.evaluators.output_evaluator import ( + OutputEvaluationCriteria, +) +from uipath.eval.evaluators.tool_call_args_evaluator import ( + ToolCallArgsEvaluationCriteria, + ToolCallArgsEvaluator, + ToolCallArgsEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_count_evaluator import ( + ToolCallCountEvaluationCriteria, + ToolCallCountEvaluator, + ToolCallCountEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_order_evaluator import ( + ToolCallOrderEvaluationCriteria, + ToolCallOrderEvaluator, + ToolCallOrderEvaluatorConfig, +) +from uipath.eval.evaluators.tool_call_output_evaluator import ( + ToolCallOutputEvaluationCriteria, + ToolCallOutputEvaluator, + ToolCallOutputEvaluatorConfig, +) + + +@pytest.fixture +def sample_config_data() -> dict[str, str | bool | int | float]: + """Sample config data for testing.""" + return { + "name": "TestEvaluator", + "threshold": 0.8, + "case_sensitive": False, + "strict": True, + } + + +class TestEvaluatorSchemas: + """Test schema generation for all evaluators.""" + + def test_exact_match_evaluator_schemas(self) -> None: + """Test ExactMatchEvaluator schema generation.""" + # Test config schema + config_schema = ExactMatchEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "case_sensitive" in config_schema["properties"] + + # Test criteria schema + criteria_schema = ExactMatchEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "expected_output" in criteria_schema["properties"] + + def test_json_similarity_evaluator_schemas(self) -> None: + """Test JsonSimilarityEvaluator schema generation.""" + # Test config schema + config_schema = JsonSimilarityEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + + # Test criteria schema + criteria_schema = JsonSimilarityEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "expected_output" in criteria_schema["properties"] + + def test_tool_call_order_evaluator_schemas(self) -> None: + """Test ToolCallOrderEvaluator schema generation.""" + # Test config schema + config_schema = ToolCallOrderEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "strict" in config_schema["properties"] + + # Test criteria schema + criteria_schema = ToolCallOrderEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "tool_calls_order" in criteria_schema["properties"] + + def test_tool_call_count_evaluator_schemas(self) -> None: + """Test ToolCallCountEvaluator schema generation.""" + # Test config schema + config_schema = ToolCallCountEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "strict" in config_schema["properties"] + + # Test criteria schema + criteria_schema = ToolCallCountEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "tool_calls_count" in criteria_schema["properties"] + + def test_tool_call_args_evaluator_schemas(self) -> None: + """Test ToolCallArgsEvaluator schema generation.""" + # Test config schema + config_schema = ToolCallArgsEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "strict" in config_schema["properties"] + assert "subset" in config_schema["properties"] + + # Test criteria schema + criteria_schema = ToolCallArgsEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "tool_calls" in criteria_schema["properties"] + + def test_tool_call_output_evaluator_schemas(self) -> None: + """Test ToolCallOutputEvaluator schema generation.""" + # Test config schema + config_schema = ToolCallOutputEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "strict" in config_schema["properties"] + + # Test criteria schema + criteria_schema = ToolCallOutputEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "tool_outputs" in criteria_schema["properties"] + + def test_base_llm_judge_evaluator_schemas(self) -> None: + """Test BaseLLMJudgeEvaluator schema generation.""" + # Test config schema + config_schema = LLMJudgeMixin[ + OutputEvaluationCriteria, + LLMJudgeOutputEvaluatorConfig, + ].get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "prompt" in config_schema["properties"], ( + f"Prompt not found in config schema: {config_schema}" + ) + assert "model" in config_schema["properties"] + + # Test criteria schema + criteria_schema = LLMJudgeMixin[ + OutputEvaluationCriteria, + LLMJudgeOutputEvaluatorConfig, + ].get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "expected_output" in criteria_schema["properties"] + + def test_llm_judge_evaluator_schemas(self) -> None: + """Test LLMJudgeEvaluator schema generation.""" + # Test config schema + config_schema = LLMJudgeOutputEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "prompt" in config_schema["properties"] + assert "model" in config_schema["properties"] + assert "target_output_key" in config_schema["properties"] + + # Test criteria schema + criteria_schema = LLMJudgeOutputEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "expected_output" in criteria_schema["properties"] + + def test_llm_judge_trajectory_evaluator_schemas(self) -> None: + """Test LlmJudgeTrajectoryEvaluator schema generation.""" + # Test config schema + config_schema = LLMJudgeTrajectoryEvaluator.get_config_schema() + assert isinstance(config_schema, dict) + assert "properties" in config_schema + assert "name" in config_schema["properties"] + assert "prompt" in config_schema["properties"] + assert "model" in config_schema["properties"] + assert "target_output_key" not in config_schema["properties"] + + # Test criteria schema + criteria_schema = LLMJudgeTrajectoryEvaluator.get_evaluation_criteria_schema() + assert isinstance(criteria_schema, dict) + assert "properties" in criteria_schema + assert "expected_agent_behavior" in criteria_schema["properties"] + + +class TestJustificationSchemas: + """Test justification schema generation and validation for all evaluators.""" + + def test_exact_match_evaluator_justification_schema(self) -> None: + """Test ExactMatchEvaluator justification schema generation.""" + # Test justification type extraction + justification_type = ExactMatchEvaluator._extract_justification_type() + assert justification_type is type(None) + + def test_json_similarity_evaluator_justification_schema(self) -> None: + """Test JsonSimilarityEvaluator justification schema generation.""" + # Test justification type extraction - JSON similarity provides str justification + justification_type = JsonSimilarityEvaluator._extract_justification_type() + assert justification_type is str + + def test_tool_call_order_evaluator_justification_schema(self) -> None: + """Test ToolCallOrderEvaluator justification schema generation.""" + # Test justification type extraction - tool call evaluators have their own justification types + from uipath.eval.evaluators.tool_call_order_evaluator import ( + ToolCallOrderEvaluatorJustification, + ) + + justification_type = ToolCallOrderEvaluator._extract_justification_type() + assert justification_type is ToolCallOrderEvaluatorJustification + + def test_tool_call_count_evaluator_justification_schema(self) -> None: + """Test ToolCallCountEvaluator justification schema generation.""" + # Test justification type extraction - tool call evaluators have their own justification types + from uipath.eval.evaluators.tool_call_count_evaluator import ( + ToolCallCountEvaluatorJustification, + ) + + justification_type = ToolCallCountEvaluator._extract_justification_type() + assert justification_type is ToolCallCountEvaluatorJustification + + def test_tool_call_args_evaluator_justification_schema(self) -> None: + """Test ToolCallArgsEvaluator justification schema generation.""" + # Test justification type extraction - tool call evaluators have their own justification types + from uipath.eval.evaluators.tool_call_args_evaluator import ( + ToolCallArgsEvaluatorJustification, + ) + + justification_type = ToolCallArgsEvaluator._extract_justification_type() + assert justification_type is ToolCallArgsEvaluatorJustification + + def test_tool_call_output_evaluator_justification_schema(self) -> None: + """Test ToolCallOutputEvaluator justification schema generation.""" + # Test justification type extraction - tool call evaluators have their own justification types + from uipath.eval.evaluators.tool_call_output_evaluator import ( + ToolCallOutputEvaluatorJustification, + ) + + justification_type = ToolCallOutputEvaluator._extract_justification_type() + assert justification_type is ToolCallOutputEvaluatorJustification + + def test_llm_judge_output_evaluator_justification_schema(self) -> None: + """Test LLMJudgeOutputEvaluator justification schema generation.""" + # Test justification type extraction - LLM evaluators use str for justification + justification_type = LLMJudgeOutputEvaluator._extract_justification_type() + assert justification_type is str + + def test_llm_judge_trajectory_evaluator_justification_schema(self) -> None: + """Test LLMJudgeTrajectoryEvaluator justification schema generation.""" + # Test justification type extraction - LLM evaluators use str for justification + justification_type = LLMJudgeTrajectoryEvaluator._extract_justification_type() + assert justification_type is str + + +class TestBaseEvaluatorFunctionality: + """Test base evaluator functionality.""" + + def test_type_extraction_exact_match(self) -> None: + """Test type extraction for ExactMatchEvaluator.""" + criteria_type = ExactMatchEvaluator._extract_evaluation_criteria_type() + config_type = ExactMatchEvaluator._extract_config_type() + + assert criteria_type == OutputEvaluationCriteria + assert config_type == ExactMatchEvaluatorConfig + + def test_type_extraction_json_similarity(self) -> None: + """Test type extraction for JsonSimilarityEvaluator.""" + criteria_type = JsonSimilarityEvaluator._extract_evaluation_criteria_type() + config_type = JsonSimilarityEvaluator._extract_config_type() + + assert criteria_type == OutputEvaluationCriteria + assert config_type == JsonSimilarityEvaluatorConfig + + def test_type_extraction_tool_call_order(self) -> None: + """Test type extraction for ToolCallOrderEvaluator.""" + criteria_type = ToolCallOrderEvaluator._extract_evaluation_criteria_type() + config_type = ToolCallOrderEvaluator._extract_config_type() + + assert criteria_type == ToolCallOrderEvaluationCriteria + assert config_type == ToolCallOrderEvaluatorConfig + + def test_type_extraction_tool_call_count(self) -> None: + """Test type extraction for ToolCallCountEvaluator.""" + criteria_type = ToolCallCountEvaluator._extract_evaluation_criteria_type() + config_type = ToolCallCountEvaluator._extract_config_type() + + assert criteria_type == ToolCallCountEvaluationCriteria + assert config_type == ToolCallCountEvaluatorConfig + + def test_type_extraction_tool_call_args(self) -> None: + """Test type extraction for ToolCallArgsEvaluator.""" + criteria_type = ToolCallArgsEvaluator._extract_evaluation_criteria_type() + config_type = ToolCallArgsEvaluator._extract_config_type() + + assert criteria_type == ToolCallArgsEvaluationCriteria + assert config_type == ToolCallArgsEvaluatorConfig + + def test_type_extraction_tool_call_output(self) -> None: + """Test type extraction for ToolCallOutputEvaluator.""" + criteria_type = ToolCallOutputEvaluator._extract_evaluation_criteria_type() + config_type = ToolCallOutputEvaluator._extract_config_type() + + assert criteria_type == ToolCallOutputEvaluationCriteria + assert config_type == ToolCallOutputEvaluatorConfig + + def test_config_validation_exact_match(self) -> None: + """Test config validation for ExactMatchEvaluator.""" + # Valid config - create minimal required config + config_dict = { + "name": "TestEvaluator", + "case_sensitive": True, + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + assert isinstance(evaluator.evaluator_config, ExactMatchEvaluatorConfig) + assert evaluator.evaluator_config.name == "TestEvaluator" + assert evaluator.evaluator_config.case_sensitive is True + + def test_criteria_validation_exact_match(self) -> None: + """Test criteria validation for ExactMatchEvaluator.""" + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + # Test dict validation + criteria_dict = {"expected_output": "test output"} + validated = evaluator.validate_evaluation_criteria(criteria_dict) + + assert isinstance(validated, OutputEvaluationCriteria) + assert validated.expected_output == "test output" + + def test_criteria_validation_tool_call_order(self) -> None: + """Test criteria validation for ToolCallOrderEvaluator.""" + config_dict = { + "name": "Test", + "strict": False, + "default_evaluation_criteria": {"tool_calls_order": ["tool1", "tool2"]}, + } + evaluator = ToolCallOrderEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + # Test dict validation + criteria_dict = {"tool_calls_order": ["tool1", "tool2", "tool3"]} + validated = evaluator.validate_evaluation_criteria(criteria_dict) + + assert isinstance(validated, ToolCallOrderEvaluationCriteria) + assert validated.tool_calls_order == ["tool1", "tool2", "tool3"] + + def test_config_validation_tool_call_output(self) -> None: + """Test config validation for ToolCallOutputEvaluator.""" + # Valid config - create minimal required config + config_dict = { + "name": "TestToolOutputEvaluator", + "strict": True, + "default_evaluation_criteria": { + "tool_outputs": [{"name": "tool1", "output": "output1"}] + }, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + assert isinstance(evaluator.evaluator_config, ToolCallOutputEvaluatorConfig) + assert evaluator.evaluator_config.name == "TestToolOutputEvaluator" + assert evaluator.evaluator_config.strict is True + + def test_criteria_validation_tool_call_output(self) -> None: + """Test criteria validation for ToolCallOutputEvaluator.""" + config_dict = { + "name": "Test", + "strict": False, + "default_evaluation_criteria": { + "tool_outputs": [{"name": "tool1", "output": "output1"}] + }, + } + evaluator = ToolCallOutputEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + # Test dict validation + criteria_dict = { + "tool_outputs": [ + {"name": "tool1", "output": "output1"}, + {"name": "tool2", "output": "output2"}, + ] + } + validated = evaluator.validate_evaluation_criteria(criteria_dict) + + assert isinstance(validated, ToolCallOutputEvaluationCriteria) + assert len(validated.tool_outputs) == 2 + assert validated.tool_outputs[0].name == "tool1" + assert validated.tool_outputs[0].output == "output1" + assert validated.tool_outputs[1].name == "tool2" + assert validated.tool_outputs[1].output == "output2" + + def test_criteria_validation_llm_judge_output(self, mocker: MockerFixture) -> None: + """Test criteria validation for LLMJudgeOutputEvaluator.""" + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + "model": "gpt-4o-2024-08-06", + } + mock_llm_service = mocker.MagicMock() + evaluator = LLMJudgeOutputEvaluator.model_validate( + { + "config": config_dict, + "llm_service": mock_llm_service, + "id": str(uuid.uuid4()), + } + ) + + # Test dict validation + criteria_dict = {"expected_output": "test output"} + validated = evaluator.validate_evaluation_criteria(criteria_dict) + + assert isinstance(validated, OutputEvaluationCriteria) + assert validated.expected_output == "test output" + + def test_automatic_type_detection(self) -> None: + """Test that types are automatically detected from Generic parameters.""" + # Create evaluator - test with basic evaluators that don't trigger CLI imports + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = JsonSimilarityEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + # Types should be set correctly + assert evaluator.evaluation_criteria_type == OutputEvaluationCriteria + assert evaluator.config_type.__name__ == "JsonSimilarityEvaluatorConfig" + + def test_justification_validation_none_type(self) -> None: + """Test justification validation for evaluators with None justification type.""" + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + # Test None validation + assert evaluator.validate_justification(None) is None + assert evaluator.validate_justification("any string") is None + + def test_justification_validation_str_type(self, mocker: MockerFixture) -> None: + """Test justification validation for evaluators with str justification type.""" + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + "model": "gpt-4o-2024-08-06", + } + mock_llm_service = mocker.MagicMock() + evaluator = LLMJudgeOutputEvaluator.model_validate( + { + "config": config_dict, + "llm_service": mock_llm_service, + "id": str(uuid.uuid4()), + } + ) + + # Test string validation + assert ( + evaluator.validate_justification("test justification") + == "test justification" + ) + assert evaluator.validate_justification(123) == "123" + assert evaluator.validate_justification(None) == "" + + def test_justification_type_consistency(self, mocker: MockerFixture) -> None: + """Test that justification_type field matches the generic parameter.""" + # Test None type evaluators + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + } + exact_match_evaluator = ExactMatchEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + assert exact_match_evaluator.justification_type is type(None) + + # Test str type evaluators + llm_config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + "model": "gpt-4o-2024-08-06", + } + mock_llm_service = mocker.MagicMock() + llm_evaluator = LLMJudgeOutputEvaluator.model_validate( + { + "config": llm_config_dict, + "llm_service": mock_llm_service, + "id": str(uuid.uuid4()), + } + ) + assert llm_evaluator.justification_type is str + + +class TestEvaluatorInstances: + """Test evaluator instance functionality.""" + + def test_instance_config_access(self) -> None: + """Test that evaluator instances have properly typed config access.""" + config_data = { + "name": "TestEvaluator", + "case_sensitive": False, + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = ExactMatchEvaluator.model_validate( + {"config": config_data, "id": str(uuid.uuid4())} + ) + + # Test direct config access + assert evaluator.evaluator_config.name == "TestEvaluator" + assert evaluator.evaluator_config.case_sensitive is False + + # Verify type + assert isinstance(evaluator.evaluator_config, ExactMatchEvaluatorConfig) + + def test_instance_schema_access(self) -> None: + """Test that evaluator instances can access schemas.""" + config_dict = { + "name": "Test", + "default_evaluation_criteria": {"expected_output": "test"}, + } + evaluator = JsonSimilarityEvaluator.model_validate( + {"config": config_dict, "id": str(uuid.uuid4())} + ) + + # Should be able to get schemas from instances + config_schema = evaluator.get_config_schema() + criteria_schema = evaluator.get_evaluation_criteria_schema() + + assert isinstance(config_schema, dict) + assert isinstance(criteria_schema, dict) + assert "properties" in config_schema + assert "properties" in criteria_schema