Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
2e24fe0
add initial version of revamped coded evaluators
andrei-rusu Sep 25, 2025
0b44a7c
fix copilot and linting issues
andrei-rusu Oct 10, 2025
1b016c1
Merge pull request #677 from UiPath/fix/andreiru/coded_evals_copilot_…
radu-mocanu Oct 10, 2025
333821b
feat: new eval schema support + contain evaluator wiring
radu-mocanu Oct 13, 2025
860c88f
feat: wiring ExactMatch evaluator to new schema
mjnovice Oct 14, 2025
fd324b0
Merge pull request #690 from UiPath/mj/wire-exact-match
mjnovice Oct 14, 2025
74bc147
feat: wiring JsonSimilarity evaluator to new schema
mjnovice Oct 14, 2025
ef92ef5
Merge pull request #692 from UiPath/mj/wire-json-similarity
mjnovice Oct 14, 2025
e05bd98
feat: wiring LLM judge evaluators to new schema
mjnovice Oct 15, 2025
a5e4947
feat: implement version-based discriminator for coded-evals push/pull
Chibionos Oct 13, 2025
2205d60
Merge pull request #681 from UiPath/feat/updating-push-pull
Chibionos Oct 15, 2025
7a2937d
feat: progress on parallelization of eval runs
akshaylive Oct 10, 2025
931a4c6
Merge pull request #697 from UiPath/mj/wire-llm-as-a-judge
mjnovice Oct 16, 2025
962f07d
Merge pull request #704 from UiPath/dev/andreiru/cherry_pick_parallel…
andrei-rusu Oct 16, 2025
4964c66
feat: missing changes from llm eval wiring
mjnovice Oct 17, 2025
746c8a7
Merge pull request #724 from UiPath/mj/missed-fixes
mjnovice Oct 20, 2025
0db93c2
feat: wiring up trajectory evals
mjnovice Oct 20, 2025
4553f34
Merge pull request #721 from UiPath/mj/wire-trajectory
mjnovice Oct 20, 2025
32e33fc
feat(evals): add dedicated UIPATH_EVAL_BACKEND_URL for localhost routing
Oct 21, 2025
9af6fa9
feat: wire tool evals, add mocked tool sample agent
mjnovice Oct 20, 2025
a533446
Merge pull request #735 from UiPath/mj/wire-tool
mjnovice Oct 21, 2025
8898a71
refac: coded_evalutors -> evaluators, associated renames/changes
mjnovice Oct 22, 2025
22356c5
Merge pull request #752 from UiPath/mj/refac-names
mjnovice Oct 22, 2025
f0883c6
Merge pull request #740 from UiPath/fix/apply-new-reporting-api-change
Chibionos Oct 22, 2025
88ca0e1
feat: add support for custom evaluators
radu-mocanu Oct 15, 2025
8957b97
Merge pull request #703 from UiPath/feat/custom-evals
radu-mocanu Oct 22, 2025
602854e
Merge branch 'main' into release/revamped-evals
mjnovice Oct 23, 2025
7efe612
Merge branch 'main' into release/revamped-evals
akshaylive Oct 23, 2025
35fc809
Merge branch 'main' into release/revamped-evals
akshaylive Oct 23, 2025
6cb6db5
Merge branch 'main' into release/revamped-evals
akshaylive Oct 23, 2025
07d1c07
fix(TonOfFixes): lots of minor fixes
akshaylive Oct 23, 2025
3fab172
Merge pull request #772 from UiPath/akshaya/revamped-eval-fix
akshaylive Oct 24, 2025
78fd2fb
fix(UnitTest): model default
akshaylive Oct 24, 2025
4095baf
Merge branch 'main' into release/revamped-evals
Oct 24, 2025
dbb414c
Merge branch 'main' into release/revamped-evals
Oct 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions samples/calculator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,22 @@ After initialization, execute the agent using this sample command:
```
uipath run main.py '{"a": 0, "b": 1, "operator": "+"}'
```

# Run evaluations
```
uipath eval .\main.py .\evals\eval-sets\default.json --no-report --output-file output.json
```

# Add and register custom evaluator

1. (Optional) Add a new evaluator -> can be created manually in the evals/custom-evaluators directory
```
uipath add evaluator my_custom_evaluator
```
2. Implement the logic

3. Register the evaluator
```
uipath register evaluator my_custom_evaluator
```
4. Apply it to any dataset
130 changes: 71 additions & 59 deletions samples/calculator/evals/eval-sets/default.json
Original file line number Diff line number Diff line change
@@ -1,72 +1,84 @@
{
"fileName": "default.json",
"id": "default-eval-set-id",
"name": "Basic Calculator Evaluation Set",
"batchSize": 10,
"version": "1.0",
"id": "NewSchemaSampleEval",
"name": "New Schema Sample Evaluation",
"evaluatorRefs": [
"equality",
"llm-as-a-judge"
"ContainsEvaluator",
"ExactMatchEvaluator",
"JsonSimilarityEvaluator",
"LLMJudgeOutputEvaluator",
"LLMJudgeStrictJSONSimilarityOutputEvaluator",
"TrajectoryEvaluator",
"CorrectOperatorEvaluator"
],
"evaluations": [
{
"id": "test-addition",
"name": "Test Addition",
"inputs": {"a": 1, "b": 1, "operator": "+"},
"expectedOutput": {"result": 2},
"expectedAgentBehavior": "",
"evalSetId": "default-eval-set-id",
"createdAt": "2025-09-04T18:54:58.378Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
"id": "default",
"name": "Add",
"inputs": {
"a": 1,
"b": 4,
"operator": "+"
},
"evaluationCriterias": {
"ContainsEvaluator": null,
"ExactMatchEvaluator": null,
"JsonSimilarityEvaluator": null,
"LLMJudgeOutputEvaluator": null,
"LLMJudgeStrictJSONSimilarityOutputEvaluator": null,
"TrajectoryEvaluator": null,
"CorrectOperatorEvaluator": null
}
},
{
"id": "test-random-addition-using-mockito",
"name": "Test Random Addition Using Mockito",
"inputs": {"a": 1, "b": 1, "operator": "random"},
"expectedOutput": {"result": 2},
"expectedAgentBehavior": "",
"mockingStrategy": {
"type": "mockito",
"behaviors": [
{
"function": "get_random_operator",
"arguments": {
"args": [],
"kwargs": {}
},
"then": [
{
"type": "return",
"value": {"result": "+"}
}
]
}
]
"id": "override",
"name": "Multiply",
"inputs": {
"a": 2,
"b": 4,
"operator": "*"
},
"evalSetId": "default-eval-set-id",
"createdAt": "2025-09-04T18:54:58.378Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
"evaluationCriterias": {
"ContainsEvaluator": {
"searchText": "8"
},
"CorrectOperatorEvaluator": {
"operator": "*"
},
"ExactMatchEvaluator": {
"expectedOutput": {
"result": "8.0"
}
},
"JsonSimilarityEvaluator": {
"expectedOutput": {
"result": 8.0
}
},
"LLMJudgeOutputEvaluator": {
"expectedOutput": {
"result": 8.0
}
},
"LLMJudgeStrictJSONSimilarityOutputEvaluator": {
"expectedOutput": {
"result": 8.0
}
},
"TrajectoryEvaluator": {
"expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
}
}
},
{
"id": "test-random-addition-using-llm",
"name": "Test Random Addition Using LLM",
"inputs": {"a": 1, "b": 1, "operator": "random"},
"expectedOutput": {"result": 2},
"expectedAgentBehavior": "",
"mockingStrategy": {
"type": "llm",
"prompt": "The random operator is '+'.",
"toolsToSimulate": [{"name": "get_random_operator"}],
"model": {
"model": "gpt-4o-mini-2024-07-18",
"temperature": 0
}
"id": "skip",
"name": "Skip denial code check",
"inputs": {
"a": 1,
"b": 1,
"operator": "+"
},
"evalSetId": "default-eval-set-id",
"createdAt": "2025-09-04T18:54:58.378Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
"evaluationCriterias": {}
}
],
"modelSettings": [],
"createdAt": "2025-09-04T18:54:58.379Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
]
}
98 changes: 98 additions & 0 deletions samples/calculator/evals/eval-sets/legacy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"fileName": "default.json",
"id": "default-eval-set-id",
"name": "Basic Calculator Evaluation Set",
"batchSize": 10,
"evaluatorRefs": [
"equality",
"llm-as-a-judge",
"json-similarity",
"trajectory"
],
"evaluations": [
{
"id": "test-addition",
"name": "Test Addition",
"inputs": {
"a": 1,
"b": 1,
"operator": "+"
},
"expectedOutput": {
"result": 2
},
"expectedAgentBehavior": "",
"evalSetId": "default-eval-set-id",
"createdAt": "2025-09-04T18:54:58.378Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
},
{
"id": "test-random-addition-using-mockito",
"name": "Test Random Addition Using Mockito",
"inputs": {
"a": 1,
"b": 1,
"operator": "random"
},
"expectedOutput": {
"result": 2
},
"expectedAgentBehavior": "",
"mockingStrategy": {
"type": "mockito",
"behaviors": [
{
"function": "get_random_operator",
"arguments": {
"args": [],
"kwargs": {}
},
"then": [
{
"type": "return",
"value": {
"result": "+"
}
}
]
}
]
},
"evalSetId": "default-eval-set-id",
"createdAt": "2025-09-04T18:54:58.378Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
},
{
"id": "test-random-addition-using-llm",
"name": "Test Random Addition Using LLM",
"inputs": {
"a": 1,
"b": 1,
"operator": "random"
},
"expectedOutput": {
"result": 2
},
"expectedAgentBehavior": "",
"mockingStrategy": {
"type": "llm",
"prompt": "The random operator is '+'.",
"toolsToSimulate": [
{
"name": "get_random_operator"
}
],
"model": {
"model": "gpt-4o-mini-2024-07-18",
"temperature": 0
}
},
"evalSetId": "default-eval-set-id",
"createdAt": "2025-09-04T18:54:58.378Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
}
],
"modelSettings": [],
"createdAt": "2025-09-04T18:54:58.379Z",
"updatedAt": "2025-09-04T18:55:55.416Z"
}
15 changes: 15 additions & 0 deletions samples/calculator/evals/evaluators/contains.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"version": "1.0",
"id": "ContainsEvaluator",
"description": "Checks if the response text includes the expected denial code.",
"evaluatorTypeId": "uipath-contains",
"evaluatorConfig": {
"name": "ContainsEvaluator",
"targetOutputKey": "result",
"negated": false,
"ignoreCase": false,
"defaultEvaluationCriteria": {
"searchText": "5"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"version": "1.0",
"id": "CorrectOperatorEvaluator",
"evaluatorTypeId": "file://types/correct-operator-evaluator-types.json",
"evaluatorSchema": "file://correct_operator.py:CorrectOperatorEvaluator",
"description": "A custom evaluator that checks if the correct operator is being used by the agent ",
"evaluatorConfig": {
"name": "CorrectOperatorEvaluator",
"defaultEvaluationCriteria": {
"operator": "+"
},
"negated": false
}
}
46 changes: 46 additions & 0 deletions samples/calculator/evals/evaluators/custom/correct_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json

from uipath.eval.evaluators import BaseEvaluator, BaseEvaluationCriteria, BaseEvaluatorConfig
from uipath.eval.models import AgentExecution, EvaluationResult, NumericEvaluationResult
from opentelemetry.sdk.trace import ReadableSpan

class CorrectOperatorEvaluationCriteria(BaseEvaluationCriteria):
"""Evaluation criteria for the contains evaluator."""

operator: str

class CorrectOperatorEvaluatorConfig(BaseEvaluatorConfig[CorrectOperatorEvaluationCriteria]):
"""Configuration for the contains evaluator."""

name: str = "CorrectOperatorEvaluator"
negated: bool = False
default_evaluation_criteria: CorrectOperatorEvaluationCriteria = CorrectOperatorEvaluationCriteria(operator="+")

class CorrectOperatorEvaluator(BaseEvaluator[CorrectOperatorEvaluationCriteria, CorrectOperatorEvaluatorConfig, type(None)]):
"""A custom evaluator that checks if the correct operator is being used by the agent """

def extract_operator_from_spans(self, agent_trace: list[ReadableSpan]) -> str:
for span in agent_trace:
if span.name == "track_operator":
if span.attributes:
input_value_as_str = span.attributes.get("input.value", "{}")
assert isinstance(input_value_as_str, str)
input_value = json.loads(input_value_as_str)
return input_value.get("operator")
raise Exception(f"No 'track_operator' span found")


@classmethod
def get_evaluator_id(cls) -> str:
return "CorrectOperatorEvaluator"


async def evaluate(self, agent_execution: AgentExecution, evaluation_criteria: CorrectOperatorEvaluationCriteria) -> EvaluationResult:
actual_operator = self.extract_operator_from_spans(agent_execution.agent_trace)
print(actual_operator)
is_expected_operator = evaluation_criteria.operator == actual_operator
if self.evaluator_config.negated:
is_expected_operator = not is_expected_operator
return NumericEvaluationResult(
score=float(is_expected_operator),
)
Loading
Loading