UiPath · radu-mocanu · Oct 24, 2025 · Sep 25, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/samples/calculator/README.md b/samples/calculator/README.md
@@ -6,3 +6,22 @@ After initialization, execute the agent using this sample command:
 ```
 uipath run main.py '{"a": 0, "b": 1, "operator": "+"}'
 ```
+
+# Run evaluations
+```
+uipath eval .\main.py .\evals\eval-sets\default.json --no-report --output-file output.json
+```
+
+# Add and register custom evaluator
+
+1. (Optional) Add a new evaluator -> can be created manually in the evals/custom-evaluators directory
+```
+uipath add evaluator my_custom_evaluator
+```
+2. Implement the logic
+
+3. Register the evaluator
+```
+uipath register evaluator my_custom_evaluator
+```
+4. Apply it to any dataset
diff --git a/samples/calculator/evals/eval-sets/default.json b/samples/calculator/evals/eval-sets/default.json
@@ -1,72 +1,84 @@
 {
-  "fileName": "default.json",
-  "id": "default-eval-set-id",
-  "name": "Basic Calculator Evaluation Set",
-  "batchSize": 10,
+  "version": "1.0",
+  "id": "NewSchemaSampleEval",
+  "name": "New Schema Sample Evaluation",
   "evaluatorRefs": [
-    "equality",
-    "llm-as-a-judge"
+    "ContainsEvaluator",
+    "ExactMatchEvaluator",
+    "JsonSimilarityEvaluator",
+    "LLMJudgeOutputEvaluator",
+    "LLMJudgeStrictJSONSimilarityOutputEvaluator",
+    "TrajectoryEvaluator",
+    "CorrectOperatorEvaluator"
   ],
   "evaluations": [
     {
-      "id": "test-addition",
-      "name": "Test Addition",
-      "inputs": {"a": 1, "b": 1, "operator":  "+"},
-      "expectedOutput": {"result": 2},
-      "expectedAgentBehavior": "",
-      "evalSetId": "default-eval-set-id",
-      "createdAt": "2025-09-04T18:54:58.378Z",
-      "updatedAt": "2025-09-04T18:55:55.416Z"
+      "id": "default",
+      "name": "Add",
+      "inputs": {
+        "a": 1,
+        "b": 4,
+        "operator": "+"
+      },
+      "evaluationCriterias": {
+        "ContainsEvaluator": null,
+        "ExactMatchEvaluator": null,
+        "JsonSimilarityEvaluator": null,
+        "LLMJudgeOutputEvaluator": null,
+        "LLMJudgeStrictJSONSimilarityOutputEvaluator": null,
+        "TrajectoryEvaluator": null,
+        "CorrectOperatorEvaluator": null
+      }
     },
     {
-      "id": "test-random-addition-using-mockito",
-      "name": "Test Random Addition Using Mockito",
-      "inputs": {"a": 1, "b": 1, "operator":  "random"},
-      "expectedOutput": {"result": 2},
-      "expectedAgentBehavior": "",
-      "mockingStrategy": {
-        "type": "mockito",
-        "behaviors": [
-          {
-            "function": "get_random_operator",
-            "arguments": {
-              "args": [],
-              "kwargs": {}
-            },
-            "then": [
-              {
-                "type": "return",
-                "value": {"result": "+"}
-              }
-            ]
-          }
-        ]
+      "id": "override",
+      "name": "Multiply",
+      "inputs": {
+        "a": 2,
+        "b": 4,
+        "operator": "*"
       },
-      "evalSetId": "default-eval-set-id",
-      "createdAt": "2025-09-04T18:54:58.378Z",
-      "updatedAt": "2025-09-04T18:55:55.416Z"
+      "evaluationCriterias": {
+        "ContainsEvaluator": {
+          "searchText": "8"
+        },
+        "CorrectOperatorEvaluator": {
+          "operator": "*"
+        },
+        "ExactMatchEvaluator": {
+          "expectedOutput": {
+            "result": "8.0"
+          }
+        },
+        "JsonSimilarityEvaluator": {
+          "expectedOutput": {
+            "result": 8.0
+          }
+        },
+        "LLMJudgeOutputEvaluator": {
+          "expectedOutput": {
+            "result": 8.0
+          }
+        },
+        "LLMJudgeStrictJSONSimilarityOutputEvaluator": {
+          "expectedOutput": {
+            "result": 8.0
+          }
+        },
+        "TrajectoryEvaluator": {
+          "expectedAgentBehavior": "The agent should correctly parse the multiply operation, perform the calculation 2 * 4, and return the result 8.0"
+        }
+      }
     },
     {
-      "id": "test-random-addition-using-llm",
-      "name": "Test Random Addition Using LLM",
-      "inputs": {"a": 1, "b": 1, "operator":  "random"},
-      "expectedOutput": {"result": 2},
-      "expectedAgentBehavior": "",
-      "mockingStrategy": {
-        "type": "llm",
-        "prompt": "The random operator is '+'.",
-        "toolsToSimulate": [{"name":  "get_random_operator"}],
-        "model": {
-          "model": "gpt-4o-mini-2024-07-18",
-          "temperature": 0
-        }
+      "id": "skip",
+      "name": "Skip denial code check",
+      "inputs": {
+        "a": 1,
+        "b": 1,
+        "operator": "+"
       },
-      "evalSetId": "default-eval-set-id",
-      "createdAt": "2025-09-04T18:54:58.378Z",
-      "updatedAt": "2025-09-04T18:55:55.416Z"
+      "evaluationCriterias": {}
     }
-  ],
-  "modelSettings": [],
-  "createdAt": "2025-09-04T18:54:58.379Z",
-  "updatedAt": "2025-09-04T18:55:55.416Z"
+  ]
 }
diff --git a/samples/calculator/evals/eval-sets/legacy.json b/samples/calculator/evals/eval-sets/legacy.json
@@ -0,0 +1,98 @@
+{
+  "fileName": "default.json",
+  "id": "default-eval-set-id",
+  "name": "Basic Calculator Evaluation Set",
+  "batchSize": 10,
+  "evaluatorRefs": [
+    "equality",
+    "llm-as-a-judge",
+    "json-similarity",
+    "trajectory"
+  ],
+  "evaluations": [
+    {
+      "id": "test-addition",
+      "name": "Test Addition",
+      "inputs": {
+        "a": 1,
+        "b": 1,
+        "operator": "+"
+      },
+      "expectedOutput": {
+        "result": 2
+      },
+      "expectedAgentBehavior": "",
+      "evalSetId": "default-eval-set-id",
+      "createdAt": "2025-09-04T18:54:58.378Z",
+      "updatedAt": "2025-09-04T18:55:55.416Z"
+    },
+    {
+      "id": "test-random-addition-using-mockito",
+      "name": "Test Random Addition Using Mockito",
+      "inputs": {
+        "a": 1,
+        "b": 1,
+        "operator": "random"
+      },
+      "expectedOutput": {
+        "result": 2
+      },
+      "expectedAgentBehavior": "",
+      "mockingStrategy": {
+        "type": "mockito",
+        "behaviors": [
+          {
+            "function": "get_random_operator",
+            "arguments": {
+              "args": [],
+              "kwargs": {}
+            },
+            "then": [
+              {
+                "type": "return",
+                "value": {
+                  "result": "+"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "evalSetId": "default-eval-set-id",
+      "createdAt": "2025-09-04T18:54:58.378Z",
+      "updatedAt": "2025-09-04T18:55:55.416Z"
+    },
+    {
+      "id": "test-random-addition-using-llm",
+      "name": "Test Random Addition Using LLM",
+      "inputs": {
+        "a": 1,
+        "b": 1,
+        "operator": "random"
+      },
+      "expectedOutput": {
+        "result": 2
+      },
+      "expectedAgentBehavior": "",
+      "mockingStrategy": {
+        "type": "llm",
+        "prompt": "The random operator is '+'.",
+        "toolsToSimulate": [
+          {
+            "name": "get_random_operator"
+          }
+        ],
+        "model": {
+          "model": "gpt-4o-mini-2024-07-18",
+          "temperature": 0
+        }
+      },
+      "evalSetId": "default-eval-set-id",
+      "createdAt": "2025-09-04T18:54:58.378Z",
+      "updatedAt": "2025-09-04T18:55:55.416Z"
+    }
+  ],
+  "modelSettings": [],
+  "createdAt": "2025-09-04T18:54:58.379Z",
+  "updatedAt": "2025-09-04T18:55:55.416Z"
+}
diff --git a/samples/calculator/evals/evaluators/contains.json b/samples/calculator/evals/evaluators/contains.json
@@ -0,0 +1,15 @@
+{
+  "version": "1.0",
+  "id": "ContainsEvaluator",
+  "description": "Checks if the response text includes the expected denial code.",
+  "evaluatorTypeId": "uipath-contains",
+  "evaluatorConfig": {
+    "name": "ContainsEvaluator",
+    "targetOutputKey": "result",
+    "negated": false,
+    "ignoreCase": false,
+    "defaultEvaluationCriteria": {
+      "searchText": "5"
+    }
+  }
+}
diff --git a/samples/calculator/evals/evaluators/correct-operator-evaluator.json b/samples/calculator/evals/evaluators/correct-operator-evaluator.json
@@ -0,0 +1,14 @@
+{
+  "version": "1.0",
+  "id": "CorrectOperatorEvaluator",
+  "evaluatorTypeId": "file://types/correct-operator-evaluator-types.json",
+  "evaluatorSchema": "file://correct_operator.py:CorrectOperatorEvaluator",
+  "description": "A custom evaluator that checks if the correct operator is being used by the agent ",
+  "evaluatorConfig": {
+    "name": "CorrectOperatorEvaluator",
+    "defaultEvaluationCriteria": {
+      "operator": "+"
+    },
+    "negated": false
+  }
+}
diff --git a/samples/calculator/evals/evaluators/custom/correct_operator.py b/samples/calculator/evals/evaluators/custom/correct_operator.py
@@ -0,0 +1,46 @@
+import json
+
+from uipath.eval.evaluators import BaseEvaluator, BaseEvaluationCriteria, BaseEvaluatorConfig
+from uipath.eval.models import AgentExecution, EvaluationResult, NumericEvaluationResult
+from opentelemetry.sdk.trace import ReadableSpan
+
+class CorrectOperatorEvaluationCriteria(BaseEvaluationCriteria):
+    """Evaluation criteria for the contains evaluator."""
+
+    operator: str
+
+class CorrectOperatorEvaluatorConfig(BaseEvaluatorConfig[CorrectOperatorEvaluationCriteria]):
+    """Configuration for the contains evaluator."""
+
+    name: str = "CorrectOperatorEvaluator"
+    negated: bool = False
+    default_evaluation_criteria: CorrectOperatorEvaluationCriteria = CorrectOperatorEvaluationCriteria(operator="+")
+
+class CorrectOperatorEvaluator(BaseEvaluator[CorrectOperatorEvaluationCriteria, CorrectOperatorEvaluatorConfig, type(None)]):
+    """A custom evaluator that checks if the correct operator is being used by the agent """
+
+    def extract_operator_from_spans(self, agent_trace: list[ReadableSpan]) -> str:
+        for span in agent_trace:
+            if span.name == "track_operator":
+                if span.attributes:
+                    input_value_as_str = span.attributes.get("input.value", "{}")
+                    assert isinstance(input_value_as_str, str)
+                    input_value = json.loads(input_value_as_str)
+                    return input_value.get("operator")
+        raise Exception(f"No 'track_operator' span found")
+
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        return "CorrectOperatorEvaluator"
+
+
+    async def evaluate(self, agent_execution: AgentExecution, evaluation_criteria: CorrectOperatorEvaluationCriteria) -> EvaluationResult:
+        actual_operator = self.extract_operator_from_spans(agent_execution.agent_trace)
+        print(actual_operator)
+        is_expected_operator = evaluation_criteria.operator == actual_operator
+        if self.evaluator_config.negated:
+            is_expected_operator = not is_expected_operator
+        return NumericEvaluationResult(
+            score=float(is_expected_operator),
+        )