UiPath · Chibionos · Oct 17, 2025 · radu-mocanu · Oct 20, 2025 · radu-mocanu
diff --git a/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json b/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json
@@ -6,7 +6,7 @@
   "evaluatorConfig": {
     "name": "LLMJudgeOutputEvaluator",
     "targetOutputKey": "*",
-    "model": "gpt-4o-mini",
+    "model": "gpt-4o-mini-2024-07-18",
     "prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.",
     "temperature": 0.0,
     "defaultEvaluationCriteria": {

diff --git a/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json b/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json
@@ -6,7 +6,7 @@
   "evaluatorConfig": {
     "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
     "targetOutputKey": "*",
-    "model": "gpt-4o-mini",
+    "model": "gpt-4o-mini-2024-07-18",
     "prompt": "Compare the following JSON outputs for strict structural similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nEvaluate if the JSON structure and values match precisely. Provide a score from 0-100 where 100 means exact match and 0 means completely different.",
     "temperature": 0.0,
     "defaultEvaluationCriteria": {

diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py
@@ -63,9 +63,18 @@ def _display_successful_evaluation(self, eval_name: str, eval_results) -> None:
             for eval_result in eval_results:
                 evaluator_name = self._get_evaluator_name(eval_result.evaluator_id)
                 score_value = self._convert_score_to_numeric(eval_result)
-                table.add_row(
-                    f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]"
-                )
+
+                # Show error details if score type is ERROR
+                if eval_result.result.score_type == ScoreType.ERROR:
+                    error_details = eval_result.result.details or "Unknown error"
+                    table.add_row(
+                        f"{evaluator_name}",
+                        f"[red]{score_value:.1f} (Error: {error_details})[/red]"
+                    )
+                else:
+                    table.add_row(
+                        f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]"
+                    )
 
             self.console.print(table)
         else:

diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py
@@ -54,6 +54,13 @@ def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator:
     def _create_evaluator_internal(
         data: Dict[str, Any],
     ) -> BaseEvaluator[Any, Any, Any]:
+        # # Validate only the evaluatorConfig part to determine type
+        # evaluator_config_data = data.get("evaluatorConfig", {})
+        # # Add evaluatorTypeId to the config data so discriminator can work
+        # evaluator_config_data_with_type = {
+        #     "evaluatorTypeId": data.get("evaluatorTypeId"),
+        #     **evaluator_config_data
+        # }
         config: BaseEvaluatorConfig[Any] = TypeAdapter(EvaluatorConfig).validate_python(
             data
         )

diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
@@ -31,7 +31,12 @@
     EvaluationEvents,
 )
 from uipath._utils import Endpoint, RequestSpec
-from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
+from uipath._utils.constants import (
+    ENV_EVAL_BACKEND_URL,
+    ENV_TENANT_ID,
+    ENV_BASE_URL,
+    HEADER_INTERNAL_TENANT_ID,
+)
 from uipath.eval.evaluators import LegacyBaseEvaluator
 from uipath.eval.models import EvalItemResult, ScoreType
 from uipath.tracing import LlmOpsHttpExporter
@@ -47,14 +52,12 @@ async def wrapper(self, *args, **kwargs):
         try:
             return await func(self, *args, **kwargs)
         except Exception as e:
-            if hasattr(self, "_console"):
-                error_type = type(e).__name__
-                logger.warning(
-                    f"Cannot report progress to SW. "
-                    f"Function: {func.__name__}, "
-                    f"Error type: {error_type}, "
-                    f"Details: {e}"
-                )
+            # Log at debug level for troubleshooting
+            logger.debug(
+                f"Cannot report progress to SW. "
+                f"Function: {func.__name__}, "
+                f"Error: {e}"
+            )
             return None
 
     return wrapper
@@ -67,6 +70,7 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter):
         self.spans_exporter = spans_exporter
 
         logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
+        logging.getLogger("httpx").setLevel(logging.WARNING)
         console_logger = ConsoleLogger.get_instance()
         uipath = UiPath()
 
@@ -79,14 +83,32 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter):
                 "Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID."
             )
 
+        # Get eval backend URL (can be overridden for local dev)
+        self._eval_backend_url = self._get_eval_backend_url()
+
         self.eval_set_run_ids: Dict[str, str] = {}
         self.evaluators: Dict[str, Any] = {}
         self.evaluator_scores: Dict[str, List[float]] = {}
         self.eval_run_ids: Dict[str, str] = {}
 
     def _format_error_message(self, error: Exception, context: str) -> None:
         """Helper method to format and display error messages consistently."""
-        self._rich_console.print(f"    • \u26a0  [dim]{context}: {error}[/dim]")
+        # Only show simple message without full error details
+        self._rich_console.print(f"    • ⚠  [dim]{context}[/dim]")
+
+    def _get_eval_backend_url(self) -> str:
+        """Get the eval backend URL from environment, falling back to UIPATH_URL."""
+        eval_url = os.getenv(ENV_EVAL_BACKEND_URL)
+        if eval_url:
+            logger.debug(f"Using eval backend URL: {eval_url}")
+            return eval_url.rstrip("/")
+
+        base_url = os.getenv(ENV_BASE_URL, "https://cloud.uipath.com")
+        return base_url.rstrip("/")
+
+    def _build_eval_endpoint_url(self, endpoint: Endpoint) -> str:
+        """Build full URL for eval endpoints using the eval backend URL."""
+        return f"{self._eval_backend_url}{endpoint}"
 
     @gracefully_handle_errors
     async def create_eval_set_run(
@@ -100,7 +122,7 @@ async def create_eval_set_run(
         spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
         response = await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -124,7 +146,7 @@ async def create_eval_run(
         spec = self._create_eval_run_spec(eval_item, eval_set_run_id)
         response = await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -138,19 +160,19 @@ async def update_eval_run(
         evaluators: dict[str, LegacyBaseEvaluator[Any]],
     ):
         """Update an evaluation run with results."""
-        assertion_runs, evaluator_scores = self._collect_results(
+        evaluator_runs, evaluator_scores = self._collect_results(
             sw_progress_item.eval_results, evaluators
         )
         spec = self._update_eval_run_spec(
-            assertion_runs=assertion_runs,
+            evaluator_runs=evaluator_runs,
             evaluator_scores=evaluator_scores,
             eval_run_id=sw_progress_item.eval_run_id,
             execution_time=sw_progress_item.agent_execution_time,
             actual_output=sw_progress_item.agent_output,
         )
         await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -166,7 +188,7 @@ async def update_eval_set_run(
         spec = self._update_eval_set_run_spec(eval_set_run_id, evaluator_scores)
         await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -203,7 +225,7 @@ async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None:
                     self.eval_run_ids[payload.execution_id] = eval_run_id
                     logger.debug(f"Created eval run with ID: {eval_run_id}")
             else:
-                logger.warning("Cannot create eval run: eval_set_run_id not available")
+                logger.debug("Cannot create eval run: eval_set_run_id not available")
 
         except Exception as e:
             self._format_error_message(e, "StudioWeb create eval run error")
@@ -258,7 +280,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
                 )
                 logger.debug(f"Updated eval set run with ID: {eval_set_run_id}")
             else:
-                logger.warning(
+                logger.debug(
                     "Cannot update eval set run: eval_set_run_id not available"
                 )
 
@@ -303,17 +325,19 @@ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
                 input_schema=input_schema, output_schema=output_schema
             )
         except Exception as e:
-            logger.warning(f"Failed to extract agent snapshot: {e}")
+            logger.debug(f"Failed to extract agent snapshot: {e}")
             return StudioWebAgentSnapshot(input_schema={}, output_schema={})
 
     def _collect_results(
         self,
         eval_results: list[EvalItemResult],
         evaluators: dict[str, LegacyBaseEvaluator[Any]],
     ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
-        assertion_runs: list[dict[str, Any]] = []
+        evaluator_runs: list[dict[str, Any]] = []
         evaluator_scores_list: list[dict[str, Any]] = []
+
         for eval_result in eval_results:
+            # Build scores for the eval run result
             evaluator_scores_list.append(
                 {
                     "type": eval_result.result.score_type.value,
@@ -322,10 +346,38 @@ def _collect_results(
                     "evaluatorId": eval_result.evaluator_id,
                 }
             )
-            assertion_runs.append(
+
+            # Build evaluator runs for the new coded eval API
+            # Handle both legacy and coded evaluators
+            evaluator = evaluators[eval_result.evaluator_id]
+
+            # Get assertion type and output key based on evaluator type
+            if hasattr(evaluator, 'evaluator_type'):
+                # Legacy evaluator
+                assertion_type = evaluator.evaluator_type.name
+                output_key = evaluator.target_output_key
+            else:
+                # Coded evaluator - use name as type and default output key
+                assertion_type = evaluator.name if hasattr(evaluator, 'name') else "UnknownEvaluator"
+                output_key = "*"  # Coded evaluators don't have target_output_key
+
+            evaluator_runs.append(
                 {
-                    "status": EvaluationStatus.COMPLETED.value,
                     "evaluatorId": eval_result.evaluator_id,
+                    "evaluatorSnapshot": {
+                        "assertionType": assertion_type,
+                        "outputKey": output_key,
+                    },
+                    "evaluationCriteria": None,  # Optional field
+                    "status": EvaluationStatus.COMPLETED.value,
+                    "result": {
+                        "output": {},  # Will be set from top-level result
+                        "score": {
+                            "type": eval_result.result.score_type.value,
+                            "value": eval_result.result.score,
+                            "justification": eval_result.result.details,
+                        }
+                    },
                     "completionMetrics": {
                         "duration": int(eval_result.result.evaluation_time)
                         if eval_result.result.evaluation_time
@@ -335,59 +387,64 @@ def _collect_results(
                         "completionTokens": 0,
                         "promptTokens": 0,
                     },
-                    "assertionSnapshot": {
-                        "assertionType": evaluators[
-                            eval_result.evaluator_id
-                        ].evaluator_type.name,
-                        "outputKey": evaluators[
-                            eval_result.evaluator_id
-                        ].target_output_key,
-                    },
                 }
             )
-        return assertion_runs, evaluator_scores_list
+        return evaluator_runs, evaluator_scores_list
 
     def _update_eval_run_spec(
         self,
-        assertion_runs: list[dict[str, Any]],
+        evaluator_runs: list[dict[str, Any]],
         evaluator_scores: list[dict[str, Any]],
         eval_run_id: str,
         actual_output: dict[str, Any],
         execution_time: float,
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
         return RequestSpec(
             method="PUT",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
+                f"api/execution/agents/{self._project_id}/coded/evalRun"
             ),
             json={
                 "evalRunId": eval_run_id,
                 "status": EvaluationStatus.COMPLETED.value,
                 "result": {
-                    "output": {"content": {**actual_output}},
-                    "evaluatorScores": evaluator_scores,
+                    "output": {**actual_output},
+                    "scores": evaluator_scores,
                 },
                 "completionMetrics": {"duration": int(execution_time)},
-                "assertionRuns": assertion_runs,
+                "evaluatorRuns": evaluator_runs,
             },
             headers=self._tenant_header(),
         )
 
     def _create_eval_run_spec(
         self, eval_item: LegacyEvaluationItem, eval_set_run_id: str
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
+        # Handle both legacy and new evaluation item formats
+        evaluation_criterias = {}
+
+        # Check if it's a legacy item with expected_output or new item with evaluation_criterias
+        if hasattr(eval_item, 'expected_output'):
+            # Legacy format: expected_output is a dict at the item level
+            evaluation_criterias = eval_item.expected_output
+        elif hasattr(eval_item, 'evaluation_criterias'):
+            # New format: evaluation_criterias is already in the correct format
+            evaluation_criterias = eval_item.evaluation_criterias
+
         return RequestSpec(
             method="POST",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
+                f"api/execution/agents/{self._project_id}/coded/evalRun"
             ),
             json={
                 "evalSetRunId": eval_set_run_id,
                 "evalSnapshot": {
                     "id": eval_item.id,
                     "name": eval_item.name,
                     "inputs": eval_item.inputs,
-                    "expectedOutput": eval_item.expected_output,
+                    "evaluationCriterias": evaluation_criterias,
                 },
                 "status": EvaluationStatus.IN_PROGRESS.value,
             },
@@ -400,17 +457,19 @@ def _create_eval_set_run_spec(
         agent_snapshot: StudioWebAgentSnapshot,
         no_of_evals: int,
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
         return RequestSpec(
             method="POST",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
+                f"api/execution/agents/{self._project_id}/coded/evalSetRun"
             ),
             json={
-                "agentId": self._project_id,
                 "evalSetId": eval_set_id,
+                "agentId": self._project_id,
                 "agentSnapshot": agent_snapshot.model_dump(by_alias=True),
                 "status": EvaluationStatus.IN_PROGRESS.value,
                 "numberOfEvalsExecuted": no_of_evals,
+                "version": "1.0",
             },
             headers=self._tenant_header(),
         )
@@ -420,15 +479,16 @@ def _update_eval_set_run_spec(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
         evaluator_scores_list = [
-            {"value": avg_score, "evaluatorId": evaluator_id}
+            {"evaluatorId": evaluator_id, "value": avg_score}
             for evaluator_id, avg_score in evaluator_scores.items()
         ]
 
         return RequestSpec(
             method="PUT",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
+                f"api/execution/agents/{self._project_id}/coded/evalSetRun"
             ),
             json={
                 "evalSetRunId": eval_set_run_id,