diff --git a/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json b/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json
index 623ffc89b..0b21f80dd 100644
--- a/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json
+++ b/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json
@@ -6,7 +6,7 @@
   "evaluatorConfig": {
     "name": "LLMJudgeOutputEvaluator",
     "targetOutputKey": "*",
-    "model": "gpt-4o-mini",
+    "model": "gpt-4o-mini-2024-07-18",
     "prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.",
     "temperature": 0.0,
     "defaultEvaluationCriteria": {
diff --git a/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json b/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json
index 9bfab8da8..142c28846 100644
--- a/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json
+++ b/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json
@@ -6,7 +6,7 @@
   "evaluatorConfig": {
     "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
     "targetOutputKey": "*",
-    "model": "gpt-4o-mini",
+    "model": "gpt-4o-mini-2024-07-18",
     "prompt": "Compare the following JSON outputs for strict structural similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nEvaluate if the JSON structure and values match precisely. Provide a score from 0-100 where 100 means exact match and 0 means completely different.",
     "temperature": 0.0,
     "defaultEvaluationCriteria": {
diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py
index 5d1d17f38..942015520 100644
--- a/src/uipath/_cli/_evals/_console_progress_reporter.py
+++ b/src/uipath/_cli/_evals/_console_progress_reporter.py
@@ -63,9 +63,18 @@ def _display_successful_evaluation(self, eval_name: str, eval_results) -> None:
             for eval_result in eval_results:
                 evaluator_name = self._get_evaluator_name(eval_result.evaluator_id)
                 score_value = self._convert_score_to_numeric(eval_result)
-                table.add_row(
-                    f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]"
-                )
+
+                # Show error details if score type is ERROR
+                if eval_result.result.score_type == ScoreType.ERROR:
+                    error_details = eval_result.result.details or "Unknown error"
+                    table.add_row(
+                        f"{evaluator_name}",
+                        f"[red]{score_value:.1f} (Error: {error_details})[/red]"
+                    )
+                else:
+                    table.add_row(
+                        f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]"
+                    )
 
             self.console.print(table)
         else:
diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py
index c5492c7de..31a7e2d07 100644
--- a/src/uipath/_cli/_evals/_evaluator_factory.py
+++ b/src/uipath/_cli/_evals/_evaluator_factory.py
@@ -54,6 +54,13 @@ def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator:
     def _create_evaluator_internal(
         data: Dict[str, Any],
     ) -> BaseEvaluator[Any, Any, Any]:
+        # # Validate only the evaluatorConfig part to determine type
+        # evaluator_config_data = data.get("evaluatorConfig", {})
+        # # Add evaluatorTypeId to the config data so discriminator can work
+        # evaluator_config_data_with_type = {
+        #     "evaluatorTypeId": data.get("evaluatorTypeId"),
+        #     **evaluator_config_data
+        # }
         config: BaseEvaluatorConfig[Any] = TypeAdapter(EvaluatorConfig).validate_python(
             data
         )
diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
index 6a3044f3a..3eb5d5431 100644
--- a/src/uipath/_cli/_evals/_progress_reporter.py
+++ b/src/uipath/_cli/_evals/_progress_reporter.py
@@ -31,7 +31,12 @@
     EvaluationEvents,
 )
 from uipath._utils import Endpoint, RequestSpec
-from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
+from uipath._utils.constants import (
+    ENV_EVAL_BACKEND_URL,
+    ENV_TENANT_ID,
+    ENV_BASE_URL,
+    HEADER_INTERNAL_TENANT_ID,
+)
 from uipath.eval.evaluators import LegacyBaseEvaluator
 from uipath.eval.models import EvalItemResult, ScoreType
 from uipath.tracing import LlmOpsHttpExporter
@@ -47,14 +52,12 @@ async def wrapper(self, *args, **kwargs):
         try:
             return await func(self, *args, **kwargs)
         except Exception as e:
-            if hasattr(self, "_console"):
-                error_type = type(e).__name__
-                logger.warning(
-                    f"Cannot report progress to SW. "
-                    f"Function: {func.__name__}, "
-                    f"Error type: {error_type}, "
-                    f"Details: {e}"
-                )
+            # Log at debug level for troubleshooting
+            logger.debug(
+                f"Cannot report progress to SW. "
+                f"Function: {func.__name__}, "
+                f"Error: {e}"
+            )
             return None
 
     return wrapper
@@ -67,6 +70,7 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter):
         self.spans_exporter = spans_exporter
 
         logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
+        logging.getLogger("httpx").setLevel(logging.WARNING)
         console_logger = ConsoleLogger.get_instance()
         uipath = UiPath()
 
@@ -79,6 +83,9 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter):
                 "Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID."
             )
 
+        # Get eval backend URL (can be overridden for local dev)
+        self._eval_backend_url = self._get_eval_backend_url()
+
         self.eval_set_run_ids: Dict[str, str] = {}
         self.evaluators: Dict[str, Any] = {}
         self.evaluator_scores: Dict[str, List[float]] = {}
@@ -86,7 +93,22 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter):
 
     def _format_error_message(self, error: Exception, context: str) -> None:
         """Helper method to format and display error messages consistently."""
-        self._rich_console.print(f"    • \u26a0  [dim]{context}: {error}[/dim]")
+        # Only show simple message without full error details
+        self._rich_console.print(f"    • ⚠  [dim]{context}[/dim]")
+
+    def _get_eval_backend_url(self) -> str:
+        """Get the eval backend URL from environment, falling back to UIPATH_URL."""
+        eval_url = os.getenv(ENV_EVAL_BACKEND_URL)
+        if eval_url:
+            logger.debug(f"Using eval backend URL: {eval_url}")
+            return eval_url.rstrip("/")
+
+        base_url = os.getenv(ENV_BASE_URL, "https://cloud.uipath.com")
+        return base_url.rstrip("/")
+
+    def _build_eval_endpoint_url(self, endpoint: Endpoint) -> str:
+        """Build full URL for eval endpoints using the eval backend URL."""
+        return f"{self._eval_backend_url}{endpoint}"
 
     @gracefully_handle_errors
     async def create_eval_set_run(
@@ -100,7 +122,7 @@ async def create_eval_set_run(
         spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
         response = await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -124,7 +146,7 @@ async def create_eval_run(
         spec = self._create_eval_run_spec(eval_item, eval_set_run_id)
         response = await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -138,11 +160,11 @@ async def update_eval_run(
         evaluators: dict[str, LegacyBaseEvaluator[Any]],
     ):
         """Update an evaluation run with results."""
-        assertion_runs, evaluator_scores = self._collect_results(
+        evaluator_runs, evaluator_scores = self._collect_results(
             sw_progress_item.eval_results, evaluators
         )
         spec = self._update_eval_run_spec(
-            assertion_runs=assertion_runs,
+            evaluator_runs=evaluator_runs,
             evaluator_scores=evaluator_scores,
             eval_run_id=sw_progress_item.eval_run_id,
             execution_time=sw_progress_item.agent_execution_time,
@@ -150,7 +172,7 @@ async def update_eval_run(
         )
         await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -166,7 +188,7 @@ async def update_eval_set_run(
         spec = self._update_eval_set_run_spec(eval_set_run_id, evaluator_scores)
         await self._client.request_async(
             method=spec.method,
-            url=spec.endpoint,
+            url=self._build_eval_endpoint_url(spec.endpoint),
             params=spec.params,
             json=spec.json,
             headers=spec.headers,
@@ -203,7 +225,7 @@ async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None:
                     self.eval_run_ids[payload.execution_id] = eval_run_id
                     logger.debug(f"Created eval run with ID: {eval_run_id}")
             else:
-                logger.warning("Cannot create eval run: eval_set_run_id not available")
+                logger.debug("Cannot create eval run: eval_set_run_id not available")
 
         except Exception as e:
             self._format_error_message(e, "StudioWeb create eval run error")
@@ -258,7 +280,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
                 )
                 logger.debug(f"Updated eval set run with ID: {eval_set_run_id}")
             else:
-                logger.warning(
+                logger.debug(
                     "Cannot update eval set run: eval_set_run_id not available"
                 )
 
@@ -303,7 +325,7 @@ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
                 input_schema=input_schema, output_schema=output_schema
             )
         except Exception as e:
-            logger.warning(f"Failed to extract agent snapshot: {e}")
+            logger.debug(f"Failed to extract agent snapshot: {e}")
             return StudioWebAgentSnapshot(input_schema={}, output_schema={})
 
     def _collect_results(
@@ -311,9 +333,11 @@ def _collect_results(
         eval_results: list[EvalItemResult],
         evaluators: dict[str, LegacyBaseEvaluator[Any]],
     ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
-        assertion_runs: list[dict[str, Any]] = []
+        evaluator_runs: list[dict[str, Any]] = []
         evaluator_scores_list: list[dict[str, Any]] = []
+
         for eval_result in eval_results:
+            # Build scores for the eval run result
             evaluator_scores_list.append(
                 {
                     "type": eval_result.result.score_type.value,
@@ -322,10 +346,38 @@ def _collect_results(
                     "evaluatorId": eval_result.evaluator_id,
                 }
             )
-            assertion_runs.append(
+
+            # Build evaluator runs for the new coded eval API
+            # Handle both legacy and coded evaluators
+            evaluator = evaluators[eval_result.evaluator_id]
+
+            # Get assertion type and output key based on evaluator type
+            if hasattr(evaluator, 'evaluator_type'):
+                # Legacy evaluator
+                assertion_type = evaluator.evaluator_type.name
+                output_key = evaluator.target_output_key
+            else:
+                # Coded evaluator - use name as type and default output key
+                assertion_type = evaluator.name if hasattr(evaluator, 'name') else "UnknownEvaluator"
+                output_key = "*"  # Coded evaluators don't have target_output_key
+
+            evaluator_runs.append(
                 {
-                    "status": EvaluationStatus.COMPLETED.value,
                     "evaluatorId": eval_result.evaluator_id,
+                    "evaluatorSnapshot": {
+                        "assertionType": assertion_type,
+                        "outputKey": output_key,
+                    },
+                    "evaluationCriteria": None,  # Optional field
+                    "status": EvaluationStatus.COMPLETED.value,
+                    "result": {
+                        "output": {},  # Will be set from top-level result
+                        "score": {
+                            "type": eval_result.result.score_type.value,
+                            "value": eval_result.result.score,
+                            "justification": eval_result.result.details,
+                        }
+                    },
                     "completionMetrics": {
                         "duration": int(eval_result.result.evaluation_time)
                         if eval_result.result.evaluation_time
@@ -335,40 +387,33 @@ def _collect_results(
                         "completionTokens": 0,
                         "promptTokens": 0,
                     },
-                    "assertionSnapshot": {
-                        "assertionType": evaluators[
-                            eval_result.evaluator_id
-                        ].evaluator_type.name,
-                        "outputKey": evaluators[
-                            eval_result.evaluator_id
-                        ].target_output_key,
-                    },
                 }
             )
-        return assertion_runs, evaluator_scores_list
+        return evaluator_runs, evaluator_scores_list
 
     def _update_eval_run_spec(
         self,
-        assertion_runs: list[dict[str, Any]],
+        evaluator_runs: list[dict[str, Any]],
         evaluator_scores: list[dict[str, Any]],
         eval_run_id: str,
         actual_output: dict[str, Any],
         execution_time: float,
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
         return RequestSpec(
             method="PUT",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
+                f"api/execution/agents/{self._project_id}/coded/evalRun"
             ),
             json={
                 "evalRunId": eval_run_id,
                 "status": EvaluationStatus.COMPLETED.value,
                 "result": {
-                    "output": {"content": {**actual_output}},
-                    "evaluatorScores": evaluator_scores,
+                    "output": {**actual_output},
+                    "scores": evaluator_scores,
                 },
                 "completionMetrics": {"duration": int(execution_time)},
-                "assertionRuns": assertion_runs,
+                "evaluatorRuns": evaluator_runs,
             },
             headers=self._tenant_header(),
         )
@@ -376,10 +421,22 @@ def _update_eval_run_spec(
     def _create_eval_run_spec(
         self, eval_item: LegacyEvaluationItem, eval_set_run_id: str
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
+        # Handle both legacy and new evaluation item formats
+        evaluation_criterias = {}
+
+        # Check if it's a legacy item with expected_output or new item with evaluation_criterias
+        if hasattr(eval_item, 'expected_output'):
+            # Legacy format: expected_output is a dict at the item level
+            evaluation_criterias = eval_item.expected_output
+        elif hasattr(eval_item, 'evaluation_criterias'):
+            # New format: evaluation_criterias is already in the correct format
+            evaluation_criterias = eval_item.evaluation_criterias
+
         return RequestSpec(
             method="POST",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
+                f"api/execution/agents/{self._project_id}/coded/evalRun"
             ),
             json={
                 "evalSetRunId": eval_set_run_id,
@@ -387,7 +444,7 @@ def _create_eval_run_spec(
                     "id": eval_item.id,
                     "name": eval_item.name,
                     "inputs": eval_item.inputs,
-                    "expectedOutput": eval_item.expected_output,
+                    "evaluationCriterias": evaluation_criterias,
                 },
                 "status": EvaluationStatus.IN_PROGRESS.value,
             },
@@ -400,17 +457,19 @@ def _create_eval_set_run_spec(
         agent_snapshot: StudioWebAgentSnapshot,
         no_of_evals: int,
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
         return RequestSpec(
             method="POST",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
+                f"api/execution/agents/{self._project_id}/coded/evalSetRun"
             ),
             json={
-                "agentId": self._project_id,
                 "evalSetId": eval_set_id,
+                "agentId": self._project_id,
                 "agentSnapshot": agent_snapshot.model_dump(by_alias=True),
                 "status": EvaluationStatus.IN_PROGRESS.value,
                 "numberOfEvalsExecuted": no_of_evals,
+                "version": "1.0",
             },
             headers=self._tenant_header(),
         )
@@ -420,15 +479,16 @@ def _update_eval_set_run_spec(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
     ) -> RequestSpec:
+        # Use new coded eval API endpoint
         evaluator_scores_list = [
-            {"value": avg_score, "evaluatorId": evaluator_id}
+            {"evaluatorId": evaluator_id, "value": avg_score}
             for evaluator_id, avg_score in evaluator_scores.items()
         ]
 
         return RequestSpec(
             method="PUT",
             endpoint=Endpoint(
-                f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
+                f"api/execution/agents/{self._project_id}/coded/evalSetRun"
             ),
             json={
                 "evalSetRunId": eval_set_run_id,
diff --git a/src/uipath/_cli/_utils/_tracing.py b/src/uipath/_cli/_utils/_tracing.py
index fdc4a2238..9a63d597a 100644
--- a/src/uipath/_cli/_utils/_tracing.py
+++ b/src/uipath/_cli/_utils/_tracing.py
@@ -10,18 +10,13 @@ def __init__(self, url_to_ignore):
 
     def filter(self, record):
         try:
+            # Suppress all HTTP Request logs from httpx
             if record.msg == 'HTTP Request: %s %s "%s %d %s"':
-                # Ignore the log if the URL matches the one we want to ignore
-                method = record.args[0]
-                url = record.args[1]
-
-                if method == "POST" and url.path.endswith(self.url_to_ignore):
-                    # Check if the URL contains the specific path we want to ignore
-                    return True
                 return False
 
         except Exception:
             return False
+        return True
 
 
 def setup_tracer_httpx_logging(url: str):
diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index 56cdcb775..5b742531d 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -5,7 +5,10 @@
 from typing import List, Optional
 
 import click
+from rich.console import Console
+from rich.table import Table
 
+from uipath import UiPath
 from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter
 from uipath._cli._evals._progress_reporter import StudioWebProgressReporter
 from uipath._cli._evals._runtime import (
@@ -21,6 +24,8 @@
 from uipath._cli._utils._folders import get_personal_workspace_key_async
 from uipath._cli.middlewares import Middlewares
 from uipath._events._event_bus import EventBus
+from uipath._utils import Endpoint
+from uipath._utils.constants import ENV_EVAL_BACKEND_URL, ENV_BASE_URL, ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
 from uipath.eval._helpers import auto_discover_entrypoint
 from uipath.tracing import LlmOpsHttpExporter
 
@@ -32,6 +37,102 @@
 console = ConsoleLogger()
 
 
+async def list_eval_runs() -> None:
+    """List previous evaluation runs for the current agent."""
+    try:
+        project_id = os.getenv(UIPATH_PROJECT_ID)
+        if not project_id:
+            console.error("UIPATH_PROJECT_ID environment variable not set. Please set it to list previous runs.")
+            return
+
+        tenant_id = os.getenv(ENV_TENANT_ID)
+        if not tenant_id:
+            console.error(f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'.")
+            return
+
+        # Get eval backend URL
+        eval_url = os.getenv(ENV_EVAL_BACKEND_URL)
+        if eval_url:
+            base_url = eval_url.rstrip("/")
+        else:
+            base_url = os.getenv(ENV_BASE_URL, "https://cloud.uipath.com").rstrip("/")
+
+        # Initialize UiPath client
+        uipath = UiPath()
+        client = uipath.api_client
+
+        # Build the endpoint URL
+        url = f"{base_url}/api/execution/agents/{project_id}/coded/evalSetRuns"
+
+        # Make the API call
+        response = await client.request_async(
+            method="GET",
+            url=url,
+            params={"agentId": project_id},
+            headers={HEADER_INTERNAL_TENANT_ID: tenant_id}
+        )
+
+        # Parse the response
+        import json
+        runs = json.loads(response.content)
+
+        if not runs:
+            console.info("No previous evaluation runs found for this agent.")
+            return
+
+        # Display results in a nice table
+        rich_console = Console()
+        table = Table(title=f"Evaluation Runs for Agent {project_id}")
+
+        table.add_column("Run ID", style="cyan", no_wrap=True)
+        table.add_column("Eval Set ID", style="magenta")
+        table.add_column("Status", style="green")
+        table.add_column("Evals Executed", justify="right")
+        table.add_column("Score", justify="right")
+        table.add_column("Duration (ms)", justify="right")
+        table.add_column("Created At", style="yellow")
+
+        for run in runs:
+            # Map status: API returns camelCase strings ("pending", "running", "completed")
+            status_value = run.get("status", "unknown")
+            if isinstance(status_value, str):
+                # Handle string status from API
+                status_map = {
+                    "pending": "Pending",
+                    "running": "Running",
+                    "completed": "Completed"
+                }
+                status = status_map.get(status_value.lower(), status_value.capitalize())
+            else:
+                # Handle integer status as fallback
+                status_map = {0: "Pending", 1: "Running", 2: "Completed"}
+                status = status_map.get(status_value, "Unknown")
+
+            table.add_row(
+                str(run.get("id", "N/A"))[:8] + "...",  # Truncate UUID for display
+                run.get("evalSetId", "N/A"),
+                status,
+                str(run.get("numberOfEvalsExecuted", "N/A")),
+                f"{run.get('score', 0):.2f}" if run.get("score") is not None else "N/A",
+                str(run.get("durationMilliseconds", "N/A")),
+                run.get("createdAt", "N/A")[:19],  # Truncate timestamp
+            )
+
+        rich_console.print(table)
+
+        # Show evaluator scores summary
+        rich_console.print("\n[bold]Evaluator Scores for Most Recent Run:[/bold]")
+        if runs and runs[0].get("evaluatorScores"):
+            scores = runs[0]["evaluatorScores"]
+            for score in scores:
+                evaluator_id = score.get("evaluatorId", "Unknown")
+                value = score.get("value", 0)
+                rich_console.print(f"  • {evaluator_id}: [green]{value:.2f}[/green]")
+
+    except Exception as e:
+        console.error(f"Failed to list eval runs: {e}")
+
+
 class LiteralOption(click.Option):
     def type_cast_value(self, ctx, value):
         try:
@@ -78,6 +179,12 @@ def setup_reporting_prereq(no_report: bool) -> bool:
     type=click.Path(exists=False),
     help="File path where the output will be written",
 )
+@click.option(
+    "--list-runs",
+    is_flag=True,
+    help="List previous evaluation runs for this agent",
+    default=False,
+)
 @track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None)
 def eval(
     entrypoint: Optional[str],
@@ -86,6 +193,7 @@ def eval(
     no_report: bool,
     workers: int,
     output_file: Optional[str],
+    list_runs: bool,
 ) -> None:
     """Run an evaluation set against the agent.
 
@@ -95,7 +203,17 @@ def eval(
         eval_ids: Optional list of evaluation IDs
         workers: Number of parallel workers for running evaluations
         no_report: Do not report the evaluation results
+        list_runs: List previous evaluation runs for this agent
     """
+    # Suppress HTTP request logs from httpx
+    import logging
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+
+    # Handle --list-runs flag
+    if list_runs:
+        asyncio.run(list_eval_runs())
+        return
+
     should_register_progress_reporter = setup_reporting_prereq(no_report)
 
     result = Middlewares.next(
diff --git a/src/uipath/_utils/constants.py b/src/uipath/_utils/constants.py
index c55d92a42..107131014 100644
--- a/src/uipath/_utils/constants.py
+++ b/src/uipath/_utils/constants.py
@@ -1,6 +1,7 @@
 # Environment variables
 DOTENV_FILE = ".env"
 ENV_BASE_URL = "UIPATH_URL"
+ENV_EVAL_BACKEND_URL = "UIPATH_EVAL_BACKEND_URL"
 ENV_UNATTENDED_USER_ACCESS_TOKEN = "UNATTENDED_USER_ACCESS_TOKEN"
 ENV_UIPATH_ACCESS_TOKEN = "UIPATH_ACCESS_TOKEN"
 ENV_FOLDER_KEY = "UIPATH_FOLDER_KEY"
diff --git a/src/uipath/eval/_helpers/helpers.py b/src/uipath/eval/_helpers/helpers.py
index 5059d6827..c1a73e024 100644
--- a/src/uipath/eval/_helpers/helpers.py
+++ b/src/uipath/eval/_helpers/helpers.py
@@ -58,10 +58,15 @@ def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]:
 
     @functools.wraps(func)
     async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult:
+        import logging
+        logger = logging.getLogger(__name__)
+
         start_time = time.time()
         try:
             result = await func(*args, **kwargs)
         except Exception as e:
+            # Log the full error for debugging
+            logger.error(f"Evaluator error in {func.__name__}: {type(e).__name__}: {e}", exc_info=True)
             result = ErrorEvaluationResult(
                 details="Exception thrown by evaluator: {}".format(e),
                 evaluation_time=time.time() - start_time,
diff --git a/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py
index 9bda57863..65d47879d 100644
--- a/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py
+++ b/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py
@@ -3,7 +3,7 @@
 import json
 from abc import abstractmethod
 from collections.abc import Callable
-from typing import Any, TypeVar
+from typing import Any, Dict, TypeVar
 
 from pydantic import BaseModel, Field, model_validator
 
@@ -28,6 +28,42 @@
 T = TypeVar("T", bound=BaseEvaluationCriteria)
 
 
+def _cleanup_schema(model_class: type[BaseModel]) -> Dict[str, Any]:
+    """Clean up a Pydantic model schema for use with LLM Gateway.
+
+    This function removes titles and ensures additionalProperties is set on objects.
+    """
+    schema = model_class.model_json_schema()
+
+    def clean_type(type_def):
+        """Clean property definitions by removing titles and cleaning nested items."""
+        cleaned_type = {}
+        for key, value in type_def.items():
+            if key == "title" or key == "properties":
+                continue
+            else:
+                cleaned_type[key] = value
+        if type_def.get("type") == "object" and "additionalProperties" not in type_def:
+            cleaned_type["additionalProperties"] = False
+
+        if "properties" in type_def:
+            properties = type_def.get("properties", {})
+            for key, value in properties.items():
+                properties[key] = clean_type(value)
+            cleaned_type["properties"] = properties
+
+        if "$defs" in type_def:
+            cleaned_defs = {}
+            for key, value in type_def["$defs"].items():
+                cleaned_defs[key] = clean_type(value)
+            cleaned_type["$defs"] = cleaned_defs
+        return cleaned_type
+
+    # Create clean schema
+    clean_schema = clean_type(schema)
+    return clean_schema
+
+
 class BaseLLMJudgeEvaluatorConfig(BaseEvaluatorConfig[T]):
     """Base config for all LLM evaluators.
 
@@ -82,7 +118,7 @@ def _get_llm_service(self):
 
         try:
             uipath = UiPath()
-            return uipath.llm.chat_completions
+            return uipath.llm_openai.chat_completions
         except Exception as e:
             raise UiPathEvaluationError(
                 code="FAILED_TO_GET_LLM_SERVICE",
@@ -157,13 +193,16 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
                 "type": "json_schema",
                 "json_schema": {
                     "name": "evaluation_response",
-                    "schema": self.output_schema.model_json_schema(),
+                    "schema": _cleanup_schema(self.output_schema),
                 },
             },
-            "max_tokens": self.evaluator_config.max_tokens,
             "temperature": self.evaluator_config.temperature,
         }
 
+        # Only include max_tokens if it's not None
+        if self.evaluator_config.max_tokens is not None:
+            request_data["max_tokens"] = self.evaluator_config.max_tokens
+
         if self.llm_service is None:
             raise UiPathEvaluationError(
                 code="LLM_SERVICE_NOT_INITIALIZED",
diff --git a/src/uipath/eval/coded_evaluators/output_evaluator.py b/src/uipath/eval/coded_evaluators/output_evaluator.py
index 2aa362e18..523bdc9a5 100644
--- a/src/uipath/eval/coded_evaluators/output_evaluator.py
+++ b/src/uipath/eval/coded_evaluators/output_evaluator.py
@@ -114,4 +114,11 @@ class OutputEvaluator(BaseOutputEvaluator[T_OutputCriteria, C, J]):
 
     def _get_full_expected_output(self, evaluation_criteria: T_OutputCriteria) -> Any:
         """Get the full expected output from the evaluation criteria."""
+        if evaluation_criteria is None:
+            raise UiPathEvaluationError(
+                code="NO_EVALUATION_CRITERIA_PROVIDED",
+                title="No evaluation criteria provided for output evaluator",
+                detail="evaluation_criteria is None and no default_evaluation_criteria is configured in the evaluator config",
+                category=UiPathEvaluationErrorCategory.USER,
+            )
         return evaluation_criteria.expected_output