diff --git a/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json b/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json index 623ffc89b..0b21f80dd 100644 --- a/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json +++ b/samples/calculator/evals/evaluators/llm-judge-semantic-similarity.json @@ -6,7 +6,7 @@ "evaluatorConfig": { "name": "LLMJudgeOutputEvaluator", "targetOutputKey": "*", - "model": "gpt-4o-mini", + "model": "gpt-4o-mini-2024-07-18", "prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.", "temperature": 0.0, "defaultEvaluationCriteria": { diff --git a/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json b/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json index 9bfab8da8..142c28846 100644 --- a/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json +++ b/samples/calculator/evals/evaluators/llm-judge-strict-json-similarity.json @@ -6,7 +6,7 @@ "evaluatorConfig": { "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", "targetOutputKey": "*", - "model": "gpt-4o-mini", + "model": "gpt-4o-mini-2024-07-18", "prompt": "Compare the following JSON outputs for strict structural similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nEvaluate if the JSON structure and values match precisely. Provide a score from 0-100 where 100 means exact match and 0 means completely different.", "temperature": 0.0, "defaultEvaluationCriteria": { diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py index 5d1d17f38..942015520 100644 --- a/src/uipath/_cli/_evals/_console_progress_reporter.py +++ b/src/uipath/_cli/_evals/_console_progress_reporter.py @@ -63,9 +63,18 @@ def _display_successful_evaluation(self, eval_name: str, eval_results) -> None: for eval_result in eval_results: evaluator_name = self._get_evaluator_name(eval_result.evaluator_id) score_value = self._convert_score_to_numeric(eval_result) - table.add_row( - f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]" - ) + + # Show error details if score type is ERROR + if eval_result.result.score_type == ScoreType.ERROR: + error_details = eval_result.result.details or "Unknown error" + table.add_row( + f"{evaluator_name}", + f"[red]{score_value:.1f} (Error: {error_details})[/red]" + ) + else: + table.add_row( + f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]" + ) self.console.print(table) else: diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py index c5492c7de..31a7e2d07 100644 --- a/src/uipath/_cli/_evals/_evaluator_factory.py +++ b/src/uipath/_cli/_evals/_evaluator_factory.py @@ -54,6 +54,13 @@ def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator: def _create_evaluator_internal( data: Dict[str, Any], ) -> BaseEvaluator[Any, Any, Any]: + # # Validate only the evaluatorConfig part to determine type + # evaluator_config_data = data.get("evaluatorConfig", {}) + # # Add evaluatorTypeId to the config data so discriminator can work + # evaluator_config_data_with_type = { + # "evaluatorTypeId": data.get("evaluatorTypeId"), + # **evaluator_config_data + # } config: BaseEvaluatorConfig[Any] = TypeAdapter(EvaluatorConfig).validate_python( data ) diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index 6a3044f3a..3eb5d5431 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -31,7 +31,12 @@ EvaluationEvents, ) from uipath._utils import Endpoint, RequestSpec -from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID +from uipath._utils.constants import ( + ENV_EVAL_BACKEND_URL, + ENV_TENANT_ID, + ENV_BASE_URL, + HEADER_INTERNAL_TENANT_ID, +) from uipath.eval.evaluators import LegacyBaseEvaluator from uipath.eval.models import EvalItemResult, ScoreType from uipath.tracing import LlmOpsHttpExporter @@ -47,14 +52,12 @@ async def wrapper(self, *args, **kwargs): try: return await func(self, *args, **kwargs) except Exception as e: - if hasattr(self, "_console"): - error_type = type(e).__name__ - logger.warning( - f"Cannot report progress to SW. " - f"Function: {func.__name__}, " - f"Error type: {error_type}, " - f"Details: {e}" - ) + # Log at debug level for troubleshooting + logger.debug( + f"Cannot report progress to SW. " + f"Function: {func.__name__}, " + f"Error: {e}" + ) return None return wrapper @@ -67,6 +70,7 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter): self.spans_exporter = spans_exporter logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL) + logging.getLogger("httpx").setLevel(logging.WARNING) console_logger = ConsoleLogger.get_instance() uipath = UiPath() @@ -79,6 +83,9 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter): "Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID." ) + # Get eval backend URL (can be overridden for local dev) + self._eval_backend_url = self._get_eval_backend_url() + self.eval_set_run_ids: Dict[str, str] = {} self.evaluators: Dict[str, Any] = {} self.evaluator_scores: Dict[str, List[float]] = {} @@ -86,7 +93,22 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter): def _format_error_message(self, error: Exception, context: str) -> None: """Helper method to format and display error messages consistently.""" - self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]") + # Only show simple message without full error details + self._rich_console.print(f" • ⚠ [dim]{context}[/dim]") + + def _get_eval_backend_url(self) -> str: + """Get the eval backend URL from environment, falling back to UIPATH_URL.""" + eval_url = os.getenv(ENV_EVAL_BACKEND_URL) + if eval_url: + logger.debug(f"Using eval backend URL: {eval_url}") + return eval_url.rstrip("/") + + base_url = os.getenv(ENV_BASE_URL, "https://cloud.uipath.com") + return base_url.rstrip("/") + + def _build_eval_endpoint_url(self, endpoint: Endpoint) -> str: + """Build full URL for eval endpoints using the eval backend URL.""" + return f"{self._eval_backend_url}{endpoint}" @gracefully_handle_errors async def create_eval_set_run( @@ -100,7 +122,7 @@ async def create_eval_set_run( spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals) response = await self._client.request_async( method=spec.method, - url=spec.endpoint, + url=self._build_eval_endpoint_url(spec.endpoint), params=spec.params, json=spec.json, headers=spec.headers, @@ -124,7 +146,7 @@ async def create_eval_run( spec = self._create_eval_run_spec(eval_item, eval_set_run_id) response = await self._client.request_async( method=spec.method, - url=spec.endpoint, + url=self._build_eval_endpoint_url(spec.endpoint), params=spec.params, json=spec.json, headers=spec.headers, @@ -138,11 +160,11 @@ async def update_eval_run( evaluators: dict[str, LegacyBaseEvaluator[Any]], ): """Update an evaluation run with results.""" - assertion_runs, evaluator_scores = self._collect_results( + evaluator_runs, evaluator_scores = self._collect_results( sw_progress_item.eval_results, evaluators ) spec = self._update_eval_run_spec( - assertion_runs=assertion_runs, + evaluator_runs=evaluator_runs, evaluator_scores=evaluator_scores, eval_run_id=sw_progress_item.eval_run_id, execution_time=sw_progress_item.agent_execution_time, @@ -150,7 +172,7 @@ async def update_eval_run( ) await self._client.request_async( method=spec.method, - url=spec.endpoint, + url=self._build_eval_endpoint_url(spec.endpoint), params=spec.params, json=spec.json, headers=spec.headers, @@ -166,7 +188,7 @@ async def update_eval_set_run( spec = self._update_eval_set_run_spec(eval_set_run_id, evaluator_scores) await self._client.request_async( method=spec.method, - url=spec.endpoint, + url=self._build_eval_endpoint_url(spec.endpoint), params=spec.params, json=spec.json, headers=spec.headers, @@ -203,7 +225,7 @@ async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None: self.eval_run_ids[payload.execution_id] = eval_run_id logger.debug(f"Created eval run with ID: {eval_run_id}") else: - logger.warning("Cannot create eval run: eval_set_run_id not available") + logger.debug("Cannot create eval run: eval_set_run_id not available") except Exception as e: self._format_error_message(e, "StudioWeb create eval run error") @@ -258,7 +280,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N ) logger.debug(f"Updated eval set run with ID: {eval_set_run_id}") else: - logger.warning( + logger.debug( "Cannot update eval set run: eval_set_run_id not available" ) @@ -303,7 +325,7 @@ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: input_schema=input_schema, output_schema=output_schema ) except Exception as e: - logger.warning(f"Failed to extract agent snapshot: {e}") + logger.debug(f"Failed to extract agent snapshot: {e}") return StudioWebAgentSnapshot(input_schema={}, output_schema={}) def _collect_results( @@ -311,9 +333,11 @@ def _collect_results( eval_results: list[EvalItemResult], evaluators: dict[str, LegacyBaseEvaluator[Any]], ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - assertion_runs: list[dict[str, Any]] = [] + evaluator_runs: list[dict[str, Any]] = [] evaluator_scores_list: list[dict[str, Any]] = [] + for eval_result in eval_results: + # Build scores for the eval run result evaluator_scores_list.append( { "type": eval_result.result.score_type.value, @@ -322,10 +346,38 @@ def _collect_results( "evaluatorId": eval_result.evaluator_id, } ) - assertion_runs.append( + + # Build evaluator runs for the new coded eval API + # Handle both legacy and coded evaluators + evaluator = evaluators[eval_result.evaluator_id] + + # Get assertion type and output key based on evaluator type + if hasattr(evaluator, 'evaluator_type'): + # Legacy evaluator + assertion_type = evaluator.evaluator_type.name + output_key = evaluator.target_output_key + else: + # Coded evaluator - use name as type and default output key + assertion_type = evaluator.name if hasattr(evaluator, 'name') else "UnknownEvaluator" + output_key = "*" # Coded evaluators don't have target_output_key + + evaluator_runs.append( { - "status": EvaluationStatus.COMPLETED.value, "evaluatorId": eval_result.evaluator_id, + "evaluatorSnapshot": { + "assertionType": assertion_type, + "outputKey": output_key, + }, + "evaluationCriteria": None, # Optional field + "status": EvaluationStatus.COMPLETED.value, + "result": { + "output": {}, # Will be set from top-level result + "score": { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": eval_result.result.details, + } + }, "completionMetrics": { "duration": int(eval_result.result.evaluation_time) if eval_result.result.evaluation_time @@ -335,40 +387,33 @@ def _collect_results( "completionTokens": 0, "promptTokens": 0, }, - "assertionSnapshot": { - "assertionType": evaluators[ - eval_result.evaluator_id - ].evaluator_type.name, - "outputKey": evaluators[ - eval_result.evaluator_id - ].target_output_key, - }, } ) - return assertion_runs, evaluator_scores_list + return evaluator_runs, evaluator_scores_list def _update_eval_run_spec( self, - assertion_runs: list[dict[str, Any]], + evaluator_runs: list[dict[str, Any]], evaluator_scores: list[dict[str, Any]], eval_run_id: str, actual_output: dict[str, Any], execution_time: float, ) -> RequestSpec: + # Use new coded eval API endpoint return RequestSpec( method="PUT", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun" + f"api/execution/agents/{self._project_id}/coded/evalRun" ), json={ "evalRunId": eval_run_id, "status": EvaluationStatus.COMPLETED.value, "result": { - "output": {"content": {**actual_output}}, - "evaluatorScores": evaluator_scores, + "output": {**actual_output}, + "scores": evaluator_scores, }, "completionMetrics": {"duration": int(execution_time)}, - "assertionRuns": assertion_runs, + "evaluatorRuns": evaluator_runs, }, headers=self._tenant_header(), ) @@ -376,10 +421,22 @@ def _update_eval_run_spec( def _create_eval_run_spec( self, eval_item: LegacyEvaluationItem, eval_set_run_id: str ) -> RequestSpec: + # Use new coded eval API endpoint + # Handle both legacy and new evaluation item formats + evaluation_criterias = {} + + # Check if it's a legacy item with expected_output or new item with evaluation_criterias + if hasattr(eval_item, 'expected_output'): + # Legacy format: expected_output is a dict at the item level + evaluation_criterias = eval_item.expected_output + elif hasattr(eval_item, 'evaluation_criterias'): + # New format: evaluation_criterias is already in the correct format + evaluation_criterias = eval_item.evaluation_criterias + return RequestSpec( method="POST", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun" + f"api/execution/agents/{self._project_id}/coded/evalRun" ), json={ "evalSetRunId": eval_set_run_id, @@ -387,7 +444,7 @@ def _create_eval_run_spec( "id": eval_item.id, "name": eval_item.name, "inputs": eval_item.inputs, - "expectedOutput": eval_item.expected_output, + "evaluationCriterias": evaluation_criterias, }, "status": EvaluationStatus.IN_PROGRESS.value, }, @@ -400,17 +457,19 @@ def _create_eval_set_run_spec( agent_snapshot: StudioWebAgentSnapshot, no_of_evals: int, ) -> RequestSpec: + # Use new coded eval API endpoint return RequestSpec( method="POST", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun" + f"api/execution/agents/{self._project_id}/coded/evalSetRun" ), json={ - "agentId": self._project_id, "evalSetId": eval_set_id, + "agentId": self._project_id, "agentSnapshot": agent_snapshot.model_dump(by_alias=True), "status": EvaluationStatus.IN_PROGRESS.value, "numberOfEvalsExecuted": no_of_evals, + "version": "1.0", }, headers=self._tenant_header(), ) @@ -420,15 +479,16 @@ def _update_eval_set_run_spec( eval_set_run_id: str, evaluator_scores: dict[str, float], ) -> RequestSpec: + # Use new coded eval API endpoint evaluator_scores_list = [ - {"value": avg_score, "evaluatorId": evaluator_id} + {"evaluatorId": evaluator_id, "value": avg_score} for evaluator_id, avg_score in evaluator_scores.items() ] return RequestSpec( method="PUT", endpoint=Endpoint( - f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun" + f"api/execution/agents/{self._project_id}/coded/evalSetRun" ), json={ "evalSetRunId": eval_set_run_id, diff --git a/src/uipath/_cli/_utils/_tracing.py b/src/uipath/_cli/_utils/_tracing.py index fdc4a2238..9a63d597a 100644 --- a/src/uipath/_cli/_utils/_tracing.py +++ b/src/uipath/_cli/_utils/_tracing.py @@ -10,18 +10,13 @@ def __init__(self, url_to_ignore): def filter(self, record): try: + # Suppress all HTTP Request logs from httpx if record.msg == 'HTTP Request: %s %s "%s %d %s"': - # Ignore the log if the URL matches the one we want to ignore - method = record.args[0] - url = record.args[1] - - if method == "POST" and url.path.endswith(self.url_to_ignore): - # Check if the URL contains the specific path we want to ignore - return True return False except Exception: return False + return True def setup_tracer_httpx_logging(url: str): diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 56cdcb775..5b742531d 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -5,7 +5,10 @@ from typing import List, Optional import click +from rich.console import Console +from rich.table import Table +from uipath import UiPath from uipath._cli._evals._console_progress_reporter import ConsoleProgressReporter from uipath._cli._evals._progress_reporter import StudioWebProgressReporter from uipath._cli._evals._runtime import ( @@ -21,6 +24,8 @@ from uipath._cli._utils._folders import get_personal_workspace_key_async from uipath._cli.middlewares import Middlewares from uipath._events._event_bus import EventBus +from uipath._utils import Endpoint +from uipath._utils.constants import ENV_EVAL_BACKEND_URL, ENV_BASE_URL, ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID from uipath.eval._helpers import auto_discover_entrypoint from uipath.tracing import LlmOpsHttpExporter @@ -32,6 +37,102 @@ console = ConsoleLogger() +async def list_eval_runs() -> None: + """List previous evaluation runs for the current agent.""" + try: + project_id = os.getenv(UIPATH_PROJECT_ID) + if not project_id: + console.error("UIPATH_PROJECT_ID environment variable not set. Please set it to list previous runs.") + return + + tenant_id = os.getenv(ENV_TENANT_ID) + if not tenant_id: + console.error(f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'.") + return + + # Get eval backend URL + eval_url = os.getenv(ENV_EVAL_BACKEND_URL) + if eval_url: + base_url = eval_url.rstrip("/") + else: + base_url = os.getenv(ENV_BASE_URL, "https://cloud.uipath.com").rstrip("/") + + # Initialize UiPath client + uipath = UiPath() + client = uipath.api_client + + # Build the endpoint URL + url = f"{base_url}/api/execution/agents/{project_id}/coded/evalSetRuns" + + # Make the API call + response = await client.request_async( + method="GET", + url=url, + params={"agentId": project_id}, + headers={HEADER_INTERNAL_TENANT_ID: tenant_id} + ) + + # Parse the response + import json + runs = json.loads(response.content) + + if not runs: + console.info("No previous evaluation runs found for this agent.") + return + + # Display results in a nice table + rich_console = Console() + table = Table(title=f"Evaluation Runs for Agent {project_id}") + + table.add_column("Run ID", style="cyan", no_wrap=True) + table.add_column("Eval Set ID", style="magenta") + table.add_column("Status", style="green") + table.add_column("Evals Executed", justify="right") + table.add_column("Score", justify="right") + table.add_column("Duration (ms)", justify="right") + table.add_column("Created At", style="yellow") + + for run in runs: + # Map status: API returns camelCase strings ("pending", "running", "completed") + status_value = run.get("status", "unknown") + if isinstance(status_value, str): + # Handle string status from API + status_map = { + "pending": "Pending", + "running": "Running", + "completed": "Completed" + } + status = status_map.get(status_value.lower(), status_value.capitalize()) + else: + # Handle integer status as fallback + status_map = {0: "Pending", 1: "Running", 2: "Completed"} + status = status_map.get(status_value, "Unknown") + + table.add_row( + str(run.get("id", "N/A"))[:8] + "...", # Truncate UUID for display + run.get("evalSetId", "N/A"), + status, + str(run.get("numberOfEvalsExecuted", "N/A")), + f"{run.get('score', 0):.2f}" if run.get("score") is not None else "N/A", + str(run.get("durationMilliseconds", "N/A")), + run.get("createdAt", "N/A")[:19], # Truncate timestamp + ) + + rich_console.print(table) + + # Show evaluator scores summary + rich_console.print("\n[bold]Evaluator Scores for Most Recent Run:[/bold]") + if runs and runs[0].get("evaluatorScores"): + scores = runs[0]["evaluatorScores"] + for score in scores: + evaluator_id = score.get("evaluatorId", "Unknown") + value = score.get("value", 0) + rich_console.print(f" • {evaluator_id}: [green]{value:.2f}[/green]") + + except Exception as e: + console.error(f"Failed to list eval runs: {e}") + + class LiteralOption(click.Option): def type_cast_value(self, ctx, value): try: @@ -78,6 +179,12 @@ def setup_reporting_prereq(no_report: bool) -> bool: type=click.Path(exists=False), help="File path where the output will be written", ) +@click.option( + "--list-runs", + is_flag=True, + help="List previous evaluation runs for this agent", + default=False, +) @track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None) def eval( entrypoint: Optional[str], @@ -86,6 +193,7 @@ def eval( no_report: bool, workers: int, output_file: Optional[str], + list_runs: bool, ) -> None: """Run an evaluation set against the agent. @@ -95,7 +203,17 @@ def eval( eval_ids: Optional list of evaluation IDs workers: Number of parallel workers for running evaluations no_report: Do not report the evaluation results + list_runs: List previous evaluation runs for this agent """ + # Suppress HTTP request logs from httpx + import logging + logging.getLogger("httpx").setLevel(logging.WARNING) + + # Handle --list-runs flag + if list_runs: + asyncio.run(list_eval_runs()) + return + should_register_progress_reporter = setup_reporting_prereq(no_report) result = Middlewares.next( diff --git a/src/uipath/_utils/constants.py b/src/uipath/_utils/constants.py index c55d92a42..107131014 100644 --- a/src/uipath/_utils/constants.py +++ b/src/uipath/_utils/constants.py @@ -1,6 +1,7 @@ # Environment variables DOTENV_FILE = ".env" ENV_BASE_URL = "UIPATH_URL" +ENV_EVAL_BACKEND_URL = "UIPATH_EVAL_BACKEND_URL" ENV_UNATTENDED_USER_ACCESS_TOKEN = "UNATTENDED_USER_ACCESS_TOKEN" ENV_UIPATH_ACCESS_TOKEN = "UIPATH_ACCESS_TOKEN" ENV_FOLDER_KEY = "UIPATH_FOLDER_KEY" diff --git a/src/uipath/eval/_helpers/helpers.py b/src/uipath/eval/_helpers/helpers.py index 5059d6827..c1a73e024 100644 --- a/src/uipath/eval/_helpers/helpers.py +++ b/src/uipath/eval/_helpers/helpers.py @@ -58,10 +58,15 @@ def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]: @functools.wraps(func) async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult: + import logging + logger = logging.getLogger(__name__) + start_time = time.time() try: result = await func(*args, **kwargs) except Exception as e: + # Log the full error for debugging + logger.error(f"Evaluator error in {func.__name__}: {type(e).__name__}: {e}", exc_info=True) result = ErrorEvaluationResult( details="Exception thrown by evaluator: {}".format(e), evaluation_time=time.time() - start_time, diff --git a/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py index 9bda57863..65d47879d 100644 --- a/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py +++ b/src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py @@ -3,7 +3,7 @@ import json from abc import abstractmethod from collections.abc import Callable -from typing import Any, TypeVar +from typing import Any, Dict, TypeVar from pydantic import BaseModel, Field, model_validator @@ -28,6 +28,42 @@ T = TypeVar("T", bound=BaseEvaluationCriteria) +def _cleanup_schema(model_class: type[BaseModel]) -> Dict[str, Any]: + """Clean up a Pydantic model schema for use with LLM Gateway. + + This function removes titles and ensures additionalProperties is set on objects. + """ + schema = model_class.model_json_schema() + + def clean_type(type_def): + """Clean property definitions by removing titles and cleaning nested items.""" + cleaned_type = {} + for key, value in type_def.items(): + if key == "title" or key == "properties": + continue + else: + cleaned_type[key] = value + if type_def.get("type") == "object" and "additionalProperties" not in type_def: + cleaned_type["additionalProperties"] = False + + if "properties" in type_def: + properties = type_def.get("properties", {}) + for key, value in properties.items(): + properties[key] = clean_type(value) + cleaned_type["properties"] = properties + + if "$defs" in type_def: + cleaned_defs = {} + for key, value in type_def["$defs"].items(): + cleaned_defs[key] = clean_type(value) + cleaned_type["$defs"] = cleaned_defs + return cleaned_type + + # Create clean schema + clean_schema = clean_type(schema) + return clean_schema + + class BaseLLMJudgeEvaluatorConfig(BaseEvaluatorConfig[T]): """Base config for all LLM evaluators. @@ -82,7 +118,7 @@ def _get_llm_service(self): try: uipath = UiPath() - return uipath.llm.chat_completions + return uipath.llm_openai.chat_completions except Exception as e: raise UiPathEvaluationError( code="FAILED_TO_GET_LLM_SERVICE", @@ -157,13 +193,16 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: "type": "json_schema", "json_schema": { "name": "evaluation_response", - "schema": self.output_schema.model_json_schema(), + "schema": _cleanup_schema(self.output_schema), }, }, - "max_tokens": self.evaluator_config.max_tokens, "temperature": self.evaluator_config.temperature, } + # Only include max_tokens if it's not None + if self.evaluator_config.max_tokens is not None: + request_data["max_tokens"] = self.evaluator_config.max_tokens + if self.llm_service is None: raise UiPathEvaluationError( code="LLM_SERVICE_NOT_INITIALIZED", diff --git a/src/uipath/eval/coded_evaluators/output_evaluator.py b/src/uipath/eval/coded_evaluators/output_evaluator.py index 2aa362e18..523bdc9a5 100644 --- a/src/uipath/eval/coded_evaluators/output_evaluator.py +++ b/src/uipath/eval/coded_evaluators/output_evaluator.py @@ -114,4 +114,11 @@ class OutputEvaluator(BaseOutputEvaluator[T_OutputCriteria, C, J]): def _get_full_expected_output(self, evaluation_criteria: T_OutputCriteria) -> Any: """Get the full expected output from the evaluation criteria.""" + if evaluation_criteria is None: + raise UiPathEvaluationError( + code="NO_EVALUATION_CRITERIA_PROVIDED", + title="No evaluation criteria provided for output evaluator", + detail="evaluation_criteria is None and no default_evaluation_criteria is configured in the evaluator config", + category=UiPathEvaluationErrorCategory.USER, + ) return evaluation_criteria.expected_output