Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"evaluatorConfig": {
"name": "LLMJudgeOutputEvaluator",
"targetOutputKey": "*",
"model": "gpt-4o-mini",
"model": "gpt-4o-mini-2024-07-18",
"prompt": "Compare the following outputs and evaluate their semantic similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nProvide a score from 0-100 where 100 means semantically identical and 0 means completely different.",
"temperature": 0.0,
"defaultEvaluationCriteria": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"evaluatorConfig": {
"name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
"targetOutputKey": "*",
"model": "gpt-4o-mini",
"model": "gpt-4o-mini-2024-07-18",
"prompt": "Compare the following JSON outputs for strict structural similarity.\n\nActual Output: {{ActualOutput}}\nExpected Output: {{ExpectedOutput}}\n\nEvaluate if the JSON structure and values match precisely. Provide a score from 0-100 where 100 means exact match and 0 means completely different.",
"temperature": 0.0,
"defaultEvaluationCriteria": {
Expand Down
15 changes: 12 additions & 3 deletions src/uipath/_cli/_evals/_console_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,18 @@ def _display_successful_evaluation(self, eval_name: str, eval_results) -> None:
for eval_result in eval_results:
evaluator_name = self._get_evaluator_name(eval_result.evaluator_id)
score_value = self._convert_score_to_numeric(eval_result)
table.add_row(
f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]"
)

# Show error details if score type is ERROR
if eval_result.result.score_type == ScoreType.ERROR:
error_details = eval_result.result.details or "Unknown error"
table.add_row(
f"{evaluator_name}",
f"[red]{score_value:.1f} (Error: {error_details})[/red]"
)
else:
table.add_row(
f"{evaluator_name}", f"[bold cyan]{score_value:.1f}[/bold cyan]"
)

self.console.print(table)
else:
Expand Down
7 changes: 7 additions & 0 deletions src/uipath/_cli/_evals/_evaluator_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ def create_evaluator(cls, data: Dict[str, Any]) -> AnyEvaluator:
def _create_evaluator_internal(
data: Dict[str, Any],
) -> BaseEvaluator[Any, Any, Any]:
# # Validate only the evaluatorConfig part to determine type
Copy link
Contributor

@radu-mocanu radu-mocanu Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this part needed?

# evaluator_config_data = data.get("evaluatorConfig", {})
# # Add evaluatorTypeId to the config data so discriminator can work
# evaluator_config_data_with_type = {
# "evaluatorTypeId": data.get("evaluatorTypeId"),
# **evaluator_config_data
# }
config: BaseEvaluatorConfig[Any] = TypeAdapter(EvaluatorConfig).validate_python(
data
)
Expand Down
144 changes: 102 additions & 42 deletions src/uipath/_cli/_evals/_progress_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@
EvaluationEvents,
)
from uipath._utils import Endpoint, RequestSpec
from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
from uipath._utils.constants import (
ENV_EVAL_BACKEND_URL,
ENV_TENANT_ID,
ENV_BASE_URL,
HEADER_INTERNAL_TENANT_ID,
)
from uipath.eval.evaluators import LegacyBaseEvaluator
from uipath.eval.models import EvalItemResult, ScoreType
from uipath.tracing import LlmOpsHttpExporter
Expand All @@ -47,14 +52,12 @@ async def wrapper(self, *args, **kwargs):
try:
return await func(self, *args, **kwargs)
except Exception as e:
if hasattr(self, "_console"):
error_type = type(e).__name__
logger.warning(
f"Cannot report progress to SW. "
f"Function: {func.__name__}, "
f"Error type: {error_type}, "
f"Details: {e}"
)
# Log at debug level for troubleshooting
logger.debug(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we change the logger level here, the users won't be able to see the errors when reporting is unsuccessful. Might be misleading

f"Cannot report progress to SW. "
f"Function: {func.__name__}, "
f"Error: {e}"
)
return None

return wrapper
Expand All @@ -67,6 +70,7 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter):
self.spans_exporter = spans_exporter

logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL)
logging.getLogger("httpx").setLevel(logging.WARNING)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we shouldn't change the global log level for those. if we wish to avoid writing them to console (although I would not recommend), we should exclude them just from the Console reporter part.

console_logger = ConsoleLogger.get_instance()
uipath = UiPath()

Expand All @@ -79,14 +83,32 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter):
"Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID."
)

# Get eval backend URL (can be overridden for local dev)
self._eval_backend_url = self._get_eval_backend_url()

self.eval_set_run_ids: Dict[str, str] = {}
self.evaluators: Dict[str, Any] = {}
self.evaluator_scores: Dict[str, List[float]] = {}
self.eval_run_ids: Dict[str, str] = {}

def _format_error_message(self, error: Exception, context: str) -> None:
"""Helper method to format and display error messages consistently."""
self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]")
# Only show simple message without full error details
self._rich_console.print(f" • ⚠ [dim]{context}[/dim]")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let s use unicode instead of emojis please


def _get_eval_backend_url(self) -> str:
"""Get the eval backend URL from environment, falling back to UIPATH_URL."""
eval_url = os.getenv(ENV_EVAL_BACKEND_URL)
if eval_url:
logger.debug(f"Using eval backend URL: {eval_url}")
return eval_url.rstrip("/")

base_url = os.getenv(ENV_BASE_URL, "https://cloud.uipath.com")
return base_url.rstrip("/")

def _build_eval_endpoint_url(self, endpoint: Endpoint) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let s refactor this into a property
we don t need _get_eval_backend_url and _eval_backend_url

"""Build full URL for eval endpoints using the eval backend URL."""
return f"{self._eval_backend_url}{endpoint}"

@gracefully_handle_errors
async def create_eval_set_run(
Expand All @@ -100,7 +122,7 @@ async def create_eval_set_run(
spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
response = await self._client.request_async(
method=spec.method,
url=spec.endpoint,
url=self._build_eval_endpoint_url(spec.endpoint),
params=spec.params,
json=spec.json,
headers=spec.headers,
Expand All @@ -124,7 +146,7 @@ async def create_eval_run(
spec = self._create_eval_run_spec(eval_item, eval_set_run_id)
response = await self._client.request_async(
method=spec.method,
url=spec.endpoint,
url=self._build_eval_endpoint_url(spec.endpoint),
params=spec.params,
json=spec.json,
headers=spec.headers,
Expand All @@ -138,19 +160,19 @@ async def update_eval_run(
evaluators: dict[str, LegacyBaseEvaluator[Any]],
):
"""Update an evaluation run with results."""
assertion_runs, evaluator_scores = self._collect_results(
evaluator_runs, evaluator_scores = self._collect_results(
sw_progress_item.eval_results, evaluators
)
spec = self._update_eval_run_spec(
assertion_runs=assertion_runs,
evaluator_runs=evaluator_runs,
evaluator_scores=evaluator_scores,
eval_run_id=sw_progress_item.eval_run_id,
execution_time=sw_progress_item.agent_execution_time,
actual_output=sw_progress_item.agent_output,
)
await self._client.request_async(
method=spec.method,
url=spec.endpoint,
url=self._build_eval_endpoint_url(spec.endpoint),
params=spec.params,
json=spec.json,
headers=spec.headers,
Expand All @@ -166,7 +188,7 @@ async def update_eval_set_run(
spec = self._update_eval_set_run_spec(eval_set_run_id, evaluator_scores)
await self._client.request_async(
method=spec.method,
url=spec.endpoint,
url=self._build_eval_endpoint_url(spec.endpoint),
params=spec.params,
json=spec.json,
headers=spec.headers,
Expand Down Expand Up @@ -203,7 +225,7 @@ async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None:
self.eval_run_ids[payload.execution_id] = eval_run_id
logger.debug(f"Created eval run with ID: {eval_run_id}")
else:
logger.warning("Cannot create eval run: eval_set_run_id not available")
logger.debug("Cannot create eval run: eval_set_run_id not available")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should leave all the logs level as warning. there is no way to see the errors on a serverless runs otherwise


except Exception as e:
self._format_error_message(e, "StudioWeb create eval run error")
Expand Down Expand Up @@ -258,7 +280,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
)
logger.debug(f"Updated eval set run with ID: {eval_set_run_id}")
else:
logger.warning(
logger.debug(
"Cannot update eval set run: eval_set_run_id not available"
)

Expand Down Expand Up @@ -303,17 +325,19 @@ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
input_schema=input_schema, output_schema=output_schema
)
except Exception as e:
logger.warning(f"Failed to extract agent snapshot: {e}")
logger.debug(f"Failed to extract agent snapshot: {e}")
return StudioWebAgentSnapshot(input_schema={}, output_schema={})

def _collect_results(
self,
eval_results: list[EvalItemResult],
evaluators: dict[str, LegacyBaseEvaluator[Any]],
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
assertion_runs: list[dict[str, Any]] = []
evaluator_runs: list[dict[str, Any]] = []
evaluator_scores_list: list[dict[str, Any]] = []

for eval_result in eval_results:
# Build scores for the eval run result
evaluator_scores_list.append(
{
"type": eval_result.result.score_type.value,
Expand All @@ -322,10 +346,38 @@ def _collect_results(
"evaluatorId": eval_result.evaluator_id,
}
)
assertion_runs.append(

# Build evaluator runs for the new coded eval API
# Handle both legacy and coded evaluators
evaluator = evaluators[eval_result.evaluator_id]

# Get assertion type and output key based on evaluator type
if hasattr(evaluator, 'evaluator_type'):
# Legacy evaluator
assertion_type = evaluator.evaluator_type.name
output_key = evaluator.target_output_key
else:
# Coded evaluator - use name as type and default output key
assertion_type = evaluator.name if hasattr(evaluator, 'name') else "UnknownEvaluator"
output_key = "*" # Coded evaluators don't have target_output_key

evaluator_runs.append(
{
"status": EvaluationStatus.COMPLETED.value,
"evaluatorId": eval_result.evaluator_id,
"evaluatorSnapshot": {
"assertionType": assertion_type,
"outputKey": output_key,
},
"evaluationCriteria": None, # Optional field
"status": EvaluationStatus.COMPLETED.value,
"result": {
"output": {}, # Will be set from top-level result
"score": {
"type": eval_result.result.score_type.value,
"value": eval_result.result.score,
"justification": eval_result.result.details,
}
},
"completionMetrics": {
"duration": int(eval_result.result.evaluation_time)
if eval_result.result.evaluation_time
Expand All @@ -335,59 +387,64 @@ def _collect_results(
"completionTokens": 0,
"promptTokens": 0,
},
"assertionSnapshot": {
"assertionType": evaluators[
eval_result.evaluator_id
].evaluator_type.name,
"outputKey": evaluators[
eval_result.evaluator_id
].target_output_key,
},
}
)
return assertion_runs, evaluator_scores_list
return evaluator_runs, evaluator_scores_list

def _update_eval_run_spec(
self,
assertion_runs: list[dict[str, Any]],
evaluator_runs: list[dict[str, Any]],
evaluator_scores: list[dict[str, Any]],
eval_run_id: str,
actual_output: dict[str, Any],
execution_time: float,
) -> RequestSpec:
# Use new coded eval API endpoint
return RequestSpec(
method="PUT",
endpoint=Endpoint(
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
f"api/execution/agents/{self._project_id}/coded/evalRun"
),
json={
"evalRunId": eval_run_id,
"status": EvaluationStatus.COMPLETED.value,
"result": {
"output": {"content": {**actual_output}},
"evaluatorScores": evaluator_scores,
"output": {**actual_output},
"scores": evaluator_scores,
},
"completionMetrics": {"duration": int(execution_time)},
"assertionRuns": assertion_runs,
"evaluatorRuns": evaluator_runs,
},
headers=self._tenant_header(),
)

def _create_eval_run_spec(
self, eval_item: LegacyEvaluationItem, eval_set_run_id: str
) -> RequestSpec:
# Use new coded eval API endpoint
# Handle both legacy and new evaluation item formats
evaluation_criterias = {}

# Check if it's a legacy item with expected_output or new item with evaluation_criterias
if hasattr(eval_item, 'expected_output'):
# Legacy format: expected_output is a dict at the item level
evaluation_criterias = eval_item.expected_output
elif hasattr(eval_item, 'evaluation_criterias'):
# New format: evaluation_criterias is already in the correct format
evaluation_criterias = eval_item.evaluation_criterias

return RequestSpec(
method="POST",
endpoint=Endpoint(
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
f"api/execution/agents/{self._project_id}/coded/evalRun"
),
json={
"evalSetRunId": eval_set_run_id,
"evalSnapshot": {
"id": eval_item.id,
"name": eval_item.name,
"inputs": eval_item.inputs,
"expectedOutput": eval_item.expected_output,
"evaluationCriterias": evaluation_criterias,
},
"status": EvaluationStatus.IN_PROGRESS.value,
},
Expand All @@ -400,17 +457,19 @@ def _create_eval_set_run_spec(
agent_snapshot: StudioWebAgentSnapshot,
no_of_evals: int,
) -> RequestSpec:
# Use new coded eval API endpoint
return RequestSpec(
method="POST",
endpoint=Endpoint(
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
f"api/execution/agents/{self._project_id}/coded/evalSetRun"
),
json={
"agentId": self._project_id,
"evalSetId": eval_set_id,
"agentId": self._project_id,
"agentSnapshot": agent_snapshot.model_dump(by_alias=True),
"status": EvaluationStatus.IN_PROGRESS.value,
"numberOfEvalsExecuted": no_of_evals,
"version": "1.0",
},
headers=self._tenant_header(),
)
Expand All @@ -420,15 +479,16 @@ def _update_eval_set_run_spec(
eval_set_run_id: str,
evaluator_scores: dict[str, float],
) -> RequestSpec:
# Use new coded eval API endpoint
evaluator_scores_list = [
{"value": avg_score, "evaluatorId": evaluator_id}
{"evaluatorId": evaluator_id, "value": avg_score}
for evaluator_id, avg_score in evaluator_scores.items()
]

return RequestSpec(
method="PUT",
endpoint=Endpoint(
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
f"api/execution/agents/{self._project_id}/coded/evalSetRun"
),
json={
"evalSetRunId": eval_set_run_id,
Expand Down
Loading
Loading