Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5e7cabc
add initial version of revamped coded evaluators
andrei-rusu Sep 25, 2025
1a3efa4
address pr comments and add tool call args evaluator
andrei-rusu Sep 26, 2025
4aa71ef
make schemas more apparent, make llm judge mixin to avoid fixing the …
andrei-rusu Sep 26, 2025
6667172
add tests for evaluating using raw criteria, putting the evaluator in…
andrei-rusu Sep 26, 2025
c24a957
equalize the type extraction method for both config and eval criteria…
andrei-rusu Sep 26, 2025
82a8b9a
fix llm judge class structure to allow for easier subclassing to chan…
andrei-rusu Sep 26, 2025
42b4832
add double equal for count eval
andrei-rusu Sep 26, 2025
fec883e
decouple evaluators from sdk as much as possible to prepare for separ…
andrei-rusu Sep 26, 2025
40e1837
add justification type to all evaluators and tests for that
andrei-rusu Sep 26, 2025
d4b53e5
change label in prompt to AgentInput
andrei-rusu Sep 29, 2025
0312217
add tool call output evaluator, fix several issues and inconsistencie…
andrei-rusu Sep 29, 2025
9803899
add evaluator_name and datapoint_id identifiers to EvaluationResult +…
andrei-rusu Sep 29, 2025
97c6373
fix datapoint id generation and add tests
andrei-rusu Sep 29, 2025
0e56225
fix type inconsistency for evaluatorconfig-evaluationcriteria, genera…
andrei-rusu Oct 1, 2025
e8ed10a
move final score calculation to UiPathEvalOutput and rm extra identif…
andrei-rusu Oct 1, 2025
a4779dd
move types jsons up the hierarchy
andrei-rusu Oct 2, 2025
1230bd4
fix imports
andrei-rusu Oct 2, 2025
b02e5b8
fix name inconsistency
andrei-rusu Oct 3, 2025
c59541e
fix tool output evaluator and tests
andrei-rusu Oct 3, 2025
91fa6fd
fix judge names
andrei-rusu Oct 3, 2025
6d11d7c
fix bugs
andrei-rusu Oct 3, 2025
65959ec
update types
andrei-rusu Oct 3, 2025
c5c541b
fix contains to have targetOutputKey
andrei-rusu Oct 3, 2025
9a89c2e
feat: wire new evaluators
radu-mocanu Oct 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 70 additions & 4 deletions src/uipath/_cli/_evals/_models/_evaluation_set.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
from enum import IntEnum
from typing import Any, Dict, List
from typing import Annotated, Any, Dict, List, Literal, Union

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
from pydantic.alias_generators import to_camel

from uipath.eval.coded_evaluators import BaseEvaluator
from uipath.eval.evaluators import LegacyBaseEvaluator


class EvaluationItem(BaseModel):
"""Individual evaluation item within an evaluation set."""

model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
id: str
name: str
inputs: Dict[str, Any]
evaluation_criterias: dict[str, dict[str, Any] | None] = Field(
..., alias="evaluationCriterias"
)
expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")


class LegacyEvaluationItem(BaseModel):
"""Individual evaluation item within an evaluation set."""

model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)

id: str
Expand All @@ -28,12 +44,36 @@ class EvaluationItem(BaseModel):
class EvaluationSet(BaseModel):
"""Complete evaluation set model."""

model_config = ConfigDict(
alias_generator=to_camel, populate_by_name=True, extra="allow"
)

id: str
name: str
version: Literal["1.0"] = "1.0"
evaluator_refs: List[str] = Field(default_factory=list)
evaluations: List[EvaluationItem] = Field(default_factory=list)

def extract_selected_evals(self, eval_ids) -> None:
selected_evals: list[EvaluationItem] = []
for evaluation in self.evaluations:
if evaluation.id in eval_ids:
selected_evals.append(evaluation)
eval_ids.remove(evaluation.id)
if len(eval_ids) > 0:
raise ValueError("Unknown evaluation ids: {}".format(eval_ids))
self.evaluations = selected_evals


class LegacyEvaluationSet(BaseModel):
"""Complete evaluation set model."""

model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)

id: str
file_name: str
evaluator_refs: List[str] = Field(default_factory=list)
evaluations: List[EvaluationItem] = Field(default_factory=list)
evaluations: List[LegacyEvaluationItem] = Field(default_factory=list)
name: str
batch_size: int = 10
timeout_minutes: int = 20
Expand All @@ -42,7 +82,7 @@ class EvaluationSet(BaseModel):
updated_at: str

def extract_selected_evals(self, eval_ids) -> None:
selected_evals: list[EvaluationItem] = []
selected_evals: list[LegacyEvaluationItem] = []
for evaluation in self.evaluations:
if evaluation.id in eval_ids:
selected_evals.append(evaluation)
Expand All @@ -56,3 +96,29 @@ class EvaluationStatus(IntEnum):
PENDING = 0
IN_PROGRESS = 1
COMPLETED = 2


def _discriminate_eval_set(
v: Any,
) -> Literal["evaluation_set", "legacy_evaluation_set"]:
"""Discriminator function that returns a tag based on version field."""
if isinstance(v, dict):
version = v.get("version")
if version == "1.0":
return "evaluation_set"
return "legacy_evaluation_set"


AnyEvaluationSet = Annotated[
Union[
Annotated[EvaluationSet, Tag("evaluation_set")],
Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
],
Discriminator(_discriminate_eval_set),
]

AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]

AnyEvaluator = Annotated[
Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]], "List of evaluators"
]
146 changes: 146 additions & 0 deletions src/uipath/_cli/_evals/_models/_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from typing import Annotated, Any, Literal, Union

from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag

from uipath.eval.coded_evaluators.base_evaluator import BaseEvaluatorConfig
from uipath.eval.coded_evaluators.contains_evaluator import ContainsEvaluatorConfig
from uipath.eval.models.models import (
EvaluatorType,
LegacyEvaluatorCategory,
LegacyEvaluatorType,
)


class EvaluatorBaseParams(BaseModel):
"""Parameters for initializing the base evaluator."""

id: str
name: str
description: str
evaluator_type: LegacyEvaluatorType = Field(..., alias="type")
created_at: str = Field(..., alias="createdAt")
updated_at: str = Field(..., alias="updatedAt")
target_output_key: str = Field(..., alias="targetOutputKey")
file_name: str = Field(..., alias="fileName")


class LLMEvaluatorParams(EvaluatorBaseParams):
category: Literal[LegacyEvaluatorCategory.LlmAsAJudge] = Field(
..., alias="category"
)
prompt: str = Field(..., alias="prompt")
model: str = Field(..., alias="model")

model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, extra="allow"
)


class TrajectoryEvaluatorParams(EvaluatorBaseParams):
category: Literal[LegacyEvaluatorCategory.Trajectory] = Field(..., alias="category")
prompt: str = Field(..., alias="prompt")
model: str = Field(..., alias="model")

model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, extra="allow"
)


class EqualsEvaluatorParams(EvaluatorBaseParams):
model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, extra="allow"
)


class JsonSimilarityEvaluatorParams(EvaluatorBaseParams):
model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, extra="allow"
)


class UnknownEvaluatorParams(EvaluatorBaseParams):
model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, extra="allow"
)


class UnknownEvaluatorConfig(BaseEvaluatorConfig):
model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, extra="allow"
)


def legacy_evaluator_discriminator(data: Any) -> str:
if isinstance(data, dict):
category = data.get("category")
evaluator_type = data.get("type")
match category:
case LegacyEvaluatorCategory.LlmAsAJudge:
return "LLMEvaluatorParams"
case LegacyEvaluatorCategory.Trajectory:
return "TrajectoryEvaluatorParams"
case LegacyEvaluatorCategory.Deterministic:
match evaluator_type:
case LegacyEvaluatorType.Equals:
return "EqualsEvaluatorParams"
case LegacyEvaluatorType.JsonSimilarity:
return "JsonSimilarityEvaluatorParams"
case _:
return "UnknownEvaluatorParams"
case _:
return "UnknownEvaluatorParams"
else:
return "UnknownEvaluatorParams"


def evaluator_config_discriminator(data: Any) -> str:
if isinstance(data, dict):
evaluator_type_id = data.get("evaluatorTypeId")
match evaluator_type_id:
case EvaluatorType.CONTAINS:
return "ContainsEvaluatorConfig"
case _:
return "UnknownEvaluatorConfig"
else:
return "UnknownEvaluatorConfig"


EvaluatorLegacy = Annotated[
Union[
Annotated[
LLMEvaluatorParams,
Tag("LLMEvaluatorParams"),
],
Annotated[
TrajectoryEvaluatorParams,
Tag("TrajectoryEvaluatorParams"),
],
Annotated[
EqualsEvaluatorParams,
Tag("EqualsEvaluatorParams"),
],
Annotated[
JsonSimilarityEvaluatorParams,
Tag("JsonSimilarityEvaluatorParams"),
],
Annotated[
UnknownEvaluatorParams,
Tag("UnknownEvaluatorParams"),
],
],
Field(discriminator=Discriminator(legacy_evaluator_discriminator)),
]

EvaluatorConfig = Annotated[
Union[
Annotated[
ContainsEvaluatorConfig,
Tag("ContainsEvaluatorConfig"),
],
Annotated[
UnknownEvaluatorConfig,
Tag("UnknownEvaluatorConfig"),
],
],
Field(discriminator=Discriminator(evaluator_config_discriminator)),
]
6 changes: 3 additions & 3 deletions src/uipath/_cli/_evals/_models/_evaluator_base_params.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from pydantic import BaseModel

from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
from uipath.eval.models.models import LegacyEvaluatorCategory, LegacyEvaluatorType


class EvaluatorBaseParams(BaseModel):
"""Parameters for initializing the base evaluator."""

id: str
category: EvaluatorCategory
evaluator_type: EvaluatorType
category: LegacyEvaluatorCategory
evaluator_type: LegacyEvaluatorType
name: str
description: str
created_at: str
Expand Down
91 changes: 88 additions & 3 deletions src/uipath/_cli/_evals/_models/_output.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import List, Optional
from collections import defaultdict
from typing import Any, Dict, List, Optional

from opentelemetry.sdk.trace import ReadableSpan
from pydantic import BaseModel, ConfigDict, model_serializer
from pydantic.alias_generators import to_camel
from pydantic_core import core_schema

from uipath._cli._runtime._contracts import UiPathRuntimeResult
from uipath.eval.models.models import EvaluationResult, ScoreType
Expand All @@ -22,11 +24,15 @@ class EvaluationResultDto(BaseModel):
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)

score: float
details: Optional[str] = None
details: Optional[str | BaseModel] = None
evaluation_time: Optional[float] = None

@model_serializer(mode="wrap")
def serialize_model(self, serializer, info):
def serialize_model(
self,
serializer: core_schema.SerializerFunctionWrapHandler,
info: core_schema.SerializationInfo,
) -> Any:
data = serializer(self)
if self.details is None and isinstance(data, dict):
data.pop("details", None)
Expand Down Expand Up @@ -96,3 +102,82 @@ def compute_average_score(self) -> None:
eval_result.score for eval_result in self.evaluation_set_results
]
self.score = sum(eval_item_scores) / len(eval_item_scores)

def calculate_final_score(
self,
evaluator_weights: Dict[str, float] | None = None,
default_weight: float = 1.0,
) -> tuple[float, Dict[str, float]]:
"""Aggregate evaluation results with deduplication and weighted scoring.

This function performs the following steps:
1. Flattens the nested evaluation_set_results structure
2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
3. Calculates average score per evaluator across all datapoints
4. Computes final weighted score across evaluators

Args:
evaluator_weights: Optional dict mapping evaluator names to weights
default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)

Returns:
Tuple of (final_score, agg_metrics_per_evaluator)
- final_score: Weighted average across evaluators
- agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
"""
if not self.evaluation_set_results:
return 0.0, {}

if evaluator_weights is None:
evaluator_weights = {}

# Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
# datapoint_id = evaluation_name, evaluator_name from EvaluationRunResultDto
grouped_by_datapoint_evaluator: defaultdict[
str, defaultdict[str, list[float]]
] = defaultdict(lambda: defaultdict(list))

for eval_run_result in self.evaluation_set_results:
datapoint_id = eval_run_result.evaluation_name
for eval_run_result_dto in eval_run_result.evaluation_run_results:
evaluator_name = eval_run_result_dto.evaluator_name
score = eval_run_result_dto.result.score
grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(
score
)

# Step 2: Deduplicate by averaging same evaluator results for same datapoint
dedup_scores: list[tuple[str, str, float]] = []
for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
for evaluator_name, scores_list in evaluators_dict.items():
if scores_list:
# Average the scores for this evaluator on this datapoint
avg_score = sum(scores_list) / len(scores_list)
dedup_scores.append((datapoint_id, evaluator_name, avg_score))

# Step 3: Group by evaluator and calculate average score per evaluator
grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list)
for _datapoint_id, evaluator_name, score in dedup_scores:
grouped_by_evaluator[evaluator_name].append(score)

agg_metrics_per_evaluator = {}
for evaluator_name, scores_list in grouped_by_evaluator.items():
avg_score = sum(scores_list) / len(scores_list)
agg_metrics_per_evaluator[evaluator_name] = avg_score

# Step 4: Calculate final weighted score
if not agg_metrics_per_evaluator:
return 0.0, {}

total_weighted_score = 0.0
total_weight = 0.0

for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
weight = evaluator_weights.get(evaluator_name, default_weight)
total_weighted_score += avg_score * weight
total_weight += weight

final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0

self.score = final_score
return final_score, agg_metrics_per_evaluator
Loading