UiPath · radu-mocanu · Sep 25, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -1,13 +1,29 @@
 from enum import IntEnum
-from typing import Any, Dict, List
+from typing import Annotated, Any, Dict, List, Literal, Union
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
 from pydantic.alias_generators import to_camel
 
+from uipath.eval.coded_evaluators import BaseEvaluator
+from uipath.eval.evaluators import LegacyBaseEvaluator
+
 
 class EvaluationItem(BaseModel):
     """Individual evaluation item within an evaluation set."""
 
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+    id: str
+    name: str
+    inputs: Dict[str, Any]
+    evaluation_criterias: dict[str, dict[str, Any] | None] = Field(
+        ..., alias="evaluationCriterias"
+    )
+    expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
+
+
+class LegacyEvaluationItem(BaseModel):
+    """Individual evaluation item within an evaluation set."""
+
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     id: str
@@ -28,12 +44,36 @@ class EvaluationItem(BaseModel):
 class EvaluationSet(BaseModel):
     """Complete evaluation set model."""
 
+    model_config = ConfigDict(
+        alias_generator=to_camel, populate_by_name=True, extra="allow"
+    )
+
+    id: str
+    name: str
+    version: Literal["1.0"] = "1.0"
+    evaluator_refs: List[str] = Field(default_factory=list)
+    evaluations: List[EvaluationItem] = Field(default_factory=list)
+
+    def extract_selected_evals(self, eval_ids) -> None:
+        selected_evals: list[EvaluationItem] = []
+        for evaluation in self.evaluations:
+            if evaluation.id in eval_ids:
+                selected_evals.append(evaluation)
+                eval_ids.remove(evaluation.id)
+        if len(eval_ids) > 0:
+            raise ValueError("Unknown evaluation ids: {}".format(eval_ids))
+        self.evaluations = selected_evals
+
+
+class LegacyEvaluationSet(BaseModel):
+    """Complete evaluation set model."""
+
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     id: str
     file_name: str
     evaluator_refs: List[str] = Field(default_factory=list)
-    evaluations: List[EvaluationItem] = Field(default_factory=list)
+    evaluations: List[LegacyEvaluationItem] = Field(default_factory=list)
     name: str
     batch_size: int = 10
     timeout_minutes: int = 20
@@ -42,7 +82,7 @@ class EvaluationSet(BaseModel):
     updated_at: str
 
     def extract_selected_evals(self, eval_ids) -> None:
-        selected_evals: list[EvaluationItem] = []
+        selected_evals: list[LegacyEvaluationItem] = []
         for evaluation in self.evaluations:
             if evaluation.id in eval_ids:
                 selected_evals.append(evaluation)
@@ -56,3 +96,29 @@ class EvaluationStatus(IntEnum):
     PENDING = 0
     IN_PROGRESS = 1
     COMPLETED = 2
+
+
+def _discriminate_eval_set(
+    v: Any,
+) -> Literal["evaluation_set", "legacy_evaluation_set"]:
+    """Discriminator function that returns a tag based on version field."""
+    if isinstance(v, dict):
+        version = v.get("version")
+        if version == "1.0":
+            return "evaluation_set"
+    return "legacy_evaluation_set"
+
+
+AnyEvaluationSet = Annotated[
+    Union[
+        Annotated[EvaluationSet, Tag("evaluation_set")],
+        Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
+    ],
+    Discriminator(_discriminate_eval_set),
+]
+
+AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
+
+AnyEvaluator = Annotated[
+    Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]], "List of evaluators"
+]
diff --git a/src/uipath/_cli/_evals/_models/_evaluator.py b/src/uipath/_cli/_evals/_models/_evaluator.py
@@ -0,0 +1,146 @@
+from typing import Annotated, Any, Literal, Union
+
+from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
+
+from uipath.eval.coded_evaluators.base_evaluator import BaseEvaluatorConfig
+from uipath.eval.coded_evaluators.contains_evaluator import ContainsEvaluatorConfig
+from uipath.eval.models.models import (
+    EvaluatorType,
+    LegacyEvaluatorCategory,
+    LegacyEvaluatorType,
+)
+
+
+class EvaluatorBaseParams(BaseModel):
+    """Parameters for initializing the base evaluator."""
+
+    id: str
+    name: str
+    description: str
+    evaluator_type: LegacyEvaluatorType = Field(..., alias="type")
+    created_at: str = Field(..., alias="createdAt")
+    updated_at: str = Field(..., alias="updatedAt")
+    target_output_key: str = Field(..., alias="targetOutputKey")
+    file_name: str = Field(..., alias="fileName")
+
+
+class LLMEvaluatorParams(EvaluatorBaseParams):
+    category: Literal[LegacyEvaluatorCategory.LlmAsAJudge] = Field(
+        ..., alias="category"
+    )
+    prompt: str = Field(..., alias="prompt")
+    model: str = Field(..., alias="model")
+
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+class TrajectoryEvaluatorParams(EvaluatorBaseParams):
+    category: Literal[LegacyEvaluatorCategory.Trajectory] = Field(..., alias="category")
+    prompt: str = Field(..., alias="prompt")
+    model: str = Field(..., alias="model")
+
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+class EqualsEvaluatorParams(EvaluatorBaseParams):
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+class JsonSimilarityEvaluatorParams(EvaluatorBaseParams):
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+class UnknownEvaluatorParams(EvaluatorBaseParams):
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+class UnknownEvaluatorConfig(BaseEvaluatorConfig):
+    model_config = ConfigDict(
+        validate_by_name=True, validate_by_alias=True, extra="allow"
+    )
+
+
+def legacy_evaluator_discriminator(data: Any) -> str:
+    if isinstance(data, dict):
+        category = data.get("category")
+        evaluator_type = data.get("type")
+        match category:
+            case LegacyEvaluatorCategory.LlmAsAJudge:
+                return "LLMEvaluatorParams"
+            case LegacyEvaluatorCategory.Trajectory:
+                return "TrajectoryEvaluatorParams"
+            case LegacyEvaluatorCategory.Deterministic:
+                match evaluator_type:
+                    case LegacyEvaluatorType.Equals:
+                        return "EqualsEvaluatorParams"
+                    case LegacyEvaluatorType.JsonSimilarity:
+                        return "JsonSimilarityEvaluatorParams"
+                    case _:
+                        return "UnknownEvaluatorParams"
+            case _:
+                return "UnknownEvaluatorParams"
+    else:
+        return "UnknownEvaluatorParams"
+
+
+def evaluator_config_discriminator(data: Any) -> str:
+    if isinstance(data, dict):
+        evaluator_type_id = data.get("evaluatorTypeId")
+        match evaluator_type_id:
+            case EvaluatorType.CONTAINS:
+                return "ContainsEvaluatorConfig"
+            case _:
+                return "UnknownEvaluatorConfig"
+    else:
+        return "UnknownEvaluatorConfig"
+
+
+EvaluatorLegacy = Annotated[
+    Union[
+        Annotated[
+            LLMEvaluatorParams,
+            Tag("LLMEvaluatorParams"),
+        ],
+        Annotated[
+            TrajectoryEvaluatorParams,
+            Tag("TrajectoryEvaluatorParams"),
+        ],
+        Annotated[
+            EqualsEvaluatorParams,
+            Tag("EqualsEvaluatorParams"),
+        ],
+        Annotated[
+            JsonSimilarityEvaluatorParams,
+            Tag("JsonSimilarityEvaluatorParams"),
+        ],
+        Annotated[
+            UnknownEvaluatorParams,
+            Tag("UnknownEvaluatorParams"),
+        ],
+    ],
+    Field(discriminator=Discriminator(legacy_evaluator_discriminator)),
+]
+
+EvaluatorConfig = Annotated[
+    Union[
+        Annotated[
+            ContainsEvaluatorConfig,
+            Tag("ContainsEvaluatorConfig"),
+        ],
+        Annotated[
+            UnknownEvaluatorConfig,
+            Tag("UnknownEvaluatorConfig"),
+        ],
+    ],
+    Field(discriminator=Discriminator(evaluator_config_discriminator)),
+]
diff --git a/src/uipath/_cli/_evals/_models/_evaluator_base_params.py b/src/uipath/_cli/_evals/_models/_evaluator_base_params.py
@@ -1,14 +1,14 @@
 from pydantic import BaseModel
 
-from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
+from uipath.eval.models.models import LegacyEvaluatorCategory, LegacyEvaluatorType
 
 
 class EvaluatorBaseParams(BaseModel):
     """Parameters for initializing the base evaluator."""
 
     id: str
-    category: EvaluatorCategory
-    evaluator_type: EvaluatorType
+    category: LegacyEvaluatorCategory
+    evaluator_type: LegacyEvaluatorType
     name: str
     description: str
     created_at: str

diff --git a/src/uipath/_cli/_evals/_models/_output.py b/src/uipath/_cli/_evals/_models/_output.py
@@ -1,8 +1,10 @@
-from typing import List, Optional
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
 
 from opentelemetry.sdk.trace import ReadableSpan
 from pydantic import BaseModel, ConfigDict, model_serializer
 from pydantic.alias_generators import to_camel
+from pydantic_core import core_schema
 
 from uipath._cli._runtime._contracts import UiPathRuntimeResult
 from uipath.eval.models.models import EvaluationResult, ScoreType
@@ -22,11 +24,15 @@ class EvaluationResultDto(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     score: float
-    details: Optional[str] = None
+    details: Optional[str | BaseModel] = None
     evaluation_time: Optional[float] = None
 
     @model_serializer(mode="wrap")
-    def serialize_model(self, serializer, info):
+    def serialize_model(
+        self,
+        serializer: core_schema.SerializerFunctionWrapHandler,
+        info: core_schema.SerializationInfo,
+    ) -> Any:
         data = serializer(self)
         if self.details is None and isinstance(data, dict):
             data.pop("details", None)
@@ -96,3 +102,82 @@ def compute_average_score(self) -> None:
             eval_result.score for eval_result in self.evaluation_set_results
         ]
         self.score = sum(eval_item_scores) / len(eval_item_scores)
+
+    def calculate_final_score(
+        self,
+        evaluator_weights: Dict[str, float] | None = None,
+        default_weight: float = 1.0,
+    ) -> tuple[float, Dict[str, float]]:
+        """Aggregate evaluation results with deduplication and weighted scoring.
+
+        This function performs the following steps:
+        1. Flattens the nested evaluation_set_results structure
+        2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
+        3. Calculates average score per evaluator across all datapoints
+        4. Computes final weighted score across evaluators
+
+        Args:
+            evaluator_weights: Optional dict mapping evaluator names to weights
+            default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)
+
+        Returns:
+            Tuple of (final_score, agg_metrics_per_evaluator)
+            - final_score: Weighted average across evaluators
+            - agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
+        """
+        if not self.evaluation_set_results:
+            return 0.0, {}
+
+        if evaluator_weights is None:
+            evaluator_weights = {}
+
+        # Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
+        # datapoint_id = evaluation_name, evaluator_name from EvaluationRunResultDto
+        grouped_by_datapoint_evaluator: defaultdict[
+            str, defaultdict[str, list[float]]
+        ] = defaultdict(lambda: defaultdict(list))
+
+        for eval_run_result in self.evaluation_set_results:
+            datapoint_id = eval_run_result.evaluation_name
+            for eval_run_result_dto in eval_run_result.evaluation_run_results:
+                evaluator_name = eval_run_result_dto.evaluator_name
+                score = eval_run_result_dto.result.score
+                grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(
+                    score
+                )
+
+        # Step 2: Deduplicate by averaging same evaluator results for same datapoint
+        dedup_scores: list[tuple[str, str, float]] = []
+        for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
+            for evaluator_name, scores_list in evaluators_dict.items():
+                if scores_list:
+                    # Average the scores for this evaluator on this datapoint
+                    avg_score = sum(scores_list) / len(scores_list)
+                    dedup_scores.append((datapoint_id, evaluator_name, avg_score))
+
+        # Step 3: Group by evaluator and calculate average score per evaluator
+        grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list)
+        for _datapoint_id, evaluator_name, score in dedup_scores:
+            grouped_by_evaluator[evaluator_name].append(score)
+
+        agg_metrics_per_evaluator = {}
+        for evaluator_name, scores_list in grouped_by_evaluator.items():
+            avg_score = sum(scores_list) / len(scores_list)
+            agg_metrics_per_evaluator[evaluator_name] = avg_score
+
+        # Step 4: Calculate final weighted score
+        if not agg_metrics_per_evaluator:
+            return 0.0, {}
+
+        total_weighted_score = 0.0
+        total_weight = 0.0
+
+        for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
+            weight = evaluator_weights.get(evaluator_name, default_weight)
+            total_weighted_score += avg_score * weight
+            total_weight += weight
+
+        final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
+
+        self.score = final_score
+        return final_score, agg_metrics_per_evaluator