|
15 | 15 | from __future__ import annotations |
16 | 16 |
|
17 | 17 | import asyncio |
| 18 | +import inspect |
18 | 19 | import logging |
19 | 20 | from typing import AsyncGenerator |
20 | 21 | from typing import Callable |
|
31 | 32 | from ..sessions.in_memory_session_service import InMemorySessionService |
32 | 33 | from ..utils.feature_decorator import working_in_progress |
33 | 34 | from .base_eval_service import BaseEvalService |
| 35 | +from .base_eval_service import EvaluateConfig |
34 | 36 | from .base_eval_service import EvaluateRequest |
35 | 37 | from .base_eval_service import InferenceRequest |
36 | 38 | from .base_eval_service import InferenceResult |
37 | 39 | from .base_eval_service import InferenceStatus |
| 40 | +from .eval_case import Invocation |
| 41 | +from .eval_metrics import EvalMetric |
| 42 | +from .eval_metrics import EvalMetricResult |
| 43 | +from .eval_metrics import EvalMetricResultPerInvocation |
38 | 44 | from .eval_result import EvalCaseResult |
39 | 45 | from .eval_set import EvalCase |
40 | 46 | from .eval_set_results_manager import EvalSetResultsManager |
41 | 47 | from .eval_sets_manager import EvalSetsManager |
42 | 48 | from .evaluation_generator import EvaluationGenerator |
| 49 | +from .evaluator import EvalStatus |
| 50 | +from .evaluator import EvaluationResult |
43 | 51 | from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY |
44 | 52 | from .metric_evaluator_registry import MetricEvaluatorRegistry |
45 | 53 |
|
@@ -136,7 +144,188 @@ async def evaluate( |
136 | 144 | evaluate_request: The request to perform metric evaluations on the |
137 | 145 | inferences. |
138 | 146 | """ |
139 | | - raise NotImplementedError() |
| 147 | + semaphore = asyncio.Semaphore( |
| 148 | + value=evaluate_request.evaluate_config.parallelism |
| 149 | + ) |
| 150 | + |
| 151 | + async def run_evaluation(inference_result): |
| 152 | + async with semaphore: |
| 153 | + return await self._evaluate_single_inference_result( |
| 154 | + inference_result=inference_result, |
| 155 | + evaluate_config=evaluate_request.evaluate_config, |
| 156 | + ) |
| 157 | + |
| 158 | + evaluation_tasks = [ |
| 159 | + run_evaluation(inference_result) |
| 160 | + for inference_result in evaluate_request.inference_results |
| 161 | + ] |
| 162 | + for evaluation_task in asyncio.as_completed(evaluation_tasks): |
| 163 | + yield await evaluation_task |
| 164 | + |
| 165 | + async def _evaluate_single_inference_result( |
| 166 | + self, inference_result: InferenceResult, evaluate_config: EvaluateConfig |
| 167 | + ) -> EvalCaseResult: |
| 168 | + """Returns EvalCaseResult for the given inference result. |
| 169 | +
|
| 170 | + A single inference result can have multiple invocations. For each |
| 171 | + invocaiton, this method evaluates the metrics present in evaluate config. |
| 172 | +
|
| 173 | + The EvalCaseResult contains scores for each metric per invocation and the |
| 174 | + overall score. |
| 175 | + """ |
| 176 | + eval_case = self._eval_sets_manager.get_eval_case( |
| 177 | + app_name=inference_result.app_name, |
| 178 | + eval_set_id=inference_result.eval_set_id, |
| 179 | + eval_case_id=inference_result.eval_case_id, |
| 180 | + ) |
| 181 | + |
| 182 | + if eval_case is None: |
| 183 | + raise NotFoundError( |
| 184 | + f'Eval case with id {inference_result.eval_case_id} not found for' |
| 185 | + f' app {inference_result.app_name} and eval set' |
| 186 | + f' {inference_result.eval_set_id}.' |
| 187 | + ) |
| 188 | + |
| 189 | + # Metric results for each invocation |
| 190 | + eval_metric_result_per_invocation = [] |
| 191 | + |
| 192 | + # We also keep track of the overall score for a metric, derived from all |
| 193 | + # invocation. For example, if we were keeping track the metric that compares |
| 194 | + # how well is the final resposne as compared to a golden answer, then each |
| 195 | + # invocation will have the value of this metric. We will also have an |
| 196 | + # overall score using aggregation strategy across all invocations. This |
| 197 | + # would be the score for the eval case. |
| 198 | + overall_eval_metric_results = [] |
| 199 | + |
| 200 | + if len(inference_result.inferences) != len(eval_case.conversation): |
| 201 | + raise ValueError( |
| 202 | + 'Inferences should match conversations in eval case. Found' |
| 203 | + f'{len(inference_result.inferences)} inferences ' |
| 204 | + f'{len(eval_case.conversation)} conversations in eval cases.' |
| 205 | + ) |
| 206 | + |
| 207 | + # Pre-creating the EvalMetricResults entries for each invocation. |
| 208 | + for actual, expected in zip( |
| 209 | + inference_result.inferences, eval_case.conversation |
| 210 | + ): |
| 211 | + eval_metric_result_per_invocation.append( |
| 212 | + EvalMetricResultPerInvocation( |
| 213 | + actual_invocation=actual, |
| 214 | + expected_invocation=expected, |
| 215 | + # We will fill this as we evaluate each metric per invocation. |
| 216 | + eval_metric_results=[], |
| 217 | + ) |
| 218 | + ) |
| 219 | + |
| 220 | + for eval_metric in evaluate_config.eval_metrics: |
| 221 | + # Perform evaluation of the metric. |
| 222 | + evaluation_result = await self._evaluate_metric( |
| 223 | + eval_metric=eval_metric, |
| 224 | + actual_invocations=inference_result.inferences, |
| 225 | + expected_invocations=eval_case.conversation, |
| 226 | + ) |
| 227 | + |
| 228 | + # Track overall scrore across all invocations. |
| 229 | + overall_eval_metric_results.append( |
| 230 | + EvalMetricResult( |
| 231 | + metric_name=eval_metric.metric_name, |
| 232 | + threshold=eval_metric.threshold, |
| 233 | + score=evaluation_result.overall_score, |
| 234 | + eval_status=evaluation_result.overall_eval_status, |
| 235 | + ) |
| 236 | + ) |
| 237 | + |
| 238 | + if len(evaluation_result.per_invocation_results) != len( |
| 239 | + eval_metric_result_per_invocation |
| 240 | + ): |
| 241 | + raise ValueError( |
| 242 | + 'Eval metric should return results for each invocation. Found ' |
| 243 | + f'{len(evaluation_result.per_invocation_results)} results for ' |
| 244 | + f'{len(eval_metric_result_per_invocation)} invocations.' |
| 245 | + ) |
| 246 | + |
| 247 | + # Track score across individual invocations. |
| 248 | + for invocation_result, invocation in zip( |
| 249 | + evaluation_result.per_invocation_results, |
| 250 | + eval_metric_result_per_invocation, |
| 251 | + ): |
| 252 | + invocation.eval_metric_results.append( |
| 253 | + EvalMetricResult( |
| 254 | + metric_name=eval_metric.metric_name, |
| 255 | + threshold=eval_metric.threshold, |
| 256 | + score=invocation_result.score, |
| 257 | + eval_status=invocation_result.eval_status, |
| 258 | + ) |
| 259 | + ) |
| 260 | + |
| 261 | + final_eval_status = self._generate_final_eval_status( |
| 262 | + overall_eval_metric_results |
| 263 | + ) |
| 264 | + user_id = ( |
| 265 | + eval_case.session_input.user_id |
| 266 | + if eval_case.session_input and eval_case.session_input.user_id |
| 267 | + else 'test_user_id' |
| 268 | + ) |
| 269 | + |
| 270 | + return EvalCaseResult( |
| 271 | + eval_set_file=inference_result.eval_set_id, |
| 272 | + eval_set_id=inference_result.eval_set_id, |
| 273 | + eval_id=inference_result.eval_case_id, |
| 274 | + final_eval_status=final_eval_status, |
| 275 | + overall_eval_metric_results=overall_eval_metric_results, |
| 276 | + eval_metric_result_per_invocation=eval_metric_result_per_invocation, |
| 277 | + session_id=inference_result.session_id, |
| 278 | + user_id=user_id, |
| 279 | + ) |
| 280 | + |
| 281 | + async def _evaluate_metric( |
| 282 | + self, |
| 283 | + eval_metric: EvalMetric, |
| 284 | + actual_invocations: list[Invocation], |
| 285 | + expected_invocations: list[Invocation], |
| 286 | + ) -> EvaluationResult: |
| 287 | + """Returns EvaluationResult obtained from evaluating a metric using an Evaluator.""" |
| 288 | + |
| 289 | + # Get the metric evaluator from the registry. |
| 290 | + metric_evaluator = self._metric_evaluator_registry.get_evaluator( |
| 291 | + eval_metric=eval_metric |
| 292 | + ) |
| 293 | + |
| 294 | + if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations): |
| 295 | + # Some evaluators could be async, for example those that use llm as a |
| 296 | + # judge, so we need to make sure that we wait on them. |
| 297 | + return await metric_evaluator.evaluate_invocations( |
| 298 | + actual_invocations=actual_invocations, |
| 299 | + expected_invocations=expected_invocations, |
| 300 | + ) |
| 301 | + else: |
| 302 | + # Metrics that perform computation synchronously, mostly these don't |
| 303 | + # perform any i/o. An example of this would calculation of rouge_1 score. |
| 304 | + return metric_evaluator.evaluate_invocations( |
| 305 | + actual_invocations=actual_invocations, |
| 306 | + expected_invocations=expected_invocations, |
| 307 | + ) |
| 308 | + |
| 309 | + def _generate_final_eval_status( |
| 310 | + self, overall_eval_metric_results: list[EvalMetricResult] |
| 311 | + ) -> EvalStatus: |
| 312 | + final_eval_status = EvalStatus.NOT_EVALUATED |
| 313 | + # Go over the all the eval statuses and mark the final eval status as |
| 314 | + # passed if all of them pass, otherwise mark the final eval status to |
| 315 | + # failed. |
| 316 | + for overall_eval_metric_result in overall_eval_metric_results: |
| 317 | + overall_eval_status = overall_eval_metric_result.eval_status |
| 318 | + if overall_eval_status == EvalStatus.PASSED: |
| 319 | + final_eval_status = EvalStatus.PASSED |
| 320 | + elif overall_eval_status == EvalStatus.NOT_EVALUATED: |
| 321 | + continue |
| 322 | + elif overall_eval_status == EvalStatus.FAILED: |
| 323 | + final_eval_status = EvalStatus.FAILED |
| 324 | + break |
| 325 | + else: |
| 326 | + raise ValueError(f'Unknown eval status: {overall_eval_status}.') |
| 327 | + |
| 328 | + return final_eval_status |
140 | 329 |
|
141 | 330 | async def _perform_inference_sigle_eval_item( |
142 | 331 | self, |
|
0 commit comments