Skip to content

Commit 33eec34

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Adding implementation of evaluate method in LocalEvalService
Also, delete agent_creator.py file. We added this file by mistake. PiperOrigin-RevId: 782193593
1 parent 48971d4 commit 33eec34

File tree

4 files changed

+389
-36
lines changed

4 files changed

+389
-36
lines changed

src/google/adk/evaluation/agent_creator.py

Lines changed: 0 additions & 35 deletions
This file was deleted.

src/google/adk/evaluation/base_eval_service.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,17 @@ class EvaluateConfig(BaseModel):
4242
description="""The list of metrics to be used in Eval.""",
4343
)
4444

45+
parallelism: int = Field(
46+
default=4,
47+
description="""Number of parallel evaluations to run during an Eval. Few
48+
factors to consider while changing this value:
49+
50+
1) Your available quota with the model, especially for those metrics that use
51+
a model as a judge. Models tend to enforce per-minute or per-second SLAs. Using
52+
a larger value could result in the eval quickly consuming the quota.
53+
""",
54+
)
55+
4556

4657
class InferenceConfig(BaseModel):
4758
"""Contains configurations need to run inferences."""

src/google/adk/evaluation/local_eval_service.py

Lines changed: 190 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from __future__ import annotations
1616

1717
import asyncio
18+
import inspect
1819
import logging
1920
from typing import AsyncGenerator
2021
from typing import Callable
@@ -31,15 +32,22 @@
3132
from ..sessions.in_memory_session_service import InMemorySessionService
3233
from ..utils.feature_decorator import working_in_progress
3334
from .base_eval_service import BaseEvalService
35+
from .base_eval_service import EvaluateConfig
3436
from .base_eval_service import EvaluateRequest
3537
from .base_eval_service import InferenceRequest
3638
from .base_eval_service import InferenceResult
3739
from .base_eval_service import InferenceStatus
40+
from .eval_case import Invocation
41+
from .eval_metrics import EvalMetric
42+
from .eval_metrics import EvalMetricResult
43+
from .eval_metrics import EvalMetricResultPerInvocation
3844
from .eval_result import EvalCaseResult
3945
from .eval_set import EvalCase
4046
from .eval_set_results_manager import EvalSetResultsManager
4147
from .eval_sets_manager import EvalSetsManager
4248
from .evaluation_generator import EvaluationGenerator
49+
from .evaluator import EvalStatus
50+
from .evaluator import EvaluationResult
4351
from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
4452
from .metric_evaluator_registry import MetricEvaluatorRegistry
4553

@@ -136,7 +144,188 @@ async def evaluate(
136144
evaluate_request: The request to perform metric evaluations on the
137145
inferences.
138146
"""
139-
raise NotImplementedError()
147+
semaphore = asyncio.Semaphore(
148+
value=evaluate_request.evaluate_config.parallelism
149+
)
150+
151+
async def run_evaluation(inference_result):
152+
async with semaphore:
153+
return await self._evaluate_single_inference_result(
154+
inference_result=inference_result,
155+
evaluate_config=evaluate_request.evaluate_config,
156+
)
157+
158+
evaluation_tasks = [
159+
run_evaluation(inference_result)
160+
for inference_result in evaluate_request.inference_results
161+
]
162+
for evaluation_task in asyncio.as_completed(evaluation_tasks):
163+
yield await evaluation_task
164+
165+
async def _evaluate_single_inference_result(
166+
self, inference_result: InferenceResult, evaluate_config: EvaluateConfig
167+
) -> EvalCaseResult:
168+
"""Returns EvalCaseResult for the given inference result.
169+
170+
A single inference result can have multiple invocations. For each
171+
invocaiton, this method evaluates the metrics present in evaluate config.
172+
173+
The EvalCaseResult contains scores for each metric per invocation and the
174+
overall score.
175+
"""
176+
eval_case = self._eval_sets_manager.get_eval_case(
177+
app_name=inference_result.app_name,
178+
eval_set_id=inference_result.eval_set_id,
179+
eval_case_id=inference_result.eval_case_id,
180+
)
181+
182+
if eval_case is None:
183+
raise NotFoundError(
184+
f'Eval case with id {inference_result.eval_case_id} not found for'
185+
f' app {inference_result.app_name} and eval set'
186+
f' {inference_result.eval_set_id}.'
187+
)
188+
189+
# Metric results for each invocation
190+
eval_metric_result_per_invocation = []
191+
192+
# We also keep track of the overall score for a metric, derived from all
193+
# invocation. For example, if we were keeping track the metric that compares
194+
# how well is the final resposne as compared to a golden answer, then each
195+
# invocation will have the value of this metric. We will also have an
196+
# overall score using aggregation strategy across all invocations. This
197+
# would be the score for the eval case.
198+
overall_eval_metric_results = []
199+
200+
if len(inference_result.inferences) != len(eval_case.conversation):
201+
raise ValueError(
202+
'Inferences should match conversations in eval case. Found'
203+
f'{len(inference_result.inferences)} inferences '
204+
f'{len(eval_case.conversation)} conversations in eval cases.'
205+
)
206+
207+
# Pre-creating the EvalMetricResults entries for each invocation.
208+
for actual, expected in zip(
209+
inference_result.inferences, eval_case.conversation
210+
):
211+
eval_metric_result_per_invocation.append(
212+
EvalMetricResultPerInvocation(
213+
actual_invocation=actual,
214+
expected_invocation=expected,
215+
# We will fill this as we evaluate each metric per invocation.
216+
eval_metric_results=[],
217+
)
218+
)
219+
220+
for eval_metric in evaluate_config.eval_metrics:
221+
# Perform evaluation of the metric.
222+
evaluation_result = await self._evaluate_metric(
223+
eval_metric=eval_metric,
224+
actual_invocations=inference_result.inferences,
225+
expected_invocations=eval_case.conversation,
226+
)
227+
228+
# Track overall scrore across all invocations.
229+
overall_eval_metric_results.append(
230+
EvalMetricResult(
231+
metric_name=eval_metric.metric_name,
232+
threshold=eval_metric.threshold,
233+
score=evaluation_result.overall_score,
234+
eval_status=evaluation_result.overall_eval_status,
235+
)
236+
)
237+
238+
if len(evaluation_result.per_invocation_results) != len(
239+
eval_metric_result_per_invocation
240+
):
241+
raise ValueError(
242+
'Eval metric should return results for each invocation. Found '
243+
f'{len(evaluation_result.per_invocation_results)} results for '
244+
f'{len(eval_metric_result_per_invocation)} invocations.'
245+
)
246+
247+
# Track score across individual invocations.
248+
for invocation_result, invocation in zip(
249+
evaluation_result.per_invocation_results,
250+
eval_metric_result_per_invocation,
251+
):
252+
invocation.eval_metric_results.append(
253+
EvalMetricResult(
254+
metric_name=eval_metric.metric_name,
255+
threshold=eval_metric.threshold,
256+
score=invocation_result.score,
257+
eval_status=invocation_result.eval_status,
258+
)
259+
)
260+
261+
final_eval_status = self._generate_final_eval_status(
262+
overall_eval_metric_results
263+
)
264+
user_id = (
265+
eval_case.session_input.user_id
266+
if eval_case.session_input and eval_case.session_input.user_id
267+
else 'test_user_id'
268+
)
269+
270+
return EvalCaseResult(
271+
eval_set_file=inference_result.eval_set_id,
272+
eval_set_id=inference_result.eval_set_id,
273+
eval_id=inference_result.eval_case_id,
274+
final_eval_status=final_eval_status,
275+
overall_eval_metric_results=overall_eval_metric_results,
276+
eval_metric_result_per_invocation=eval_metric_result_per_invocation,
277+
session_id=inference_result.session_id,
278+
user_id=user_id,
279+
)
280+
281+
async def _evaluate_metric(
282+
self,
283+
eval_metric: EvalMetric,
284+
actual_invocations: list[Invocation],
285+
expected_invocations: list[Invocation],
286+
) -> EvaluationResult:
287+
"""Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
288+
289+
# Get the metric evaluator from the registry.
290+
metric_evaluator = self._metric_evaluator_registry.get_evaluator(
291+
eval_metric=eval_metric
292+
)
293+
294+
if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
295+
# Some evaluators could be async, for example those that use llm as a
296+
# judge, so we need to make sure that we wait on them.
297+
return await metric_evaluator.evaluate_invocations(
298+
actual_invocations=actual_invocations,
299+
expected_invocations=expected_invocations,
300+
)
301+
else:
302+
# Metrics that perform computation synchronously, mostly these don't
303+
# perform any i/o. An example of this would calculation of rouge_1 score.
304+
return metric_evaluator.evaluate_invocations(
305+
actual_invocations=actual_invocations,
306+
expected_invocations=expected_invocations,
307+
)
308+
309+
def _generate_final_eval_status(
310+
self, overall_eval_metric_results: list[EvalMetricResult]
311+
) -> EvalStatus:
312+
final_eval_status = EvalStatus.NOT_EVALUATED
313+
# Go over the all the eval statuses and mark the final eval status as
314+
# passed if all of them pass, otherwise mark the final eval status to
315+
# failed.
316+
for overall_eval_metric_result in overall_eval_metric_results:
317+
overall_eval_status = overall_eval_metric_result.eval_status
318+
if overall_eval_status == EvalStatus.PASSED:
319+
final_eval_status = EvalStatus.PASSED
320+
elif overall_eval_status == EvalStatus.NOT_EVALUATED:
321+
continue
322+
elif overall_eval_status == EvalStatus.FAILED:
323+
final_eval_status = EvalStatus.FAILED
324+
break
325+
else:
326+
raise ValueError(f'Unknown eval status: {overall_eval_status}.')
327+
328+
return final_eval_status
140329

141330
async def _perform_inference_sigle_eval_item(
142331
self,

0 commit comments

Comments
 (0)