Skip to content

Commit c25911c

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Implement in memory EvalSetsManager
This version of the EvalSetsManager is intended to support two main behaviors 1) The agent developer wants to bring in their own eval set file, which is usually the case with `adk eval` cli. Once their eval sets are uploaded into this version of the eval sets manager, the EvalSetManager could be handed over to the Eval system for running evals. 2) As a part of AgentEvaluator testing, we expect developers to supply Eval cases in json files. The in-memory version of the EvalSetsManager will help us run those test cases using LocalEvalService. PiperOrigin-RevId: 783198788
1 parent a504199 commit c25911c

File tree

2 files changed

+350
-0
lines changed

2 files changed

+350
-0
lines changed
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import time
18+
from typing import Optional
19+
20+
from typing_extensions import override
21+
22+
from ..errors.not_found_error import NotFoundError
23+
from .eval_case import EvalCase
24+
from .eval_set import EvalSet
25+
from .eval_sets_manager import EvalSetsManager
26+
27+
28+
class InMemoryEvalSetsManager(EvalSetsManager):
29+
"""An in-memory implementation of EvalSetsManager using dictionaries.
30+
31+
You can use this class:
32+
1) As a part of your testcase.
33+
2) For cases where other implementations of EvalSetsManager are too expensive
34+
to use.
35+
"""
36+
37+
def __init__(self):
38+
# {app_name: {eval_set_id: EvalSet}}
39+
self._eval_sets: dict[str, dict[str, EvalSet]] = {}
40+
# {app_name: {eval_set_id: {eval_case_id: EvalCase}}}
41+
self._eval_cases: dict[str, dict[str, dict[str, EvalCase]]] = {}
42+
43+
def _ensure_app_exists(self, app_name: str):
44+
if app_name not in self._eval_sets:
45+
self._eval_sets[app_name] = {}
46+
self._eval_cases[app_name] = {}
47+
48+
@override
49+
def get_eval_set(self, app_name: str, eval_set_id: str) -> Optional[EvalSet]:
50+
self._ensure_app_exists(app_name)
51+
return self._eval_sets[app_name].get(eval_set_id, None)
52+
53+
@override
54+
def create_eval_set(self, app_name: str, eval_set_id: str):
55+
self._ensure_app_exists(app_name)
56+
if eval_set_id in self._eval_sets[app_name]:
57+
raise ValueError(
58+
f"EvalSet {eval_set_id} already exists for app {app_name}."
59+
)
60+
61+
new_eval_set = EvalSet(
62+
eval_set_id=eval_set_id,
63+
eval_cases=[],
64+
creation_timestamp=time.time(),
65+
)
66+
self._eval_sets[app_name][eval_set_id] = new_eval_set
67+
self._eval_cases[app_name][eval_set_id] = {}
68+
69+
@override
70+
def list_eval_sets(self, app_name: str) -> list[str]:
71+
if app_name not in self._eval_sets:
72+
return []
73+
74+
return list(self._eval_sets[app_name].keys())
75+
76+
@override
77+
def get_eval_case(
78+
self, app_name: str, eval_set_id: str, eval_case_id: str
79+
) -> Optional[EvalCase]:
80+
if app_name not in self._eval_cases:
81+
return None
82+
if eval_set_id not in self._eval_cases[app_name]:
83+
return None
84+
return self._eval_cases[app_name][eval_set_id].get(eval_case_id)
85+
86+
@override
87+
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
88+
self._ensure_app_exists(app_name)
89+
if eval_set_id not in self._eval_sets[app_name]:
90+
raise NotFoundError(
91+
f"EvalSet {eval_set_id} not found for app {app_name}."
92+
)
93+
if eval_case.eval_id in self._eval_cases[app_name][eval_set_id]:
94+
raise ValueError(
95+
f"EvalCase {eval_case.eval_id} already exists in EvalSet"
96+
f" {eval_set_id} for app {app_name}."
97+
)
98+
99+
self._eval_cases[app_name][eval_set_id][eval_case.eval_id] = eval_case
100+
# Also update the list in the EvalSet object
101+
self._eval_sets[app_name][eval_set_id].eval_cases.append(eval_case)
102+
103+
@override
104+
def update_eval_case(
105+
self, app_name: str, eval_set_id: str, updated_eval_case: EvalCase
106+
):
107+
self._ensure_app_exists(app_name)
108+
if eval_set_id not in self._eval_sets[app_name]:
109+
raise NotFoundError(
110+
f"EvalSet {eval_set_id} not found for app {app_name}."
111+
)
112+
if updated_eval_case.eval_id not in self._eval_cases[app_name][eval_set_id]:
113+
raise NotFoundError(
114+
f"EvalCase {updated_eval_case.eval_id} not found in EvalSet"
115+
f" {eval_set_id} for app {app_name}."
116+
)
117+
118+
# Full replace
119+
self._eval_cases[app_name][eval_set_id][
120+
updated_eval_case.eval_id
121+
] = updated_eval_case
122+
123+
# Update the list in the EvalSet object
124+
eval_set = self._eval_sets[app_name][eval_set_id]
125+
for i, case in enumerate(eval_set.eval_cases):
126+
if case.eval_id == updated_eval_case.eval_id:
127+
eval_set.eval_cases[i] = updated_eval_case
128+
break
129+
130+
@override
131+
def delete_eval_case(
132+
self, app_name: str, eval_set_id: str, eval_case_id: str
133+
):
134+
self._ensure_app_exists(app_name)
135+
if eval_set_id not in self._eval_sets[app_name]:
136+
raise NotFoundError(
137+
f"EvalSet {eval_set_id} not found for app {app_name}."
138+
)
139+
if eval_case_id not in self._eval_cases[app_name][eval_set_id]:
140+
raise NotFoundError(
141+
f"EvalCase {eval_case_id} not found in EvalSet {eval_set_id}"
142+
f" for app {app_name}."
143+
)
144+
145+
del self._eval_cases[app_name][eval_set_id][eval_case_id]
146+
147+
# Remove from the list in the EvalSet object
148+
eval_set = self._eval_sets[app_name][eval_set_id]
149+
eval_set.eval_cases = [
150+
case for case in eval_set.eval_cases if case.eval_id != eval_case_id
151+
]
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import time
16+
17+
from google.adk.errors.not_found_error import NotFoundError
18+
from google.adk.evaluation.eval_case import EvalCase
19+
from google.adk.evaluation.in_memory_eval_sets_manager import InMemoryEvalSetsManager
20+
import pytest
21+
22+
23+
@pytest.fixture
24+
def app_name():
25+
return "test_app"
26+
27+
28+
@pytest.fixture
29+
def manager():
30+
return InMemoryEvalSetsManager()
31+
32+
33+
@pytest.fixture
34+
def eval_set_id():
35+
return "test_eval_set"
36+
37+
38+
@pytest.fixture
39+
def eval_case_id():
40+
return "test_eval_case"
41+
42+
43+
def test_create_eval_set(manager, app_name, eval_set_id):
44+
manager.create_eval_set(app_name, eval_set_id)
45+
eval_set = manager.get_eval_set(app_name, eval_set_id)
46+
assert eval_set is not None
47+
assert eval_set.eval_set_id == eval_set_id
48+
assert eval_set.eval_cases == []
49+
50+
51+
def test_create_eval_set_already_exists(manager, app_name, eval_set_id):
52+
manager.create_eval_set(app_name, eval_set_id)
53+
with pytest.raises(ValueError):
54+
manager.create_eval_set(app_name, eval_set_id)
55+
56+
57+
def test_get_eval_set(manager, app_name, eval_set_id):
58+
manager.create_eval_set(app_name, eval_set_id)
59+
eval_set = manager.get_eval_set(app_name, eval_set_id)
60+
assert eval_set is not None
61+
assert eval_set.eval_set_id == eval_set_id
62+
63+
64+
def test_get_eval_set_not_found(manager, app_name):
65+
eval_set = manager.get_eval_set(app_name, "nonexistent_set")
66+
assert eval_set is None
67+
68+
69+
def test_get_eval_set_wrong_app(manager, app_name, eval_set_id):
70+
manager.create_eval_set(app_name, eval_set_id)
71+
eval_set = manager.get_eval_set("wrong_app", eval_set_id)
72+
assert eval_set is None
73+
74+
75+
def test_list_eval_sets(manager, app_name):
76+
manager.create_eval_set(app_name, "set1")
77+
manager.create_eval_set(app_name, "set2")
78+
eval_sets = manager.list_eval_sets(app_name)
79+
assert len(eval_sets) == 2
80+
assert "set1" in eval_sets
81+
assert "set2" in eval_sets
82+
83+
84+
def test_list_eval_sets_wrong_app(manager, app_name):
85+
manager.create_eval_set(app_name, "set1")
86+
eval_sets = manager.list_eval_sets("wrong_app")
87+
assert len(eval_sets) == 0
88+
89+
90+
def test_add_eval_case(manager, app_name, eval_set_id, eval_case_id):
91+
manager.create_eval_set(app_name, eval_set_id)
92+
eval_case = EvalCase(eval_id=eval_case_id, conversation=[])
93+
manager.add_eval_case(app_name, eval_set_id, eval_case)
94+
95+
retrieved_case = manager.get_eval_case(app_name, eval_set_id, eval_case_id)
96+
assert retrieved_case is not None
97+
assert retrieved_case.eval_id == eval_case_id
98+
99+
eval_set = manager.get_eval_set(app_name, eval_set_id)
100+
assert len(eval_set.eval_cases) == 1
101+
assert eval_set.eval_cases[0].eval_id == eval_case_id
102+
103+
104+
def test_add_eval_case_set_not_found(
105+
manager, app_name, eval_set_id, eval_case_id
106+
):
107+
eval_case = EvalCase(eval_id=eval_case_id, conversation=[])
108+
with pytest.raises(NotFoundError):
109+
manager.add_eval_case(app_name, eval_set_id, eval_case)
110+
111+
112+
def test_add_eval_case_already_exists(
113+
manager, app_name, eval_set_id, eval_case_id
114+
):
115+
manager.create_eval_set(app_name, eval_set_id)
116+
eval_case = EvalCase(eval_id=eval_case_id, conversation=[])
117+
manager.add_eval_case(app_name, eval_set_id, eval_case)
118+
with pytest.raises(ValueError):
119+
manager.add_eval_case(app_name, eval_set_id, eval_case)
120+
121+
122+
def test_get_eval_case(manager, app_name, eval_set_id, eval_case_id):
123+
manager.create_eval_set(app_name, eval_set_id)
124+
eval_case = EvalCase(eval_id=eval_case_id, conversation=[])
125+
manager.add_eval_case(app_name, eval_set_id, eval_case)
126+
retrieved_case = manager.get_eval_case(app_name, eval_set_id, eval_case_id)
127+
assert retrieved_case is not None
128+
assert retrieved_case.eval_id == eval_case_id
129+
130+
131+
def test_get_eval_case_not_found(manager, app_name, eval_set_id):
132+
manager.create_eval_set(app_name, eval_set_id)
133+
retrieved_case = manager.get_eval_case(
134+
app_name, eval_set_id, "nonexistent_case"
135+
)
136+
assert retrieved_case is None
137+
138+
139+
def test_get_eval_case_set_not_found(manager, app_name, eval_case_id):
140+
retrieved_case = manager.get_eval_case(
141+
app_name, "nonexistent_set", eval_case_id
142+
)
143+
assert retrieved_case is None
144+
145+
146+
def test_update_eval_case(manager, app_name, eval_set_id, eval_case_id):
147+
manager.create_eval_set(app_name, eval_set_id)
148+
eval_case = EvalCase(eval_id=eval_case_id, conversation=[])
149+
manager.add_eval_case(app_name, eval_set_id, eval_case)
150+
151+
updated_eval_case = EvalCase(
152+
eval_id=eval_case_id, conversation=[], creation_timestamp=time.time()
153+
)
154+
manager.update_eval_case(app_name, eval_set_id, updated_eval_case)
155+
156+
retrieved_case = manager.get_eval_case(app_name, eval_set_id, eval_case_id)
157+
assert retrieved_case is not None
158+
assert retrieved_case.creation_timestamp != 0.0
159+
assert (
160+
retrieved_case.creation_timestamp == updated_eval_case.creation_timestamp
161+
)
162+
163+
eval_set = manager.get_eval_set(app_name, eval_set_id)
164+
assert len(eval_set.eval_cases) == 1
165+
assert (
166+
eval_set.eval_cases[0].creation_timestamp
167+
== updated_eval_case.creation_timestamp
168+
)
169+
170+
171+
def test_update_eval_case_not_found(
172+
manager, app_name, eval_set_id, eval_case_id
173+
):
174+
manager.create_eval_set(app_name, eval_set_id)
175+
updated_eval_case = EvalCase(eval_id=eval_case_id, conversation=[])
176+
with pytest.raises(NotFoundError):
177+
manager.update_eval_case(app_name, eval_set_id, updated_eval_case)
178+
179+
180+
def test_delete_eval_case(manager, app_name, eval_set_id, eval_case_id):
181+
manager.create_eval_set(app_name, eval_set_id)
182+
eval_case = EvalCase(eval_id=eval_case_id, conversation=[])
183+
manager.add_eval_case(app_name, eval_set_id, eval_case)
184+
185+
manager.delete_eval_case(app_name, eval_set_id, eval_case_id)
186+
187+
retrieved_case = manager.get_eval_case(app_name, eval_set_id, eval_case_id)
188+
assert retrieved_case is None
189+
190+
eval_set = manager.get_eval_set(app_name, eval_set_id)
191+
assert len(eval_set.eval_cases) == 0
192+
193+
194+
def test_delete_eval_case_not_found(
195+
manager, app_name, eval_set_id, eval_case_id
196+
):
197+
manager.create_eval_set(app_name, eval_set_id)
198+
with pytest.raises(NotFoundError):
199+
manager.delete_eval_case(app_name, eval_set_id, eval_case_id)

0 commit comments

Comments
 (0)