|
5 | 5 | # the root directory of this source tree.
|
6 | 6 |
|
7 | 7 | import os
|
8 |
| -import unittest |
9 | 8 | from unittest.mock import MagicMock, patch
|
10 | 9 |
|
11 | 10 | import pytest
|
12 | 11 |
|
13 | 12 | from llama_stack.apis.benchmarks import Benchmark
|
14 | 13 | from llama_stack.apis.common.job_types import Job, JobStatus
|
15 | 14 | from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
|
| 15 | +from llama_stack.apis.inference.inference import TopPSamplingStrategy |
| 16 | +from llama_stack.apis.resource import ResourceType |
16 | 17 | from llama_stack.models.llama.sku_types import CoreModelId
|
17 | 18 | from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
|
18 | 19 | from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
|
|
21 | 22 | MOCK_BENCHMARK_ID = "test-benchmark"
|
22 | 23 |
|
23 | 24 |
|
24 |
| -class TestNVIDIAEvalImpl(unittest.TestCase): |
25 |
| - def setUp(self): |
26 |
| - os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test" |
27 |
| - |
28 |
| - # Create mock APIs |
29 |
| - self.datasetio_api = MagicMock() |
30 |
| - self.datasets_api = MagicMock() |
31 |
| - self.scoring_api = MagicMock() |
32 |
| - self.inference_api = MagicMock() |
33 |
| - self.agents_api = MagicMock() |
34 |
| - |
35 |
| - self.config = NVIDIAEvalConfig( |
36 |
| - evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"], |
37 |
| - ) |
38 |
| - |
39 |
| - self.eval_impl = NVIDIAEvalImpl( |
40 |
| - config=self.config, |
41 |
| - datasetio_api=self.datasetio_api, |
42 |
| - datasets_api=self.datasets_api, |
43 |
| - scoring_api=self.scoring_api, |
44 |
| - inference_api=self.inference_api, |
45 |
| - agents_api=self.agents_api, |
46 |
| - ) |
47 |
| - |
48 |
| - # Mock the HTTP request methods |
49 |
| - self.evaluator_get_patcher = patch( |
50 |
| - "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get" |
51 |
| - ) |
52 |
| - self.evaluator_post_patcher = patch( |
53 |
| - "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post" |
54 |
| - ) |
55 |
| - |
56 |
| - self.mock_evaluator_get = self.evaluator_get_patcher.start() |
57 |
| - self.mock_evaluator_post = self.evaluator_post_patcher.start() |
58 |
| - |
59 |
| - def tearDown(self): |
60 |
| - """Clean up after each test.""" |
61 |
| - self.evaluator_get_patcher.stop() |
62 |
| - self.evaluator_post_patcher.stop() |
63 |
| - |
64 |
| - def _assert_request_body(self, expected_json): |
65 |
| - """Helper method to verify request body in Evaluator POST request is correct""" |
66 |
| - call_args = self.mock_evaluator_post.call_args |
67 |
| - actual_json = call_args[0][1] |
68 |
| - |
69 |
| - # Check that all expected keys contain the expected values in the actual JSON |
70 |
| - for key, value in expected_json.items(): |
71 |
| - assert key in actual_json, f"Key '{key}' missing in actual JSON" |
72 |
| - |
73 |
| - if isinstance(value, dict): |
74 |
| - for nested_key, nested_value in value.items(): |
75 |
| - assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']" |
76 |
| - assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'" |
77 |
| - else: |
78 |
| - assert actual_json[key] == value, f"Value mismatch for '{key}'" |
79 |
| - |
80 |
| - @pytest.fixture(autouse=True) |
81 |
| - def inject_fixtures(self, run_async): |
82 |
| - self.run_async = run_async |
83 |
| - |
84 |
| - def test_register_benchmark(self): |
85 |
| - eval_config = { |
86 |
| - "type": "custom", |
87 |
| - "params": {"parallelism": 8}, |
88 |
| - "tasks": { |
89 |
| - "qa": { |
90 |
| - "type": "completion", |
91 |
| - "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}}, |
92 |
| - "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"}, |
93 |
| - "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}}, |
94 |
| - } |
95 |
| - }, |
| 25 | +@pytest.fixture |
| 26 | +def nvidia_eval_setup(): |
| 27 | + """Set up the NVIDIA eval implementation with mocked dependencies.""" |
| 28 | + os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test" |
| 29 | + |
| 30 | + # Create mock APIs |
| 31 | + datasetio_api = MagicMock() |
| 32 | + datasets_api = MagicMock() |
| 33 | + scoring_api = MagicMock() |
| 34 | + inference_api = MagicMock() |
| 35 | + agents_api = MagicMock() |
| 36 | + |
| 37 | + config = NVIDIAEvalConfig( |
| 38 | + evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"], |
| 39 | + ) |
| 40 | + |
| 41 | + eval_impl = NVIDIAEvalImpl( |
| 42 | + config=config, |
| 43 | + datasetio_api=datasetio_api, |
| 44 | + datasets_api=datasets_api, |
| 45 | + scoring_api=scoring_api, |
| 46 | + inference_api=inference_api, |
| 47 | + agents_api=agents_api, |
| 48 | + ) |
| 49 | + |
| 50 | + # Mock the HTTP request methods |
| 51 | + with ( |
| 52 | + patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get, |
| 53 | + patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post, |
| 54 | + ): |
| 55 | + yield { |
| 56 | + "eval_impl": eval_impl, |
| 57 | + "mock_evaluator_get": mock_evaluator_get, |
| 58 | + "mock_evaluator_post": mock_evaluator_post, |
| 59 | + "datasetio_api": datasetio_api, |
| 60 | + "datasets_api": datasets_api, |
| 61 | + "scoring_api": scoring_api, |
| 62 | + "inference_api": inference_api, |
| 63 | + "agents_api": agents_api, |
96 | 64 | }
|
97 | 65 |
|
98 |
| - benchmark = Benchmark( |
99 |
| - provider_id="nvidia", |
100 |
| - type="benchmark", |
101 |
| - identifier=MOCK_BENCHMARK_ID, |
102 |
| - dataset_id=MOCK_DATASET_ID, |
103 |
| - scoring_functions=["basic::equality"], |
104 |
| - metadata=eval_config, |
105 |
| - ) |
106 |
| - |
107 |
| - # Mock Evaluator API response |
108 |
| - mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"} |
109 |
| - self.mock_evaluator_post.return_value = mock_evaluator_response |
110 |
| - |
111 |
| - # Register the benchmark |
112 |
| - self.run_async(self.eval_impl.register_benchmark(benchmark)) |
113 |
| - |
114 |
| - # Verify the Evaluator API was called correctly |
115 |
| - self.mock_evaluator_post.assert_called_once() |
116 |
| - self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}) |
117 |
| - |
118 |
| - def test_run_eval(self): |
119 |
| - benchmark_config = BenchmarkConfig( |
120 |
| - eval_candidate=ModelCandidate( |
121 |
| - type="model", |
122 |
| - model=CoreModelId.llama3_1_8b_instruct.value, |
123 |
| - sampling_params=SamplingParams(max_tokens=100, temperature=0.7), |
124 |
| - ) |
125 |
| - ) |
126 |
| - |
127 |
| - # Mock Evaluator API response |
128 |
| - mock_evaluator_response = {"id": "job-123", "status": "created"} |
129 |
| - self.mock_evaluator_post.return_value = mock_evaluator_response |
130 |
| - |
131 |
| - # Run the Evaluation job |
132 |
| - result = self.run_async( |
133 |
| - self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config) |
134 |
| - ) |
135 | 66 |
|
136 |
| - # Verify the Evaluator API was called correctly |
137 |
| - self.mock_evaluator_post.assert_called_once() |
138 |
| - self._assert_request_body( |
139 |
| - { |
140 |
| - "config": f"nvidia/{MOCK_BENCHMARK_ID}", |
141 |
| - "target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"}, |
| 67 | +def _assert_request_body(mock_evaluator_post, expected_json): |
| 68 | + """Helper method to verify request body in Evaluator POST request is correct""" |
| 69 | + call_args = mock_evaluator_post.call_args |
| 70 | + actual_json = call_args[0][1] |
| 71 | + |
| 72 | + # Check that all expected keys contain the expected values in the actual JSON |
| 73 | + for key, value in expected_json.items(): |
| 74 | + assert key in actual_json, f"Key '{key}' missing in actual JSON" |
| 75 | + |
| 76 | + if isinstance(value, dict): |
| 77 | + for nested_key, nested_value in value.items(): |
| 78 | + assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']" |
| 79 | + assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'" |
| 80 | + else: |
| 81 | + assert actual_json[key] == value, f"Value mismatch for '{key}'" |
| 82 | + |
| 83 | + |
| 84 | +@pytest.mark.asyncio |
| 85 | +async def test_register_benchmark(nvidia_eval_setup): |
| 86 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 87 | + mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"] |
| 88 | + |
| 89 | + eval_config = { |
| 90 | + "type": "custom", |
| 91 | + "params": {"parallelism": 8}, |
| 92 | + "tasks": { |
| 93 | + "qa": { |
| 94 | + "type": "completion", |
| 95 | + "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}}, |
| 96 | + "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"}, |
| 97 | + "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}}, |
142 | 98 | }
|
| 99 | + }, |
| 100 | + } |
| 101 | + |
| 102 | + benchmark = Benchmark( |
| 103 | + provider_id="nvidia", |
| 104 | + type=ResourceType.benchmark, |
| 105 | + identifier=MOCK_BENCHMARK_ID, |
| 106 | + dataset_id=MOCK_DATASET_ID, |
| 107 | + scoring_functions=["basic::equality"], |
| 108 | + metadata=eval_config, |
| 109 | + ) |
| 110 | + |
| 111 | + # Mock Evaluator API response |
| 112 | + mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"} |
| 113 | + mock_evaluator_post.return_value = mock_evaluator_response |
| 114 | + |
| 115 | + # Register the benchmark |
| 116 | + await eval_impl.register_benchmark(benchmark) |
| 117 | + |
| 118 | + # Verify the Evaluator API was called correctly |
| 119 | + mock_evaluator_post.assert_called_once() |
| 120 | + _assert_request_body( |
| 121 | + mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config} |
| 122 | + ) |
| 123 | + |
| 124 | + |
| 125 | +@pytest.mark.asyncio |
| 126 | +async def test_run_eval(nvidia_eval_setup): |
| 127 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 128 | + mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"] |
| 129 | + |
| 130 | + benchmark_config = BenchmarkConfig( |
| 131 | + eval_candidate=ModelCandidate( |
| 132 | + type="model", |
| 133 | + model=CoreModelId.llama3_1_8b_instruct.value, |
| 134 | + sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)), |
143 | 135 | )
|
144 |
| - |
145 |
| - # Verify the result |
146 |
| - assert isinstance(result, Job) |
147 |
| - assert result.job_id == "job-123" |
148 |
| - assert result.status == JobStatus.in_progress |
149 |
| - |
150 |
| - def test_job_status(self): |
151 |
| - # Mock Evaluator API response |
152 |
| - mock_evaluator_response = {"id": "job-123", "status": "completed"} |
153 |
| - self.mock_evaluator_get.return_value = mock_evaluator_response |
154 |
| - |
155 |
| - # Get the Evaluation job |
156 |
| - result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")) |
157 |
| - |
158 |
| - # Verify the result |
159 |
| - assert isinstance(result, Job) |
160 |
| - assert result.job_id == "job-123" |
161 |
| - assert result.status == JobStatus.completed |
162 |
| - |
163 |
| - # Verify the API was called correctly |
164 |
| - self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}") |
165 |
| - |
166 |
| - def test_job_cancel(self): |
167 |
| - # Mock Evaluator API response |
168 |
| - mock_evaluator_response = {"id": "job-123", "status": "cancelled"} |
169 |
| - self.mock_evaluator_post.return_value = mock_evaluator_response |
170 |
| - |
171 |
| - # Cancel the Evaluation job |
172 |
| - self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")) |
173 |
| - |
174 |
| - # Verify the API was called correctly |
175 |
| - self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {}) |
176 |
| - |
177 |
| - def test_job_result(self): |
178 |
| - # Mock Evaluator API responses |
179 |
| - mock_job_status_response = {"id": "job-123", "status": "completed"} |
180 |
| - mock_job_results_response = { |
181 |
| - "id": "job-123", |
182 |
| - "status": "completed", |
183 |
| - "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}}, |
184 |
| - } |
185 |
| - self.mock_evaluator_get.side_effect = [ |
186 |
| - mock_job_status_response, # First call to retrieve job |
187 |
| - mock_job_results_response, # Second call to retrieve job results |
188 |
| - ] |
189 |
| - |
190 |
| - # Get the Evaluation job results |
191 |
| - result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")) |
192 |
| - |
193 |
| - # Verify the result |
194 |
| - assert isinstance(result, EvaluateResponse) |
195 |
| - assert MOCK_BENCHMARK_ID in result.scores |
196 |
| - assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85 |
197 |
| - |
198 |
| - # Verify the API was called correctly |
199 |
| - assert self.mock_evaluator_get.call_count == 2 |
200 |
| - self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123") |
201 |
| - self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results") |
| 136 | + ) |
| 137 | + |
| 138 | + # Mock Evaluator API response |
| 139 | + mock_evaluator_response = {"id": "job-123", "status": "created"} |
| 140 | + mock_evaluator_post.return_value = mock_evaluator_response |
| 141 | + |
| 142 | + # Run the Evaluation job |
| 143 | + result = await eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config) |
| 144 | + |
| 145 | + # Verify the Evaluator API was called correctly |
| 146 | + mock_evaluator_post.assert_called_once() |
| 147 | + _assert_request_body( |
| 148 | + mock_evaluator_post, |
| 149 | + { |
| 150 | + "config": f"nvidia/{MOCK_BENCHMARK_ID}", |
| 151 | + "target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"}, |
| 152 | + }, |
| 153 | + ) |
| 154 | + |
| 155 | + # Verify the result |
| 156 | + assert isinstance(result, Job) |
| 157 | + assert result.job_id == "job-123" |
| 158 | + assert result.status == JobStatus.in_progress |
| 159 | + |
| 160 | + |
| 161 | +@pytest.mark.asyncio |
| 162 | +async def test_job_status(nvidia_eval_setup): |
| 163 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 164 | + mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"] |
| 165 | + |
| 166 | + # Mock Evaluator API response |
| 167 | + mock_evaluator_response = {"id": "job-123", "status": "completed"} |
| 168 | + mock_evaluator_get.return_value = mock_evaluator_response |
| 169 | + |
| 170 | + # Get the Evaluation job |
| 171 | + result = await eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123") |
| 172 | + |
| 173 | + # Verify the result |
| 174 | + assert isinstance(result, Job) |
| 175 | + assert result.job_id == "job-123" |
| 176 | + assert result.status == JobStatus.completed |
| 177 | + |
| 178 | + # Verify the API was called correctly |
| 179 | + mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}") |
| 180 | + |
| 181 | + |
| 182 | +@pytest.mark.asyncio |
| 183 | +async def test_job_cancel(nvidia_eval_setup): |
| 184 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 185 | + mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"] |
| 186 | + |
| 187 | + # Mock Evaluator API response |
| 188 | + mock_evaluator_response = {"id": "job-123", "status": "cancelled"} |
| 189 | + mock_evaluator_post.return_value = mock_evaluator_response |
| 190 | + |
| 191 | + # Cancel the Evaluation job |
| 192 | + await eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123") |
| 193 | + |
| 194 | + # Verify the API was called correctly |
| 195 | + mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {}) |
| 196 | + |
| 197 | + |
| 198 | +@pytest.mark.asyncio |
| 199 | +async def test_job_result(nvidia_eval_setup): |
| 200 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 201 | + mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"] |
| 202 | + |
| 203 | + # Mock Evaluator API responses |
| 204 | + mock_job_status_response = {"id": "job-123", "status": "completed"} |
| 205 | + mock_job_results_response = { |
| 206 | + "id": "job-123", |
| 207 | + "status": "completed", |
| 208 | + "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}}, |
| 209 | + } |
| 210 | + mock_evaluator_get.side_effect = [ |
| 211 | + mock_job_status_response, # First call to retrieve job |
| 212 | + mock_job_results_response, # Second call to retrieve job results |
| 213 | + ] |
| 214 | + |
| 215 | + # Get the Evaluation job results |
| 216 | + result = await eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123") |
| 217 | + |
| 218 | + # Verify the result |
| 219 | + assert isinstance(result, EvaluateResponse) |
| 220 | + assert MOCK_BENCHMARK_ID in result.scores |
| 221 | + assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85 |
| 222 | + |
| 223 | + # Verify the API was called correctly |
| 224 | + assert mock_evaluator_get.call_count == 2 |
| 225 | + mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123") |
| 226 | + mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results") |
0 commit comments