Skip to content

Commit be97e08

Browse files
committed
chore(test): migrate unit tests from unittest to pytest nvidia test eval
Signed-off-by: Mustafa Elbehery <[email protected]>
1 parent 75fad44 commit be97e08

File tree

1 file changed

+199
-174
lines changed

1 file changed

+199
-174
lines changed

tests/unit/providers/nvidia/test_eval.py

Lines changed: 199 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@
55
# the root directory of this source tree.
66

77
import os
8-
import unittest
98
from unittest.mock import MagicMock, patch
109

1110
import pytest
1211

1312
from llama_stack.apis.benchmarks import Benchmark
1413
from llama_stack.apis.common.job_types import Job, JobStatus
1514
from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
15+
from llama_stack.apis.inference.inference import TopPSamplingStrategy
16+
from llama_stack.apis.resource import ResourceType
1617
from llama_stack.models.llama.sku_types import CoreModelId
1718
from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
1819
from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
@@ -21,181 +22,205 @@
2122
MOCK_BENCHMARK_ID = "test-benchmark"
2223

2324

24-
class TestNVIDIAEvalImpl(unittest.TestCase):
25-
def setUp(self):
26-
os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
27-
28-
# Create mock APIs
29-
self.datasetio_api = MagicMock()
30-
self.datasets_api = MagicMock()
31-
self.scoring_api = MagicMock()
32-
self.inference_api = MagicMock()
33-
self.agents_api = MagicMock()
34-
35-
self.config = NVIDIAEvalConfig(
36-
evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
37-
)
38-
39-
self.eval_impl = NVIDIAEvalImpl(
40-
config=self.config,
41-
datasetio_api=self.datasetio_api,
42-
datasets_api=self.datasets_api,
43-
scoring_api=self.scoring_api,
44-
inference_api=self.inference_api,
45-
agents_api=self.agents_api,
46-
)
47-
48-
# Mock the HTTP request methods
49-
self.evaluator_get_patcher = patch(
50-
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get"
51-
)
52-
self.evaluator_post_patcher = patch(
53-
"llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
54-
)
55-
56-
self.mock_evaluator_get = self.evaluator_get_patcher.start()
57-
self.mock_evaluator_post = self.evaluator_post_patcher.start()
58-
59-
def tearDown(self):
60-
"""Clean up after each test."""
61-
self.evaluator_get_patcher.stop()
62-
self.evaluator_post_patcher.stop()
63-
64-
def _assert_request_body(self, expected_json):
65-
"""Helper method to verify request body in Evaluator POST request is correct"""
66-
call_args = self.mock_evaluator_post.call_args
67-
actual_json = call_args[0][1]
68-
69-
# Check that all expected keys contain the expected values in the actual JSON
70-
for key, value in expected_json.items():
71-
assert key in actual_json, f"Key '{key}' missing in actual JSON"
72-
73-
if isinstance(value, dict):
74-
for nested_key, nested_value in value.items():
75-
assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
76-
assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
77-
else:
78-
assert actual_json[key] == value, f"Value mismatch for '{key}'"
79-
80-
@pytest.fixture(autouse=True)
81-
def inject_fixtures(self, run_async):
82-
self.run_async = run_async
83-
84-
def test_register_benchmark(self):
85-
eval_config = {
86-
"type": "custom",
87-
"params": {"parallelism": 8},
88-
"tasks": {
89-
"qa": {
90-
"type": "completion",
91-
"params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
92-
"dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
93-
"metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
94-
}
95-
},
25+
@pytest.fixture
26+
def nvidia_eval_setup():
27+
"""Set up the NVIDIA eval implementation with mocked dependencies."""
28+
os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
29+
30+
# Create mock APIs
31+
datasetio_api = MagicMock()
32+
datasets_api = MagicMock()
33+
scoring_api = MagicMock()
34+
inference_api = MagicMock()
35+
agents_api = MagicMock()
36+
37+
config = NVIDIAEvalConfig(
38+
evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
39+
)
40+
41+
eval_impl = NVIDIAEvalImpl(
42+
config=config,
43+
datasetio_api=datasetio_api,
44+
datasets_api=datasets_api,
45+
scoring_api=scoring_api,
46+
inference_api=inference_api,
47+
agents_api=agents_api,
48+
)
49+
50+
# Mock the HTTP request methods
51+
with (
52+
patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get,
53+
patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post,
54+
):
55+
yield {
56+
"eval_impl": eval_impl,
57+
"mock_evaluator_get": mock_evaluator_get,
58+
"mock_evaluator_post": mock_evaluator_post,
59+
"datasetio_api": datasetio_api,
60+
"datasets_api": datasets_api,
61+
"scoring_api": scoring_api,
62+
"inference_api": inference_api,
63+
"agents_api": agents_api,
9664
}
9765

98-
benchmark = Benchmark(
99-
provider_id="nvidia",
100-
type="benchmark",
101-
identifier=MOCK_BENCHMARK_ID,
102-
dataset_id=MOCK_DATASET_ID,
103-
scoring_functions=["basic::equality"],
104-
metadata=eval_config,
105-
)
106-
107-
# Mock Evaluator API response
108-
mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
109-
self.mock_evaluator_post.return_value = mock_evaluator_response
110-
111-
# Register the benchmark
112-
self.run_async(self.eval_impl.register_benchmark(benchmark))
113-
114-
# Verify the Evaluator API was called correctly
115-
self.mock_evaluator_post.assert_called_once()
116-
self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
117-
118-
def test_run_eval(self):
119-
benchmark_config = BenchmarkConfig(
120-
eval_candidate=ModelCandidate(
121-
type="model",
122-
model=CoreModelId.llama3_1_8b_instruct.value,
123-
sampling_params=SamplingParams(max_tokens=100, temperature=0.7),
124-
)
125-
)
126-
127-
# Mock Evaluator API response
128-
mock_evaluator_response = {"id": "job-123", "status": "created"}
129-
self.mock_evaluator_post.return_value = mock_evaluator_response
130-
131-
# Run the Evaluation job
132-
result = self.run_async(
133-
self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
134-
)
13566

136-
# Verify the Evaluator API was called correctly
137-
self.mock_evaluator_post.assert_called_once()
138-
self._assert_request_body(
139-
{
140-
"config": f"nvidia/{MOCK_BENCHMARK_ID}",
141-
"target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"},
67+
def _assert_request_body(mock_evaluator_post, expected_json):
68+
"""Helper method to verify request body in Evaluator POST request is correct"""
69+
call_args = mock_evaluator_post.call_args
70+
actual_json = call_args[0][1]
71+
72+
# Check that all expected keys contain the expected values in the actual JSON
73+
for key, value in expected_json.items():
74+
assert key in actual_json, f"Key '{key}' missing in actual JSON"
75+
76+
if isinstance(value, dict):
77+
for nested_key, nested_value in value.items():
78+
assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
79+
assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
80+
else:
81+
assert actual_json[key] == value, f"Value mismatch for '{key}'"
82+
83+
84+
@pytest.mark.asyncio
85+
async def test_register_benchmark(nvidia_eval_setup):
86+
eval_impl = nvidia_eval_setup["eval_impl"]
87+
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
88+
89+
eval_config = {
90+
"type": "custom",
91+
"params": {"parallelism": 8},
92+
"tasks": {
93+
"qa": {
94+
"type": "completion",
95+
"params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
96+
"dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
97+
"metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
14298
}
99+
},
100+
}
101+
102+
benchmark = Benchmark(
103+
provider_id="nvidia",
104+
type=ResourceType.benchmark,
105+
identifier=MOCK_BENCHMARK_ID,
106+
dataset_id=MOCK_DATASET_ID,
107+
scoring_functions=["basic::equality"],
108+
metadata=eval_config,
109+
)
110+
111+
# Mock Evaluator API response
112+
mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
113+
mock_evaluator_post.return_value = mock_evaluator_response
114+
115+
# Register the benchmark
116+
await eval_impl.register_benchmark(benchmark)
117+
118+
# Verify the Evaluator API was called correctly
119+
mock_evaluator_post.assert_called_once()
120+
_assert_request_body(
121+
mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}
122+
)
123+
124+
125+
@pytest.mark.asyncio
126+
async def test_run_eval(nvidia_eval_setup):
127+
eval_impl = nvidia_eval_setup["eval_impl"]
128+
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
129+
130+
benchmark_config = BenchmarkConfig(
131+
eval_candidate=ModelCandidate(
132+
type="model",
133+
model=CoreModelId.llama3_1_8b_instruct.value,
134+
sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
143135
)
144-
145-
# Verify the result
146-
assert isinstance(result, Job)
147-
assert result.job_id == "job-123"
148-
assert result.status == JobStatus.in_progress
149-
150-
def test_job_status(self):
151-
# Mock Evaluator API response
152-
mock_evaluator_response = {"id": "job-123", "status": "completed"}
153-
self.mock_evaluator_get.return_value = mock_evaluator_response
154-
155-
# Get the Evaluation job
156-
result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
157-
158-
# Verify the result
159-
assert isinstance(result, Job)
160-
assert result.job_id == "job-123"
161-
assert result.status == JobStatus.completed
162-
163-
# Verify the API was called correctly
164-
self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
165-
166-
def test_job_cancel(self):
167-
# Mock Evaluator API response
168-
mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
169-
self.mock_evaluator_post.return_value = mock_evaluator_response
170-
171-
# Cancel the Evaluation job
172-
self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
173-
174-
# Verify the API was called correctly
175-
self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
176-
177-
def test_job_result(self):
178-
# Mock Evaluator API responses
179-
mock_job_status_response = {"id": "job-123", "status": "completed"}
180-
mock_job_results_response = {
181-
"id": "job-123",
182-
"status": "completed",
183-
"results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
184-
}
185-
self.mock_evaluator_get.side_effect = [
186-
mock_job_status_response, # First call to retrieve job
187-
mock_job_results_response, # Second call to retrieve job results
188-
]
189-
190-
# Get the Evaluation job results
191-
result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
192-
193-
# Verify the result
194-
assert isinstance(result, EvaluateResponse)
195-
assert MOCK_BENCHMARK_ID in result.scores
196-
assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
197-
198-
# Verify the API was called correctly
199-
assert self.mock_evaluator_get.call_count == 2
200-
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
201-
self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")
136+
)
137+
138+
# Mock Evaluator API response
139+
mock_evaluator_response = {"id": "job-123", "status": "created"}
140+
mock_evaluator_post.return_value = mock_evaluator_response
141+
142+
# Run the Evaluation job
143+
result = await eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
144+
145+
# Verify the Evaluator API was called correctly
146+
mock_evaluator_post.assert_called_once()
147+
_assert_request_body(
148+
mock_evaluator_post,
149+
{
150+
"config": f"nvidia/{MOCK_BENCHMARK_ID}",
151+
"target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"},
152+
},
153+
)
154+
155+
# Verify the result
156+
assert isinstance(result, Job)
157+
assert result.job_id == "job-123"
158+
assert result.status == JobStatus.in_progress
159+
160+
161+
@pytest.mark.asyncio
162+
async def test_job_status(nvidia_eval_setup):
163+
eval_impl = nvidia_eval_setup["eval_impl"]
164+
mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
165+
166+
# Mock Evaluator API response
167+
mock_evaluator_response = {"id": "job-123", "status": "completed"}
168+
mock_evaluator_get.return_value = mock_evaluator_response
169+
170+
# Get the Evaluation job
171+
result = await eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
172+
173+
# Verify the result
174+
assert isinstance(result, Job)
175+
assert result.job_id == "job-123"
176+
assert result.status == JobStatus.completed
177+
178+
# Verify the API was called correctly
179+
mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
180+
181+
182+
@pytest.mark.asyncio
183+
async def test_job_cancel(nvidia_eval_setup):
184+
eval_impl = nvidia_eval_setup["eval_impl"]
185+
mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
186+
187+
# Mock Evaluator API response
188+
mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
189+
mock_evaluator_post.return_value = mock_evaluator_response
190+
191+
# Cancel the Evaluation job
192+
await eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
193+
194+
# Verify the API was called correctly
195+
mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
196+
197+
198+
@pytest.mark.asyncio
199+
async def test_job_result(nvidia_eval_setup):
200+
eval_impl = nvidia_eval_setup["eval_impl"]
201+
mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
202+
203+
# Mock Evaluator API responses
204+
mock_job_status_response = {"id": "job-123", "status": "completed"}
205+
mock_job_results_response = {
206+
"id": "job-123",
207+
"status": "completed",
208+
"results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
209+
}
210+
mock_evaluator_get.side_effect = [
211+
mock_job_status_response, # First call to retrieve job
212+
mock_job_results_response, # Second call to retrieve job results
213+
]
214+
215+
# Get the Evaluation job results
216+
result = await eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
217+
218+
# Verify the result
219+
assert isinstance(result, EvaluateResponse)
220+
assert MOCK_BENCHMARK_ID in result.scores
221+
assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
222+
223+
# Verify the API was called correctly
224+
assert mock_evaluator_get.call_count == 2
225+
mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
226+
mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")

0 commit comments

Comments
 (0)