Skip to content

Commit db15033

Browse files
committed
Extended descriptions for source_data and model_info. Added check if model is from HF
1 parent 6f27c88 commit db15033

File tree

9 files changed

+45
-23
lines changed

9 files changed

+45
-23
lines changed

eval_converters/common/adapter.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from dataclasses import dataclass
33
from abc import ABC, abstractmethod
44
from enum import Enum
5+
from huggingface_hub import model_info, HfApi
56
from schema.eval_types import EvaluationLog
67
from typing import Any, List, Union
78
from pathlib import Path
@@ -173,4 +174,10 @@ def _handle_transformation_error(self, error: Exception, context: str):
173174
if self.strict_validation:
174175
raise TransformationError(error_msg) from error
175176
else:
176-
self.logger.warning(error_msg)
177+
self.logger.warning(error_msg)
178+
179+
def _check_if_model_is_on_huggingface(self, model_path):
180+
try:
181+
info = model_info(model_path)
182+
except Exception as e:
183+
self.logger.warning(f"Model '{model_path}' not found on Hugging Face.")

eval_converters/helm/adapter.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def transform_from_directory(self, dir_path):
173173
context_window = 1
174174

175175
except Exception as e:
176-
print(f"Error getting context window: {e}")
176+
self.logger.error(f"Error getting context window: {e}")
177177
context_window = 1
178178

179179
configuration = Configuration(
@@ -184,7 +184,7 @@ def transform_from_directory(self, dir_path):
184184
try:
185185
precision, method = infer_quantization(adapter_spec.model)
186186
except Exception as e:
187-
print(f"Error getting quantization: {e}")
187+
self.logger.warning(f"Error getting quantization: {e}")
188188
precision = BitPrecision.none
189189
method = Method.None_
190190

@@ -271,7 +271,7 @@ def transform_from_directory(self, dir_path):
271271
score = instance_scores[request_state.instance.id]
272272

273273
except Exception as e:
274-
print(f"Error getting instance scores: {e}")
274+
self.logger.warning(f"Error getting instance scores: {e}")
275275
score = 0.0
276276

277277
evaluation = Evaluation(
@@ -361,7 +361,7 @@ def _transform_single(self, raw_data, base_dir=None):
361361
context_window = 1
362362

363363
except Exception as e:
364-
print(f"Error getting context window: {e}")
364+
self.logger.warning(f"Error getting context window: {e}")
365365
context_window = 1
366366

367367
configuration = Configuration(
@@ -372,7 +372,7 @@ def _transform_single(self, raw_data, base_dir=None):
372372
try:
373373
precision, method = infer_quantization(adapter_spec.model)
374374
except Exception as e:
375-
print(f"Error getting quantization: {e}")
375+
self.logger.warning(f"Error getting quantization: {e}")
376376
precision = BitPrecision.none
377377
method = Method.None_
378378

@@ -479,7 +479,7 @@ def _transform_single(self, raw_data, base_dir=None):
479479
break
480480

481481
except Exception as e:
482-
print(f"Error getting instance scores: {e}")
482+
self.logger.warning(f"Error getting instance scores: {e}")
483483
instance_scores = {}
484484

485485
score = instance_scores.get(request_state.instance.id, 0.0)

eval_converters/inspect/adapter.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,6 @@ def _transform_single(self, raw_data: EvalLog, source_metadata: SourceMetadata)
7474

7575
retrieved_timestamp = eval_stats.started_at or eval_spec.created
7676

77-
evaluation_id = f'inspect_ai/{eval_spec.model}/{eval_spec.dataset.name}/{retrieved_timestamp}'
78-
7977
source_data = SourceData(
8078
dataset_name=eval_spec.dataset.name.split('/')[-1],
8179
hf_repo=eval_spec.dataset.location,
@@ -88,8 +86,20 @@ def _transform_single(self, raw_data: EvalLog, source_metadata: SourceMetadata)
8886
evaluation_source_type=EvaluationSourceType.evaluation_platform
8987
)
9088

89+
model_path = eval_spec.model
90+
if raw_data.samples:
91+
model_name = raw_data.samples[0].output.model
92+
model_path_parts = model_path.split('/')
93+
94+
if model_path_parts[-1] in model_name:
95+
model_path_parts[-1] = model_name
96+
97+
model_path = '/'.join(model_path_parts)
98+
99+
self._check_if_model_is_on_huggingface(model_path)
100+
91101
model_info = ModelInfo(
92-
name=eval_spec.model,
102+
name=model_path,
93103
developer=eval_spec.model.split('/')[0],
94104
inference_platform="/".join(eval_spec.model.split('/')[:-1])
95105
)
@@ -137,9 +147,11 @@ def _transform_single(self, raw_data: EvalLog, source_metadata: SourceMetadata)
137147
)
138148
)
139149

150+
evaluation_id = f'inspect_ai/{model_path}/{eval_spec.dataset.name}/{retrieved_timestamp}'
151+
140152
return EvaluationLog(
141153
schema_version=SCHEMA_VERSION,
142-
evaluation_id=evaluation_id.replace('/', '_'),
154+
evaluation_id=evaluation_id,
143155
retrieved_timestamp=retrieved_timestamp,
144156
source_data=source_data,
145157
evaluation_source=evaluation_source,

eval_converters/inspect/converter.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,10 @@ def convert_to_unified_schema(self, source_metadata: SourceMetadata) -> Evaluati
5050

5151
def save_to_file(self, unified_eval_log: EvaluationLog, output_filename: str) -> bool:
5252
try:
53-
# data = unified_eval_log.model_dump()
5453
json_str = unified_eval_log.model_dump_json(indent=2)
5554

5655
with open(f'{self.output_dir}/{output_filename}', 'w') as json_file:
5756
json_file.write(json_str)
58-
# json_str = json.dumps(data, indent=2)
59-
# with open(f'{self.output_dir}/{output_filename}', 'w') as json_file:
60-
# json.dump(json_str, json_file)
6157

6258
print(f'Unified eval log was successfully saved to {output_filename} file.')
6359
except Exception as e:
@@ -86,7 +82,7 @@ def save_to_hf_datasets(self, unified_eval_log: EvaluationLog) -> bool:
8682

8783
unified_output: EvaluationLog = inspect_converter.convert_to_unified_schema(source_metadata)
8884
if unified_output:
89-
output_filename = f'{str(unified_output.evaluation_id)}.json'
85+
output_filename = f'{str(unified_output.evaluation_id).replace('/', '_')}.json'
9086
inspect_converter.save_to_file(unified_output, output_filename)
9187
else:
9288
print("Missing unified schema result!")

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
77
dependencies = [
88
"crfm-helm>=0.5.6",
99
"dacite>=1.9.2",
10+
"huggingface-hub>=0.33.2",
1011
"numpy>=2.3.1",
1112
"openai>=1.93.0",
1213
"pandas>=2.3.0",

schema/eval.schema.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"description": "Timestamp for when this record was created"
2727
},
2828
"source_data": {
29+
"description": "Source of dataset used for evaluation. There are two options supported: HuggingFace dataset or url for other data source.",
2930
"oneOf": [
3031
{
3132
"type": "array",
@@ -36,7 +37,7 @@
3637
},
3738
{
3839
"type": "object",
39-
"description": "Details about dataset used for evaluation",
40+
"description": "Details about HuggingFace dataset used for evaluation",
4041
"required": [
4142
"dataset_name"
4243
],
@@ -131,7 +132,7 @@
131132
"properties": {
132133
"name": {
133134
"type": "string",
134-
"description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
135+
"description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
135136
},
136137
"developer": {
137138
"type": "string",

schema/eval_types.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# generated by datamodel-codegen:
22
# filename: eval.schema.json
3-
# timestamp: 2025-10-13T22:57:16+00:00
3+
# timestamp: 2025-10-14T21:00:44+00:00
44

55
from __future__ import annotations
66

@@ -62,7 +62,7 @@ class SourceMetadata(BaseModel):
6262
class ModelInfo(BaseModel):
6363
name: str = Field(
6464
...,
65-
description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)',
65+
description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)',
6666
)
6767
developer: Optional[str] = Field(
6868
None, description="Name of organization that provides the model (e.g. 'OpenAI')"
@@ -156,7 +156,10 @@ class EvaluationLog(BaseModel):
156156
retrieved_timestamp: str = Field(
157157
..., description='Timestamp for when this record was created'
158158
)
159-
source_data: Union[List[str], SourceData]
159+
source_data: Union[List[str], SourceData] = Field(
160+
...,
161+
description='Source of dataset used for evaluation. There are two options supported: HuggingFace dataset or url for other data source.',
162+
)
160163
evaluation_source: Optional[EvaluationSource] = Field(
161164
None,
162165
description='Details about evaluation origin. There are options that evaluations come from leaderboards or evaluation platforms.',

schema/inspect_ai_openai_azure_gpt-4o-mini_bigbio_pubmed_qa_2025-07-03T16:44:30+02:00.json renamed to schema/inspect_ai_openai_azure_gpt-4o-mini-2024-07-18_bigbio_pubmed_qa_2025-07-03T16:44:30+02:00.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "inspect_ai_openai_azure_gpt-4o-mini_bigbio_pubmed_qa_2025-07-03T16:44:30+02:00",
3+
"evaluation_id": "inspect_ai/openai/azure/gpt-4o-mini-2024-07-18/bigbio/pubmed_qa/2025-07-03T16:44:30+02:00",
44
"retrieved_timestamp": "2025-07-03T16:44:30+02:00",
55
"source_data": {
66
"dataset_name": "pubmed_qa",
@@ -23,7 +23,7 @@
2323
"evaluator_relationship": "other"
2424
},
2525
"model_info": {
26-
"name": "openai/azure/gpt-4o-mini",
26+
"name": "openai/azure/gpt-4o-mini-2024-07-18",
2727
"developer": "openai",
2828
"inference_platform": "openai/azure"
2929
},

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)