Extended descriptions for source_data and model_info. Added check if model is from HF

damian1996 · damian1996 · commit db150332e546 · 2025-10-14T23:31:44.000+02:00
diff --git a/eval_converters/common/adapter.py b/eval_converters/common/adapter.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
 from enum import Enum
+from huggingface_hub import model_info, HfApi
 from schema.eval_types import EvaluationLog
 from typing import Any, List, Union
 from pathlib import Path
@@ -173,4 +174,10 @@ def _handle_transformation_error(self, error: Exception, context: str):
         if self.strict_validation:
             raise TransformationError(error_msg) from error
         else:
-            self.logger.warning(error_msg)
+            self.logger.warning(error_msg)
+
+    def _check_if_model_is_on_huggingface(self, model_path):
+        try:
+            info = model_info(model_path)
+        except Exception as e:
+            self.logger.warning(f"Model '{model_path}' not found on Hugging Face.")
diff --git a/eval_converters/helm/adapter.py b/eval_converters/helm/adapter.py
@@ -173,7 +173,7 @@ def transform_from_directory(self, dir_path):
 					context_window = 1
 		
 		except Exception as e:
-			print(f"Error getting context window: {e}")
+			self.logger.error(f"Error getting context window: {e}")
 			context_window = 1
 
 		configuration = Configuration(
@@ -184,7 +184,7 @@ def transform_from_directory(self, dir_path):
 		try:
 			precision, method = infer_quantization(adapter_spec.model)
 		except Exception as e:
-			print(f"Error getting quantization: {e}")
+			self.logger.warning(f"Error getting quantization: {e}")
 			precision = BitPrecision.none
 			method = Method.None_
 
@@ -271,7 +271,7 @@ def transform_from_directory(self, dir_path):
 				score = instance_scores[request_state.instance.id]
 				
 			except Exception as e:
-				print(f"Error getting instance scores: {e}")
+				self.logger.warning(f"Error getting instance scores: {e}")
 				score = 0.0
 			
 			evaluation = Evaluation(
@@ -361,7 +361,7 @@ def _transform_single(self, raw_data, base_dir=None):
 					context_window = 1
 		
 		except Exception as e:
-			print(f"Error getting context window: {e}")
+			self.logger.warning(f"Error getting context window: {e}")
 			context_window = 1
 
 		configuration = Configuration(
@@ -372,7 +372,7 @@ def _transform_single(self, raw_data, base_dir=None):
 		try:
 			precision, method = infer_quantization(adapter_spec.model)
 		except Exception as e:
-			print(f"Error getting quantization: {e}")
+			self.logger.warning(f"Error getting quantization: {e}")
 			precision = BitPrecision.none
 			method = Method.None_
 
@@ -479,7 +479,7 @@ def _transform_single(self, raw_data, base_dir=None):
 							break
 
 			except Exception as e:
-				print(f"Error getting instance scores: {e}")
+				self.logger.warning(f"Error getting instance scores: {e}")
 				instance_scores = {}
 
 			score = instance_scores.get(request_state.instance.id, 0.0)
diff --git a/eval_converters/inspect/adapter.py b/eval_converters/inspect/adapter.py
@@ -74,8 +74,6 @@ def _transform_single(self, raw_data: EvalLog, source_metadata: SourceMetadata)
 
         retrieved_timestamp = eval_stats.started_at or eval_spec.created
         
-        evaluation_id = f'inspect_ai/{eval_spec.model}/{eval_spec.dataset.name}/{retrieved_timestamp}'
-
         source_data = SourceData(
             dataset_name=eval_spec.dataset.name.split('/')[-1],
             hf_repo=eval_spec.dataset.location,
@@ -88,8 +86,20 @@ def _transform_single(self, raw_data: EvalLog, source_metadata: SourceMetadata)
             evaluation_source_type=EvaluationSourceType.evaluation_platform
         )
 
+        model_path = eval_spec.model
+        if raw_data.samples:
+            model_name = raw_data.samples[0].output.model
+            model_path_parts = model_path.split('/')
+
+            if model_path_parts[-1] in model_name:
+                model_path_parts[-1] = model_name
+
+            model_path = '/'.join(model_path_parts)
+
+        self._check_if_model_is_on_huggingface(model_path)
+
         model_info = ModelInfo(
-            name=eval_spec.model,
+            name=model_path,
             developer=eval_spec.model.split('/')[0],
             inference_platform="/".join(eval_spec.model.split('/')[:-1])
         )
@@ -137,9 +147,11 @@ def _transform_single(self, raw_data: EvalLog, source_metadata: SourceMetadata)
                 )
             )
 
+        evaluation_id = f'inspect_ai/{model_path}/{eval_spec.dataset.name}/{retrieved_timestamp}'
+
         return EvaluationLog(
             schema_version=SCHEMA_VERSION,
-            evaluation_id=evaluation_id.replace('/', '_'),
+            evaluation_id=evaluation_id,
             retrieved_timestamp=retrieved_timestamp,
             source_data=source_data,
             evaluation_source=evaluation_source,
diff --git a/eval_converters/inspect/converter.py b/eval_converters/inspect/converter.py
@@ -50,14 +50,10 @@ def convert_to_unified_schema(self, source_metadata: SourceMetadata) -> Evaluati
 
     def save_to_file(self, unified_eval_log: EvaluationLog, output_filename: str) -> bool:
         try:
-            # data = unified_eval_log.model_dump()
             json_str = unified_eval_log.model_dump_json(indent=2)
 
             with open(f'{self.output_dir}/{output_filename}', 'w') as json_file:
                 json_file.write(json_str)
-            # json_str = json.dumps(data, indent=2)
-            # with open(f'{self.output_dir}/{output_filename}', 'w') as json_file:
-            #     json.dump(json_str, json_file)
 
             print(f'Unified eval log was successfully saved to {output_filename} file.')
         except Exception as e:
@@ -86,7 +82,7 @@ def save_to_hf_datasets(self, unified_eval_log: EvaluationLog) -> bool:
 
     unified_output: EvaluationLog = inspect_converter.convert_to_unified_schema(source_metadata)
     if unified_output:
-        output_filename = f'{str(unified_output.evaluation_id)}.json'
+        output_filename = f'{str(unified_output.evaluation_id).replace('/', '_')}.json'
         inspect_converter.save_to_file(unified_output, output_filename)
     else:
         print("Missing unified schema result!")
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 dependencies = [
     "crfm-helm>=0.5.6",
     "dacite>=1.9.2",
+    "huggingface-hub>=0.33.2",
     "numpy>=2.3.1",
     "openai>=1.93.0",
     "pandas>=2.3.0",
diff --git a/schema/eval.schema.json b/schema/eval.schema.json
@@ -26,6 +26,7 @@
             "description": "Timestamp for when this record was created"
         },
         "source_data": {
+            "description": "Source of dataset used for evaluation. There are two options supported: HuggingFace dataset or url for other data source.",
             "oneOf": [
                 {
                     "type": "array",
@@ -36,7 +37,7 @@
                 },
                 {
                     "type": "object",
-                    "description": "Details about dataset used for evaluation",
+                    "description": "Details about HuggingFace dataset used for evaluation",
                     "required": [
                         "dataset_name"
                     ],
@@ -131,7 +132,7 @@
             "properties": {
                 "name": {
                     "type": "string",
-                    "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
+                    "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
                 },
                 "developer": {
                     "type": "string",
diff --git a/schema/eval_types.py b/schema/eval_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-10-13T22:57:16+00:00
+#   timestamp: 2025-10-14T21:00:44+00:00
 
 from __future__ import annotations
 
@@ -62,7 +62,7 @@ class SourceMetadata(BaseModel):
 class ModelInfo(BaseModel):
     name: str = Field(
         ...,
-        description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)',
+        description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)',
     )
     developer: Optional[str] = Field(
         None, description="Name of organization that provides the model (e.g. 'OpenAI')"
@@ -156,7 +156,10 @@ class EvaluationLog(BaseModel):
     retrieved_timestamp: str = Field(
         ..., description='Timestamp for when this record was created'
     )
-    source_data: Union[List[str], SourceData]
+    source_data: Union[List[str], SourceData] = Field(
+        ...,
+        description='Source of dataset used for evaluation. There are two options supported: HuggingFace dataset or url for other data source.',
+    )
     evaluation_source: Optional[EvaluationSource] = Field(
         None,
         description='Details about evaluation origin. There are options that evaluations come from leaderboards or evaluation platforms.',
diff --git a/schema/inspect_ai_openai_azure_gpt-4o-mini-2024-07-18_bigbio_pubmed_qa_2025-07-03T16:44:30+02:00.json b/schema/inspect_ai_openai_azure_gpt-4o-mini-2024-07-18_bigbio_pubmed_qa_2025-07-03T16:44:30+02:00.json
@@ -1,6 +1,6 @@
 {
   "schema_version": "0.0.1",
-  "evaluation_id": "inspect_ai_openai_azure_gpt-4o-mini_bigbio_pubmed_qa_2025-07-03T16:44:30+02:00",
+  "evaluation_id": "inspect_ai/openai/azure/gpt-4o-mini-2024-07-18/bigbio/pubmed_qa/2025-07-03T16:44:30+02:00",
   "retrieved_timestamp": "2025-07-03T16:44:30+02:00",
   "source_data": {
     "dataset_name": "pubmed_qa",
@@ -23,7 +23,7 @@
     "evaluator_relationship": "other"
   },
   "model_info": {
-    "name": "openai/azure/gpt-4o-mini",
+    "name": "openai/azure/gpt-4o-mini-2024-07-18",
     "developer": "openai",
     "inference_platform": "openai/azure"
   },
diff --git a/uv.lock b/uv.lock