evaleval · damian1996 · Nov 1, 2025 · Oct 5, 2025 · Oct 8, 2025 · Oct 14, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,33 @@
+name: Run tests and lint
+
+on:
+  pull_request:
+    branches:
+      - "**"  # runs for all branches with PRs
+
+jobs:
+  lint-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Sync dependencies
+        run: uv sync --extra dev
+
+      - name: Run Ruff linter
+        run: uv run ruff check .
+
+      - name: Run tests with pytest
+        run: uv run pytest -v
diff --git a/README.md b/README.md
@@ -14,3 +14,60 @@ This repository provides a unified and extensible framework for running and orga
 ```bash
 uv sync
 ```
+
+## Automatic Evaluation Conversion Scripts
+
+We are supporting following evaluation platforms for automatic converting their evaluations into our unified schema.
+
+### Inspect
+Convert eval log from Inspect AI into json format with following command:
+
+```bash
+uv run inspect log convert path_to_eval_file_generated_by_inspect --to json --output-dir inspect_json
+```
+
+Then we can convert Inspect evaluation log into unified schema via eval_converters/inspect/converter.py. Conversion for example data can be generated via below script: 
+
+```bash
+uv run python3 -m eval_converters.inspect.converter
+```
+
+for example:
+
+```bash
+uv run python3 -m eval_converters.inspect.converter --log_path tests/data/inspect/data_arc_qwen.json
+```
+
+
+Full manual for conversion of your own Inspect evaluation log into unified is available below:
+
+```bash
+usage: converter.py [-h] [--log_path LOG_PATH]
+                    [--huggingface_dataset HUGGINGFACE_DATASET]
+                    [--output_dir OUTPUT_DIR]
+                    [--source_organization_name SOURCE_ORGANIZATION_NAME]
+                    [--evaluator_relationship {first_party,third_party,collaborative,other}]
+                    [--source_organization_url SOURCE_ORGANIZATION_URL]
+                    [--source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL]
+
+options:
+  -h, --help            show this help message and exit
+  --log_path LOG_PATH
+  --huggingface_dataset HUGGINGFACE_DATASET
+  --output_dir OUTPUT_DIR
+  --source_organization_name SOURCE_ORGANIZATION_NAME
+                        Orgnization which pushed evaluation to the evalHub.
+  --evaluator_relationship {first_party,third_party,collaborative,other}
+                        Relationship of evaluation author to the model
+  --source_organization_url SOURCE_ORGANIZATION_URL
+  --source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL
+```
+
+## Tests
+
+Run below script to perform unit tests for all evaluation platforms.
+
+```bash
+uv run pytest -s
+uv run ruff check 
+```
diff --git a/eval_converters/common/adapter.py b/eval_converters/common/adapter.py
@@ -2,12 +2,14 @@
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
 from enum import Enum
-from schema.eval_types import EvaluationResult
+from huggingface_hub import model_info
+from schema.eval_types import EvaluationLog
 from typing import Any, List, Union
 from pathlib import Path
 import json
 
 from eval_converters.common.error import AdapterError, TransformationError
+from schema.eval_types import SourceMetadata
 
 @dataclass
 class AdapterMetadata:
@@ -58,22 +60,22 @@ def supported_library(self) -> SupportedLibrary:
         pass
 
     @abstractmethod
-    def _transform_single(self, raw_data: Any) -> EvaluationResult:
+    def _transform_single(self, raw_data: Any, source_metadata: SourceMetadata) -> EvaluationLog:
         """
         Transform a single evaluation record.
 
         Args:
             raw_data: Single evaluation record in library-specific format
 
         Returns:
-            EvaluationResult in unified schema format
+            EvaluationLog in unified schema format
 
         Raises:
             TransformationError: If transformation fails
         """
         pass
 
-    def transform(self, data: Any) -> Union[EvaluationResult, List[EvaluationResult]]:
+    def transform(self, data: Any, source_metadata: SourceMetadata) -> Union[EvaluationLog, List[EvaluationLog]]:
         """
         Transform evaluation data to unified schema format.
 
@@ -89,18 +91,18 @@ def transform(self, data: Any) -> Union[EvaluationResult, List[EvaluationResult]
                 results = []
                 for i, item in enumerate(data):
                     try:
-                        result = self._transform_single(item)
+                        result = self._transform_single(item, source_metadata)
                         results.append(result)
                     except Exception as e:
                         self._handle_transformation_error(e, f"item {i}")
                 return results
             else:
-                return self._transform_single(data)
+                return self._transform_single(data, source_metadata)
 
         except Exception as e:
             self._handle_transformation_error(e, "data transformation")
 
-    def transform_from_file(self, file_path: Union[str, Path]) -> Union[EvaluationResult, List[EvaluationResult]]:
+    def transform_from_file(self, file_path: Union[str, Path], source_metadata: SourceMetadata) -> Union[EvaluationLog, List[EvaluationLog]]:
         """
         Load and transform evaluation data from file.
 
@@ -117,12 +119,12 @@ def transform_from_file(self, file_path: Union[str, Path]) -> Union[EvaluationRe
 
         try:
             data = self._load_file(file_path)
-            return self.transform(data)
+            return self.transform(data, source_metadata)
         except Exception as e:
             raise AdapterError(f"Failed to load file {file_path}: {str(e)}")
 
     @abstractmethod
-    def transform_from_directory(self, dir_path: Union[str, Path]) -> Union[EvaluationResult, List[EvaluationResult]]:
+    def transform_from_directory(self, dir_path: Union[str, Path]) -> Union[EvaluationLog, List[EvaluationLog]]:
         """
         Load and transform evaluation data from all files in a directory.
 
@@ -172,4 +174,11 @@ def _handle_transformation_error(self, error: Exception, context: str):
         if self.strict_validation:
             raise TransformationError(error_msg) from error
         else:
-            self.logger.warning(error_msg)
+            self.logger.warning(error_msg)
+
+    def _check_if_model_is_on_huggingface(self, model_path):
+        try:
+            info = model_info(model_path)
+            return info
+        except Exception:
+            self.logger.warning(f"Model '{model_path}' not found on Hugging Face.")
diff --git a/eval_converters/common/utils.py b/eval_converters/common/utils.py
@@ -1,28 +1,63 @@
-from schema.eval_types import Family, HfSplit
+from datetime import datetime
+from huggingface_hub import HfApi
+from typing import Dict
 
-def detect_family(model_name: str) -> Family:
-    """Return the Family enum if any of its values is a substring of model_name."""
-    model_name_lower = model_name.lower()
-    for family in Family:
-        if family.value and family.value.lower() in model_name_lower:
-            return family
-    return Family.NoneType_None
+def convert_timestamp_to_unix_format(timestamp: str):
+    dt = datetime.fromisoformat(timestamp)
+    return str(dt.timestamp())
 
-def detect_hf_split(split_str: str) -> HfSplit:
+def get_model_organization_info(model_base_name: str) -> Dict:
     """
-    Determines the type of dataset split from a given string.
-
+    Searches the Hugging Face Hub for a model based on its base name 
+    and attempts to find the organization that published the most relevant/original version.
+
     Args:
-        split_str (str): The input string to classify.
-        
+        model_base_name: The model name without an organization (e.g., 'deepseek-coder-6.7b-base').
+
     Returns:
-        HfSplit: One of "train", "test", or "validation".
+        A dictionary containing the best-guess organization and full repository ID, 
+        or an error message.
     """
-    s = split_str.strip().lower()
 
-    if s == "test":
-        return HfSplit.test
-    elif "train" in s:
-        return HfSplit.train
-    else:
-        return HfSplit.validation
+    api = HfApi()
+
+    try:
+        models = api.list_models(
+            search=model_base_name,
+            sort="downloads",
+            direction=-1,
+            limit=50
+        )
+        models_list = list(models)
+    except Exception as e:
+        return f"Failed to connect to Hugging Face Hub: {e}"
+
+    if not models_list:
+        return 'not_found'
+
+    # Heuristic to find the 'Original' Organization:
+    # The original model is usually the one with the shortest repo_id 
+    # that includes the base model name (e.g., 'deepseek-ai/deepseek-coder-6.7b-base').
+    # We also prioritize the one with the highest downloads.
+
+    best_match = models_list[0] # Start with the most downloaded model
+
+    for model in models_list:
+        repo_id = model.modelId
+
+        parts = repo_id.split('/')
+        if len(parts) != 2:
+             continue
+
+        org, name = parts
+
+        # A good heuristic: the model name part (name) should exactly match the base name,
+        # or be a very close variant (e.g., -instruct) with the highest download count.
+        if model_base_name in name and name == model_base_name:
+            best_match = model
+            break
+
+    full_repo_id = best_match.modelId
+    organization = full_repo_id.split('/')[0]
+
+    return organization
diff --git a/custom/__init__.py → eval_converters/custom/__init__.py b/custom/__init__.py → eval_converters/custom/__init__.py
diff --git a/custom/custom_runner.py → eval_converters/custom/custom_runner.py b/custom/custom_runner.py → eval_converters/custom/custom_runner.py
diff --git a/eval_converters/helm/adapter.py b/eval_converters/helm/adapter.py
@@ -1,8 +1,7 @@
 import os
 import wget
 import json
-from typing import List, Dict, Sequence, Optional, Any
-import tempfile
+from typing import List, Dict
 from helm.benchmark.metrics.metric import PerInstanceStats
 from helm.benchmark.presentation.schema import Schema, read_schema
 from helm.benchmark.adaptation.scenario_state import ScenarioState
@@ -173,7 +172,7 @@ def transform_from_directory(self, dir_path):
 					context_window = 1
 
 		except Exception as e:
-			print(f"Error getting context window: {e}")
+			self.logger.error(f"Error getting context window: {e}")
 			context_window = 1
 
 		configuration = Configuration(
@@ -184,7 +183,7 @@ def transform_from_directory(self, dir_path):
 		try:
 			precision, method = infer_quantization(adapter_spec.model)
 		except Exception as e:
-			print(f"Error getting quantization: {e}")
+			self.logger.warning(f"Error getting quantization: {e}")
 			precision = BitPrecision.none
 			method = Method.None_
 
@@ -271,7 +270,7 @@ def transform_from_directory(self, dir_path):
 				score = instance_scores[request_state.instance.id]
 
 			except Exception as e:
-				print(f"Error getting instance scores: {e}")
+				self.logger.warning(f"Error getting instance scores: {e}")
 				score = 0.0
 
 			evaluation = Evaluation(
@@ -361,7 +360,7 @@ def _transform_single(self, raw_data, base_dir=None):
 					context_window = 1
 
 		except Exception as e:
-			print(f"Error getting context window: {e}")
+			self.logger.warning(f"Error getting context window: {e}")
 			context_window = 1
 
 		configuration = Configuration(
@@ -372,7 +371,7 @@ def _transform_single(self, raw_data, base_dir=None):
 		try:
 			precision, method = infer_quantization(adapter_spec.model)
 		except Exception as e:
-			print(f"Error getting quantization: {e}")
+			self.logger.warning(f"Error getting quantization: {e}")
 			precision = BitPrecision.none
 			method = Method.None_
 
@@ -479,7 +478,7 @@ def _transform_single(self, raw_data, base_dir=None):
 							break
 
 			except Exception as e:
-				print(f"Error getting instance scores: {e}")
+				self.logger.warning(f"Error getting instance scores: {e}")
 				instance_scores = {}
 
 			score = instance_scores.get(request_state.instance.id, 0.0)