Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Run tests and lint

on:
pull_request:
branches:
- "**" # runs for all branches with PRs

jobs:
lint-and-test:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install uv
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "$HOME/.local/bin" >> $GITHUB_PATH

- name: Sync dependencies
run: uv sync --extra dev

- name: Run Ruff linter
run: uv run ruff check .

- name: Run tests with pytest
run: uv run pytest -v
57 changes: 57 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,60 @@ This repository provides a unified and extensible framework for running and orga
```bash
uv sync
```

## Automatic Evaluation Conversion Scripts

We are supporting following evaluation platforms for automatic converting their evaluations into our unified schema.

### Inspect
Convert eval log from Inspect AI into json format with following command:

```bash
uv run inspect log convert path_to_eval_file_generated_by_inspect --to json --output-dir inspect_json
```

Then we can convert Inspect evaluation log into unified schema via eval_converters/inspect/converter.py. Conversion for example data can be generated via below script:

```bash
uv run python3 -m eval_converters.inspect.converter
```

for example:

```bash
uv run python3 -m eval_converters.inspect.converter --log_path tests/data/inspect/data_arc_qwen.json
```


Full manual for conversion of your own Inspect evaluation log into unified is available below:

```bash
usage: converter.py [-h] [--log_path LOG_PATH]
[--huggingface_dataset HUGGINGFACE_DATASET]
[--output_dir OUTPUT_DIR]
[--source_organization_name SOURCE_ORGANIZATION_NAME]
[--evaluator_relationship {first_party,third_party,collaborative,other}]
[--source_organization_url SOURCE_ORGANIZATION_URL]
[--source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL]

options:
-h, --help show this help message and exit
--log_path LOG_PATH
--huggingface_dataset HUGGINGFACE_DATASET
--output_dir OUTPUT_DIR
--source_organization_name SOURCE_ORGANIZATION_NAME
Orgnization which pushed evaluation to the evalHub.
--evaluator_relationship {first_party,third_party,collaborative,other}
Relationship of evaluation author to the model
--source_organization_url SOURCE_ORGANIZATION_URL
--source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL
```

## Tests

Run below script to perform unit tests for all evaluation platforms.

```bash
uv run pytest -s
uv run ruff check
```
29 changes: 19 additions & 10 deletions eval_converters/common/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
from dataclasses import dataclass
from abc import ABC, abstractmethod
from enum import Enum
from schema.eval_types import EvaluationResult
from huggingface_hub import model_info
from schema.eval_types import EvaluationLog
from typing import Any, List, Union
from pathlib import Path
import json

from eval_converters.common.error import AdapterError, TransformationError
from schema.eval_types import SourceMetadata

@dataclass
class AdapterMetadata:
Expand Down Expand Up @@ -58,22 +60,22 @@ def supported_library(self) -> SupportedLibrary:
pass

@abstractmethod
def _transform_single(self, raw_data: Any) -> EvaluationResult:
def _transform_single(self, raw_data: Any, source_metadata: SourceMetadata) -> EvaluationLog:
"""
Transform a single evaluation record.

Args:
raw_data: Single evaluation record in library-specific format

Returns:
EvaluationResult in unified schema format
EvaluationLog in unified schema format

Raises:
TransformationError: If transformation fails
"""
pass

def transform(self, data: Any) -> Union[EvaluationResult, List[EvaluationResult]]:
def transform(self, data: Any, source_metadata: SourceMetadata) -> Union[EvaluationLog, List[EvaluationLog]]:
"""
Transform evaluation data to unified schema format.

Expand All @@ -89,18 +91,18 @@ def transform(self, data: Any) -> Union[EvaluationResult, List[EvaluationResult]
results = []
for i, item in enumerate(data):
try:
result = self._transform_single(item)
result = self._transform_single(item, source_metadata)
results.append(result)
except Exception as e:
self._handle_transformation_error(e, f"item {i}")
return results
else:
return self._transform_single(data)
return self._transform_single(data, source_metadata)

except Exception as e:
self._handle_transformation_error(e, "data transformation")

def transform_from_file(self, file_path: Union[str, Path]) -> Union[EvaluationResult, List[EvaluationResult]]:
def transform_from_file(self, file_path: Union[str, Path], source_metadata: SourceMetadata) -> Union[EvaluationLog, List[EvaluationLog]]:
"""
Load and transform evaluation data from file.

Expand All @@ -117,12 +119,12 @@ def transform_from_file(self, file_path: Union[str, Path]) -> Union[EvaluationRe

try:
data = self._load_file(file_path)
return self.transform(data)
return self.transform(data, source_metadata)
except Exception as e:
raise AdapterError(f"Failed to load file {file_path}: {str(e)}")

@abstractmethod
def transform_from_directory(self, dir_path: Union[str, Path]) -> Union[EvaluationResult, List[EvaluationResult]]:
def transform_from_directory(self, dir_path: Union[str, Path]) -> Union[EvaluationLog, List[EvaluationLog]]:
"""
Load and transform evaluation data from all files in a directory.

Expand Down Expand Up @@ -172,4 +174,11 @@ def _handle_transformation_error(self, error: Exception, context: str):
if self.strict_validation:
raise TransformationError(error_msg) from error
else:
self.logger.warning(error_msg)
self.logger.warning(error_msg)

def _check_if_model_is_on_huggingface(self, model_path):
try:
info = model_info(model_path)
return info
except Exception:
self.logger.warning(f"Model '{model_path}' not found on Hugging Face.")
77 changes: 56 additions & 21 deletions eval_converters/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,63 @@
from schema.eval_types import Family, HfSplit
from datetime import datetime
from huggingface_hub import HfApi
from typing import Dict

def detect_family(model_name: str) -> Family:
"""Return the Family enum if any of its values is a substring of model_name."""
model_name_lower = model_name.lower()
for family in Family:
if family.value and family.value.lower() in model_name_lower:
return family
return Family.NoneType_None
def convert_timestamp_to_unix_format(timestamp: str):
dt = datetime.fromisoformat(timestamp)
return str(dt.timestamp())

def detect_hf_split(split_str: str) -> HfSplit:
def get_model_organization_info(model_base_name: str) -> Dict:
"""
Determines the type of dataset split from a given string.

Searches the Hugging Face Hub for a model based on its base name
and attempts to find the organization that published the most relevant/original version.

Args:
split_str (str): The input string to classify.
model_base_name: The model name without an organization (e.g., 'deepseek-coder-6.7b-base').

Returns:
HfSplit: One of "train", "test", or "validation".
A dictionary containing the best-guess organization and full repository ID,
or an error message.
"""
s = split_str.strip().lower()

if s == "test":
return HfSplit.test
elif "train" in s:
return HfSplit.train
else:
return HfSplit.validation
api = HfApi()

try:
models = api.list_models(
search=model_base_name,
sort="downloads",
direction=-1,
limit=50
)
models_list = list(models)
except Exception as e:
return f"Failed to connect to Hugging Face Hub: {e}"

if not models_list:
return 'not_found'

# Heuristic to find the 'Original' Organization:
# The original model is usually the one with the shortest repo_id
# that includes the base model name (e.g., 'deepseek-ai/deepseek-coder-6.7b-base').
# We also prioritize the one with the highest downloads.

best_match = models_list[0] # Start with the most downloaded model

for model in models_list:
repo_id = model.modelId

parts = repo_id.split('/')
if len(parts) != 2:
continue

org, name = parts

# A good heuristic: the model name part (name) should exactly match the base name,
# or be a very close variant (e.g., -instruct) with the highest download count.
if model_base_name in name and name == model_base_name:
best_match = model
break

full_repo_id = best_match.modelId
organization = full_repo_id.split('/')[0]

return organization
File renamed without changes.
File renamed without changes.
15 changes: 7 additions & 8 deletions eval_converters/helm/adapter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os
import wget
import json
from typing import List, Dict, Sequence, Optional, Any
import tempfile
from typing import List, Dict
from helm.benchmark.metrics.metric import PerInstanceStats
from helm.benchmark.presentation.schema import Schema, read_schema
from helm.benchmark.adaptation.scenario_state import ScenarioState
Expand Down Expand Up @@ -173,7 +172,7 @@ def transform_from_directory(self, dir_path):
context_window = 1

except Exception as e:
print(f"Error getting context window: {e}")
self.logger.error(f"Error getting context window: {e}")
context_window = 1

configuration = Configuration(
Expand All @@ -184,7 +183,7 @@ def transform_from_directory(self, dir_path):
try:
precision, method = infer_quantization(adapter_spec.model)
except Exception as e:
print(f"Error getting quantization: {e}")
self.logger.warning(f"Error getting quantization: {e}")
precision = BitPrecision.none
method = Method.None_

Expand Down Expand Up @@ -271,7 +270,7 @@ def transform_from_directory(self, dir_path):
score = instance_scores[request_state.instance.id]

except Exception as e:
print(f"Error getting instance scores: {e}")
self.logger.warning(f"Error getting instance scores: {e}")
score = 0.0

evaluation = Evaluation(
Expand Down Expand Up @@ -361,7 +360,7 @@ def _transform_single(self, raw_data, base_dir=None):
context_window = 1

except Exception as e:
print(f"Error getting context window: {e}")
self.logger.warning(f"Error getting context window: {e}")
context_window = 1

configuration = Configuration(
Expand All @@ -372,7 +371,7 @@ def _transform_single(self, raw_data, base_dir=None):
try:
precision, method = infer_quantization(adapter_spec.model)
except Exception as e:
print(f"Error getting quantization: {e}")
self.logger.warning(f"Error getting quantization: {e}")
precision = BitPrecision.none
method = Method.None_

Expand Down Expand Up @@ -479,7 +478,7 @@ def _transform_single(self, raw_data, base_dir=None):
break

except Exception as e:
print(f"Error getting instance scores: {e}")
self.logger.warning(f"Error getting instance scores: {e}")
instance_scores = {}

score = instance_scores.get(request_state.instance.id, 0.0)
Expand Down
Loading