Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions evalscope/api/benchmark/adapters/default_data_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import defaultdict
from functools import partial
from overrides import override
from tqdm.auto import tqdm
from typing import Any, Callable, Dict, List, Optional, Tuple, Type

from evalscope.api.dataset import DataLoader, Dataset, DatasetDict, LocalDataLoader, RemoteDataLoader, Sample
Expand Down Expand Up @@ -612,6 +613,61 @@ def calculate_metrics(self, task_state: TaskState) -> SampleScore:

return sample_score

def batch_match_score(
self, original_predictions: List[str], filtered_predictions: List[str], references: List[str],
task_states: List[TaskState]
) -> Optional[List[Score]]:
"""
Batch calculate evaluation scores by comparing predictions with references.

This method computes scores using all configured metrics for a batch of samples
and creates a list of Score objects with detailed evaluation results.

Args:
original_predictions (List[str]): The original, unfiltered model predictions
filtered_predictions (List[str]): The filtered and processed predictions
references (List[str]): The ground truth reference answers
task_states (List[TaskState]): The complete task states for context

Returns:
List[Score]: List of objects containing all calculated metric scores and metadata
"""
return None # Default implementation does not support batch scoring

@override
def batch_calculate_metrics(self, task_states: List[TaskState],
sample_scores: List[SampleScore]) -> List[SampleScore]:
"""Batch calculate metrics for a list of task states with tqdm progress and batch processing."""
total = len(task_states)
if total == 0:
return sample_scores

# Prepare lists for batch processing
original_predictions: List[str] = []
filtered_predictions: List[str] = []
references: List[str] = []

for ts in task_states:
pred = ts.output.completion
original_predictions.append(pred)
filtered_predictions.append(self.filter_prediction(pred, ts))
references.append(ts.target)

batch_scores = self.batch_match_score(
original_predictions=original_predictions,
filtered_predictions=filtered_predictions,
references=references,
task_states=task_states
)

if batch_scores is not None:
assert len(batch_scores) == len(sample_scores), \
'Batch scores length must match sample scores length.'
for batch_score, sample_score in zip(batch_scores, sample_scores):
sample_score.score.value.update(batch_score.value)

return sample_scores

@override
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
"""
Expand Down
9 changes: 9 additions & 0 deletions evalscope/api/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def __init__(self, benchmark_meta: 'BenchmarkMeta', task_config: Optional['TaskC
self.shuffle_choices = False
"""Whether to shuffle the choices in the dataset"""

self.use_batch_scoring = False
"""Whether to use batch scoring for metrics that support it, need to be enabled in the benchmark as well"""

self.save_metadata = True
"""Whether to save metadata in the review result"""

Expand Down Expand Up @@ -81,6 +84,12 @@ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs)
def calculate_metrics(self, task_state: TaskState) -> SampleScore:
pass

@abstractmethod
def batch_calculate_metrics(self, task_states: List[TaskState],
sample_scores: List[SampleScore]) -> List[SampleScore]:
"""Batch calculate metrics for a list of task states. Need to update sample_scores in place."""
pass

@abstractmethod
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
pass
Expand Down
2 changes: 1 addition & 1 deletion evalscope/api/metric/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .metric import Metric, T2IMetric
from .metric import Metric, SingletonMetric, T2IMetric
from .scorer import Aggregator, AggScore, SampleScore, Score, Value
7 changes: 6 additions & 1 deletion evalscope/api/metric/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def __call__(self, prediction: str, reference: str) -> float:
return self.apply([prediction], [reference])[0]


class T2IMetric(Metric):
class SingletonMetric(Metric):
"""Singleton base class for metrics."""
_instance = None

@thread_safe
Expand All @@ -48,6 +49,10 @@ def __init__(self, *args, **kwargs):
def _init_once(self, *args, **kwargs):
pass


class T2IMetric(SingletonMetric):
"""Singleton base class for T2I metrics."""

def apply(self, images: List[str], texts: List[str], **kwargs) -> List[Union[float, dict]]:
pass

Expand Down
77 changes: 29 additions & 48 deletions evalscope/benchmarks/drivelology/drivelology_writing_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
from evalscope.api.registry import register_benchmark
from evalscope.constants import Tags
from evalscope.utils.import_utils import check_import
from evalscope.utils.logger import get_logger

logger = get_logger()
Expand Down Expand Up @@ -51,29 +50,6 @@
""".strip() # noqa: E501


def compute_bertscore_one_sample(
predictions: List[str], references: List[str], lang: str = 'en', model_type: str = 'roberta-large'
) -> dict:
check_import('bert_score', 'bert_score', raise_error=True, feature_name='Text similarity metrics')
from bert_score import score as bert_score_fn
try:
P, R, F1 = bert_score_fn(
predictions, references, lang=lang, model_type=model_type, rescale_with_baseline=False, verbose=False
)
return {
'bertscore-precision': round(P[0].item(), 6),
'bertscore-recall': round(R[0].item(), 6),
'bertscore-f1': round(F1[0].item(), 6),
}
except Exception as e:
logger.error(f'BERTScore error: {e}')
return {
'bertscore-precision': 0.0,
'bertscore-recall': 0.0,
'bertscore-f1': 0.0,
}


@register_benchmark(
BenchmarkMeta(
name='drivel_writing',
Expand All @@ -82,7 +58,13 @@ def compute_bertscore_one_sample(
description=DESCRIPTION.strip(),
dataset_id='extraordinarylab/drivel-hub',
subset_list=['narrative-writing-english'],
metric_list=['gpt_score', 'bert_score'],
metric_list={
'bert_score': {
'model_id_or_path': 'AI-ModelScope/roberta-large',
'model_type': 'roberta-large'
},
'llm_match_score': {}
},
few_shot_num=0,
train_split=None,
eval_split='test',
Expand All @@ -94,6 +76,7 @@ class DrivelologyNarrativeWritingAdapter(DefaultDataAdapter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._use_llm_judge = True # Use LLM as a judge by default
self.use_batch_scoring = True # Enable batch scoring

def record_to_sample(self, record: Dict[str, Any]) -> Sample:
"""
Expand All @@ -117,6 +100,25 @@ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
}
)

def batch_match_score(self, original_predictions, filtered_predictions, references, task_states):
"""
Batch calculate the match scores using BERTScore.
"""
from evalscope.metrics.metric import BertScore

score_args = self.metric_list.get('bert_score', {})
bert_scorer = BertScore(**score_args)
bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
scores = []
for i in range(len(original_predictions)):
score = Score(
extracted_prediction=filtered_predictions[i],
prediction=original_predictions[i],
value={'bert_score': bert_score_f1[i]}
)
scores.append(score)
return scores

def llm_match_score(
self,
original_prediction: str,
Expand All @@ -135,27 +137,6 @@ def llm_match_score(
# Initialize score value dictionary
score.value = {}

# Calculate BERTScore
if filtered_prediction and reference:
try:
# Truncate if needed to prevent memory issues
max_length = 1024
filtered_prediction_trunc = filtered_prediction[:max_length]
reference_trunc = reference[:max_length]

bertscore_results = compute_bertscore_one_sample(
predictions=[filtered_prediction_trunc], references=[reference_trunc]
)

score.value['bert_score'] = bertscore_results['bertscore-f1']
logger.info(f"BERTScore: {score.value['bert_score']}")
except Exception as e:
logger.error(f'BERTScore calculation failed: {e}')
# Use 0.0 for failures to avoid positively biasing the aggregate score.
score.value['bert_score'] = 0.0
else:
score.value['bert_score'] = 0.0

# Use LLM judge to evaluate narrative quality
eval_prompt = NARRATIVE_EVALUATION_TEMPLATE.format(candidate=filtered_prediction, reference=reference)

Expand Down Expand Up @@ -212,8 +193,8 @@ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
bert_scores = [ss.score.value.get('bert_score', 0.0) for ss in sample_scores]

# Calculate averages
avg_gpt_score = sum(gpt_scores) / len(gpt_scores)
avg_bert_score = sum(bert_scores) / len(bert_scores)
avg_gpt_score = sum(gpt_scores) / len(gpt_scores) if gpt_scores else 0.0
avg_bert_score = sum(bert_scores) / len(bert_scores) if bert_scores else 0.0

return [
AggScore(
Expand Down
73 changes: 58 additions & 15 deletions evalscope/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import os
import traceback
from collections import defaultdict
from typing import TYPE_CHECKING, Dict, List
from tqdm import tqdm
from typing import TYPE_CHECKING, Callable, Dict, List

from evalscope.api.dataset import Dataset, DatasetDict, Sample
from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
Expand Down Expand Up @@ -161,17 +162,17 @@ def get_answers(self, subset: str, dataset: Dataset) -> List[TaskState]:
"""
# Initialize task state list and filter cached predictions if caching is enabled
if self.use_cache:
task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
cached_task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
else:
task_state_list = []
cached_task_state_list = []

# Get output directory for storing model predictions
model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))

# Convert dataset to list for parallel processing
dataset_list = list(dataset)
if not dataset_list:
return task_state_list
return cached_task_state_list

logger.info(f'Processing {len(dataset_list)} samples, if data is large, it may take a while.')

Expand All @@ -190,19 +191,19 @@ def on_error(sample: Sample, exc: Exception) -> None:
return
raise exc

new_task_states = run_in_threads_with_progress(
finished_task_states = run_in_threads_with_progress(
dataset_list,
worker,
desc=f'Predicting[{self.benchmark_name}@{subset}]: ',
max_workers=self.task_config.eval_batch_size,
heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
on_result=on_result,
on_error=on_error,
filter_none_results=True,
)
task_state_list.extend(new_task_states)

logger.info(f'Finished getting predictions for subset: {subset}.')
return task_state_list
return cached_task_state_list + finished_task_states

def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
"""
Expand Down Expand Up @@ -239,14 +240,14 @@ def get_reviews(self, subset: str, task_states: List[TaskState]) -> List[SampleS
"""
# Initialize sample score list and filter cached reviews if caching is enabled
if self.use_cache and not self.task_config.rerun_review:
sample_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
cached_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
else:
# Init a clean sample score list
sample_score_list = []
cached_score_list = []
self.cache_manager.delete_review_cache(subset)

if not task_states:
return sample_score_list
return cached_score_list

logger.info(f'Reviewing {len(task_states)} samples, if data is large, it may take a while.')

Expand All @@ -270,19 +271,27 @@ def on_error(task_state: TaskState, exc: Exception) -> None:
return
raise exc

new_scores = run_in_threads_with_progress(
# Run reviews in parallel
reviewed_scores = run_in_threads_with_progress(
task_states,
worker,
desc=f'Reviewing[{self.benchmark_name}@{subset}]: ',
max_workers=self.task_config.judge_worker_num,
heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
on_result=on_result,
on_error=on_error,
# Do not persist interim results when batch scoring is enabled
on_result=None if self.benchmark.use_batch_scoring else on_result,
filter_none_results=False,
)
sample_score_list.extend(new_scores)

logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(sample_score_list)}')
return sample_score_list
# Batch calculate metrics if supported by the benchmark
if self.benchmark.use_batch_scoring:
reviewed_scores = self._batch_review_task_states(
task_states=task_states, reviewed_scores=reviewed_scores, on_result=on_result
)

logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(reviewed_scores)}')
return cached_score_list + reviewed_scores

def _review_task_state(self, task_state: TaskState) -> SampleScore:
"""
Expand All @@ -298,6 +307,40 @@ def _review_task_state(self, task_state: TaskState) -> SampleScore:
sample_score = self.benchmark.calculate_metrics(task_state=task_state)
return sample_score

def _batch_review_task_states(
self, task_states: List[TaskState], reviewed_scores: List[SampleScore],
on_result: Callable[[TaskState, SampleScore], None]
) -> List[SampleScore]:
valid_indices = [i for i, score in enumerate(reviewed_scores) if score is not None]
if not valid_indices:
return reviewed_scores

task_states = [task_states[i] for i in valid_indices]
reviewed_scores = [reviewed_scores[i] for i in valid_indices]

# Iterate in batches with progress bar
all_reviewed_scores = []
total = len(task_states)
batch_size = self.task_config.judge_worker_num
with tqdm(total=total, desc='Scoring (batch)', unit='sample') as pbar:
for start in range(0, total, batch_size):
# Process batch
end = min(start + batch_size, total)
batch_task_states = task_states[start:end]
batch_scores = reviewed_scores[start:end]
# Batch calculate metrics
updated_reviewed_scores = self.benchmark.batch_calculate_metrics(
task_states=batch_task_states, sample_scores=batch_scores
)
# Append results
all_reviewed_scores.extend(updated_reviewed_scores)
# Save each result to cache
for task_state, sample_score in zip(batch_task_states, updated_reviewed_scores):
on_result(task_state, sample_score)

pbar.update(len(batch_task_states))
return all_reviewed_scores

def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
"""
Generate a comprehensive evaluation report from aggregated scores.
Expand Down
Empty file.
Loading