From f31a7e022ce9875f48bd51543ed9eec5f9dd4b9c Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Mon, 19 May 2025 10:28:29 +0000 Subject: [PATCH 1/8] Add Chinese (zh) translation of documentation --- docs/translate/zh/_toctree.yml | 54 ++++ docs/translate/zh/adding-a-custom-task.mdx | 143 ++++++++++ docs/translate/zh/adding-a-new-metric.mdx | 86 ++++++ docs/translate/zh/available-tasks.mdx | 252 ++++++++++++++++++ ...ntributing-to-multilingual-evaluations.mdx | 104 ++++++++ .../zh/evaluating-a-custom-model.mdx | 127 +++++++++ docs/translate/zh/index.mdx | 14 + docs/translate/zh/installation.mdx | 46 ++++ docs/translate/zh/metric-list.mdx | 76 ++++++ .../package_reference/evaluation_tracker.mdx | 3 + .../zh/package_reference/logging.mdx | 15 ++ .../zh/package_reference/metrics.mdx | 70 +++++ .../translate/zh/package_reference/models.mdx | 40 +++ .../zh/package_reference/pipeline.mdx | 13 + docs/translate/zh/package_reference/tasks.mdx | 38 +++ docs/translate/zh/quicktour.mdx | 159 +++++++++++ .../zh/saving-and-reading-results.mdx | 183 +++++++++++++ ...-inference-endpoints-or-tgi-as-backend.mdx | 50 ++++ .../zh/use-inference-providers-as-backend.mdx | 41 +++ docs/translate/zh/use-litellm-as-backend.mdx | 38 +++ docs/translate/zh/use-sglang-as-backend.mdx | 77 ++++++ docs/translate/zh/use-vllm-as-backend.mdx | 124 +++++++++ docs/translate/zh/using-the-python-api.mdx | 63 +++++ 23 files changed, 1816 insertions(+) create mode 100644 docs/translate/zh/_toctree.yml create mode 100644 docs/translate/zh/adding-a-custom-task.mdx create mode 100644 docs/translate/zh/adding-a-new-metric.mdx create mode 100644 docs/translate/zh/available-tasks.mdx create mode 100644 docs/translate/zh/contributing-to-multilingual-evaluations.mdx create mode 100644 docs/translate/zh/evaluating-a-custom-model.mdx create mode 100644 docs/translate/zh/index.mdx create mode 100644 docs/translate/zh/installation.mdx create mode 100644 docs/translate/zh/metric-list.mdx create mode 100644 docs/translate/zh/package_reference/evaluation_tracker.mdx create mode 100644 docs/translate/zh/package_reference/logging.mdx create mode 100644 docs/translate/zh/package_reference/metrics.mdx create mode 100644 docs/translate/zh/package_reference/models.mdx create mode 100644 docs/translate/zh/package_reference/pipeline.mdx create mode 100644 docs/translate/zh/package_reference/tasks.mdx create mode 100644 docs/translate/zh/quicktour.mdx create mode 100644 docs/translate/zh/saving-and-reading-results.mdx create mode 100644 docs/translate/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx create mode 100644 docs/translate/zh/use-inference-providers-as-backend.mdx create mode 100644 docs/translate/zh/use-litellm-as-backend.mdx create mode 100644 docs/translate/zh/use-sglang-as-backend.mdx create mode 100644 docs/translate/zh/use-vllm-as-backend.mdx create mode 100644 docs/translate/zh/using-the-python-api.mdx diff --git a/docs/translate/zh/_toctree.yml b/docs/translate/zh/_toctree.yml new file mode 100644 index 000000000..74b6d31de --- /dev/null +++ b/docs/translate/zh/_toctree.yml @@ -0,0 +1,54 @@ +- sections: + - local: index + title: 🤗 Lighteval + - local: installation + title: 安装 + - local: quicktour + title: 快速上手 + title: 入门指南 +- sections: + - local: saving-and-reading-results + title: 保存和读取结果 + - local: using-the-python-api + title: 使用Python API + - local: adding-a-custom-task + title: 添加自定义任务 + - local: adding-a-new-metric + title: 添加自定义指标 + - local: evaluating-a-custom-model + title: 评估自定义模型 + - local: use-inference-providers-as-backend + title: 使用HF的推理提供商作为后端 + - local: use-litellm-as-backend + title: 使用litellm作为后端 + - local: use-vllm-as-backend + title: 使用vllm作为后端 + - local: use-sglang-as-backend + title: 使用SGLang作为后端 + - local: use-huggingface-inference-endpoints-or-tgi-as-backend + title: 使用Hugging Face推理端点或TGI作为后端 + - local: contributing-to-multilingual-evaluations + title: 贡献多语言评估 + title: 指南 +- sections: + - local: metric-list + title: 可用指标 + - local: available-tasks + title: 可用任务 + title: API +- sections: + - sections: + - local: package_reference/evaluation_tracker + title: EvaluationTracker + - local: package_reference/models + title: 模型和模型配置 + - local: package_reference/pipeline + title: 流水线 + title: 主要类 + - local: package_reference/metrics + title: 指标 + - local: package_reference/tasks + title: 任务 + - local: package_reference/logging + title: 日志 + title: 参考 \ No newline at end of file diff --git a/docs/translate/zh/adding-a-custom-task.mdx b/docs/translate/zh/adding-a-custom-task.mdx new file mode 100644 index 000000000..555e73bc8 --- /dev/null +++ b/docs/translate/zh/adding-a-custom-task.mdx @@ -0,0 +1,143 @@ +# 添加自定义任务 + +要添加新任务,首先打开一个issue,确定它是否将被集成到lighteval的核心评估中、扩展任务中还是社区任务中,并在hub上添加其数据集。 + +- 核心评估是在其度量和处理中只需要标准逻辑的评估,我们会将其添加到我们的测试套件中,以确保随着时间的推移不会出现回归。它们在社区中已经有很高的使用率。 +- 扩展评估是在其度量中需要自定义逻辑的评估(复杂的规范化、LLM作为评判等),我们添加它们是为了方便用户。它们在社区中已经有很高的使用率。 +- 社区评估是社区提交的新任务。 + +随着时间的推移,一个受欢迎的社区评估可以发展成为扩展评估或核心评估。 + +> [!TIP] +> 您可以在community_task目录中找到自定义任务的示例。 + +## 逐步创建自定义任务 + +> [!WARNING] +> 要将您的自定义指标贡献给lighteval仓库,您首先需要通过运行`pip install -e .[dev]`安装所需的开发依赖项,然后运行`pre-commit install`安装pre-commit钩子。 + +首先,在`community_tasks`目录下创建一个Python文件。 + +您需要定义一个提示函数,该函数将把来自数据集的一行转换为用于评估的文档。 + +```python +# 根据您不同的任务需求定义尽可能多的函数 +def prompt_fn(line, task_name: str = None): + """定义如何从数据集行到doc对象。 + 参考src/lighteval/tasks/default_prompts.py中的例子, + 或者在README中获取关于此函数应该做什么的更多信息。 + """ + return Doc( + task_name=task_name, + query=line["question"], + choices=[f" {c}" for c in line["choices"]], + gold_index=line["gold"], + instruction="", + ) +``` + +然后,您需要选择一个指标:您可以使用现有的指标(在[`lighteval.metrics.metrics.Metrics`]中定义)或[创建自定义指标](adding-a-new-metric)。 +[//]: # (TODO: 一旦添加了自动文档,将lighteval.metrics.metrics.Metrics替换为~metrics.metrics.Metrics) + +```python +custom_metric = SampleLevelMetric( + metric_name="my_custom_metric_name", + higher_is_better=True, + category=MetricCategory.IGNORED, + use_case=MetricUseCase.NONE, + sample_level_fn=lambda x: x, # 如何计算一个样本的分数 + corpus_level_fn=np.mean, # 如何聚合样本指标 +) +``` + +然后,您需要使用[`~tasks.lighteval_task.LightevalTaskConfig`]定义您的任务。 +您可以定义有或没有子集的任务。 +要定义没有子集的任务: + +```python +# 这是如何创建一个简单的任务(如hellaswag),它有一个单一的子集附加到它,并且可能有一个评估。 +task = LightevalTaskConfig( + name="myothertask", + prompt_function=prompt_fn, # 必须在文件中定义或从src/lighteval/tasks/tasks_prompt_formatting.py导入 + suite=["community"], + hf_repo="", + hf_subset="default", + hf_avail_splits=[], + evaluation_splits=[], + few_shots_split=None, + few_shots_select=None, + metric=[], # 在Metrics中选择您的指标 +) +``` + +如果您想创建具有多个子集的任务,请将它们添加到`SAMPLE_SUBSETS`列表中,并为每个子集创建一个任务。 + +```python +SAMPLE_SUBSETS = [] # 用于此评估的所有子集列表 + + +class CustomSubsetTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=prompt_fn, # 必须在文件中定义或从src/lighteval/tasks/tasks_prompt_formatting.py导入 + hf_repo="", + metric=[custom_metric], # 在Metrics中选择您的指标或使用您的custom_metric + hf_avail_splits=[], + evaluation_splits=[], + few_shots_split=None, + few_shots_select=None, + suite=["community"], + generation_size=-1, + stop_sequence=None, + ) +SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] +``` + +以下是参数及其含义的列表: + +- `name` (str),您的评估名称 +- `suite` (list),您的评估应该属于的套件。此字段允许我们比较不同的任务实现,并用作任务选择以区分要启动的版本。目前,您会找到关键词["helm", "bigbench", "original", "lighteval", "community", "custom"];对于核心评估,请选择`lighteval`。 +- `prompt_function` (Callable),您在上面步骤中定义的提示函数 +- `hf_repo` (str),hub上您的评估数据集的路径 +- `hf_subset` (str),您想用于评估的特定子集(注意:当数据集没有子集时,请用`"default"`填充此字段,而不是用`None`或`""`) +- `hf_avail_splits` (list),您的数据集可用的所有分割(训练、验证、测试、其他...) +- `evaluation_splits` (list),您想用于评估的分割 +- `few_shots_split` (str,可以为`null`),您想从中选择样本作为少量样本示例的特定分割。它应该与`evaluation_splits`中包含的集合不同 +- `few_shots_select` (str,可以为`null`),您将用来为少量样本示例选择项目的方法。可以为`null`,或以下之一: + - `balanced` 从`few_shots_split`中选择带有平衡标签的示例,以避免将少量样本示例(因此是模型生成)偏向特定标签 + - `random` 从`few_shots_split`中随机选择示例 + - `random_sampling` 为每个新项目从`few_shots_split`中随机选择新示例,但如果采样项等于当前项,则从可用样本中删除 + - `random_sampling_from_train` 为每个新项目从`few_shots_split`中随机选择新示例,但如果采样项等于当前项,则保留!仅在您知道自己在做什么时使用此选项。 + - `sequential` 选择`few_shots_split`的前`n`个示例 +- `generation_size` (int),生成评估允许的最大令牌数。如果您的评估是对数似然评估(多选),此值应为-1 +- `stop_sequence` (list),作为生成的句子结束标记的字符串列表 +- `metric` (list),您想用于评估的指标(有关详细说明,请参见下一节) +- `trust_dataset` (bool),如果您信任数据集,则设置为True + + +然后,您需要将您的任务添加到`TASKS_TABLE`列表中。 + +```python +# 存储您的评估 + +# 带有子集的任务: +TASKS_TABLE = SUBSET_TASKS + +# 不带子集的任务: +# TASKS_TABLE = [task] +``` + +创建文件后,您可以使用以下命令运行评估: + +```bash +lighteval accelerate \ + "model_name=HuggingFaceH4/zephyr-7b-beta" \ + "community|{custom_task}|{fewshots}|{truncate_few_shot}" \ + --custom-tasks {path_to_your_custom_task_file} +``` \ No newline at end of file diff --git a/docs/translate/zh/adding-a-new-metric.mdx b/docs/translate/zh/adding-a-new-metric.mdx new file mode 100644 index 000000000..e3dc5a11c --- /dev/null +++ b/docs/translate/zh/adding-a-new-metric.mdx @@ -0,0 +1,86 @@ +# 添加新指标 + +首先,检查是否可以使用[语料库指标](package_reference/metrics#corpus-metrics)或[样本指标](package_reference/metrics#sample-metrics)中的参数化函数。 + +如果不能,您可以使用`custom_task`系统注册您的新指标: + +> [!TIP] +> 要查看与自定义任务一起添加的自定义指标示例,请查看IFEval自定义任务。 + + +> [!WARNING] +> 要将您的自定义指标贡献给lighteval仓库,您首先需要通过运行`pip install -e .[dev]`安装所需的开发依赖项,然后运行`pre-commit install`安装pre-commit钩子。 + + +- 创建一个包含指标完整逻辑的新Python文件。 +- 该文件还需要以这些导入开始 + +```python +from aenum import extend_enum +from lighteval.metrics import Metrics +``` + +您需要定义一个样本级指标: + +```python +def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> bool: + response = predictions[0] + return response == formatted_doc.choices[formatted_doc.gold_index] +``` + +这里的样本级指标只返回一个指标,如果您想为每个样本返回多个指标,您需要返回一个字典,以指标为键,值为值。 + +```python +def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict: + response = predictions[0] + return {"accuracy": response == formatted_doc.choices[formatted_doc.gold_index], "other_metric": 0.5} +``` + +然后,如果需要,您可以定义一个聚合函数,常见的聚合函数是`np.mean`。 + +```python +def agg_function(items): + flat_items = [item for sublist in items for item in sublist] + score = sum(flat_items) / len(flat_items) + return score +``` + +最后,您可以定义您的指标。如果是样本级指标,您可以使用以下代码和[`~metrics.utils.metric_utils.SampleLevelMetric`]: + +```python +my_custom_metric = SampleLevelMetric( + metric_name={custom_metric_name}, + higher_is_better={True或False}, + category={MetricCategory}, + use_case={MetricUseCase}, + sample_level_fn=custom_metric, + corpus_level_fn=agg_function, +) +``` + +如果您的指标为每个样本定义多个指标,您可以使用以下代码和[`~metrics.utils.metric_utils.SampleLevelMetricGrouping`]: + +```python +custom_metric = SampleLevelMetricGrouping( + metric_name={submetric_names}, + higher_is_better={n: {True或False} for n in submetric_names}, + category={MetricCategory}, + use_case={MetricUseCase}, + sample_level_fn=custom_metric, + corpus_level_fn={ + "accuracy": np.mean, + "other_metric": agg_function, + }, +) +``` + +最后,添加以下内容,以便在作为模块加载时将您的指标添加到我们的指标列表中。 + +```python +# 将指标添加到指标列表! +extend_enum(Metrics, "metric_name", metric_function) +if __name__ == "__main__": + print("Imported metric") +``` + +您可以通过在启动lighteval时使用`--custom-tasks path_to_your_file`来提供您的自定义指标。 \ No newline at end of file diff --git a/docs/translate/zh/available-tasks.mdx b/docs/translate/zh/available-tasks.mdx new file mode 100644 index 000000000..6fc867958 --- /dev/null +++ b/docs/translate/zh/available-tasks.mdx @@ -0,0 +1,252 @@ +# 可用任务 + +您可以通过运行以下命令获取所有可用任务的列表: + +```bash +lighteval tasks list +``` + +您还可以通过运行以下命令来检查特定任务: + +```bash +lighteval tasks inspect +``` + +## 任务列表 + +- bigbench: + - bigbench|abstract_narrative_understanding + - bigbench|anachronisms + - bigbench|analogical_similarity + - bigbench|analytic_entailment + - bigbench|arithmetic_bb + - bigbench|ascii_word_recognition + - bigbench|authorship_verification + - bigbench|auto_categorization + - bigbench|auto_debugging + - bigbench|bbq_lite_json + - bigbench|bridging_anaphora_resolution_barqa + - bigbench|causal_judgment + - bigbench|cause_and_effect + - bigbench|checkmate_in_one + - bigbench|chess_state_tracking + - bigbench|chinese_remainder_theorem + - bigbench|cifar10_classification + - bigbench|code_line_description + - bigbench|codenames + - bigbench|color + - bigbench|common_morpheme + - bigbench|conceptual_combinations + - bigbench|conlang_translation + - bigbench|contextual_parametric_knowledge_conflicts + - bigbench|coqa_bb + - bigbench|crash_blossom + - bigbench|crass_ai + - bigbench|cryobiology_spanish + - bigbench|cryptonite + - bigbench|cs_algorithms + - bigbench|dark_humor_detection + - bigbench|date_understanding + - bigbench|disambiguation_qa + - bigbench|discourse_marker_prediction + - bigbench|disfl_qa + - bigbench|dyck_languages + - bigbench|elementary_math_qa + - bigbench|emoji_movie + - bigbench|emojis_emotion_prediction + - bigbench|empirical_judgments + - bigbench|english_proverbs + - bigbench|english_russian_proverbs + - bigbench|entailed_polarity + - bigbench|entailed_polarity_hindi + - bigbench|epistemic_reasoning + - bigbench|evaluating_information_essentiality + - bigbench|fact_checker + - bigbench|fantasy_reasoning + - bigbench|few_shot_nlg + - bigbench|figure_of_speech_detection + - bigbench|formal_fallacies_syllogisms_negation + - bigbench|gem + - bigbench|gender_inclusive_sentences_german + - bigbench|general_knowledge + - bigbench|geometric_shapes + - bigbench|goal_step_wikihow + - bigbench|gre_reading_comprehension + - bigbench|hhh_alignment + - bigbench|hindi_question_answering + - bigbench|hindu_knowledge + - bigbench|hinglish_toxicity + - bigbench|human_organs_senses + - bigbench|hyperbaton + - bigbench|identify_math_theorems + - bigbench|identify_odd_metaphor + - bigbench|implicatures + - bigbench|implicit_relations + - bigbench|intent_recognition + - bigbench|international_phonetic_alphabet_nli + - bigbench|international_phonetic_alphabet_transliterate + - bigbench|intersect_geometry + - bigbench|irony_identification + - bigbench|kanji_ascii + - bigbench|kannada + - bigbench|key_value_maps + - bigbench|known_unknowns + - bigbench|language_games + - bigbench|language_identification + - bigbench|linguistic_mappings + - bigbench|linguistics_puzzles + - bigbench|logic_grid_puzzle + - bigbench|logical_args + - bigbench|logical_deduction + - bigbench|logical_fallacy_detection + - bigbench|logical_sequence + - bigbench|mathematical_induction + - bigbench|matrixshapes + - bigbench|metaphor_boolean + - bigbench|metaphor_understanding + - bigbench|minute_mysteries_qa + - bigbench|misconceptions + - bigbench|misconceptions_russian + - bigbench|mnist_ascii + - bigbench|modified_arithmetic + - bigbench|moral_permissibility + - bigbench|movie_dialog_same_or_different + - bigbench|movie_recommendation + - bigbench|mult_data_wrangling + - bigbench|multiemo + - bigbench|natural_instructions + - bigbench|navigate + - bigbench|nonsense_words_grammar + - bigbench|novel_concepts + - bigbench|object_counting + - bigbench|odd_one_out + - bigbench|operators + - bigbench|paragraph_segmentation + - bigbench|parsinlu_qa + - bigbench|parsinlu_reading_comprehension + - bigbench|penguins_in_a_table + - bigbench|periodic_elements + - bigbench|persian_idioms + - bigbench|phrase_relatedness + - bigbench|physical_intuition + - bigbench|physics + - bigbench|physics_questions + - bigbench|play_dialog_same_or_different + - bigbench|polish_sequence_labeling + - bigbench|presuppositions_as_nli + - bigbench|qa_wikidata + - bigbench|question_selection + - bigbench|real_or_fake_text + - bigbench|reasoning_about_colored_objects + - bigbench|repeat_copy_logic + - bigbench|rephrase + - bigbench|rhyming + - bigbench|riddle_sense + - bigbench|ruin_names + - bigbench|salient_translation_error_detection + - bigbench|scientific_press_release + - bigbench|semantic_parsing_in_context_sparc + - bigbench|semantic_parsing_spider + - bigbench|sentence_ambiguity + - bigbench|similarities_abstraction + - bigbench|simp_turing_concept + - bigbench|simple_arithmetic_json + - bigbench|simple_arithmetic_json_multiple_choice + - bigbench|simple_arithmetic_json_subtasks + - bigbench|simple_arithmetic_multiple_targets_json + - bigbench|simple_ethical_questions + - bigbench|simple_text_editing + - bigbench|snarks + - bigbench|social_iqa + - bigbench|social_support + - bigbench|sports_understanding + - bigbench|strange_stories + - bigbench|strategyqa + - bigbench|sufficient_information + - bigbench|suicide_risk + - bigbench|swahili_english_proverbs + - bigbench|swedish_to_german_proverbs + - bigbench|symbol_interpretation + - bigbench|tellmewhy + - bigbench|temporal_sequences + - bigbench|tense + - bigbench|timedial + - bigbench|topical_chat + - bigbench|tracking_shuffled_objects + - bigbench|understanding_fables + - bigbench|undo_permutation + - bigbench|unit_conversion + - bigbench|unit_interpretation + - bigbench|unnatural_in_context_learning + - bigbench|vitaminc_fact_verification + - bigbench|what_is_the_tao + - bigbench|which_wiki_edit + - bigbench|wino_x_german + - bigbench|winowhy + - bigbench|word_sorting + - bigbench|word_unscrambling + +- harness: + - harness|bbh:boolean_expressions + - harness|bbh:causal_judgment + - harness|bbh:date_understanding + - harness|bbh:disambiguation_qa + - harness|bbh:dyck_languages + - harness|bbh:formal_fallacies + - harness|bbh:geometric_shapes + - harness|bbh:hyperbaton + - harness|bbh:logical_deduction_five_objects + - harness|bbh:logical_deduction_seven_objects + - harness|bbh:logical_deduction_three_objects + - harness|bbh:movie_recommendation + - harness|bbh:multistep_arithmetic_two + - harness|bbh:navigate + - harness|bbh:object_counting + - harness|bbh:penguins_in_a_table + - harness|bbh:reasoning_about_colored_objects + - harness|bbh:ruin_names + - harness|bbh:salient_translation_error_detection + - harness|bbh:snarks + - harness|bbh:sports_understanding + - harness|bbh:temporal_sequences + - harness|bbh:tracking_shuffled_objects_five_objects + - harness|bbh:tracking_shuffled_objects_seven_objects + - harness|bbh:tracking_shuffled_objects_three_objects + - harness|bbh:web_of_lies + - harness|bbh:word_sorting + - harness|bigbench:causal_judgment + - harness|bigbench:date_understanding + - harness|bigbench:disambiguation_qa + - harness|bigbench:geometric_shapes + - harness|bigbench:logical_deduction_five_objects + - harness|bigbench:logical_deduction_seven_objects + - harness|bigbench:logical_deduction_three_objects + - harness|bigbench:movie_recommendation + - harness|bigbench:navigate + - harness|bigbench:reasoning_about_colored_objects + - harness|bigbench:ruin_names + - harness|bigbench:salient_translation_error_detection + - harness|bigbench:snarks + - harness|bigbench:sports_understanding + - harness|bigbench:temporal_sequences + - harness|bigbench:tracking_shuffled_objects_five_objects + - harness|bigbench:tracking_shuffled_objects_seven_objects + - harness|bigbench:tracking_shuffled_objects_three_objects + - harness|wikitext:103:document_level + +- helm: + - helm|babi_qa + - helm|bbq + - helm|bbq:Age + - helm|bbq:Disability_status + - helm|bbq:Gender_identity + - helm|bbq:Physical_appearance + - helm|bbq:Race_ethnicity + - helm|bbq:Race_x_SES + - helm|bbq:Race_x_gender + - helm|bbq:Religion + - helm|bbq:SES + - helm|bbq:Sexual_orientation + - helm|bbq=Nationality + # ... (此列表较长,为简洁起见,此处省略其余部分) + # 请注意,实际翻译时应包含完整列表 \ No newline at end of file diff --git a/docs/translate/zh/contributing-to-multilingual-evaluations.mdx b/docs/translate/zh/contributing-to-multilingual-evaluations.mdx new file mode 100644 index 000000000..9ab47bc20 --- /dev/null +++ b/docs/translate/zh/contributing-to-multilingual-evaluations.mdx @@ -0,0 +1,104 @@ +# 贡献多语言评估 + +## 贡献小型翻译 + +我们定义了19个`literals`,这些是在自动创建评估提示时使用的基本关键词或标点符号,例如`yes`、`no`、`because`等。 + +我们欢迎您提供您语言的翻译! + +要贡献,您需要: +1. 打开[translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py)文件 +2. 编辑文件,为您感兴趣的语言添加或扩展字面量。 + +```python + Language.ENGLISH: TranslationLiterals( + language=Language.ENGLISH, + question_word="question", # 用法:"Question: How are you?" + answer="answer", # 用法:"Answer: I am fine" + confirmation_word="right", # 用法:"He is smart, right?" + yes="yes", # 用法:"Yes, he is" + no="no", # 用法:"No, he is not" + also="also", # 用法:"Also, she is smart." + cause_word="because", # 用法:"She is smart, because she is tall" + effect_word="therefore", # 用法:"He is tall therefore he is smart" + or_word="or", # 用法:"He is tall or small" + true="true", # 用法:"He is smart, true, false or neither?" + false="false", # 用法:"He is smart, true, false or neither?" + neither="neither", # 用法:"He is smart, true, false or neither?" + # 标点和间距:仅在您的语言使用与英语不同的内容时调整 + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + # 用于枚举的字母表的第一个字符,如果与英语不同 + indices=["A", "B", "C", ...] + ) +``` + +3. 提交带有您修改的PR!就是这样! + +## 贡献新的多语言任务 + +您应该首先阅读我们关于[添加自定义任务](adding-a-custom-task)的指南,以更好地理解我们使用的不同参数。 + +然后,您应该查看当前的[多语言任务](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py)文件,以了解它们是如何定义的。对于多语言评估,`prompt_function`应该由语言适应模板实现。模板将负责正确的格式化,正确且一致地使用语言调整的提示锚点(例如问题/回答)和标点符号。 + +浏览[此处](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/templates)的所有模板列表,以查看哪些最适合您自己的任务。 + +然后,准备好后,要定义您自己的任务,您应该: +1. 按照上述指南创建一个Python文件 +2. 导入与您的任务类型相关的模板(XNLI、Copa、多项选择、问答等) +3. 使用我们可参数化的[`~tasks.lighteval_task.LightevalTaskConfig`]类为每个相关语言和评估表述(对于多项选择)定义一个或一组任务 + +```python +your_tasks = [ + LightevalTaskConfig( + # 您的评估名称 + name=f"evalname_{language.value}_{formulation.name.lower()}", + # 此评估由社区贡献 + suite=["community"], + # 这将自动获取您所选表述的正确指标 + metric=get_metrics_for_formulation( + formulation, + [ + loglikelihood_acc_metric(normalization=None), + loglikelihood_acc_metric(normalization=LogProbTokenNorm()), + loglikelihood_acc_metric(normalization=LogProbCharNorm()), + ], + ), + # 在此函数中,您选择要遵循的模板以及使用哪种语言和表述 + prompt_function=get_template_prompt_function( + language=language, + # 然后使用适配器定义模板键(左)和数据集键(右)之间的映射 + # 要了解需要哪些模板键以及可用哪些模板键, + # 请查阅相应的适配器类型和文档字符串。 + adapter=lambda line: { + "key": line["relevant_key"], + ... + }, + formulation=formulation, + ), + # 您还可以添加特定过滤器以删除不相关的样本 + hf_filter=lambda line: line["label"] in , + # 然后选择您的huggingface数据集以及可用于评估的分割 + hf_repo=, + hf_subset=, + evaluation_splits=["train"], + hf_avail_splits=["train"], + ) + for language in [ + Language.YOUR_LANGUAGE, ... + ] + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] +] +``` +4. 然后,您可以回到指南,测试您的任务是否正确实现! + +> [!TIP] +> 所有[`~tasks.lighteval_task.LightevalTaskConfig`]参数都是强类型的,包括模板函数的输入。确保利用您的IDE的功能,使正确填写这些参数变得更容易。 + + +一旦一切就绪,提交PR,我们将很乐意审查它! \ No newline at end of file diff --git a/docs/translate/zh/evaluating-a-custom-model.mdx b/docs/translate/zh/evaluating-a-custom-model.mdx new file mode 100644 index 000000000..1eea6a13f --- /dev/null +++ b/docs/translate/zh/evaluating-a-custom-model.mdx @@ -0,0 +1,127 @@ +# 评估自定义模型 + +Lighteval允许您通过创建继承自`LightevalModel`的自定义模型类来评估自定义模型实现。当您想评估标准后端(transformers、vllm等)不直接支持的模型时,这非常有用。 + +## 创建自定义模型 + +1. 创建包含您的自定义模型实现的Python文件。该模型必须继承自`LightevalModel`并实现所有必需的方法。 + +以下是一个基本示例: + +```python +from lighteval.models.abstract_model import LightevalModel + +class MyCustomModel(LightevalModel): + def __init__(self, config): + super().__init__(config) + # 在这里初始化您的模型... + + def greedy_until(self, requests, max_tokens=None, stop_sequences=None): + # 实现生成逻辑 + pass + + def loglikelihood(self, requests, log=True): + # 实现对数似然计算 + pass + + def loglikelihood_rolling(self, requests): + # 实现滚动对数似然计算 + pass + + def loglikelihood_single_token(self, requests): + # 实现单个令牌对数似然计算 + pass +``` + +2. 自定义模型文件应该只包含一个继承自`LightevalModel`的类。在加载模型时,这个类将被自动检测并实例化。 + +> [!TIP] +> 您可以在`examples/custom_models/google_translate_model.py`中找到一个完整的自定义模型实现示例。 + +## 运行评估 + +您可以使用命令行界面或Python API评估您的自定义模型。 + +### 使用命令行 + +```bash +lighteval custom \ + "google-translate" \ + "examples/custom_models/google_translate_model.py" \ + "lighteval|wmt20:fr-de|0|0" \ + --max-samples 10 +``` + +该命令需要三个必要参数: +- 模型名称(用于在结果/日志中跟踪) +- 您的模型实现文件的路径 +- 要评估的任务(格式与其他后端相同) + +### 使用Python API + +```python +from lighteval.logging.evaluation_tracker import EvaluationTracker +from lighteval.models.custom.custom_model import CustomModelConfig +from lighteval.pipeline import Pipeline, PipelineParameters + +# 设置评估跟踪 +evaluation_tracker = EvaluationTracker( + output_dir="results", + save_details=True +) + +# 配置流水线 +pipeline_params = PipelineParameters( + launcher_type=ParallelismManager.CUSTOM, +) + +# 配置您的自定义模型 +model_config = CustomModelConfig( + model="my-custom-model", + model_definition_file_path="path/to/my_model.py" +) + +# 创建并运行流水线 +pipeline = Pipeline( + tasks="leaderboard|truthfulqa:mc|0|0", + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config +) + +pipeline.evaluate() +pipeline.save_and_push_results() +``` + +## 必需的方法 + +您的自定义模型必须实现这些核心方法: + +- `greedy_until`:用于生成文本,直到达到停止序列或最大令牌数 +- `loglikelihood`:用于计算特定续写的对数概率 +- `loglikelihood_rolling`:用于计算序列的滚动对数概率 +- `loglikelihood_single_token`:用于计算单个令牌的对数概率 + +有关详细的方法签名和要求,请参阅`LightevalModel`基类文档。 + +## 最佳实践 + +1. **错误处理**:在您的模型方法中实现健壮的错误处理,以优雅地处理边缘情况。 + +2. **批处理**:考虑在您的模型方法中实现高效的批处理,以提高性能。 + +3. **资源管理**:在您的模型的`__init__`和`__del__`方法中正确管理任何资源(例如,API连接、模型权重)。 + +4. **文档**:为您的模型类和方法添加清晰的文档字符串,解释任何特定的要求或限制。 + +## 示例用例 + +自定义模型特别适用于: + +- 评估通过自定义API访问的模型 +- 包装具有专门预处理/后处理的模型 +- 测试新型模型架构 +- 评估集成模型 +- 与外部服务或工具集成 + +有关包装Google Translate API的自定义模型的完整示例,请参阅`examples/custom_models/google_translate_model.py`。 \ No newline at end of file diff --git a/docs/translate/zh/index.mdx b/docs/translate/zh/index.mdx new file mode 100644 index 000000000..a2673e067 --- /dev/null +++ b/docs/translate/zh/index.mdx @@ -0,0 +1,14 @@ +# Lighteval + +🤗 Lighteval 是一款多功能的大语言模型评估工具箱,支持多种后端平台 — 无论您使用的是 +[transformers](https://github.com/huggingface/transformers)、 +[tgi](https://github.com/huggingface/text-generation-inference)、 +[inference providers](https://huggingface.co/docs/huggingface_hub/en/guides/inference)、 +[vllm](https://github.com/vllm-project/vllm) 还是 +[nanotron](https://github.com/huggingface/nanotron), +都能轻松实现评估。通过保存和分析详细的样本级结果,您可以深入了解模型性能,进行调试并比较不同模型的表现。 + +Lighteval 提供了灵活的自定义功能:您可以根据需求轻松创建[新任务](adding-a-custom-task)和[新指标](adding-a-new-metric), +也可以直接使用我们已有的丰富任务和指标库。 + +您可以无缝地进行实验、基准测试,并将结果存储在 Hugging Face Hub、S3 或本地环境中。 \ No newline at end of file diff --git a/docs/translate/zh/installation.mdx b/docs/translate/zh/installation.mdx new file mode 100644 index 000000000..5c14ec019 --- /dev/null +++ b/docs/translate/zh/installation.mdx @@ -0,0 +1,46 @@ +# 安装 + +Lighteval可以通过PyPi或源代码两种方式安装。 + +## 从PyPi安装 + +```bash +pip install lighteval +``` + +## 从源代码安装 +此方式主要适用于希望在`lighteval`上进行开发的用户: + +```bash +git clone https://github.com/huggingface/lighteval.git +cd lighteval +pip install -e . +``` + +## 额外依赖 + +Lighteval提供了多个可选依赖包,可以通过指定额外组来安装: +`pip install lighteval[]`或`pip install -e .[]` + +如需使用`sglang`作为Lighteval后端,请参考[sglang安装文档](https://docs.sglang.ai/start/install.html)。 + +| 额外名称 | 描述 | +|--------------|---------------------------------------------------| +| tgi | 使用Text Generation Inference API评估模型 | +| nanotron | 支持评估nanotron模型 | +| quantization | 支持评估量化模型 | +| adapters | 支持评估适配器模型(delta和peft) | +| tensorboardX | 允许将结果上传到tensorboard | +| vllm | 使用vllm作为推理后端 | +| sglang | 使用sglang作为推理后端 | +| s3 | 支持将结果上传到s3存储 | + + +## Hugging Face登录 + +如果您希望将评估结果推送到Hugging Face Hub或评估私有模型, +需要将访问令牌添加到环境变量`HF_TOKEN`中,可通过以下命令完成: + +```bash +huggingface-cli login +``` \ No newline at end of file diff --git a/docs/translate/zh/metric-list.mdx b/docs/translate/zh/metric-list.mdx new file mode 100644 index 000000000..da89d692f --- /dev/null +++ b/docs/translate/zh/metric-list.mdx @@ -0,0 +1,76 @@ +# 指标列表 + +## 多项选择任务的自动指标 + +这些指标基于不同可能答案选项的对数似然值计算。 +- `loglikelihood_acc`:选择对数概率最高选项且选择正确的样本比例 - 对于选项仅包含单个token的任务,有一个更高效的版本 (`loglikelihood_acc_single_token`)。 +- `loglikelihood_acc_norm`:选择按序列长度归一化后对数概率最高选项且选择正确的样本比例 - 对于选项仅包含单个token的任务,有一个更高效的版本 (`loglikelihood_acc_norm_single_token`)。 +- `loglikelihood_acc_norm_nospace`:选择按序列长度归一化后对数概率最高选项且选择正确的样本比例,忽略首个空格。 +- `loglikelihood_f1`:多项选择的语料库级别F1分数 - 对于选项仅包含单个token的任务,有一个更高效的版本 (`loglikelihood_f1_single_token`)。 +- `mcc`:马修斯相关系数(衡量统计分布之间一致性的指标)。 +- `recall_at_1`:选择对数概率最高选项且选择正确的样本比例 - 对于每个选项仅包含单个token的任务,有一个更高效的版本 (`recall_at_1_single_token`)。 +- `recall_at_2`:选择对数概率第二高或更高选项且选择正确的样本比例 - 对于每个选项仅包含单个token的任务,有一个更高效的版本 (`recall_at_2_single_token`)。 +- `mrr`:平均倒数排名,评估按正确性/相关性排序的选项质量 - 对于选项仅包含单个token的任务,有一个更高效的版本 (`mrr_single_token`)。 +- `target_perplexity`:不同选项的困惑度。 +- `acc_golds_likelihood`:检查单个目标的平均对数概率是否高于或低于0.5。 +- `multi_f1_numeric`:多个正确答案选项的对数似然F1分数。 + +上述所有指标都有"单token"版本(`loglikelihood_acc_single_token`、`loglikelihood_acc_norm_single_token`、`loglikelihood_f1_single_token`、`mcc_single_token`、`recall@2_single_token` 和 `mrr_single_token`)。当多项选择选项仅比较单个token(例如:"A"vs"B"vs"C"vs"D",或"是"vs"否")时,使用这些单token版本可以将评估时间缩短至原来的1/N(N为选项数量)。单token评估还包括: +- `multi_f1_numeric`:计算所有可能选项的F1分数并取平均值。 + +## 困惑度和语言建模的自动指标 +这些指标基于输入文本的对数似然值计算。 +- `word_perplexity`:按序列词数加权的困惑度(输入的对数概率)。 +- `byte_perplexity`:按序列字节数加权的困惑度(输入的对数概率)。 +- `bits_per_byte`:根据模型概率计算的每字节平均位数。 +- `log_prob`:预测输出的平均对数概率(语言建模的输入对数概率)。 + +## 生成任务的自动指标 +这些指标需要模型生成输出文本,因此评估速度相对较慢。 +- 基本指标: + - `perfect_exact_match`:预测与参考答案完全匹配的样本比例。 + - `exact_match`:预测在去除首尾空白后与参考答案匹配的样本比例(即对两者应用`strip`后比较)。 + - `quasi_exact_match`:归一化预测与归一化参考答案匹配的样本比例(归一化处理包括空白、冠词、大小写等)。还有其他变体使用不同归一化方法,例如`quasi_exact_match_triviaqa`,仅在对所有文本应用`strip`后比较。 + - `prefix_exact_match`:预测的开头部分与参考答案匹配的样本比例(去除首尾空白后比较)。 + - `prefix_quasi_exact_match`:预测的归一化开头部分与归一化参考答案匹配的样本比例。 + - `exact_match_indicator`:移除特定指示符前的上下文后的精确匹配。 + - `f1_score_quasi`:模型输出和参考答案之间词语重叠的平均F1分数,两者都先进行归一化。 + - `f1_score`:模型输出和参考答案之间词语重叠的平均F1分数,不进行归一化。 + - `f1_score_macro`:语料库级别的宏观F1分数。 + - `f1_score_micro`:语料库级别的微观F1分数。 + - `maj_at_5` 和 `maj_at_8`:模型多数投票。从模型中获取n(5或8)个生成结果,并将出现频率最高的结果作为最终预测。 +- 摘要指标: + - `rouge`:平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。 + - `rouge1`:基于1-gram重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。 + - `rouge2`:基于2-gram重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。 + - `rougeL`:基于最长公共子序列重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。 + - `rougeLsum`:基于最长公共子序列重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。 + - `rouge_t5` (BigBench):所有可用ROUGE指标的语料库级别ROUGE分数。 + - `faithfulness`:基于SummaC方法 [(Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/)) 的忠实度评分。 + - `extractiveness`:基于 [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/) 的内容评估: + - `summarization_coverage`:模型生成摘要中来自源文档的抽取片段比例, + - `summarization_density`:模型生成摘要对源文档的抽取程度, + - `summarization_compression`:模型生成摘要相对源文档的压缩比率。 + - `bert_score`:模型生成摘要和参考摘要之间的平均BERTScore精确率、召回率和F1分数 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr)。 +- 翻译指标: + - `bleu`:语料库级别BLEU分数 [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - 使用sacrebleu实现。 + - `bleu_1`:基于1-gram重叠的平均样本BLEU分数 [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - 使用nltk实现。 + - `bleu_4`:基于4-gram重叠的平均样本BLEU分数 [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - 使用nltk实现。 + - `chrf`:字符n-gram匹配的F值评分。 + - `ter`:翻译编辑/错误率。 +- 版权相关指标: + - `copyright`:评估以下方面: + - `longest_common_prefix_length`:模型生成和参考文本之间最长公共前缀的平均长度, + - `edit_distance`:模型生成和参考文本之间的平均Levenshtein编辑距离, + - `edit_similarity`:模型生成和参考文本之间的平均Levenshtein编辑相似度(按较长序列长度归一化)。 +- 数学评估指标: + - `quasi_exact_match_math`:归一化预测与归一化参考答案匹配的样本比例(针对数学问题特别优化,删除了LaTeX符号、单位等)。 + - `maj_at_4_math`:多数投票评估,对预测和参考答案使用数学专用归一化。 + - `quasi_exact_match_gsm8k`:归一化预测与归一化参考答案匹配的样本比例(针对gsm8k问题特别优化,删除了LaTeX符号、单位等)。 + - `maj_at_8_gsm8k`:多数投票评估,对预测和参考答案使用gsm8k专用归一化。 + +## LLM作为评判 +- `llm_judge_gpt3p5`:适用于任何生成任务,使用OpenAI API的GPT-3.5模型评分。 +- `llm_judge_llama_3_405b`:适用于任何生成任务,使用HuggingFace API的Llama 3 405B模型评分。 +- `llm_judge_multi_turn_gpt3p5`:适用于多轮对话任务(如mt-bench),使用OpenAI API的GPT-3.5模型评分。 +- `llm_judge_multi_turn_llama_3_405b`:适用于多轮对话任务(如mt-bench),使用HuggingFace API的Llama 3 405B模型评分。 \ No newline at end of file diff --git a/docs/translate/zh/package_reference/evaluation_tracker.mdx b/docs/translate/zh/package_reference/evaluation_tracker.mdx new file mode 100644 index 000000000..06297ebbf --- /dev/null +++ b/docs/translate/zh/package_reference/evaluation_tracker.mdx @@ -0,0 +1,3 @@ +# 评估跟踪器 + +[[autodoc]] logging.evaluation_tracker.EvaluationTracker \ No newline at end of file diff --git a/docs/translate/zh/package_reference/logging.mdx b/docs/translate/zh/package_reference/logging.mdx new file mode 100644 index 000000000..99fd42a16 --- /dev/null +++ b/docs/translate/zh/package_reference/logging.mdx @@ -0,0 +1,15 @@ +# 日志记录 + +## EvaluationTracker(评估跟踪器) +[[autodoc]] logging.evaluation_tracker.EvaluationTracker + +## GeneralConfigLogger(通用配置记录器) +[[autodoc]] logging.info_loggers.GeneralConfigLogger +## DetailsLogger(详情记录器) +[[autodoc]] logging.info_loggers.DetailsLogger +## MetricsLogger(指标记录器) +[[autodoc]] logging.info_loggers.MetricsLogger +## VersionsLogger(版本记录器) +[[autodoc]] logging.info_loggers.VersionsLogger +## TaskConfigLogger(任务配置记录器) +[[autodoc]] logging.info_loggers.TaskConfigLogger \ No newline at end of file diff --git a/docs/translate/zh/package_reference/metrics.mdx b/docs/translate/zh/package_reference/metrics.mdx new file mode 100644 index 000000000..ffc71a9f7 --- /dev/null +++ b/docs/translate/zh/package_reference/metrics.mdx @@ -0,0 +1,70 @@ +# 指标 + +## 指标 +[//]: # (TODO: aenum.Enum raises error when generating docs: not supported by inspect.signature. See: https://github.com/ethanfurman/aenum/issues/44) +[//]: # (### Metrics) +[//]: # ([[autodoc]] metrics.metrics.Metrics) +### Metric(指标) +[[autodoc]] metrics.utils.metric_utils.Metric +### CorpusLevelMetric(语料级指标) +[[autodoc]] metrics.utils.metric_utils.CorpusLevelMetric +### SampleLevelMetric(样本级指标) +[[autodoc]] metrics.utils.metric_utils.SampleLevelMetric +### MetricGrouping(指标分组) +[[autodoc]] metrics.utils.metric_utils.MetricGrouping +### CorpusLevelMetricGrouping(语料级指标分组) +[[autodoc]] metrics.utils.metric_utils.CorpusLevelMetricGrouping +### SampleLevelMetricGrouping(样本级指标分组) +[[autodoc]] metrics.utils.metric_utils.SampleLevelMetricGrouping + +## Corpus Metrics(语料级指标) +### CorpusLevelF1Score +[[autodoc]] metrics.metrics_corpus.CorpusLevelF1Score +### CorpusLevelPerplexityMetric +[[autodoc]] metrics.metrics_corpus.CorpusLevelPerplexityMetric +### CorpusLevelTranslationMetric +[[autodoc]] metrics.metrics_corpus.CorpusLevelTranslationMetric +### matthews_corrcoef +[[autodoc]] metrics.metrics_corpus.matthews_corrcoef + +## Sample Metrics(样本级指标) +### ExactMatches(精确匹配) +[[autodoc]] metrics.metrics_sample.ExactMatches +### F1_score(F1分数) +[[autodoc]] metrics.metrics_sample.F1_score +### LoglikelihoodAcc +[[autodoc]] metrics.metrics_sample.LoglikelihoodAcc +### NormalizedMultiChoiceProbability(归一化多选概率) +[[autodoc]] metrics.metrics_sample.NormalizedMultiChoiceProbability +### Probability(概率) +[[autodoc]] metrics.metrics_sample.Probability +### Recall(召回率) +[[autodoc]] metrics.metrics_sample.Recall +### MRR +[[autodoc]] metrics.metrics_sample.MRR +### ROUGE +[[autodoc]] metrics.metrics_sample.ROUGE +### BertScore +[[autodoc]] metrics.metrics_sample.BertScore +### Extractiveness(抽取性) +[[autodoc]] metrics.metrics_sample.Extractiveness +### Faithfulness(忠实度) +[[autodoc]] metrics.metrics_sample.Faithfulness +### BLEURT +[[autodoc]] metrics.metrics_sample.BLEURT +### BLEU +[[autodoc]] metrics.metrics_sample.BLEU +### StringDistance(字符串距离) +[[autodoc]] metrics.metrics_sample.StringDistance +### JudgeLLM +[[autodoc]] metrics.metrics_sample.JudgeLLM +### JudgeLLMMTBench +[[autodoc]] metrics.metrics_sample.JudgeLLMMTBench +### JudgeLLMMixEval +[[autodoc]] metrics.metrics_sample.JudgeLLMMixEval +### MajAtK +[[autodoc]] metrics.metrics_sample.MajAtK + +## LLM-as-a-Judge(LLM作为评判) +### JudgeLM +[[autodoc]] metrics.llm_as_judge.JudgeLM \ No newline at end of file diff --git a/docs/translate/zh/package_reference/models.mdx b/docs/translate/zh/package_reference/models.mdx new file mode 100644 index 000000000..8c2d7957a --- /dev/null +++ b/docs/translate/zh/package_reference/models.mdx @@ -0,0 +1,40 @@ +# 模型 + +## 模型 +### LightevalModel +[[autodoc]] models.abstract_model.LightevalModel + + +## Accelerate和Transformers模型 +### TransformersModel +[[autodoc]] models.transformers.transformers_model.TransformersModelConfig +[[autodoc]] models.transformers.transformers_model.TransformersModel + +### AdapterModel +[[autodoc]] models.transformers.adapter_model.AdapterModelConfig +[[autodoc]] models.transformers.adapter_model.AdapterModel + +### DeltaModel +[[autodoc]] models.transformers.delta_model.DeltaModelConfig +[[autodoc]] models.transformers.delta_model.DeltaModel + +## 基于端点的模型 +### InferenceEndpointModel +[[autodoc]] models.endpoints.endpoint_model.InferenceEndpointModelConfig +[[autodoc]] models.endpoints.endpoint_model.ServerlessEndpointModelConfig +[[autodoc]] models.endpoints.endpoint_model.InferenceEndpointModel + +### TGI ModelClient +[[autodoc]] models.endpoints.tgi_model.TGIModelConfig +[[autodoc]] models.endpoints.tgi_model.ModelClient + +### 自定义模型 +[[autodoc]] models.custom.custom_model.CustomModelConfig + +### Open AI模型 +[[autodoc]] models.endpoints.openai_model.OpenAIClient + +## VLLM模型 +### VLLMModel +[[autodoc]] models.vllm.vllm_model.VLLMModelConfig +[[autodoc]] models.vllm.vllm_model.VLLMModel \ No newline at end of file diff --git a/docs/translate/zh/package_reference/pipeline.mdx b/docs/translate/zh/package_reference/pipeline.mdx new file mode 100644 index 000000000..68f712ee6 --- /dev/null +++ b/docs/translate/zh/package_reference/pipeline.mdx @@ -0,0 +1,13 @@ +# 流水线 + +## Pipeline(流水线) + +[[autodoc]] pipeline.Pipeline + +## PipelineParameters(流水线参数) + +[[autodoc]] pipeline.PipelineParameters + +## ParallelismManager(并行管理器) + +[[autodoc]] pipeline.ParallelismManager \ No newline at end of file diff --git a/docs/translate/zh/package_reference/tasks.mdx b/docs/translate/zh/package_reference/tasks.mdx new file mode 100644 index 000000000..91b6fb4c8 --- /dev/null +++ b/docs/translate/zh/package_reference/tasks.mdx @@ -0,0 +1,38 @@ +# 任务 + +## LightevalTask +### LightevalTaskConfig +[[autodoc]] tasks.lighteval_task.LightevalTaskConfig +### LightevalTask +[[autodoc]] tasks.lighteval_task.LightevalTask + +## PromptManager(提示管理器) + +[[autodoc]] tasks.prompt_manager.PromptManager + +## Registry(注册表) + +[[autodoc]] tasks.registry.Registry + +## Requests(请求) + +[[autodoc]] tasks.requests.Request + +[[autodoc]] tasks.requests.LoglikelihoodRequest + +[[autodoc]] tasks.requests.LoglikelihoodSingleTokenRequest + +[[autodoc]] tasks.requests.LoglikelihoodRollingRequest + +[[autodoc]] tasks.requests.GreedyUntilRequest + +[[autodoc]] tasks.requests.GreedyUntilMultiTurnRequest + +## Datasets(数据集) + +[[autodoc]] data.DynamicBatchDataset +[[autodoc]] data.LoglikelihoodDataset +[[autodoc]] data.LoglikelihoodSingleTokenDataset +[[autodoc]] data.GenerativeTaskDataset +[[autodoc]] data.GenerativeTaskDatasetNanotron +[[autodoc]] data.GenDistributedSampler \ No newline at end of file diff --git a/docs/translate/zh/quicktour.mdx b/docs/translate/zh/quicktour.mdx new file mode 100644 index 000000000..abf33a830 --- /dev/null +++ b/docs/translate/zh/quicktour.mdx @@ -0,0 +1,159 @@ +# 快速上手 + + +> [!TIP] +> 建议使用`--help`参数了解每个命令的可用选项。 +> `lighteval --help` + +Lighteval 支持多种命令方式: + +- `lighteval accelerate`: 使用[🤗 Accelerate](https://github.com/huggingface/accelerate)在CPU或多GPU环境评估模型 +- `lighteval nanotron`: 通过[⚡️ Nanotron](https://github.com/huggingface/nanotron)在分布式环境中评估模型 +- `lighteval vllm`: 基于[🚀 VLLM](https://github.com/vllm-project/vllm)在单个或多个GPU上评估模型 +- `lighteval endpoint` + - `inference-endpoint`: 使用[🔗 Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated)评估模型 + - `tgi`: 通过[🔗 Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index)评估模型 + - `openai`: 基于[🔗 OpenAI API](https://platform.openai.com/)评估模型 + +## 基本用法 + +要使用[🤗 Accelerate](https://github.com/huggingface/accelerate)在Truthful QA基准上评估`GPT-2`模型,运行: + +```bash +lighteval accelerate \ + "model_name=openai-community/gpt2" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +在这里,首先选择后端(`accelerate`、`nanotron`或`vllm`),然后指定要评估的模型和任务。 + +模型参数采用`key1=value1,key2=value2`这样的语法格式。 +有效的键值对取决于所选后端,详细说明请参见[下文](#模型参数)。 + +任务规范的语法格式如下: + +```txt +{套件}|{任务}|{少样本数量}|{0表示严格使用指定的少样本数量,1表示允许在上下文过长时自动截断} +``` + +当第四个值设为1时,lighteval会检查整个提示(包括少样本示例)是否超出任务或模型的上下文长度限制。 +如果超出限制,系统会自动减少少样本示例的数量。 + +所有官方支持的任务可在[任务列表](available-tasks)和 +[extended文件夹](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/extended)中找到。 +社区贡献的任务则位于 +[community](https://github.com/huggingface/lighteval/tree/main/community_tasks)文件夹。 +关于任务实现的更多细节,如提示构建方式或使用的评估指标,请查阅 +[源文件](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/default_tasks.py)。 + +Lighteval支持同时运行多个任务,可通过逗号分隔列表或指定配置文件路径来实现。 +配置文件应按照[examples/tasks/recommended_set.txt](https://github.com/huggingface/lighteval/blob/main/examples/tasks/recommended_set.txt)的结构编写。 +指定文件路径时应以`./`开头。 + +```bash +lighteval accelerate \ + "model_name=openai-community/gpt2" \ + ./path/to/lighteval/examples/tasks/recommended_set.txt +# 或者使用逗号分隔的任务列表,例如:"leaderboard|truthfulqa:mc|0|0|,leaderboard|gsm8k|3|1" +``` + +## 在多GPU环境中评估模型 + +#### 数据并行 + +要在多GPU环境中评估模型,首先需要创建多GPU配置: + +```bash +accelerate config +``` + +然后,可以使用8个GPU的数据并行方式来评估模型: + +```bash +accelerate launch --multi_gpu --num_processes=8 -m \ + lighteval accelerate \ + "model_name=openai-community/gpt2" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +其中,`--override_batch_size`定义每个设备的批处理大小,实际总批处理大小为`override_batch_size * num_gpus`。 + +#### 流水线并行 + +要使用2个或更多GPU的流水线并行方式评估模型,运行: + +```bash +lighteval accelerate \ + "model_name=openai-community/gpt2,model_parallel=True" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +这会自动使用accelerate将模型分布在多个GPU上。 + +> [!TIP] +> 数据并行和流水线并行可以结合使用,只需设置`model_parallel=True`并使用accelerate进行数据分布。 + +## 后端配置 + +`model-args`参数接受一个模型参数列表字符串。可用参数取决于所选后端(vllm或accelerate)。 + +### Accelerate + +- **pretrained** (str): + HuggingFace Hub模型ID或预训练模型路径,相当于HuggingFace `transformers` API中`from_pretrained`的`pretrained_model_name_or_path`参数。 +- **tokenizer** (Optional[str]): 用于分词的HuggingFace Hub分词器ID。 +- **multichoice_continuations_start_space** (Optional[bool]): 在多选项生成中是否在每个选项开头添加空格。 + 例如,对于问题"法国的首都是什么?"和选项"巴黎"、"伦敦", + 会被分词为"法国的首都是什么?巴黎"和"法国的首都是什么?伦敦"。 + True表示添加空格,False表示去除空格,None表示不做处理。 +- **subfolder** (Optional[str]): 模型仓库中的子文件夹。 +- **revision** (str): 模型的版本。 +- **max_gen_toks** (Optional[int]): 生成的最大token数量。 +- **max_length** (Optional[int]): 生成输出的最大长度。 +- **add_special_tokens** (bool, optional, defaults to True): 是否向输入序列添加特殊token。 + 如果为`None`,对于seq2seq模型(如T5)默认值为`True`,对于因果模型默认为`False`。 +- **model_parallel** (bool, optional, defaults to None): + True/False: 强制使用或不使用`accelerate`库在多设备间分布大型模型。 + 默认为None,会比较进程数与GPU数:若进程数小于GPU数则启用模型并行,否则不启用。 +- **dtype** (Union[str, torch.dtype], optional, defaults to None): + 如指定,则将模型权重转换为该数据类型。字符串会被转换为`torch.dtype`对象(如`float16` -> `torch.float16`)。 + 使用`dtype="auto"`可从模型权重自动推导类型。 +- **device** (Union[int, str]): 用于模型训练的设备。 +- **quantization_config** (Optional[BitsAndBytesConfig]): 模型量化配置,用于以量化精度加载原本为浮点的模型。4位和8位精度需要此配置。 +- **trust_remote_code** (bool): 加载模型时是否信任远程代码。 + +### VLLM + +- **pretrained** (str): HuggingFace Hub模型ID或预训练模型路径。 +- **gpu_memory_utilization** (float): GPU内存使用比例。 +- **batch_size** (int): 模型训练的批处理大小。 +- **revision** (str): 模型版本。 +- **dtype** (str, None): 模型使用的数据类型。 +- **tensor_parallel_size** (int): 使用的张量并行单元数量。 +- **data_parallel_size** (int): 使用的数据并行单元数量。 +- **max_model_length** (int): 模型的最大长度。 +- **swap_space** (int): 每个GPU的CPU交换空间大小(GiB)。 +- **seed** (int): 模型使用的随机种子。 +- **trust_remote_code** (bool): 加载模型时是否信任远程代码。 +- **use_chat_template** (bool): 是否使用聊天模板。 +- **add_special_tokens** (bool): 是否向输入序列添加特殊token。 +- **multichoice_continuations_start_space** (bool): 在多选项生成中是否在每个选项开头添加空格。 +- **subfolder** (Optional[str]): 模型仓库中的子文件夹。 + +## Nanotron + +要评估使用nanotron训练的模型: + +> [!WARNING] +> Nanotron模型必须使用torchrun进行评估。 + + +```bash + torchrun --standalone --nnodes=1 --nproc-per-node=1 \ + src/lighteval/__main__.py nanotron \ + --checkpoint-config-path ../nanotron/checkpoints/10/config.yaml \ + --lighteval-config-path examples/nanotron/lighteval_config_override_template.yaml + ``` + +`nproc-per-node`参数应与`lighteval_config_template.yaml`文件中配置的并行设置匹配, +即:`nproc-per-node = data_parallelism * tensor_parallelism * pipeline_parallelism`。 \ No newline at end of file diff --git a/docs/translate/zh/saving-and-reading-results.mdx b/docs/translate/zh/saving-and-reading-results.mdx new file mode 100644 index 000000000..6f2e9dc91 --- /dev/null +++ b/docs/translate/zh/saving-and-reading-results.mdx @@ -0,0 +1,183 @@ +# 保存和读取结果 + +## 本地保存结果 + +Lighteval将自动在使用`--output-dir`选项设置的目录中保存结果和评估详情。结果将保存在`{output_dir}/results/{model_name}/results_{timestamp}.json`中。[这里有一个结果文件的示例](#结果文件示例)。输出路径可以是任何符合[fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html)的路径(本地、s3、hf hub、gdrive、ftp等)。 + +要保存评估的详细信息,您可以使用`--save-details`选项。详细信息将保存在parquet文件`{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`中。 + +## 将结果推送到HuggingFace hub + +您可以将结果和评估详情推送到HuggingFace hub。要这样做,您需要设置`--push-to-hub`以及`--results-org`选项。结果将保存在名为`{results_org}/{model_org}/{model_name}`的数据集中。要推送详细信息,您需要设置`--save-details`选项。 +默认情况下,创建的数据集将是私有的,您可以通过设置`--public-run`选项使其公开。 + + +## 将结果推送到Tensorboard + +您可以通过设置`--push-to-tensorboard`将结果推送到Tensorboard。这将在使用`--results-org`选项设置的HF组织中创建一个Tensorboard仪表板。 + + +## 将结果推送到WandB + +您可以通过设置`--wandb`将结果推送到WandB。这将初始化一个WandB运行并记录结果。 + +Wandb参数需要在您的环境变量中设置。 + +``` +export WANDB_PROJECT="lighteval" +``` + +您可以在[wandb文档](https://docs.wandb.ai/guides/track/environment-variables/)中找到变量列表。 + + +## 如何加载和研究详细信息 + +### 从本地详细信息文件加载 + +```python +from datasets import load_dataset +import os + +output_dir = "evals_doc" +model_name = "HuggingFaceH4/zephyr-7b-beta" +timestamp = "latest" +task = "lighteval|gsm8k|0" + +if timestamp == "latest": + path = f"{output_dir}/details/{model_org}/{model_name}/*/" + timestamps = glob.glob(path) + timestamp = sorted(timestamps)[-1].split("/")[-2] + print(f"Latest timestamp: {timestamp}") + +details_path = f"{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet" + +# 加载详细信息 +details = load_dataset("parquet", data_files=details_path, split="train") + +for detail in details: + print(detail) +``` + +### 从HuggingFace hub加载 + +```python +from datasets import load_dataset + +results_org = "SaylorTwift" +model_name = "HuggingFaceH4/zephyr-7b-beta" +sanitized_model_name = model_name.replace("/", "__") +task = "lighteval|gsm8k|0" +public_run = False + +dataset_path = f"{results_org}/details_{sanitized_model_name}{'_private' if not public_run else ''}" +details = load_dataset(dataset_path, task.replace("|", "_"), split="latest") + +for detail in details: + print(detail) +``` + + +详细信息文件包含以下列: +- `choices`:在多选任务的情况下,向模型呈现的选项。 +- `gold`:黄金答案。 +- `gold_index`:黄金答案在选项列表中的索引。 +- `cont_tokens`:续写的令牌。 +- `example`:文本形式的输入。 +- `full_prompt`:完整提示,将输入到模型中。 +- `input_tokens`:完整提示的令牌。 +- `instruction`:给模型的指令。 +- `metrics`:为示例计算的指标。 +- `num_asked_few_shots`:要求模型的少样本数量。 +- `num_effective_few_shots`:有效的少样本数量。 +- `padded`:输入是否被填充。 +- `pred_logits`:模型的logits。 +- `predictions`:模型的预测。 +- `specifics`:任务的具体细节。 +- `truncated`:输入是否被截断。 + + +## 结果文件示例 + +```json +{ + "config_general": { + "lighteval_sha": "203045a8431bc9b77245c9998e05fc54509ea07f", + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": 1, + "job_id": "", + "start_time": 620979.879320166, + "end_time": 621004.632108041, + "total_evaluation_time_secondes": "24.752787875011563", + "model_name": "gpt2", + "model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e", + "model_dtype": null, + "model_size": "476.2 MB" + }, + "results": { + "lighteval|gsm8k|0": { + "qem": 0.0, + "qem_stderr": 0.0, + "maj@8": 0.0, + "maj@8_stderr": 0.0 + }, + "all": { + "qem": 0.0, + "qem_stderr": 0.0, + "maj@8": 0.0, + "maj@8_stderr": 0.0 + } + }, + "versions": { + "lighteval|gsm8k|0": 0 + }, + "config_tasks": { + "lighteval|gsm8k": { + "name": "gsm8k", + "prompt_function": "gsm8k", + "hf_repo": "gsm8k", + "hf_subset": "main", + "metric": [ + { + "metric_name": "qem", + "higher_is_better": true, + "category": "3", + "use_case": "5", + "sample_level_fn": "compute", + "corpus_level_fn": "mean" + }, + { + "metric_name": "maj@8", + "higher_is_better": true, + "category": "5", + "use_case": "5", + "sample_level_fn": "compute", + "corpus_level_fn": "mean" + } + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 256, + "generation_grammar": null, + "stop_sequence": [ + "Question=" + ], + "num_samples": null, + "suite": [ + "lighteval" + ], + "original_num_docs": 1319, + "effective_num_docs": 1, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + } + } +} \ No newline at end of file diff --git a/docs/translate/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/translate/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx new file mode 100644 index 000000000..3045f171e --- /dev/null +++ b/docs/translate/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx @@ -0,0 +1,50 @@ +# 在服务器或容器上评估模型 + +除了本地运行评估外,您还可以在兼容TGI的服务器或容器上部署模型,然后通过向服务器发送请求来执行评估。命令格式与前面相同,只需额外指定yaml配置文件的路径: + +```bash +lighteval endpoint {tgi,inference-endpoint} \ + "/path/to/config/file"\ + +``` + +服务器上运行支持两种类型的配置文件: + +### Hugging Face Inference Endpoints + +要使用HuggingFace的Inference Endpoints部署模型,您需要提供`endpoint_model.yaml`配置文件。Lighteval会自动部署端点,运行评估,并在完成后删除端点(除非您指定使用已启动的端点,这种情况下评估结束后不会删除端点)。 + +__配置文件示例:__ + +```yaml +model_parameters: + reuse_existing: false # 如果设为true,将忽略实例中的所有参数,且评估后不删除端点 +# endpoint_name: "llama-2-7B-lighteval" # 名称必须使用小写字母,不含特殊字符 + model_name: "meta-llama/Llama-2-7b-hf" + revision: "main" # 默认为"main" + dtype: "float16" # 可选值包括"awq"、"eetq"、"gptq"、"4bit"或"8bit"(使用bitsandbytes)、"bfloat16"或"float16" + accelerator: "gpu" + region: "eu-west-1" + vendor: "aws" + instance_type: "nvidia-a10g" + instance_size: "x1" + framework: "pytorch" + endpoint_type: "protected" + namespace: null # 端点部署的命名空间,默认为当前用户的命名空间 + image_url: null # (可选)指定部署端点时使用的docker镜像,例如使用支持更新模型的最新TGI容器 + env_vars: + null # (可选)启动端点时设置的环境变量,例如:`MAX_INPUT_LENGTH: 2048` +``` + +### Text Generation Inference (TGI) + +如需使用已部署在TGI服务器上的模型(例如HuggingFace的无服务器推理服务): + +__配置文件示例:__ + +```yaml +model_parameters: + inference_server_address: "" + inference_server_auth: null + model_id: null # 可选,仅当TGI容器以指向本地目录的model_id启动时需要 +``` \ No newline at end of file diff --git a/docs/translate/zh/use-inference-providers-as-backend.mdx b/docs/translate/zh/use-inference-providers-as-backend.mdx new file mode 100644 index 000000000..dbe6228a3 --- /dev/null +++ b/docs/translate/zh/use-inference-providers-as-backend.mdx @@ -0,0 +1,41 @@ +# 使用Inference Providers作为后端 + +Lighteval支持通过Hugging Face的Inference Providers在多种服务提供商上评估大语言模型,包括Black Forest Labs、Cerebras、Fireworks AI、Nebius、Together AI等。 + +## 快速使用 + +> [!WARNING] +> 请务必设置您的HuggingFace API密钥。 +> 您可以通过`HF_TOKEN`环境变量或使用`huggingface-cli`命令来设置密钥。 + + +```bash +lighteval endpoint inference-providers \ + "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \ + "lighteval|gsm8k|0|0" +``` + +## 使用配置文件 + +您可以通过配置文件来定义要使用的模型和服务提供商。 + +```bash +lighteval endpoint inference-providers \ + examples/model_configs/inference_providers.yaml \ + "lighteval|gsm8k|0|0" +``` + +配置文件示例: + +```yaml +model_parameters: + model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" + provider: "novita" + timeout: null + proxies: null + parallel_calls_count: 10 + generation_parameters: + temperature: 0.8 + top_k: 10 + max_new_tokens: 10000 +``` \ No newline at end of file diff --git a/docs/translate/zh/use-litellm-as-backend.mdx b/docs/translate/zh/use-litellm-as-backend.mdx new file mode 100644 index 000000000..22252acc9 --- /dev/null +++ b/docs/translate/zh/use-litellm-as-backend.mdx @@ -0,0 +1,38 @@ +# 使用Litellm作为后端 + +Lighteval支持使用litellm作为后端,这是一个统一接口工具,允许您以OpenAI格式调用各种LLM API(包括Bedrock、Huggingface、VertexAI、TogetherAI、Azure、OpenAI、Groq等)。 + +关于可用API和兼容端点的详细文档可在[此处](https://docs.litellm.ai/docs/)查阅。 + +## 快速使用 + +```bash +lighteval endpoint litellm \ + "provider=openai,model_name=gpt-3.5-turbo" \ + "lighteval|gsm8k|0|0" \ + --use-chat-template +``` + +> [!WARNING] +> 使用litellm时必须添加`--use-chat-template`参数才能正常工作。 + +## 使用配置文件 + +Litellm能够连接任何与OpenAI兼容的端点进行文本生成,例如,您可以评估在本地vllm服务器上运行的模型。 + +要实现这一点,您需要使用类似以下的配置文件: + +```yaml +model_parameters: + model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" + base_url: "您要使用的端点的URL" + api_key: "" # 根据需要删除或保留为空 + generation_parameters: + temperature: 0.5 + max_new_tokens: 256 + stop_tokens: [""] + top_p: 0.9 + seed: 0 + repetition_penalty: 1.0 + frequency_penalty: 0.0 +``` \ No newline at end of file diff --git a/docs/translate/zh/use-sglang-as-backend.mdx b/docs/translate/zh/use-sglang-as-backend.mdx new file mode 100644 index 000000000..e59fd0b63 --- /dev/null +++ b/docs/translate/zh/use-sglang-as-backend.mdx @@ -0,0 +1,77 @@ +# 使用SGLang作为后端 + +Lighteval支持使用`sglang`作为后端,这能显著提升评估速度。 +要启用此功能,只需在`model_args`中指定您希望传递给sglang的相关参数即可。 + +```bash +lighteval sglang \ + "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +`sglang`能够通过数据并行和张量并行方式在多GPU环境中分布式部署模型。 +您可以在`model_args`中设置相应参数来选择合适的并行策略。 + +例如,如果您有4个GPU,可以使用`tp_size`参数实现张量并行: + +```bash +lighteval sglang \ + "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +或者,如果您的模型能够适合单个GPU,可以利用`dp_size`参数实现数据并行来加速评估过程: + +```bash +lighteval sglang \ + "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +## 使用配置文件 + +对于更高级的配置需求,您可以使用配置文件来定义模型参数。 +以下是一个示例配置文件,完整版本可在`examples/model_configs/sglang_model_config.yaml`中找到: + +```bash +lighteval sglang \ + "examples/model_configs/sglang_model_config.yaml" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +> [!TIP] +> sglang的详细配置参数文档可在[此处](https://docs.sglang.ai/backend/server_arguments.html)查阅 + +```yaml +model_parameters: + model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct" + dtype: "auto" + tp_size: 1 + dp_size: 1 + context_length: null + random_seed: 1 + trust_remote_code: False + use_chat_template: False + device: "cuda" + skip_tokenizer_init: False + kv_cache_dtype: "auto" + add_special_tokens: True + pairwise_tokenization: False + sampling_backend: null + attention_backend: null + mem_fraction_static: 0.8 + chunked_prefill_size: 4096 + generation_parameters: + max_new_tokens: 1024 + min_new_tokens: 0 + temperature: 1.0 + top_k: 50 + min_p: 0.0 + top_p: 1.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + frequency_penalty: 0.0 +``` + +> [!WARNING] +> 如果遇到内存溢出(OOM)问题,您可能需要减小模型的上下文窗口大小,并降低`mem_fraction_static`和`chunked_prefill_size`参数值。 \ No newline at end of file diff --git a/docs/translate/zh/use-vllm-as-backend.mdx b/docs/translate/zh/use-vllm-as-backend.mdx new file mode 100644 index 000000000..d41be7aa6 --- /dev/null +++ b/docs/translate/zh/use-vllm-as-backend.mdx @@ -0,0 +1,124 @@ +# 使用VLLM作为后端 + +Lighteval支持使用`vllm`作为后端,这能显著提升评估速度。 +要启用此功能,只需在`model_args`中指定您希望传递给vllm的相关参数即可。 + + +> [!TIP] +> vllm引擎的详细参数文档可在[此处](https://docs.vllm.ai/en/latest/serving/engine_args.html)查阅 + +```bash +lighteval vllm \ + "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +`vllm`能够通过数据并行、流水线并行或张量并行方式在多GPU环境中分布式部署模型。 +您可以在`model_args`中设置相应参数来选择合适的并行策略。 + +例如,如果您有4个GPU,可以使用`tensor_parallelism`将模型拆分: + +```bash +export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \ + "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +或者,如果您的模型能够适合单个GPU,可以利用`data_parallelism`来加速评估过程: + +```bash +lighteval vllm \ + "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +## 使用配置文件 + +对于更高级的配置需求,您可以使用配置文件来定义模型参数。 +以下是一个示例配置文件,完整版本可在`examples/model_configs/vllm_model_config.yaml`中找到: + +```bash +lighteval vllm \ + "examples/model_configs/vllm_model_config.yaml" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +```yaml +model_parameters: + model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct" + revision: "main" + dtype: "bfloat16" + tensor_parallel_size: 1 + data_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.9 + max_model_length: 2048 + swap_space: 4 + seed: 1 + trust_remote_code: True + use_chat_template: True + add_special_tokens: True + multichoice_continuations_start_space: True + pairwise_tokenization: True + subfolder: null + generation_parameters: + presence_penalty: 0.0 + repetition_penalty: 1.0 + frequency_penalty: 0.0 + temperature: 1.0 + top_k: 50 + min_p: 0.0 + top_p: 1.0 + seed: 42 + stop_tokens: null + max_new_tokens: 1024 + min_new_tokens: 0 +``` + +> [!WARNING] +> 如果遇到内存溢出(OOM)问题,您可能需要减小模型的上下文窗口大小,并降低`gpu_memory_utilization`参数值。 + + +## 动态调整指标配置 + +对于特殊类型的指标,如`Pass@K`或LiveCodeBench的`codegen`指标,有时需要传递特定参数值,例如生成样本数量。这可以在`yaml`配置文件中通过以下方式实现: + +```yaml +model_parameters: + model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct" + revision: "main" + dtype: "bfloat16" + tensor_parallel_size: 1 + data_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.9 + max_model_length: 2048 + swap_space: 4 + seed: 1 + trust_remote_code: True + use_chat_template: True + add_special_tokens: True + multichoice_continuations_start_space: True + pairwise_tokenization: True + subfolder: null + generation_parameters: + presence_penalty: 0.0 + repetition_penalty: 1.0 + frequency_penalty: 0.0 + temperature: 1.0 + top_k: 50 + min_p: 0.0 + top_p: 1.0 + seed: 42 + stop_tokens: null + max_new_tokens: 1024 + min_new_tokens: 0 +metric_options: # 可选的指标参数 + codegen_pass@1:16: + num_samples: 16 +``` + +您可以通过在yaml文件中添加可选的`metric_options`键来自定义指标参数, +使用的指标名称应与`Metric.metric_name`中定义的一致。 +在上例中,我们为任务中定义的`codegen_pass@1:16`指标将`num_samples`值设置为16, +这会覆盖该指标原本的默认值。 \ No newline at end of file diff --git a/docs/translate/zh/using-the-python-api.mdx b/docs/translate/zh/using-the-python-api.mdx new file mode 100644 index 000000000..72e535f01 --- /dev/null +++ b/docs/translate/zh/using-the-python-api.mdx @@ -0,0 +1,63 @@ +# 使用Python API + +Lighteval可以从自定义Python脚本中使用。要评估模型,您需要设置 +[`~logging.evaluation_tracker.EvaluationTracker`]、[`~pipeline.PipelineParameters`]、 +[`model`](package_reference/models)或[`model_config`](package_reference/model_config) +以及[`~pipeline.Pipeline`]。 + +之后,只需运行流水线并保存结果。 + + +```python +import lighteval +from lighteval.logging.evaluation_tracker import EvaluationTracker +from lighteval.models.vllm.vllm_model import VLLMModelConfig +from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters +from lighteval.utils.utils import EnvConfig +from lighteval.utils.imports import is_accelerate_available + +if is_accelerate_available(): + from datetime import timedelta + from accelerate import Accelerator, InitProcessGroupKwargs + accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) +else: + accelerator = None + +def main(): + evaluation_tracker = EvaluationTracker( + output_dir="./results", + save_details=True, + push_to_hub=True, + hub_results_org="您的用户名", + ) + + pipeline_params = PipelineParameters( + launcher_type=ParallelismManager.ACCELERATE, + env_config=EnvConfig(cache_dir="tmp/"), + custom_task_directory=None, # 如果使用自定义任务 + # 一旦您的配置经过测试,删除以下2个参数 + override_batch_size=1, + max_samples=10 + ) + + model_config = VLLMModelConfig( + model_name="HuggingFaceH4/zephyr-7b-beta", + dtype="float16", + use_chat_template=True, + ) + + task = "helm|mmlu|5|1" + + pipeline = Pipeline( + tasks=task, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config, + ) + + pipeline.evaluate() + pipeline.save_and_push_results() + pipeline.show_results() + +if __name__ == "__main__": + main() \ No newline at end of file From 0cbce02ab3edac476de5c65556624f6ab40117d6 Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Mon, 19 May 2025 12:09:16 +0000 Subject: [PATCH 2/8] fix: rename docs/translate to docs/translations for correct noun usage in Chinese docs --- docs/{translate => translations}/zh/_toctree.yml | 0 docs/{translate => translations}/zh/adding-a-custom-task.mdx | 0 docs/{translate => translations}/zh/adding-a-new-metric.mdx | 0 docs/{translate => translations}/zh/available-tasks.mdx | 0 .../zh/contributing-to-multilingual-evaluations.mdx | 0 docs/{translate => translations}/zh/evaluating-a-custom-model.mdx | 0 docs/{translate => translations}/zh/index.mdx | 0 docs/{translate => translations}/zh/installation.mdx | 0 docs/{translate => translations}/zh/metric-list.mdx | 0 .../zh/package_reference/evaluation_tracker.mdx | 0 docs/{translate => translations}/zh/package_reference/logging.mdx | 0 docs/{translate => translations}/zh/package_reference/metrics.mdx | 0 docs/{translate => translations}/zh/package_reference/models.mdx | 0 .../{translate => translations}/zh/package_reference/pipeline.mdx | 0 docs/{translate => translations}/zh/package_reference/tasks.mdx | 0 docs/{translate => translations}/zh/quicktour.mdx | 0 .../{translate => translations}/zh/saving-and-reading-results.mdx | 0 .../zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx | 0 .../zh/use-inference-providers-as-backend.mdx | 0 docs/{translate => translations}/zh/use-litellm-as-backend.mdx | 0 docs/{translate => translations}/zh/use-sglang-as-backend.mdx | 0 docs/{translate => translations}/zh/use-vllm-as-backend.mdx | 0 docs/{translate => translations}/zh/using-the-python-api.mdx | 0 23 files changed, 0 insertions(+), 0 deletions(-) rename docs/{translate => translations}/zh/_toctree.yml (100%) rename docs/{translate => translations}/zh/adding-a-custom-task.mdx (100%) rename docs/{translate => translations}/zh/adding-a-new-metric.mdx (100%) rename docs/{translate => translations}/zh/available-tasks.mdx (100%) rename docs/{translate => translations}/zh/contributing-to-multilingual-evaluations.mdx (100%) rename docs/{translate => translations}/zh/evaluating-a-custom-model.mdx (100%) rename docs/{translate => translations}/zh/index.mdx (100%) rename docs/{translate => translations}/zh/installation.mdx (100%) rename docs/{translate => translations}/zh/metric-list.mdx (100%) rename docs/{translate => translations}/zh/package_reference/evaluation_tracker.mdx (100%) rename docs/{translate => translations}/zh/package_reference/logging.mdx (100%) rename docs/{translate => translations}/zh/package_reference/metrics.mdx (100%) rename docs/{translate => translations}/zh/package_reference/models.mdx (100%) rename docs/{translate => translations}/zh/package_reference/pipeline.mdx (100%) rename docs/{translate => translations}/zh/package_reference/tasks.mdx (100%) rename docs/{translate => translations}/zh/quicktour.mdx (100%) rename docs/{translate => translations}/zh/saving-and-reading-results.mdx (100%) rename docs/{translate => translations}/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx (100%) rename docs/{translate => translations}/zh/use-inference-providers-as-backend.mdx (100%) rename docs/{translate => translations}/zh/use-litellm-as-backend.mdx (100%) rename docs/{translate => translations}/zh/use-sglang-as-backend.mdx (100%) rename docs/{translate => translations}/zh/use-vllm-as-backend.mdx (100%) rename docs/{translate => translations}/zh/using-the-python-api.mdx (100%) diff --git a/docs/translate/zh/_toctree.yml b/docs/translations/zh/_toctree.yml similarity index 100% rename from docs/translate/zh/_toctree.yml rename to docs/translations/zh/_toctree.yml diff --git a/docs/translate/zh/adding-a-custom-task.mdx b/docs/translations/zh/adding-a-custom-task.mdx similarity index 100% rename from docs/translate/zh/adding-a-custom-task.mdx rename to docs/translations/zh/adding-a-custom-task.mdx diff --git a/docs/translate/zh/adding-a-new-metric.mdx b/docs/translations/zh/adding-a-new-metric.mdx similarity index 100% rename from docs/translate/zh/adding-a-new-metric.mdx rename to docs/translations/zh/adding-a-new-metric.mdx diff --git a/docs/translate/zh/available-tasks.mdx b/docs/translations/zh/available-tasks.mdx similarity index 100% rename from docs/translate/zh/available-tasks.mdx rename to docs/translations/zh/available-tasks.mdx diff --git a/docs/translate/zh/contributing-to-multilingual-evaluations.mdx b/docs/translations/zh/contributing-to-multilingual-evaluations.mdx similarity index 100% rename from docs/translate/zh/contributing-to-multilingual-evaluations.mdx rename to docs/translations/zh/contributing-to-multilingual-evaluations.mdx diff --git a/docs/translate/zh/evaluating-a-custom-model.mdx b/docs/translations/zh/evaluating-a-custom-model.mdx similarity index 100% rename from docs/translate/zh/evaluating-a-custom-model.mdx rename to docs/translations/zh/evaluating-a-custom-model.mdx diff --git a/docs/translate/zh/index.mdx b/docs/translations/zh/index.mdx similarity index 100% rename from docs/translate/zh/index.mdx rename to docs/translations/zh/index.mdx diff --git a/docs/translate/zh/installation.mdx b/docs/translations/zh/installation.mdx similarity index 100% rename from docs/translate/zh/installation.mdx rename to docs/translations/zh/installation.mdx diff --git a/docs/translate/zh/metric-list.mdx b/docs/translations/zh/metric-list.mdx similarity index 100% rename from docs/translate/zh/metric-list.mdx rename to docs/translations/zh/metric-list.mdx diff --git a/docs/translate/zh/package_reference/evaluation_tracker.mdx b/docs/translations/zh/package_reference/evaluation_tracker.mdx similarity index 100% rename from docs/translate/zh/package_reference/evaluation_tracker.mdx rename to docs/translations/zh/package_reference/evaluation_tracker.mdx diff --git a/docs/translate/zh/package_reference/logging.mdx b/docs/translations/zh/package_reference/logging.mdx similarity index 100% rename from docs/translate/zh/package_reference/logging.mdx rename to docs/translations/zh/package_reference/logging.mdx diff --git a/docs/translate/zh/package_reference/metrics.mdx b/docs/translations/zh/package_reference/metrics.mdx similarity index 100% rename from docs/translate/zh/package_reference/metrics.mdx rename to docs/translations/zh/package_reference/metrics.mdx diff --git a/docs/translate/zh/package_reference/models.mdx b/docs/translations/zh/package_reference/models.mdx similarity index 100% rename from docs/translate/zh/package_reference/models.mdx rename to docs/translations/zh/package_reference/models.mdx diff --git a/docs/translate/zh/package_reference/pipeline.mdx b/docs/translations/zh/package_reference/pipeline.mdx similarity index 100% rename from docs/translate/zh/package_reference/pipeline.mdx rename to docs/translations/zh/package_reference/pipeline.mdx diff --git a/docs/translate/zh/package_reference/tasks.mdx b/docs/translations/zh/package_reference/tasks.mdx similarity index 100% rename from docs/translate/zh/package_reference/tasks.mdx rename to docs/translations/zh/package_reference/tasks.mdx diff --git a/docs/translate/zh/quicktour.mdx b/docs/translations/zh/quicktour.mdx similarity index 100% rename from docs/translate/zh/quicktour.mdx rename to docs/translations/zh/quicktour.mdx diff --git a/docs/translate/zh/saving-and-reading-results.mdx b/docs/translations/zh/saving-and-reading-results.mdx similarity index 100% rename from docs/translate/zh/saving-and-reading-results.mdx rename to docs/translations/zh/saving-and-reading-results.mdx diff --git a/docs/translate/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/translations/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx similarity index 100% rename from docs/translate/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx rename to docs/translations/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx diff --git a/docs/translate/zh/use-inference-providers-as-backend.mdx b/docs/translations/zh/use-inference-providers-as-backend.mdx similarity index 100% rename from docs/translate/zh/use-inference-providers-as-backend.mdx rename to docs/translations/zh/use-inference-providers-as-backend.mdx diff --git a/docs/translate/zh/use-litellm-as-backend.mdx b/docs/translations/zh/use-litellm-as-backend.mdx similarity index 100% rename from docs/translate/zh/use-litellm-as-backend.mdx rename to docs/translations/zh/use-litellm-as-backend.mdx diff --git a/docs/translate/zh/use-sglang-as-backend.mdx b/docs/translations/zh/use-sglang-as-backend.mdx similarity index 100% rename from docs/translate/zh/use-sglang-as-backend.mdx rename to docs/translations/zh/use-sglang-as-backend.mdx diff --git a/docs/translate/zh/use-vllm-as-backend.mdx b/docs/translations/zh/use-vllm-as-backend.mdx similarity index 100% rename from docs/translate/zh/use-vllm-as-backend.mdx rename to docs/translations/zh/use-vllm-as-backend.mdx diff --git a/docs/translate/zh/using-the-python-api.mdx b/docs/translations/zh/using-the-python-api.mdx similarity index 100% rename from docs/translate/zh/using-the-python-api.mdx rename to docs/translations/zh/using-the-python-api.mdx From 39e659886ff37796d47efb833b15b488c400aeb7 Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Mon, 19 May 2025 14:28:35 +0000 Subject: [PATCH 3/8] fix: optimize Chinese description for few_shots_split field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the description of the few_shots_split field in docs/translations/zh/adding-a-custom-task.mdx from “您想从中选择样本作为少量样本示例的特定分割” to “您想从中选择少量示例样本的特定数据划分”. --- docs/translations/zh/adding-a-custom-task.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/translations/zh/adding-a-custom-task.mdx b/docs/translations/zh/adding-a-custom-task.mdx index 555e73bc8..a983ca119 100644 --- a/docs/translations/zh/adding-a-custom-task.mdx +++ b/docs/translations/zh/adding-a-custom-task.mdx @@ -108,7 +108,7 @@ SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for - `hf_subset` (str),您想用于评估的特定子集(注意:当数据集没有子集时,请用`"default"`填充此字段,而不是用`None`或`""`) - `hf_avail_splits` (list),您的数据集可用的所有分割(训练、验证、测试、其他...) - `evaluation_splits` (list),您想用于评估的分割 -- `few_shots_split` (str,可以为`null`),您想从中选择样本作为少量样本示例的特定分割。它应该与`evaluation_splits`中包含的集合不同 +- `few_shots_split` (str,可以为`null`),您想从中选择少量示例样本的特定数据划分。它应该与`evaluation_splits`中包含的集合不同 - `few_shots_select` (str,可以为`null`),您将用来为少量样本示例选择项目的方法。可以为`null`,或以下之一: - `balanced` 从`few_shots_split`中选择带有平衡标签的示例,以避免将少量样本示例(因此是模型生成)偏向特定标签 - `random` 从`few_shots_split`中随机选择示例 From c0de297b2db550a52c1639762d39ab3bc0e7f1e5 Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Mon, 19 May 2025 14:49:01 +0000 Subject: [PATCH 4/8] refactor(docs): reorganize documentation structure for multilingual support Rename docs/source to docs/source/en to explicitly separate English documentation. Move docs/translations/zh to docs/source/zh to follow the standard multilingual documentation structure. Update CI configuration to support both English and Chinese documentation builds. --- .github/workflows/doc-build.yml | 1 + docs/source/{ => en}/_toctree.yml | 0 docs/source/{ => en}/adding-a-custom-task.mdx | 0 docs/source/{ => en}/adding-a-new-metric.mdx | 0 docs/source/{ => en}/available-tasks.mdx | 0 .../source/{ => en}/contributing-to-multilingual-evaluations.mdx | 0 docs/source/{ => en}/evaluating-a-custom-model.mdx | 0 docs/source/{ => en}/index.mdx | 0 docs/source/{ => en}/installation.mdx | 0 docs/source/{ => en}/metric-list.mdx | 0 docs/source/{ => en}/package_reference/evaluation_tracker.mdx | 0 docs/source/{ => en}/package_reference/logging.mdx | 0 docs/source/{ => en}/package_reference/metrics.mdx | 0 docs/source/{ => en}/package_reference/models.mdx | 0 docs/source/{ => en}/package_reference/pipeline.mdx | 0 docs/source/{ => en}/package_reference/tasks.mdx | 0 docs/source/{ => en}/quicktour.mdx | 0 docs/source/{ => en}/saving-and-reading-results.mdx | 0 .../use-huggingface-inference-endpoints-or-tgi-as-backend.mdx | 0 docs/source/{ => en}/use-inference-providers-as-backend.mdx | 0 docs/source/{ => en}/use-litellm-as-backend.mdx | 0 docs/source/{ => en}/use-sglang-as-backend.mdx | 0 docs/source/{ => en}/use-vllm-as-backend.mdx | 0 docs/source/{ => en}/using-the-python-api.mdx | 0 docs/{translations => source}/zh/_toctree.yml | 0 docs/{translations => source}/zh/adding-a-custom-task.mdx | 0 docs/{translations => source}/zh/adding-a-new-metric.mdx | 0 docs/{translations => source}/zh/available-tasks.mdx | 0 .../zh/contributing-to-multilingual-evaluations.mdx | 0 docs/{translations => source}/zh/evaluating-a-custom-model.mdx | 0 docs/{translations => source}/zh/index.mdx | 0 docs/{translations => source}/zh/installation.mdx | 0 docs/{translations => source}/zh/metric-list.mdx | 0 .../zh/package_reference/evaluation_tracker.mdx | 0 docs/{translations => source}/zh/package_reference/logging.mdx | 0 docs/{translations => source}/zh/package_reference/metrics.mdx | 0 docs/{translations => source}/zh/package_reference/models.mdx | 0 docs/{translations => source}/zh/package_reference/pipeline.mdx | 0 docs/{translations => source}/zh/package_reference/tasks.mdx | 0 docs/{translations => source}/zh/quicktour.mdx | 0 docs/{translations => source}/zh/saving-and-reading-results.mdx | 0 .../zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx | 0 .../zh/use-inference-providers-as-backend.mdx | 0 docs/{translations => source}/zh/use-litellm-as-backend.mdx | 0 docs/{translations => source}/zh/use-sglang-as-backend.mdx | 0 docs/{translations => source}/zh/use-vllm-as-backend.mdx | 0 docs/{translations => source}/zh/using-the-python-api.mdx | 0 47 files changed, 1 insertion(+) rename docs/source/{ => en}/_toctree.yml (100%) rename docs/source/{ => en}/adding-a-custom-task.mdx (100%) rename docs/source/{ => en}/adding-a-new-metric.mdx (100%) rename docs/source/{ => en}/available-tasks.mdx (100%) rename docs/source/{ => en}/contributing-to-multilingual-evaluations.mdx (100%) rename docs/source/{ => en}/evaluating-a-custom-model.mdx (100%) rename docs/source/{ => en}/index.mdx (100%) rename docs/source/{ => en}/installation.mdx (100%) rename docs/source/{ => en}/metric-list.mdx (100%) rename docs/source/{ => en}/package_reference/evaluation_tracker.mdx (100%) rename docs/source/{ => en}/package_reference/logging.mdx (100%) rename docs/source/{ => en}/package_reference/metrics.mdx (100%) rename docs/source/{ => en}/package_reference/models.mdx (100%) rename docs/source/{ => en}/package_reference/pipeline.mdx (100%) rename docs/source/{ => en}/package_reference/tasks.mdx (100%) rename docs/source/{ => en}/quicktour.mdx (100%) rename docs/source/{ => en}/saving-and-reading-results.mdx (100%) rename docs/source/{ => en}/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx (100%) rename docs/source/{ => en}/use-inference-providers-as-backend.mdx (100%) rename docs/source/{ => en}/use-litellm-as-backend.mdx (100%) rename docs/source/{ => en}/use-sglang-as-backend.mdx (100%) rename docs/source/{ => en}/use-vllm-as-backend.mdx (100%) rename docs/source/{ => en}/using-the-python-api.mdx (100%) rename docs/{translations => source}/zh/_toctree.yml (100%) rename docs/{translations => source}/zh/adding-a-custom-task.mdx (100%) rename docs/{translations => source}/zh/adding-a-new-metric.mdx (100%) rename docs/{translations => source}/zh/available-tasks.mdx (100%) rename docs/{translations => source}/zh/contributing-to-multilingual-evaluations.mdx (100%) rename docs/{translations => source}/zh/evaluating-a-custom-model.mdx (100%) rename docs/{translations => source}/zh/index.mdx (100%) rename docs/{translations => source}/zh/installation.mdx (100%) rename docs/{translations => source}/zh/metric-list.mdx (100%) rename docs/{translations => source}/zh/package_reference/evaluation_tracker.mdx (100%) rename docs/{translations => source}/zh/package_reference/logging.mdx (100%) rename docs/{translations => source}/zh/package_reference/metrics.mdx (100%) rename docs/{translations => source}/zh/package_reference/models.mdx (100%) rename docs/{translations => source}/zh/package_reference/pipeline.mdx (100%) rename docs/{translations => source}/zh/package_reference/tasks.mdx (100%) rename docs/{translations => source}/zh/quicktour.mdx (100%) rename docs/{translations => source}/zh/saving-and-reading-results.mdx (100%) rename docs/{translations => source}/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx (100%) rename docs/{translations => source}/zh/use-inference-providers-as-backend.mdx (100%) rename docs/{translations => source}/zh/use-litellm-as-backend.mdx (100%) rename docs/{translations => source}/zh/use-sglang-as-backend.mdx (100%) rename docs/{translations => source}/zh/use-vllm-as-backend.mdx (100%) rename docs/{translations => source}/zh/using-the-python-api.mdx (100%) diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml index cd345d3d3..54a42e3cb 100644 --- a/.github/workflows/doc-build.yml +++ b/.github/workflows/doc-build.yml @@ -12,6 +12,7 @@ jobs: uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main with: commit_sha: ${{ github.sha }} + languages: en zh package: lighteval secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} diff --git a/docs/source/_toctree.yml b/docs/source/en/_toctree.yml similarity index 100% rename from docs/source/_toctree.yml rename to docs/source/en/_toctree.yml diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/en/adding-a-custom-task.mdx similarity index 100% rename from docs/source/adding-a-custom-task.mdx rename to docs/source/en/adding-a-custom-task.mdx diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/en/adding-a-new-metric.mdx similarity index 100% rename from docs/source/adding-a-new-metric.mdx rename to docs/source/en/adding-a-new-metric.mdx diff --git a/docs/source/available-tasks.mdx b/docs/source/en/available-tasks.mdx similarity index 100% rename from docs/source/available-tasks.mdx rename to docs/source/en/available-tasks.mdx diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/en/contributing-to-multilingual-evaluations.mdx similarity index 100% rename from docs/source/contributing-to-multilingual-evaluations.mdx rename to docs/source/en/contributing-to-multilingual-evaluations.mdx diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/en/evaluating-a-custom-model.mdx similarity index 100% rename from docs/source/evaluating-a-custom-model.mdx rename to docs/source/en/evaluating-a-custom-model.mdx diff --git a/docs/source/index.mdx b/docs/source/en/index.mdx similarity index 100% rename from docs/source/index.mdx rename to docs/source/en/index.mdx diff --git a/docs/source/installation.mdx b/docs/source/en/installation.mdx similarity index 100% rename from docs/source/installation.mdx rename to docs/source/en/installation.mdx diff --git a/docs/source/metric-list.mdx b/docs/source/en/metric-list.mdx similarity index 100% rename from docs/source/metric-list.mdx rename to docs/source/en/metric-list.mdx diff --git a/docs/source/package_reference/evaluation_tracker.mdx b/docs/source/en/package_reference/evaluation_tracker.mdx similarity index 100% rename from docs/source/package_reference/evaluation_tracker.mdx rename to docs/source/en/package_reference/evaluation_tracker.mdx diff --git a/docs/source/package_reference/logging.mdx b/docs/source/en/package_reference/logging.mdx similarity index 100% rename from docs/source/package_reference/logging.mdx rename to docs/source/en/package_reference/logging.mdx diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/en/package_reference/metrics.mdx similarity index 100% rename from docs/source/package_reference/metrics.mdx rename to docs/source/en/package_reference/metrics.mdx diff --git a/docs/source/package_reference/models.mdx b/docs/source/en/package_reference/models.mdx similarity index 100% rename from docs/source/package_reference/models.mdx rename to docs/source/en/package_reference/models.mdx diff --git a/docs/source/package_reference/pipeline.mdx b/docs/source/en/package_reference/pipeline.mdx similarity index 100% rename from docs/source/package_reference/pipeline.mdx rename to docs/source/en/package_reference/pipeline.mdx diff --git a/docs/source/package_reference/tasks.mdx b/docs/source/en/package_reference/tasks.mdx similarity index 100% rename from docs/source/package_reference/tasks.mdx rename to docs/source/en/package_reference/tasks.mdx diff --git a/docs/source/quicktour.mdx b/docs/source/en/quicktour.mdx similarity index 100% rename from docs/source/quicktour.mdx rename to docs/source/en/quicktour.mdx diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/en/saving-and-reading-results.mdx similarity index 100% rename from docs/source/saving-and-reading-results.mdx rename to docs/source/en/saving-and-reading-results.mdx diff --git a/docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx similarity index 100% rename from docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx rename to docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/en/use-inference-providers-as-backend.mdx similarity index 100% rename from docs/source/use-inference-providers-as-backend.mdx rename to docs/source/en/use-inference-providers-as-backend.mdx diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/en/use-litellm-as-backend.mdx similarity index 100% rename from docs/source/use-litellm-as-backend.mdx rename to docs/source/en/use-litellm-as-backend.mdx diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/en/use-sglang-as-backend.mdx similarity index 100% rename from docs/source/use-sglang-as-backend.mdx rename to docs/source/en/use-sglang-as-backend.mdx diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/en/use-vllm-as-backend.mdx similarity index 100% rename from docs/source/use-vllm-as-backend.mdx rename to docs/source/en/use-vllm-as-backend.mdx diff --git a/docs/source/using-the-python-api.mdx b/docs/source/en/using-the-python-api.mdx similarity index 100% rename from docs/source/using-the-python-api.mdx rename to docs/source/en/using-the-python-api.mdx diff --git a/docs/translations/zh/_toctree.yml b/docs/source/zh/_toctree.yml similarity index 100% rename from docs/translations/zh/_toctree.yml rename to docs/source/zh/_toctree.yml diff --git a/docs/translations/zh/adding-a-custom-task.mdx b/docs/source/zh/adding-a-custom-task.mdx similarity index 100% rename from docs/translations/zh/adding-a-custom-task.mdx rename to docs/source/zh/adding-a-custom-task.mdx diff --git a/docs/translations/zh/adding-a-new-metric.mdx b/docs/source/zh/adding-a-new-metric.mdx similarity index 100% rename from docs/translations/zh/adding-a-new-metric.mdx rename to docs/source/zh/adding-a-new-metric.mdx diff --git a/docs/translations/zh/available-tasks.mdx b/docs/source/zh/available-tasks.mdx similarity index 100% rename from docs/translations/zh/available-tasks.mdx rename to docs/source/zh/available-tasks.mdx diff --git a/docs/translations/zh/contributing-to-multilingual-evaluations.mdx b/docs/source/zh/contributing-to-multilingual-evaluations.mdx similarity index 100% rename from docs/translations/zh/contributing-to-multilingual-evaluations.mdx rename to docs/source/zh/contributing-to-multilingual-evaluations.mdx diff --git a/docs/translations/zh/evaluating-a-custom-model.mdx b/docs/source/zh/evaluating-a-custom-model.mdx similarity index 100% rename from docs/translations/zh/evaluating-a-custom-model.mdx rename to docs/source/zh/evaluating-a-custom-model.mdx diff --git a/docs/translations/zh/index.mdx b/docs/source/zh/index.mdx similarity index 100% rename from docs/translations/zh/index.mdx rename to docs/source/zh/index.mdx diff --git a/docs/translations/zh/installation.mdx b/docs/source/zh/installation.mdx similarity index 100% rename from docs/translations/zh/installation.mdx rename to docs/source/zh/installation.mdx diff --git a/docs/translations/zh/metric-list.mdx b/docs/source/zh/metric-list.mdx similarity index 100% rename from docs/translations/zh/metric-list.mdx rename to docs/source/zh/metric-list.mdx diff --git a/docs/translations/zh/package_reference/evaluation_tracker.mdx b/docs/source/zh/package_reference/evaluation_tracker.mdx similarity index 100% rename from docs/translations/zh/package_reference/evaluation_tracker.mdx rename to docs/source/zh/package_reference/evaluation_tracker.mdx diff --git a/docs/translations/zh/package_reference/logging.mdx b/docs/source/zh/package_reference/logging.mdx similarity index 100% rename from docs/translations/zh/package_reference/logging.mdx rename to docs/source/zh/package_reference/logging.mdx diff --git a/docs/translations/zh/package_reference/metrics.mdx b/docs/source/zh/package_reference/metrics.mdx similarity index 100% rename from docs/translations/zh/package_reference/metrics.mdx rename to docs/source/zh/package_reference/metrics.mdx diff --git a/docs/translations/zh/package_reference/models.mdx b/docs/source/zh/package_reference/models.mdx similarity index 100% rename from docs/translations/zh/package_reference/models.mdx rename to docs/source/zh/package_reference/models.mdx diff --git a/docs/translations/zh/package_reference/pipeline.mdx b/docs/source/zh/package_reference/pipeline.mdx similarity index 100% rename from docs/translations/zh/package_reference/pipeline.mdx rename to docs/source/zh/package_reference/pipeline.mdx diff --git a/docs/translations/zh/package_reference/tasks.mdx b/docs/source/zh/package_reference/tasks.mdx similarity index 100% rename from docs/translations/zh/package_reference/tasks.mdx rename to docs/source/zh/package_reference/tasks.mdx diff --git a/docs/translations/zh/quicktour.mdx b/docs/source/zh/quicktour.mdx similarity index 100% rename from docs/translations/zh/quicktour.mdx rename to docs/source/zh/quicktour.mdx diff --git a/docs/translations/zh/saving-and-reading-results.mdx b/docs/source/zh/saving-and-reading-results.mdx similarity index 100% rename from docs/translations/zh/saving-and-reading-results.mdx rename to docs/source/zh/saving-and-reading-results.mdx diff --git a/docs/translations/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/source/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx similarity index 100% rename from docs/translations/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx rename to docs/source/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx diff --git a/docs/translations/zh/use-inference-providers-as-backend.mdx b/docs/source/zh/use-inference-providers-as-backend.mdx similarity index 100% rename from docs/translations/zh/use-inference-providers-as-backend.mdx rename to docs/source/zh/use-inference-providers-as-backend.mdx diff --git a/docs/translations/zh/use-litellm-as-backend.mdx b/docs/source/zh/use-litellm-as-backend.mdx similarity index 100% rename from docs/translations/zh/use-litellm-as-backend.mdx rename to docs/source/zh/use-litellm-as-backend.mdx diff --git a/docs/translations/zh/use-sglang-as-backend.mdx b/docs/source/zh/use-sglang-as-backend.mdx similarity index 100% rename from docs/translations/zh/use-sglang-as-backend.mdx rename to docs/source/zh/use-sglang-as-backend.mdx diff --git a/docs/translations/zh/use-vllm-as-backend.mdx b/docs/source/zh/use-vllm-as-backend.mdx similarity index 100% rename from docs/translations/zh/use-vllm-as-backend.mdx rename to docs/source/zh/use-vllm-as-backend.mdx diff --git a/docs/translations/zh/using-the-python-api.mdx b/docs/source/zh/using-the-python-api.mdx similarity index 100% rename from docs/translations/zh/using-the-python-api.mdx rename to docs/source/zh/using-the-python-api.mdx From ebce5b908f50e87115e5983f9f73300f0f6d68ff Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Mon, 19 May 2025 16:05:12 +0000 Subject: [PATCH 5/8] chore: update doc-pr workflow yml files --- .github/workflows/doc-pr-build.yml | 2 ++ .github/workflows/doc-pr-upload.yml | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml index f96e20583..a8e5e9b2c 100644 --- a/.github/workflows/doc-pr-build.yml +++ b/.github/workflows/doc-pr-build.yml @@ -14,3 +14,5 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: lighteval + languages: en zh + diff --git a/.github/workflows/doc-pr-upload.yml b/.github/workflows/doc-pr-upload.yml index ab6f32d7a..2b450ea1b 100644 --- a/.github/workflows/doc-pr-upload.yml +++ b/.github/workflows/doc-pr-upload.yml @@ -11,6 +11,7 @@ jobs: uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main with: package_name: lighteval + languages: en zh secrets: hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} From 7c83b4179191daa79c2081ba4c15b175d8182184 Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Sat, 24 May 2025 17:37:40 +0000 Subject: [PATCH 6/8] fix(docs): rename translation files to .mdx extension to resolve build failures This commit updates the file extensions for Chinese translation files from .md to .mdx to ensure compatibility with the documentation build system and prevent build failures. --- docs/source/zh/index.mdx | 20 +++++++++++++++++++ docs/source/zh/installation.mdx | 20 +++++++++++++++++++ docs/source/zh/quicktour.mdx | 20 +++++++++++++++++++ docs/source/zh/saving-and-reading-results.mdx | 20 +++++++++++++++++++ docs/source/zh/using-the-python-api.mdx | 20 +++++++++++++++++++ 5 files changed, 100 insertions(+) diff --git a/docs/source/zh/index.mdx b/docs/source/zh/index.mdx index a2673e067..31dcf91aa 100644 --- a/docs/source/zh/index.mdx +++ b/docs/source/zh/index.mdx @@ -1,3 +1,23 @@ + + # Lighteval 🤗 Lighteval 是一款多功能的大语言模型评估工具箱,支持多种后端平台 — 无论您使用的是 diff --git a/docs/source/zh/installation.mdx b/docs/source/zh/installation.mdx index 5c14ec019..0f3f2e1aa 100644 --- a/docs/source/zh/installation.mdx +++ b/docs/source/zh/installation.mdx @@ -1,3 +1,23 @@ + + # 安装 Lighteval可以通过PyPi或源代码两种方式安装。 diff --git a/docs/source/zh/quicktour.mdx b/docs/source/zh/quicktour.mdx index abf33a830..bfb35e6a9 100644 --- a/docs/source/zh/quicktour.mdx +++ b/docs/source/zh/quicktour.mdx @@ -1,3 +1,23 @@ + + # 快速上手 diff --git a/docs/source/zh/saving-and-reading-results.mdx b/docs/source/zh/saving-and-reading-results.mdx index 6f2e9dc91..41eebd025 100644 --- a/docs/source/zh/saving-and-reading-results.mdx +++ b/docs/source/zh/saving-and-reading-results.mdx @@ -1,3 +1,23 @@ + + # 保存和读取结果 ## 本地保存结果 diff --git a/docs/source/zh/using-the-python-api.mdx b/docs/source/zh/using-the-python-api.mdx index 72e535f01..c74fca156 100644 --- a/docs/source/zh/using-the-python-api.mdx +++ b/docs/source/zh/using-the-python-api.mdx @@ -1,3 +1,23 @@ + + # 使用Python API Lighteval可以从自定义Python脚本中使用。要评估模型,您需要设置 From 401778dc57d62ca4503f1a8db49c70c934d98a45 Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Sat, 24 May 2025 17:50:05 +0000 Subject: [PATCH 7/8] =?UTF-8?q?=E9=87=8D=E6=9E=84=E6=96=87=E6=A1=A3?= =?UTF-8?q?=E7=BB=93=E6=9E=84=E4=BB=A5=E5=8C=B9=E9=85=8D=E4=B8=8A=E6=B8=B8?= =?UTF-8?q?=E4=BB=93=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 将所有英文文档从 docs/source/en/ 移动到 docs/source/ - 保持中文翻译在 docs/source/zh/ 目录不变 - 同步上游 lighteval 仓库的最新文档结构变更 --- docs/source/{en => }/_toctree.yml | 0 docs/source/{en => }/adding-a-custom-task.mdx | 0 docs/source/{en => }/adding-a-new-metric.mdx | 0 docs/source/{en => }/available-tasks.mdx | 0 docs/source/{en => }/contributing-to-multilingual-evaluations.mdx | 0 docs/source/{en => }/evaluating-a-custom-model.mdx | 0 docs/source/{en => }/index.mdx | 0 docs/source/{en => }/installation.mdx | 0 docs/source/{en => }/metric-list.mdx | 0 docs/source/{en => }/package_reference/evaluation_tracker.mdx | 0 docs/source/{en => }/package_reference/logging.mdx | 0 docs/source/{en => }/package_reference/metrics.mdx | 0 docs/source/{en => }/package_reference/models.mdx | 0 docs/source/{en => }/package_reference/pipeline.mdx | 0 docs/source/{en => }/package_reference/tasks.mdx | 0 docs/source/{en => }/quicktour.mdx | 0 docs/source/{en => }/saving-and-reading-results.mdx | 0 .../use-huggingface-inference-endpoints-or-tgi-as-backend.mdx | 0 docs/source/{en => }/use-inference-providers-as-backend.mdx | 0 docs/source/{en => }/use-litellm-as-backend.mdx | 0 docs/source/{en => }/use-sglang-as-backend.mdx | 0 docs/source/{en => }/use-vllm-as-backend.mdx | 0 docs/source/{en => }/using-the-python-api.mdx | 0 23 files changed, 0 insertions(+), 0 deletions(-) rename docs/source/{en => }/_toctree.yml (100%) rename docs/source/{en => }/adding-a-custom-task.mdx (100%) rename docs/source/{en => }/adding-a-new-metric.mdx (100%) rename docs/source/{en => }/available-tasks.mdx (100%) rename docs/source/{en => }/contributing-to-multilingual-evaluations.mdx (100%) rename docs/source/{en => }/evaluating-a-custom-model.mdx (100%) rename docs/source/{en => }/index.mdx (100%) rename docs/source/{en => }/installation.mdx (100%) rename docs/source/{en => }/metric-list.mdx (100%) rename docs/source/{en => }/package_reference/evaluation_tracker.mdx (100%) rename docs/source/{en => }/package_reference/logging.mdx (100%) rename docs/source/{en => }/package_reference/metrics.mdx (100%) rename docs/source/{en => }/package_reference/models.mdx (100%) rename docs/source/{en => }/package_reference/pipeline.mdx (100%) rename docs/source/{en => }/package_reference/tasks.mdx (100%) rename docs/source/{en => }/quicktour.mdx (100%) rename docs/source/{en => }/saving-and-reading-results.mdx (100%) rename docs/source/{en => }/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx (100%) rename docs/source/{en => }/use-inference-providers-as-backend.mdx (100%) rename docs/source/{en => }/use-litellm-as-backend.mdx (100%) rename docs/source/{en => }/use-sglang-as-backend.mdx (100%) rename docs/source/{en => }/use-vllm-as-backend.mdx (100%) rename docs/source/{en => }/using-the-python-api.mdx (100%) diff --git a/docs/source/en/_toctree.yml b/docs/source/_toctree.yml similarity index 100% rename from docs/source/en/_toctree.yml rename to docs/source/_toctree.yml diff --git a/docs/source/en/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx similarity index 100% rename from docs/source/en/adding-a-custom-task.mdx rename to docs/source/adding-a-custom-task.mdx diff --git a/docs/source/en/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx similarity index 100% rename from docs/source/en/adding-a-new-metric.mdx rename to docs/source/adding-a-new-metric.mdx diff --git a/docs/source/en/available-tasks.mdx b/docs/source/available-tasks.mdx similarity index 100% rename from docs/source/en/available-tasks.mdx rename to docs/source/available-tasks.mdx diff --git a/docs/source/en/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx similarity index 100% rename from docs/source/en/contributing-to-multilingual-evaluations.mdx rename to docs/source/contributing-to-multilingual-evaluations.mdx diff --git a/docs/source/en/evaluating-a-custom-model.mdx b/docs/source/evaluating-a-custom-model.mdx similarity index 100% rename from docs/source/en/evaluating-a-custom-model.mdx rename to docs/source/evaluating-a-custom-model.mdx diff --git a/docs/source/en/index.mdx b/docs/source/index.mdx similarity index 100% rename from docs/source/en/index.mdx rename to docs/source/index.mdx diff --git a/docs/source/en/installation.mdx b/docs/source/installation.mdx similarity index 100% rename from docs/source/en/installation.mdx rename to docs/source/installation.mdx diff --git a/docs/source/en/metric-list.mdx b/docs/source/metric-list.mdx similarity index 100% rename from docs/source/en/metric-list.mdx rename to docs/source/metric-list.mdx diff --git a/docs/source/en/package_reference/evaluation_tracker.mdx b/docs/source/package_reference/evaluation_tracker.mdx similarity index 100% rename from docs/source/en/package_reference/evaluation_tracker.mdx rename to docs/source/package_reference/evaluation_tracker.mdx diff --git a/docs/source/en/package_reference/logging.mdx b/docs/source/package_reference/logging.mdx similarity index 100% rename from docs/source/en/package_reference/logging.mdx rename to docs/source/package_reference/logging.mdx diff --git a/docs/source/en/package_reference/metrics.mdx b/docs/source/package_reference/metrics.mdx similarity index 100% rename from docs/source/en/package_reference/metrics.mdx rename to docs/source/package_reference/metrics.mdx diff --git a/docs/source/en/package_reference/models.mdx b/docs/source/package_reference/models.mdx similarity index 100% rename from docs/source/en/package_reference/models.mdx rename to docs/source/package_reference/models.mdx diff --git a/docs/source/en/package_reference/pipeline.mdx b/docs/source/package_reference/pipeline.mdx similarity index 100% rename from docs/source/en/package_reference/pipeline.mdx rename to docs/source/package_reference/pipeline.mdx diff --git a/docs/source/en/package_reference/tasks.mdx b/docs/source/package_reference/tasks.mdx similarity index 100% rename from docs/source/en/package_reference/tasks.mdx rename to docs/source/package_reference/tasks.mdx diff --git a/docs/source/en/quicktour.mdx b/docs/source/quicktour.mdx similarity index 100% rename from docs/source/en/quicktour.mdx rename to docs/source/quicktour.mdx diff --git a/docs/source/en/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx similarity index 100% rename from docs/source/en/saving-and-reading-results.mdx rename to docs/source/saving-and-reading-results.mdx diff --git a/docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx similarity index 100% rename from docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx rename to docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx diff --git a/docs/source/en/use-inference-providers-as-backend.mdx b/docs/source/use-inference-providers-as-backend.mdx similarity index 100% rename from docs/source/en/use-inference-providers-as-backend.mdx rename to docs/source/use-inference-providers-as-backend.mdx diff --git a/docs/source/en/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx similarity index 100% rename from docs/source/en/use-litellm-as-backend.mdx rename to docs/source/use-litellm-as-backend.mdx diff --git a/docs/source/en/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx similarity index 100% rename from docs/source/en/use-sglang-as-backend.mdx rename to docs/source/use-sglang-as-backend.mdx diff --git a/docs/source/en/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx similarity index 100% rename from docs/source/en/use-vllm-as-backend.mdx rename to docs/source/use-vllm-as-backend.mdx diff --git a/docs/source/en/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx similarity index 100% rename from docs/source/en/using-the-python-api.mdx rename to docs/source/using-the-python-api.mdx From f6f59207444a89b1b124766b9425c25f33d04b3f Mon Sep 17 00:00:00 2001 From: CassiopeiaCode Date: Sat, 24 May 2025 17:52:52 +0000 Subject: [PATCH 8/8] Move English documentation to en folder - Move all English docs from docs/source/ to docs/source/en/ - Maintain Chinese translations in docs/source/zh/ directory - Restore traditional language-specific folder structure --- docs/source/{ => en}/_toctree.yml | 0 docs/source/{ => en}/adding-a-custom-task.mdx | 0 docs/source/{ => en}/adding-a-new-metric.mdx | 0 docs/source/{ => en}/available-tasks.mdx | 0 docs/source/{ => en}/contributing-to-multilingual-evaluations.mdx | 0 docs/source/{ => en}/evaluating-a-custom-model.mdx | 0 docs/source/{ => en}/index.mdx | 0 docs/source/{ => en}/installation.mdx | 0 docs/source/{ => en}/metric-list.mdx | 0 docs/source/{ => en}/package_reference/evaluation_tracker.mdx | 0 docs/source/{ => en}/package_reference/logging.mdx | 0 docs/source/{ => en}/package_reference/metrics.mdx | 0 docs/source/{ => en}/package_reference/models.mdx | 0 docs/source/{ => en}/package_reference/pipeline.mdx | 0 docs/source/{ => en}/package_reference/tasks.mdx | 0 docs/source/{ => en}/quicktour.mdx | 0 docs/source/{ => en}/saving-and-reading-results.mdx | 0 .../use-huggingface-inference-endpoints-or-tgi-as-backend.mdx | 0 docs/source/{ => en}/use-inference-providers-as-backend.mdx | 0 docs/source/{ => en}/use-litellm-as-backend.mdx | 0 docs/source/{ => en}/use-sglang-as-backend.mdx | 0 docs/source/{ => en}/use-vllm-as-backend.mdx | 0 docs/source/{ => en}/using-the-python-api.mdx | 0 23 files changed, 0 insertions(+), 0 deletions(-) rename docs/source/{ => en}/_toctree.yml (100%) rename docs/source/{ => en}/adding-a-custom-task.mdx (100%) rename docs/source/{ => en}/adding-a-new-metric.mdx (100%) rename docs/source/{ => en}/available-tasks.mdx (100%) rename docs/source/{ => en}/contributing-to-multilingual-evaluations.mdx (100%) rename docs/source/{ => en}/evaluating-a-custom-model.mdx (100%) rename docs/source/{ => en}/index.mdx (100%) rename docs/source/{ => en}/installation.mdx (100%) rename docs/source/{ => en}/metric-list.mdx (100%) rename docs/source/{ => en}/package_reference/evaluation_tracker.mdx (100%) rename docs/source/{ => en}/package_reference/logging.mdx (100%) rename docs/source/{ => en}/package_reference/metrics.mdx (100%) rename docs/source/{ => en}/package_reference/models.mdx (100%) rename docs/source/{ => en}/package_reference/pipeline.mdx (100%) rename docs/source/{ => en}/package_reference/tasks.mdx (100%) rename docs/source/{ => en}/quicktour.mdx (100%) rename docs/source/{ => en}/saving-and-reading-results.mdx (100%) rename docs/source/{ => en}/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx (100%) rename docs/source/{ => en}/use-inference-providers-as-backend.mdx (100%) rename docs/source/{ => en}/use-litellm-as-backend.mdx (100%) rename docs/source/{ => en}/use-sglang-as-backend.mdx (100%) rename docs/source/{ => en}/use-vllm-as-backend.mdx (100%) rename docs/source/{ => en}/using-the-python-api.mdx (100%) diff --git a/docs/source/_toctree.yml b/docs/source/en/_toctree.yml similarity index 100% rename from docs/source/_toctree.yml rename to docs/source/en/_toctree.yml diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/en/adding-a-custom-task.mdx similarity index 100% rename from docs/source/adding-a-custom-task.mdx rename to docs/source/en/adding-a-custom-task.mdx diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/en/adding-a-new-metric.mdx similarity index 100% rename from docs/source/adding-a-new-metric.mdx rename to docs/source/en/adding-a-new-metric.mdx diff --git a/docs/source/available-tasks.mdx b/docs/source/en/available-tasks.mdx similarity index 100% rename from docs/source/available-tasks.mdx rename to docs/source/en/available-tasks.mdx diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/en/contributing-to-multilingual-evaluations.mdx similarity index 100% rename from docs/source/contributing-to-multilingual-evaluations.mdx rename to docs/source/en/contributing-to-multilingual-evaluations.mdx diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/en/evaluating-a-custom-model.mdx similarity index 100% rename from docs/source/evaluating-a-custom-model.mdx rename to docs/source/en/evaluating-a-custom-model.mdx diff --git a/docs/source/index.mdx b/docs/source/en/index.mdx similarity index 100% rename from docs/source/index.mdx rename to docs/source/en/index.mdx diff --git a/docs/source/installation.mdx b/docs/source/en/installation.mdx similarity index 100% rename from docs/source/installation.mdx rename to docs/source/en/installation.mdx diff --git a/docs/source/metric-list.mdx b/docs/source/en/metric-list.mdx similarity index 100% rename from docs/source/metric-list.mdx rename to docs/source/en/metric-list.mdx diff --git a/docs/source/package_reference/evaluation_tracker.mdx b/docs/source/en/package_reference/evaluation_tracker.mdx similarity index 100% rename from docs/source/package_reference/evaluation_tracker.mdx rename to docs/source/en/package_reference/evaluation_tracker.mdx diff --git a/docs/source/package_reference/logging.mdx b/docs/source/en/package_reference/logging.mdx similarity index 100% rename from docs/source/package_reference/logging.mdx rename to docs/source/en/package_reference/logging.mdx diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/en/package_reference/metrics.mdx similarity index 100% rename from docs/source/package_reference/metrics.mdx rename to docs/source/en/package_reference/metrics.mdx diff --git a/docs/source/package_reference/models.mdx b/docs/source/en/package_reference/models.mdx similarity index 100% rename from docs/source/package_reference/models.mdx rename to docs/source/en/package_reference/models.mdx diff --git a/docs/source/package_reference/pipeline.mdx b/docs/source/en/package_reference/pipeline.mdx similarity index 100% rename from docs/source/package_reference/pipeline.mdx rename to docs/source/en/package_reference/pipeline.mdx diff --git a/docs/source/package_reference/tasks.mdx b/docs/source/en/package_reference/tasks.mdx similarity index 100% rename from docs/source/package_reference/tasks.mdx rename to docs/source/en/package_reference/tasks.mdx diff --git a/docs/source/quicktour.mdx b/docs/source/en/quicktour.mdx similarity index 100% rename from docs/source/quicktour.mdx rename to docs/source/en/quicktour.mdx diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/en/saving-and-reading-results.mdx similarity index 100% rename from docs/source/saving-and-reading-results.mdx rename to docs/source/en/saving-and-reading-results.mdx diff --git a/docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx similarity index 100% rename from docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx rename to docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/en/use-inference-providers-as-backend.mdx similarity index 100% rename from docs/source/use-inference-providers-as-backend.mdx rename to docs/source/en/use-inference-providers-as-backend.mdx diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/en/use-litellm-as-backend.mdx similarity index 100% rename from docs/source/use-litellm-as-backend.mdx rename to docs/source/en/use-litellm-as-backend.mdx diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/en/use-sglang-as-backend.mdx similarity index 100% rename from docs/source/use-sglang-as-backend.mdx rename to docs/source/en/use-sglang-as-backend.mdx diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/en/use-vllm-as-backend.mdx similarity index 100% rename from docs/source/use-vllm-as-backend.mdx rename to docs/source/en/use-vllm-as-backend.mdx diff --git a/docs/source/using-the-python-api.mdx b/docs/source/en/using-the-python-api.mdx similarity index 100% rename from docs/source/using-the-python-api.mdx rename to docs/source/en/using-the-python-api.mdx