From de830f6232f4c465a588630d31dcf9735af67d4c Mon Sep 17 00:00:00 2001 From: David Biagioni Date: Mon, 24 Nov 2025 12:12:44 -0700 Subject: [PATCH 1/4] enable use of data files for custom tasks --- docs/source/adding-a-custom-task.mdx | 38 +++++++ .../custom_yourbench_task_from_files.py | 105 ++++++++++++++++++ src/lighteval/tasks/lighteval_task.py | 6 +- tests/unit/tasks/test_lighteval_task.py | 21 ++++ 4 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 examples/custom_tasks_templates/custom_yourbench_task_from_files.py diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx index c68149a1e..f1bb73c49 100644 --- a/docs/source/adding-a-custom-task.mdx +++ b/docs/source/adding-a-custom-task.mdx @@ -36,6 +36,44 @@ def prompt_fn(line: dict, task_name: str): ) ``` +#### Task Backed by Local `data_files` + +If you are prototyping a task based on files that are not (yet) hosted on the +Hub, you can take advantage of the `hf_data_files` argument to point Lighteval +at local JSON/CSV resources. This makes it easy to evaluate datasets that live +in your repo or that are generated on the fly. + +```python +from pathlib import Path + +from lighteval.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +def local_prompt(line: dict, task_name: str) -> Doc: + return Doc(task_name=task_name, query=line["question"], choices=line["choices"], gold_index=line["answer"]) + + +local_data = Path(__file__).parent / "samples" / "faq.jsonl" + +local_task = LightevalTaskConfig( + name="faq_eval", + prompt_function=local_prompt, + hf_repo="json", # Built-in streaming loader for json/jsonl files + hf_subset="default", + hf_data_files=str(local_data), # Can also be a dict mapping split names to paths + evaluation_splits=["train"], + metrics=[Metrics.ACCURACY], +) +``` + +Once the config is registered in `TASKS_TABLE`, running the task with +`--custom-tasks path/to/your_file.py` will automatically load the local data +files. You can also pass a dictionary to `hf_data_files` (e.g. +`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple +splits. + ### Step 3: Choose or Create Metrics You can either use an existing metric (defined in [`lighteval.metrics.metrics.Metrics`]) or [create a custom one](adding-a-new-metric). diff --git a/examples/custom_tasks_templates/custom_yourbench_task_from_files.py b/examples/custom_tasks_templates/custom_yourbench_task_from_files.py new file mode 100644 index 000000000..25d5684a6 --- /dev/null +++ b/examples/custom_tasks_templates/custom_yourbench_task_from_files.py @@ -0,0 +1,105 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import json +import logging +import tempfile +from functools import partial +from pathlib import Path + +from custom_yourbench_task_mcq import yourbench_prompt +from datasets import Dataset, DatasetDict + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig + + +logger = logging.getLogger(__name__) + +save_dir = str(tempfile.mkdtemp()) + +ds = DatasetDict( + { + "train": Dataset.from_dict( + { + "question": ["What is 2+2?", "Capital of France?"], + "choices": [["1", "2", "3", "4"], ["Paris", "Berlin", "Rome", "Madrid"]], + "gold": [[3], [0]], + } + ) + } +) + + +CustomTaskConfig = partial( + LightevalTaskConfig, + prompt_function=yourbench_prompt, + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=16, + metrics=[Metrics.gpqa_instruct_metric], + version=0, +) + +# Example 1: save to disk (huggingface format) #### + +ds.save_to_disk(save_dir) + +yourbench_mcq = CustomTaskConfig( + name="tiny_mcqa_dataset", + hf_repo="arrow", + hf_subset="default", + hf_data_files=f"{save_dir}/**/*.arrow", +) + +task = LightevalTask(yourbench_mcq) +eval_docs = task.eval_docs() + +print("\n>>READING TASK FROM ARROW<<") +for doc in eval_docs: + print(doc) + + +# Example 2: jsonlines format #### + +jsonl_path = Path(save_dir) / "train.jsonl" +with open(jsonl_path, "w") as f: + for row in ds["train"]: + f.write(json.dumps(row) + "\n") + +yourbench_mcq = CustomTaskConfig( + name="tiny_mcqa_dataset", + hf_repo="json", + hf_subset="default", + hf_data_files=str(jsonl_path), +) + +task = LightevalTask(yourbench_mcq) +eval_docs = task.eval_docs() + +print("\n>>READING TASK FROM JSONLINES<<") +for doc in eval_docs: + print(doc) + +# TASKS_TABLE = [yourbench_mcq] diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index f8c1eed9c..1ecd4e375 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -24,7 +24,7 @@ import logging import random from dataclasses import asdict, dataclass, field -from typing import Callable +from typing import Callable, Mapping, Sequence from datasets import DatasetDict, load_dataset from huggingface_hub import TextGenerationInputGrammarType @@ -59,6 +59,7 @@ class LightevalTaskConfig: row to Doc objects for evaluation. Takes a dataset row dict and task name as input. hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset. + hf_data_files (str): Data files to load. hf_subset (str): Dataset subset/configuration name to use for this task. metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task. @@ -113,6 +114,7 @@ class LightevalTaskConfig: hf_repo: str hf_subset: str metrics: ListLike[Metric | Metrics] # Accept both Metric objects and Metrics enums + hf_data_files: str | Sequence[str] | Mapping[str, str | Sequence[str]] | None = None # Inspect AI compatible parameters solver: None = None @@ -219,6 +221,7 @@ def __init__( # Dataset info self.dataset_path = config.hf_repo + self.data_files = config.hf_data_files self.dataset_config_name = config.hf_subset self.dataset_revision = config.hf_revision self.dataset_filter = config.hf_filter @@ -454,6 +457,7 @@ def download_dataset_worker( path=task.dataset_path, name=task.dataset_config_name, revision=task.dataset_revision, + data_files=task.data_files, ) if task.dataset_filter is not None: diff --git a/tests/unit/tasks/test_lighteval_task.py b/tests/unit/tasks/test_lighteval_task.py index df2b5ad4a..7cdb7b6f5 100644 --- a/tests/unit/tasks/test_lighteval_task.py +++ b/tests/unit/tasks/test_lighteval_task.py @@ -63,3 +63,24 @@ def test_dataset_filter(): filtered_docs = task.eval_docs() assert len(filtered_docs) == 1 assert filtered_docs[0].query == "hi" + + +def test_hf_data_files(tmp_path): + # create a small jsonl dataset + data_file = tmp_path / "data.jsonl" + src_docs = [f"document {i}" for i in range(3)] + data_file.write_text("\n".join([f'{{"text": "{doc}"}}' for doc in src_docs])) + + cfg = LightevalTaskConfig( + name="test_data_files", + prompt_function=dummy_prompt_function, + hf_repo="json", + hf_subset="default", + metrics=[], + evaluation_splits=["train"], + hf_data_files=str(data_file), + ) + task = LightevalTask(cfg) + + eval_docs = task.eval_docs() + assert [doc.query for doc in eval_docs] == src_docs From cdebb52d994a1f83b67d8be3158663ea56363943 Mon Sep 17 00:00:00 2001 From: David Biagioni Date: Thu, 4 Dec 2025 10:40:00 -0700 Subject: [PATCH 2/4] addressing PR comments, create new doc file, update docstring with types --- docs/source/adding-a-custom-task.mdx | 38 ---------------------- docs/source/offline-evaluation.md | 46 +++++++++++++++++++++++++++ src/lighteval/tasks/lighteval_task.py | 3 +- 3 files changed, 48 insertions(+), 39 deletions(-) create mode 100644 docs/source/offline-evaluation.md diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx index f1bb73c49..c68149a1e 100644 --- a/docs/source/adding-a-custom-task.mdx +++ b/docs/source/adding-a-custom-task.mdx @@ -36,44 +36,6 @@ def prompt_fn(line: dict, task_name: str): ) ``` -#### Task Backed by Local `data_files` - -If you are prototyping a task based on files that are not (yet) hosted on the -Hub, you can take advantage of the `hf_data_files` argument to point Lighteval -at local JSON/CSV resources. This makes it easy to evaluate datasets that live -in your repo or that are generated on the fly. - -```python -from pathlib import Path - -from lighteval.metrics import Metrics -from lighteval.tasks.lighteval_task import LightevalTaskConfig -from lighteval.tasks.requests import Doc - - -def local_prompt(line: dict, task_name: str) -> Doc: - return Doc(task_name=task_name, query=line["question"], choices=line["choices"], gold_index=line["answer"]) - - -local_data = Path(__file__).parent / "samples" / "faq.jsonl" - -local_task = LightevalTaskConfig( - name="faq_eval", - prompt_function=local_prompt, - hf_repo="json", # Built-in streaming loader for json/jsonl files - hf_subset="default", - hf_data_files=str(local_data), # Can also be a dict mapping split names to paths - evaluation_splits=["train"], - metrics=[Metrics.ACCURACY], -) -``` - -Once the config is registered in `TASKS_TABLE`, running the task with -`--custom-tasks path/to/your_file.py` will automatically load the local data -files. You can also pass a dictionary to `hf_data_files` (e.g. -`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple -splits. - ### Step 3: Choose or Create Metrics You can either use an existing metric (defined in [`lighteval.metrics.metrics.Metrics`]) or [create a custom one](adding-a-new-metric). diff --git a/docs/source/offline-evaluation.md b/docs/source/offline-evaluation.md new file mode 100644 index 000000000..b73b13ff1 --- /dev/null +++ b/docs/source/offline-evaluation.md @@ -0,0 +1,46 @@ +# Offline evaluation using local data files + +If you are prototyping a task based on files that are not yet hosted on the +Hub, you can take advantage of the `hf_data_files` argument to point Lighteval +at local JSON/CSV resources. This makes it easy to evaluate datasets that live +in your repo or that are generated on the fly. + +Internally, `hf_data_files` is passed directly to the `data_files` parameter of `datasets.load_dataset` ([docs]((https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset))). + +See [adding a custom task](adding-a-custom-task) for more information on how to create a custom task. + +```python +from pathlib import Path + +from lighteval.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +def local_prompt(line: dict, task_name: str) -> Doc: + return Doc( + task_name=task_name, + query=line["question"], + choices=line["choices"], + gold_index=line["answer"] + ) + + +local_data = Path(__file__).parent / "samples" / "faq.jsonl" + +local_task = LightevalTaskConfig( + name="faq_eval", + prompt_function=local_prompt, + hf_repo="json", # Built-in streaming loader for json/jsonl files + hf_subset="default", + hf_data_files=str(local_data), # Can also be a dict mapping split names to paths + evaluation_splits=["train"], + metrics=[Metrics.ACCURACY], +) +``` + +Once the config is registered in `TASKS_TABLE`, running the task with +`--custom-tasks path/to/your_file.py` will automatically load the local data +files. You can also pass a dictionary to `hf_data_files` (e.g. +`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple +splits. diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 1ecd4e375..5e9bac215 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -59,7 +59,8 @@ class LightevalTaskConfig: row to Doc objects for evaluation. Takes a dataset row dict and task name as input. hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset. - hf_data_files (str): Data files to load. + hf_data_files (str | Sequence[str] | Mapping[str, str | Sequence[str]] | None): + Data files to load. Same as `data_files` argument of `datasets.load_dataset`. hf_subset (str): Dataset subset/configuration name to use for this task. metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task. From 75c58aa8ab9473dc929b45f1b18bcb98bacaaffa Mon Sep 17 00:00:00 2001 From: Dave Biagioni Date: Mon, 8 Dec 2025 10:30:57 -0700 Subject: [PATCH 3/4] Update docs/source/offline-evaluation.md Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- docs/source/offline-evaluation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/offline-evaluation.md b/docs/source/offline-evaluation.md index b73b13ff1..37c921b61 100644 --- a/docs/source/offline-evaluation.md +++ b/docs/source/offline-evaluation.md @@ -1,7 +1,7 @@ # Offline evaluation using local data files If you are prototyping a task based on files that are not yet hosted on the -Hub, you can take advantage of the `hf_data_files` argument to point Lighteval +Hub, you can take advantage of the `hf_data_files` argument to point lighteval at local JSON/CSV resources. This makes it easy to evaluate datasets that live in your repo or that are generated on the fly. From a577ebf4e94b723039c4acb7537ffc06af847ebb Mon Sep 17 00:00:00 2001 From: Dave Biagioni Date: Mon, 8 Dec 2025 10:42:08 -0700 Subject: [PATCH 4/4] Add offline evaluation section to documentation --- docs/source/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index d3c33cdab..75733d49b 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -17,6 +17,8 @@ title: Use the Python API - local: adding-a-custom-task title: Add a custom task + - local: offline-evaluation + title: Offline evaluation - local: adding-a-new-metric title: Add a custom metric - local: evaluating-a-custom-model