From de830f6232f4c465a588630d31dcf9735af67d4c Mon Sep 17 00:00:00 2001
From: David Biagioni <dbiagioni@proofpoint.com>
Date: Mon, 24 Nov 2025 12:12:44 -0700
Subject: [PATCH 1/4] enable use of data files for custom tasks

---
 docs/source/adding-a-custom-task.mdx          |  38 +++++++
 .../custom_yourbench_task_from_files.py       | 105 ++++++++++++++++++
 src/lighteval/tasks/lighteval_task.py         |   6 +-
 tests/unit/tasks/test_lighteval_task.py       |  21 ++++
 4 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 examples/custom_tasks_templates/custom_yourbench_task_from_files.py

diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
index c68149a1e..f1bb73c49 100644
--- a/docs/source/adding-a-custom-task.mdx
+++ b/docs/source/adding-a-custom-task.mdx
@@ -36,6 +36,44 @@ def prompt_fn(line: dict, task_name: str):
     )
 ```
 
+#### Task Backed by Local `data_files`
+
+If you are prototyping a task based on files that are not (yet) hosted on the
+Hub, you can take advantage of the `hf_data_files` argument to point Lighteval
+at local JSON/CSV resources. This makes it easy to evaluate datasets that live
+in your repo or that are generated on the fly.
+
+```python
+from pathlib import Path
+
+from lighteval.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+def local_prompt(line: dict, task_name: str) -> Doc:
+    return Doc(task_name=task_name, query=line["question"], choices=line["choices"], gold_index=line["answer"])
+
+
+local_data = Path(__file__).parent / "samples" / "faq.jsonl"
+
+local_task = LightevalTaskConfig(
+    name="faq_eval",
+    prompt_function=local_prompt,
+    hf_repo="json",  # Built-in streaming loader for json/jsonl files
+    hf_subset="default",
+    hf_data_files=str(local_data),  # Can also be a dict mapping split names to paths
+    evaluation_splits=["train"],
+    metrics=[Metrics.ACCURACY],
+)
+```
+
+Once the config is registered in `TASKS_TABLE`, running the task with
+`--custom-tasks path/to/your_file.py` will automatically load the local data
+files. You can also pass a dictionary to `hf_data_files` (e.g.
+`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple
+splits.
+
 ### Step 3: Choose or Create Metrics
 
 You can either use an existing metric (defined in [`lighteval.metrics.metrics.Metrics`]) or [create a custom one](adding-a-new-metric).
diff --git a/examples/custom_tasks_templates/custom_yourbench_task_from_files.py b/examples/custom_tasks_templates/custom_yourbench_task_from_files.py
new file mode 100644
index 000000000..25d5684a6
--- /dev/null
+++ b/examples/custom_tasks_templates/custom_yourbench_task_from_files.py
@@ -0,0 +1,105 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import logging
+import tempfile
+from functools import partial
+from pathlib import Path
+
+from custom_yourbench_task_mcq import yourbench_prompt
+from datasets import Dataset, DatasetDict
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
+
+
+logger = logging.getLogger(__name__)
+
+save_dir = str(tempfile.mkdtemp())
+
+ds = DatasetDict(
+    {
+        "train": Dataset.from_dict(
+            {
+                "question": ["What is 2+2?", "Capital of France?"],
+                "choices": [["1", "2", "3", "4"], ["Paris", "Berlin", "Rome", "Madrid"]],
+                "gold": [[3], [0]],
+            }
+        )
+    }
+)
+
+
+CustomTaskConfig = partial(
+    LightevalTaskConfig,
+    prompt_function=yourbench_prompt,
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=16,
+    metrics=[Metrics.gpqa_instruct_metric],
+    version=0,
+)
+
+# Example 1: save to disk (huggingface format) ####
+
+ds.save_to_disk(save_dir)
+
+yourbench_mcq = CustomTaskConfig(
+    name="tiny_mcqa_dataset",
+    hf_repo="arrow",
+    hf_subset="default",
+    hf_data_files=f"{save_dir}/**/*.arrow",
+)
+
+task = LightevalTask(yourbench_mcq)
+eval_docs = task.eval_docs()
+
+print("\n>>READING TASK FROM ARROW<<")
+for doc in eval_docs:
+    print(doc)
+
+
+# Example 2: jsonlines format ####
+
+jsonl_path = Path(save_dir) / "train.jsonl"
+with open(jsonl_path, "w") as f:
+    for row in ds["train"]:
+        f.write(json.dumps(row) + "\n")
+
+yourbench_mcq = CustomTaskConfig(
+    name="tiny_mcqa_dataset",
+    hf_repo="json",
+    hf_subset="default",
+    hf_data_files=str(jsonl_path),
+)
+
+task = LightevalTask(yourbench_mcq)
+eval_docs = task.eval_docs()
+
+print("\n>>READING TASK FROM JSONLINES<<")
+for doc in eval_docs:
+    print(doc)
+
+# TASKS_TABLE = [yourbench_mcq]
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index f8c1eed9c..1ecd4e375 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -24,7 +24,7 @@
 import logging
 import random
 from dataclasses import asdict, dataclass, field
-from typing import Callable
+from typing import Callable, Mapping, Sequence
 
 from datasets import DatasetDict, load_dataset
 from huggingface_hub import TextGenerationInputGrammarType
@@ -59,6 +59,7 @@ class LightevalTaskConfig:
             row to Doc objects for evaluation. Takes a dataset row dict and task
             name as input.
         hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset.
+        hf_data_files (str): Data files to load.
         hf_subset (str): Dataset subset/configuration name to use for this task.
         metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task.
 
@@ -113,6 +114,7 @@ class LightevalTaskConfig:
     hf_repo: str
     hf_subset: str
     metrics: ListLike[Metric | Metrics]  # Accept both Metric objects and Metrics enums
+    hf_data_files: str | Sequence[str] | Mapping[str, str | Sequence[str]] | None = None
 
     # Inspect AI compatible parameters
     solver: None = None
@@ -219,6 +221,7 @@ def __init__(
 
         # Dataset info
         self.dataset_path = config.hf_repo
+        self.data_files = config.hf_data_files
         self.dataset_config_name = config.hf_subset
         self.dataset_revision = config.hf_revision
         self.dataset_filter = config.hf_filter
@@ -454,6 +457,7 @@ def download_dataset_worker(
             path=task.dataset_path,
             name=task.dataset_config_name,
             revision=task.dataset_revision,
+            data_files=task.data_files,
         )
 
         if task.dataset_filter is not None:
diff --git a/tests/unit/tasks/test_lighteval_task.py b/tests/unit/tasks/test_lighteval_task.py
index df2b5ad4a..7cdb7b6f5 100644
--- a/tests/unit/tasks/test_lighteval_task.py
+++ b/tests/unit/tasks/test_lighteval_task.py
@@ -63,3 +63,24 @@ def test_dataset_filter():
     filtered_docs = task.eval_docs()
     assert len(filtered_docs) == 1
     assert filtered_docs[0].query == "hi"
+
+
+def test_hf_data_files(tmp_path):
+    # create a small jsonl dataset
+    data_file = tmp_path / "data.jsonl"
+    src_docs = [f"document {i}" for i in range(3)]
+    data_file.write_text("\n".join([f'{{"text": "{doc}"}}' for doc in src_docs]))
+
+    cfg = LightevalTaskConfig(
+        name="test_data_files",
+        prompt_function=dummy_prompt_function,
+        hf_repo="json",
+        hf_subset="default",
+        metrics=[],
+        evaluation_splits=["train"],
+        hf_data_files=str(data_file),
+    )
+    task = LightevalTask(cfg)
+
+    eval_docs = task.eval_docs()
+    assert [doc.query for doc in eval_docs] == src_docs

From cdebb52d994a1f83b67d8be3158663ea56363943 Mon Sep 17 00:00:00 2001
From: David Biagioni <dbiagioni@proofpoint.com>
Date: Thu, 4 Dec 2025 10:40:00 -0700
Subject: [PATCH 2/4] addressing PR comments, create new doc file, update
 docstring with types

---
 docs/source/adding-a-custom-task.mdx  | 38 ----------------------
 docs/source/offline-evaluation.md     | 46 +++++++++++++++++++++++++++
 src/lighteval/tasks/lighteval_task.py |  3 +-
 3 files changed, 48 insertions(+), 39 deletions(-)
 create mode 100644 docs/source/offline-evaluation.md

diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
index f1bb73c49..c68149a1e 100644
--- a/docs/source/adding-a-custom-task.mdx
+++ b/docs/source/adding-a-custom-task.mdx
@@ -36,44 +36,6 @@ def prompt_fn(line: dict, task_name: str):
     )
 ```
 
-#### Task Backed by Local `data_files`
-
-If you are prototyping a task based on files that are not (yet) hosted on the
-Hub, you can take advantage of the `hf_data_files` argument to point Lighteval
-at local JSON/CSV resources. This makes it easy to evaluate datasets that live
-in your repo or that are generated on the fly.
-
-```python
-from pathlib import Path
-
-from lighteval.metrics import Metrics
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-
-
-def local_prompt(line: dict, task_name: str) -> Doc:
-    return Doc(task_name=task_name, query=line["question"], choices=line["choices"], gold_index=line["answer"])
-
-
-local_data = Path(__file__).parent / "samples" / "faq.jsonl"
-
-local_task = LightevalTaskConfig(
-    name="faq_eval",
-    prompt_function=local_prompt,
-    hf_repo="json",  # Built-in streaming loader for json/jsonl files
-    hf_subset="default",
-    hf_data_files=str(local_data),  # Can also be a dict mapping split names to paths
-    evaluation_splits=["train"],
-    metrics=[Metrics.ACCURACY],
-)
-```
-
-Once the config is registered in `TASKS_TABLE`, running the task with
-`--custom-tasks path/to/your_file.py` will automatically load the local data
-files. You can also pass a dictionary to `hf_data_files` (e.g.
-`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple
-splits.
-
 ### Step 3: Choose or Create Metrics
 
 You can either use an existing metric (defined in [`lighteval.metrics.metrics.Metrics`]) or [create a custom one](adding-a-new-metric).
diff --git a/docs/source/offline-evaluation.md b/docs/source/offline-evaluation.md
new file mode 100644
index 000000000..b73b13ff1
--- /dev/null
+++ b/docs/source/offline-evaluation.md
@@ -0,0 +1,46 @@
+# Offline evaluation using local data files
+
+If you are prototyping a task based on files that are not yet hosted on the
+Hub, you can take advantage of the `hf_data_files` argument to point Lighteval
+at local JSON/CSV resources. This makes it easy to evaluate datasets that live
+in your repo or that are generated on the fly.
+
+Internally, `hf_data_files` is passed directly to the `data_files` parameter of `datasets.load_dataset` ([docs]((https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset))).
+
+See [adding a custom task](adding-a-custom-task) for more information on how to create a custom task.
+
+```python
+from pathlib import Path
+
+from lighteval.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+def local_prompt(line: dict, task_name: str) -> Doc:
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=line["choices"],
+        gold_index=line["answer"]
+    )
+
+
+local_data = Path(__file__).parent / "samples" / "faq.jsonl"
+
+local_task = LightevalTaskConfig(
+    name="faq_eval",
+    prompt_function=local_prompt,
+    hf_repo="json",  # Built-in streaming loader for json/jsonl files
+    hf_subset="default",
+    hf_data_files=str(local_data),  # Can also be a dict mapping split names to paths
+    evaluation_splits=["train"],
+    metrics=[Metrics.ACCURACY],
+)
+```
+
+Once the config is registered in `TASKS_TABLE`, running the task with
+`--custom-tasks path/to/your_file.py` will automatically load the local data
+files. You can also pass a dictionary to `hf_data_files` (e.g.
+`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple
+splits.
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 1ecd4e375..5e9bac215 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -59,7 +59,8 @@ class LightevalTaskConfig:
             row to Doc objects for evaluation. Takes a dataset row dict and task
             name as input.
         hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset.
-        hf_data_files (str): Data files to load.
+        hf_data_files (str | Sequence[str] | Mapping[str, str | Sequence[str]] | None):
+            Data files to load. Same as `data_files` argument of `datasets.load_dataset`.
         hf_subset (str): Dataset subset/configuration name to use for this task.
         metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task.
 

From 75c58aa8ab9473dc929b45f1b18bcb98bacaaffa Mon Sep 17 00:00:00 2001
From: Dave Biagioni <davebiagioni@users.noreply.github.com>
Date: Mon, 8 Dec 2025 10:30:57 -0700
Subject: [PATCH 3/4] Update docs/source/offline-evaluation.md

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
---
 docs/source/offline-evaluation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/offline-evaluation.md b/docs/source/offline-evaluation.md
index b73b13ff1..37c921b61 100644
--- a/docs/source/offline-evaluation.md
+++ b/docs/source/offline-evaluation.md
@@ -1,7 +1,7 @@
 # Offline evaluation using local data files
 
 If you are prototyping a task based on files that are not yet hosted on the
-Hub, you can take advantage of the `hf_data_files` argument to point Lighteval
+Hub, you can take advantage of the `hf_data_files` argument to point lighteval
 at local JSON/CSV resources. This makes it easy to evaluate datasets that live
 in your repo or that are generated on the fly.
 

From a577ebf4e94b723039c4acb7537ffc06af847ebb Mon Sep 17 00:00:00 2001
From: Dave Biagioni <davebiagioni@users.noreply.github.com>
Date: Mon, 8 Dec 2025 10:42:08 -0700
Subject: [PATCH 4/4] Add offline evaluation section to documentation

---
 docs/source/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index d3c33cdab..75733d49b 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -17,6 +17,8 @@
     title: Use the Python API
   - local: adding-a-custom-task
     title: Add a custom task
+  - local: offline-evaluation
+    title: Offline evaluation
   - local: adding-a-new-metric
     title: Add a custom metric
   - local: evaluating-a-custom-model