Skip to content

Commit de830f6

Browse files
enable use of data files for custom tasks
1 parent 5803818 commit de830f6

File tree

4 files changed

+169
-1
lines changed

4 files changed

+169
-1
lines changed

docs/source/adding-a-custom-task.mdx

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,44 @@ def prompt_fn(line: dict, task_name: str):
3636
)
3737
```
3838

39+
#### Task Backed by Local `data_files`
40+
41+
If you are prototyping a task based on files that are not (yet) hosted on the
42+
Hub, you can take advantage of the `hf_data_files` argument to point Lighteval
43+
at local JSON/CSV resources. This makes it easy to evaluate datasets that live
44+
in your repo or that are generated on the fly.
45+
46+
```python
47+
from pathlib import Path
48+
49+
from lighteval.metrics import Metrics
50+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
51+
from lighteval.tasks.requests import Doc
52+
53+
54+
def local_prompt(line: dict, task_name: str) -> Doc:
55+
return Doc(task_name=task_name, query=line["question"], choices=line["choices"], gold_index=line["answer"])
56+
57+
58+
local_data = Path(__file__).parent / "samples" / "faq.jsonl"
59+
60+
local_task = LightevalTaskConfig(
61+
name="faq_eval",
62+
prompt_function=local_prompt,
63+
hf_repo="json", # Built-in streaming loader for json/jsonl files
64+
hf_subset="default",
65+
hf_data_files=str(local_data), # Can also be a dict mapping split names to paths
66+
evaluation_splits=["train"],
67+
metrics=[Metrics.ACCURACY],
68+
)
69+
```
70+
71+
Once the config is registered in `TASKS_TABLE`, running the task with
72+
`--custom-tasks path/to/your_file.py` will automatically load the local data
73+
files. You can also pass a dictionary to `hf_data_files` (e.g.
74+
`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple
75+
splits.
76+
3977
### Step 3: Choose or Create Metrics
4078

4179
You can either use an existing metric (defined in [`lighteval.metrics.metrics.Metrics`]) or [create a custom one](adding-a-new-metric).
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
import json
24+
import logging
25+
import tempfile
26+
from functools import partial
27+
from pathlib import Path
28+
29+
from custom_yourbench_task_mcq import yourbench_prompt
30+
from datasets import Dataset, DatasetDict
31+
32+
from lighteval.metrics.metrics import Metrics
33+
from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
34+
35+
36+
logger = logging.getLogger(__name__)
37+
38+
save_dir = str(tempfile.mkdtemp())
39+
40+
ds = DatasetDict(
41+
{
42+
"train": Dataset.from_dict(
43+
{
44+
"question": ["What is 2+2?", "Capital of France?"],
45+
"choices": [["1", "2", "3", "4"], ["Paris", "Berlin", "Rome", "Madrid"]],
46+
"gold": [[3], [0]],
47+
}
48+
)
49+
}
50+
)
51+
52+
53+
CustomTaskConfig = partial(
54+
LightevalTaskConfig,
55+
prompt_function=yourbench_prompt,
56+
hf_avail_splits=["train"],
57+
evaluation_splits=["train"],
58+
few_shots_split=None,
59+
few_shots_select=None,
60+
generation_size=16,
61+
metrics=[Metrics.gpqa_instruct_metric],
62+
version=0,
63+
)
64+
65+
# Example 1: save to disk (huggingface format) ####
66+
67+
ds.save_to_disk(save_dir)
68+
69+
yourbench_mcq = CustomTaskConfig(
70+
name="tiny_mcqa_dataset",
71+
hf_repo="arrow",
72+
hf_subset="default",
73+
hf_data_files=f"{save_dir}/**/*.arrow",
74+
)
75+
76+
task = LightevalTask(yourbench_mcq)
77+
eval_docs = task.eval_docs()
78+
79+
print("\n>>READING TASK FROM ARROW<<")
80+
for doc in eval_docs:
81+
print(doc)
82+
83+
84+
# Example 2: jsonlines format ####
85+
86+
jsonl_path = Path(save_dir) / "train.jsonl"
87+
with open(jsonl_path, "w") as f:
88+
for row in ds["train"]:
89+
f.write(json.dumps(row) + "\n")
90+
91+
yourbench_mcq = CustomTaskConfig(
92+
name="tiny_mcqa_dataset",
93+
hf_repo="json",
94+
hf_subset="default",
95+
hf_data_files=str(jsonl_path),
96+
)
97+
98+
task = LightevalTask(yourbench_mcq)
99+
eval_docs = task.eval_docs()
100+
101+
print("\n>>READING TASK FROM JSONLINES<<")
102+
for doc in eval_docs:
103+
print(doc)
104+
105+
# TASKS_TABLE = [yourbench_mcq]

src/lighteval/tasks/lighteval_task.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import logging
2525
import random
2626
from dataclasses import asdict, dataclass, field
27-
from typing import Callable
27+
from typing import Callable, Mapping, Sequence
2828

2929
from datasets import DatasetDict, load_dataset
3030
from huggingface_hub import TextGenerationInputGrammarType
@@ -59,6 +59,7 @@ class LightevalTaskConfig:
5959
row to Doc objects for evaluation. Takes a dataset row dict and task
6060
name as input.
6161
hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset.
62+
hf_data_files (str): Data files to load.
6263
hf_subset (str): Dataset subset/configuration name to use for this task.
6364
metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task.
6465
@@ -113,6 +114,7 @@ class LightevalTaskConfig:
113114
hf_repo: str
114115
hf_subset: str
115116
metrics: ListLike[Metric | Metrics] # Accept both Metric objects and Metrics enums
117+
hf_data_files: str | Sequence[str] | Mapping[str, str | Sequence[str]] | None = None
116118

117119
# Inspect AI compatible parameters
118120
solver: None = None
@@ -219,6 +221,7 @@ def __init__(
219221

220222
# Dataset info
221223
self.dataset_path = config.hf_repo
224+
self.data_files = config.hf_data_files
222225
self.dataset_config_name = config.hf_subset
223226
self.dataset_revision = config.hf_revision
224227
self.dataset_filter = config.hf_filter
@@ -454,6 +457,7 @@ def download_dataset_worker(
454457
path=task.dataset_path,
455458
name=task.dataset_config_name,
456459
revision=task.dataset_revision,
460+
data_files=task.data_files,
457461
)
458462

459463
if task.dataset_filter is not None:

tests/unit/tasks/test_lighteval_task.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,24 @@ def test_dataset_filter():
6363
filtered_docs = task.eval_docs()
6464
assert len(filtered_docs) == 1
6565
assert filtered_docs[0].query == "hi"
66+
67+
68+
def test_hf_data_files(tmp_path):
69+
# create a small jsonl dataset
70+
data_file = tmp_path / "data.jsonl"
71+
src_docs = [f"document {i}" for i in range(3)]
72+
data_file.write_text("\n".join([f'{{"text": "{doc}"}}' for doc in src_docs]))
73+
74+
cfg = LightevalTaskConfig(
75+
name="test_data_files",
76+
prompt_function=dummy_prompt_function,
77+
hf_repo="json",
78+
hf_subset="default",
79+
metrics=[],
80+
evaluation_splits=["train"],
81+
hf_data_files=str(data_file),
82+
)
83+
task = LightevalTask(cfg)
84+
85+
eval_docs = task.eval_docs()
86+
assert [doc.query for doc in eval_docs] == src_docs

0 commit comments

Comments
 (0)