From 7d2809b32024c664f378c1c0c6ca0fcbec93a675 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 24 Nov 2025 13:40:13 +0100 Subject: [PATCH 1/6] use a eval.yaml from the hub --- src/lighteval/__main__.py | 1 + src/lighteval/from_hub.py | 160 ++++++++++++++++++++++++++++++++++ src/lighteval/main_inspect.py | 8 ++ 3 files changed, 169 insertions(+) create mode 100644 src/lighteval/from_hub.py diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index 694a76fe5..01a33b53e 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -71,6 +71,7 @@ app.command(rich_help_panel="Evaluation Backends")(lighteval.main_custom.custom) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_inspect.eval) +app.command(rich_help_panel="Evaluation Backends")(lighteval.main_inspect.from_hub) app.command(rich_help_panel="EvaluationUtils")(lighteval.main_inspect.bundle) app.add_typer( lighteval.main_endpoint.app, diff --git a/src/lighteval/from_hub.py b/src/lighteval/from_hub.py new file mode 100644 index 000000000..37073f95d --- /dev/null +++ b/src/lighteval/from_hub.py @@ -0,0 +1,160 @@ +import os +from pathlib import Path +from string import ascii_uppercase + +import yaml +from huggingface_hub import hf_hub_download +from inspect_ai import Epochs, Task, task +from inspect_ai.dataset import FieldSpec, Sample, hf_dataset +from inspect_ai.scorer import choice, exact, match, model_graded_fact +from inspect_ai.solver import ( + chain_of_thought, + generate, + multiple_choice, + prompt_template, + system_message, +) + + +def load_config(yaml_path: str = None) -> dict: + """Load and parse the YAML configuration file.""" + if yaml_path is None: + yaml_path = os.getenv("EVAL_YAML", "eval.yaml") + + yaml_path = Path(yaml_path) + if not yaml_path.is_absolute(): + yaml_path = Path(__file__).parent / yaml_path + + with open(yaml_path, "r") as f: + return yaml.safe_load(f) + + +def record_to_sample(record, field_spec: dict): + """Convert a dataset record to a Sample based on field_spec.""" + input_text = record[field_spec["input"]] + + # Handle target - convert numeric labels to letters for multiple choice + target_letter = ascii_uppercase[record[field_spec["target"]]] + + # Get choices if specified + choices_list = None + if "choices" in field_spec: + choices_list = [record[choice_field] for choice_field in field_spec["choices"]] + + sample_kwargs = { + "input": input_text, + "target": target_letter, + } + if choices_list: + sample_kwargs["choices"] = choices_list + + return Sample(**sample_kwargs) + + +def load_dataset(repo_id: str, revision: str = "main", task_config: dict = None, global_config: dict = None): + """Load dataset based on task configuration.""" + subset = task_config.get("subset") + split = task_config.get("splits", "test") + field_spec = task_config["field_spec"] + + # Use custom function if choices are specified (for multiple choice with label conversion) + if "choices" in field_spec: + dataset = hf_dataset( + path=repo_id, + revision=revision, + name=subset, + split=split, + sample_fields=lambda record: record_to_sample(record, field_spec), + ) + else: + # For non-multiple-choice, use FieldSpec + dataset = hf_dataset( + path=repo_id, + revision=revision, + name=subset, + split=split, + sample_fields=FieldSpec( + input=field_spec["input"], + target=field_spec["target"], + **({k: v for k, v in field_spec.items() if k not in ["input", "target"]}), + ), + ) + + return dataset + + +def build_solvers(task_config: dict): + """Build solvers list from task configuration.""" + solvers = [] + solver_names = task_config.get("solvers", []) + + for solver_name in solver_names: + if solver_name == "prompt_template": + if "prompt_template" in task_config and task_config["prompt_template"]: + template = task_config["prompt_template"].strip().strip('"') + template = template.replace("{{prompt}}", "{prompt}") + solvers.append(prompt_template(template)) + elif solver_name == "system_message": + if "system_message" in task_config and task_config["system_message"]: + sys_msg = task_config["system_message"].strip().strip('"') + solvers.append(system_message(sys_msg)) + elif solver_name == "chain_of_thought": + solvers.append(chain_of_thought()) + elif solver_name == "multiple_choice": + solvers.append(multiple_choice()) + elif solver_name == "generate": + solvers.append(generate()) + + return solvers + + +def build_scorer(task_config: dict): + """Build scorer from task configuration.""" + scorer_name = task_config.get("scorers", ["choice"])[0] + + if scorer_name == "choice": + return choice() + elif scorer_name == "exact": + return exact() + elif scorer_name == "match": + return match() + elif scorer_name == "model_graded_fact": + return model_graded_fact() + else: + raise ValueError(f"Unknown scorer: {scorer_name}") + + +def create_task_from_config( + repo_id: str, revision: str = "main", task_config: dict = None, global_config: dict = None +): + """Create an inspect.ai Task from a task configuration.""" + dataset = load_dataset(repo_id, revision, task_config, global_config) + solvers = build_solvers(task_config) + scorer = build_scorer(task_config) + epochs = task_config.get("epochs", 1) + epochs_reducer = task_config.get("epochs_reducer", "mean") + + return Task( + dataset=dataset, + solver=solvers, + scorer=scorer, + name=task_config["name"], + epochs=Epochs(epochs, epochs_reducer), + ) + + +def create_task_function(repo_id: str, revision: str = "main"): + """Factory function to create a task function with proper closure.""" + # read yaml from hf filesystem + yaml_path = hf_hub_download(repo_id=repo_id, filename="eval.yaml", repo_type="dataset", revision=revision) + + with open(yaml_path, "r") as f: + global_config = yaml.safe_load(f) + + task_config = global_config["tasks"][0] + + @task + def task_func(): + return create_task_from_config(repo_id, revision, task_config, global_config) + + return task_func diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index a8402df82..06d874387 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -28,6 +28,7 @@ import requests from huggingface_hub import HfApi from inspect_ai import Epochs, Task, task +from inspect_ai import eval as inspect_ai_eval from inspect_ai import eval_set as inspect_ai_eval_set from inspect_ai.dataset import hf_dataset from inspect_ai.log import bundle_log_dir @@ -37,6 +38,7 @@ from typer import Argument, Option from typing_extensions import Annotated +from lighteval.from_hub import create_task_function from lighteval.models.abstract_model import InspectAIModelConfig from lighteval.tasks.lighteval_task import LightevalTaskConfig @@ -520,6 +522,12 @@ def eval( # noqa C901 print("run 'inspect view' to view the results") +def from_hub(model: str, repo_id: str, limit: int = 100, revision: str = "main"): + task = create_task_function(repo_id, revision) + model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct" + inspect_ai_eval(tasks=[task], model=model, limit=100) + + def bundle(log_dir: str, output_dir: str, overwrite: bool = True, repo_id: str | None = None, public: bool = False): bundle_log_dir(log_dir=log_dir, output_dir=output_dir, overwrite=overwrite) From d29845837f7bbfce7849d82cc79a4439a52c70ae Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 2 Dec 2025 14:38:25 +0100 Subject: [PATCH 2/6] use a eval.yaml from the hub --- src/lighteval/from_hub.py | 152 ++++++++++++++++++---------------- src/lighteval/main_inspect.py | 6 +- 2 files changed, 83 insertions(+), 75 deletions(-) diff --git a/src/lighteval/from_hub.py b/src/lighteval/from_hub.py index 37073f95d..44bf8a0b0 100644 --- a/src/lighteval/from_hub.py +++ b/src/lighteval/from_hub.py @@ -1,32 +1,10 @@ -import os -from pathlib import Path +from importlib import import_module from string import ascii_uppercase import yaml from huggingface_hub import hf_hub_download from inspect_ai import Epochs, Task, task from inspect_ai.dataset import FieldSpec, Sample, hf_dataset -from inspect_ai.scorer import choice, exact, match, model_graded_fact -from inspect_ai.solver import ( - chain_of_thought, - generate, - multiple_choice, - prompt_template, - system_message, -) - - -def load_config(yaml_path: str = None) -> dict: - """Load and parse the YAML configuration file.""" - if yaml_path is None: - yaml_path = os.getenv("EVAL_YAML", "eval.yaml") - - yaml_path = Path(yaml_path) - if not yaml_path.is_absolute(): - yaml_path = Path(__file__).parent / yaml_path - - with open(yaml_path, "r") as f: - return yaml.safe_load(f) def record_to_sample(record, field_spec: dict): @@ -51,9 +29,9 @@ def record_to_sample(record, field_spec: dict): return Sample(**sample_kwargs) -def load_dataset(repo_id: str, revision: str = "main", task_config: dict = None, global_config: dict = None): +def load_dataset(repo_id: str, revision: str = "main", task_config: dict = None): """Load dataset based on task configuration.""" - subset = task_config.get("subset") + subset = task_config.get("subset", "default") split = task_config.get("splits", "test") field_spec = task_config["field_spec"] @@ -76,7 +54,7 @@ def load_dataset(repo_id: str, revision: str = "main", task_config: dict = None, sample_fields=FieldSpec( input=field_spec["input"], target=field_spec["target"], - **({k: v for k, v in field_spec.items() if k not in ["input", "target"]}), + metadata=field_spec.get("metadata", []), ), ) @@ -84,66 +62,96 @@ def load_dataset(repo_id: str, revision: str = "main", task_config: dict = None, def build_solvers(task_config: dict): - """Build solvers list from task configuration.""" + """ + Build a list of solvers from the task configuration. + + task_config example: + + ```yaml + solvers: + - name: prompt_template + args: + template: > + You are a helpful assistant. + {prompt} + - name: generate + args: + cache: true + ``` + + + """ solvers = [] - solver_names = task_config.get("solvers", []) - - for solver_name in solver_names: - if solver_name == "prompt_template": - if "prompt_template" in task_config and task_config["prompt_template"]: - template = task_config["prompt_template"].strip().strip('"') - template = template.replace("{{prompt}}", "{prompt}") - solvers.append(prompt_template(template)) - elif solver_name == "system_message": - if "system_message" in task_config and task_config["system_message"]: - sys_msg = task_config["system_message"].strip().strip('"') - solvers.append(system_message(sys_msg)) - elif solver_name == "chain_of_thought": - solvers.append(chain_of_thought()) - elif solver_name == "multiple_choice": - solvers.append(multiple_choice()) - elif solver_name == "generate": - solvers.append(generate()) + solver_configs = task_config.get("solvers", []) + solver_module = import_module("inspect_ai.solver") - return solvers + for solver_config in solver_configs: + solver_name = solver_config["name"] + if not hasattr(solver_module, solver_name): + raise ValueError(f"Unknown solver: {solver_name}") -def build_scorer(task_config: dict): - """Build scorer from task configuration.""" - scorer_name = task_config.get("scorers", ["choice"])[0] - - if scorer_name == "choice": - return choice() - elif scorer_name == "exact": - return exact() - elif scorer_name == "match": - return match() - elif scorer_name == "model_graded_fact": - return model_graded_fact() - else: - raise ValueError(f"Unknown scorer: {scorer_name}") + solver_fn = getattr(solver_module, solver_name) + solvers.append(solver_fn(**solver_config.get("args", {}))) + + return solvers -def create_task_from_config( - repo_id: str, revision: str = "main", task_config: dict = None, global_config: dict = None -): +def build_scorer(task_config: dict): + """ + Build a scorer from the task configuration. + task_config example: + + ```yaml + scorers: + - name: model_graded_fact + args: + template: | + grade this, + + question: + {question} + criterion: + {criterion} + answer: + {answer} + ``` + """ + scorers = [] + scorer_configs = task_config.get("scorers", []) + scorer_module = import_module("inspect_ai.scorer") + + for scorer_config in scorer_configs: + scorer_name = scorer_config["name"] + + if not hasattr(scorer_module, scorer_name): + raise ValueError(f"Unknown scorer: {scorer_name}") + + scorer_fn = getattr(scorer_module, scorer_name) + scorers.append(scorer_fn(**scorer_config.get("args", {}))) + + return scorers + + +@task +def create_task_from_config(repo_id: str, revision: str = "main", task_config: dict = None): """Create an inspect.ai Task from a task configuration.""" - dataset = load_dataset(repo_id, revision, task_config, global_config) + dataset = load_dataset(repo_id, revision, task_config) solvers = build_solvers(task_config) - scorer = build_scorer(task_config) + scorers = build_scorer(task_config) epochs = task_config.get("epochs", 1) epochs_reducer = task_config.get("epochs_reducer", "mean") return Task( dataset=dataset, solver=solvers, - scorer=scorer, + scorer=scorers, name=task_config["name"], epochs=Epochs(epochs, epochs_reducer), ) -def create_task_function(repo_id: str, revision: str = "main"): +def create_task_function(repo_id: str, revision: str = "main") -> list: """Factory function to create a task function with proper closure.""" # read yaml from hf filesystem yaml_path = hf_hub_download(repo_id=repo_id, filename="eval.yaml", repo_type="dataset", revision=revision) @@ -151,10 +159,10 @@ def create_task_function(repo_id: str, revision: str = "main"): with open(yaml_path, "r") as f: global_config = yaml.safe_load(f) - task_config = global_config["tasks"][0] + task_configs = global_config["tasks"] - @task - def task_func(): - return create_task_from_config(repo_id, revision, task_config, global_config) + tasks = [] + for task_config in task_configs: + tasks.append(create_task_from_config(repo_id, revision, task_config)) - return task_func + return tasks diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index 06d874387..8cd2e03cb 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -522,10 +522,10 @@ def eval( # noqa C901 print("run 'inspect view' to view the results") -def from_hub(model: str, repo_id: str, limit: int = 100, revision: str = "main"): +def from_hub(repo_id: str, models: list[str], limit: int = 100, revision: str = "main"): task = create_task_function(repo_id, revision) - model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct" - inspect_ai_eval(tasks=[task], model=model, limit=100) + + inspect_ai_eval(tasks=task, model=models, limit=limit) def bundle(log_dir: str, output_dir: str, overwrite: bool = True, repo_id: str | None = None, public: bool = False): From 023a311c02019b1c87dcca34e0153b4f1d0a3e5a Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 2 Dec 2025 16:20:09 +0100 Subject: [PATCH 3/6] use a eval.yaml from the hub --- src/lighteval/from_hub.py | 58 +++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/src/lighteval/from_hub.py b/src/lighteval/from_hub.py index 44bf8a0b0..753f9f4fc 100644 --- a/src/lighteval/from_hub.py +++ b/src/lighteval/from_hub.py @@ -8,23 +8,29 @@ def record_to_sample(record, field_spec: dict): - """Convert a dataset record to a Sample based on field_spec.""" + """ + Used for multiple choice tasks because we often need to convert numeric + labels to letters for the target. + """ input_text = record[field_spec["input"]] - # Handle target - convert numeric labels to letters for multiple choice - target_letter = ascii_uppercase[record[field_spec["target"]]] + target = record[field_spec["target"]] + if isinstance(target, int): + target = ascii_uppercase[target] - # Get choices if specified - choices_list = None - if "choices" in field_spec: - choices_list = [record[choice_field] for choice_field in field_spec["choices"]] + choices_list = record[field_spec["choices"]] + + metadata = field_spec.get("metadata", None) + + if metadata: + metadata = {name: record[name] for name in metadata} sample_kwargs = { "input": input_text, - "target": target_letter, + "target": target, + "choices": choices_list, + "metadata": metadata, } - if choices_list: - sample_kwargs["choices"] = choices_list return Sample(**sample_kwargs) @@ -35,28 +41,20 @@ def load_dataset(repo_id: str, revision: str = "main", task_config: dict = None) split = task_config.get("splits", "test") field_spec = task_config["field_spec"] - # Use custom function if choices are specified (for multiple choice with label conversion) if "choices" in field_spec: - dataset = hf_dataset( - path=repo_id, - revision=revision, - name=subset, - split=split, - sample_fields=lambda record: record_to_sample(record, field_spec), - ) + + def sample_fields(record): + return record_to_sample(record, field_spec) else: - # For non-multiple-choice, use FieldSpec - dataset = hf_dataset( - path=repo_id, - revision=revision, - name=subset, - split=split, - sample_fields=FieldSpec( - input=field_spec["input"], - target=field_spec["target"], - metadata=field_spec.get("metadata", []), - ), - ) + sample_fields = FieldSpec(**field_spec) + + dataset = hf_dataset( + path=repo_id, + revision=revision, + name=subset, + split=split, + sample_fields=sample_fields, + ) return dataset From c0e60040965e63dc008920e2477dceb618b22bd5 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 4 Dec 2025 11:51:15 +0100 Subject: [PATCH 4/6] use a eval.yaml from the hub --- src/lighteval/from_hub.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lighteval/from_hub.py b/src/lighteval/from_hub.py index 753f9f4fc..1d28b98de 100644 --- a/src/lighteval/from_hub.py +++ b/src/lighteval/from_hub.py @@ -15,6 +15,7 @@ def record_to_sample(record, field_spec: dict): input_text = record[field_spec["input"]] target = record[field_spec["target"]] + if isinstance(target, int): target = ascii_uppercase[target] From 273d4216c7a2b4ed8fc9b7aedbffd81fe7a96025 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 4 Dec 2025 15:38:06 +0100 Subject: [PATCH 5/6] use a eval.yaml from the hub --- src/lighteval/__main__.py | 1 - src/lighteval/from_hub.py | 14 ++++++++++++-- src/lighteval/main_inspect.py | 28 +++++++++++++++------------- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index 01a33b53e..694a76fe5 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -71,7 +71,6 @@ app.command(rich_help_panel="Evaluation Backends")(lighteval.main_custom.custom) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_inspect.eval) -app.command(rich_help_panel="Evaluation Backends")(lighteval.main_inspect.from_hub) app.command(rich_help_panel="EvaluationUtils")(lighteval.main_inspect.bundle) app.add_typer( lighteval.main_endpoint.app, diff --git a/src/lighteval/from_hub.py b/src/lighteval/from_hub.py index 1d28b98de..4a0018428 100644 --- a/src/lighteval/from_hub.py +++ b/src/lighteval/from_hub.py @@ -14,12 +14,22 @@ def record_to_sample(record, field_spec: dict): """ input_text = record[field_spec["input"]] - target = record[field_spec["target"]] + target = field_spec["target"] + + if target in ascii_uppercase: + target = target + else: + target = record[field_spec["target"]] if isinstance(target, int): target = ascii_uppercase[target] - choices_list = record[field_spec["choices"]] + choices = field_spec["choices"] + + if isinstance(choices, list): + choices_list = [record[choice] for choice in choices] + else: + choices_list = record[choices] metadata = field_spec.get("metadata", None) diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index 8cd2e03cb..e599a7ac6 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -28,7 +28,6 @@ import requests from huggingface_hub import HfApi from inspect_ai import Epochs, Task, task -from inspect_ai import eval as inspect_ai_eval from inspect_ai import eval_set as inspect_ai_eval_set from inspect_ai.dataset import hf_dataset from inspect_ai.log import bundle_log_dir @@ -215,6 +214,7 @@ def eval( # noqa C901 models: Annotated[list[str], Argument(help="Models to evaluate")], tasks: Annotated[str, Argument(help="Tasks to evaluate")], # model arguments + revision: Annotated[str, Option(help="Revision of the benchmark repo on the hub")] = "main", model_base_url: Annotated[ str | None, Option( @@ -430,15 +430,23 @@ def eval( # noqa C901 ), ] = False, ): + from huggingface_hub import HfApi + from lighteval.tasks.registry import Registry - registry = Registry(tasks=tasks, custom_tasks=None, load_multilingual=False) - task_configs = registry.task_to_configs - inspect_ai_tasks = [] + if "/" in tasks: + api = HfApi() + print(f"Loading tasks from dataset repository {tasks}...") + api.repo_info(repo_id=tasks, repo_type="dataset", revision=revision) + inspect_ai_tasks = create_task_function(tasks, revision) + else: + registry = Registry(tasks=tasks, custom_tasks=None, load_multilingual=False) + task_configs = registry.task_to_configs + inspect_ai_tasks = [] - for task_name, task_configs in task_configs.items(): - for task_config in task_configs: - inspect_ai_tasks.append(get_inspect_ai_task(task_config, epochs=epochs, epochs_reducer=epochs_reducer)) + for task_name, task_configs in task_configs.items(): + for task_config in task_configs: + inspect_ai_tasks.append(get_inspect_ai_task(task_config, epochs=epochs, epochs_reducer=epochs_reducer)) if model_args is not None: model_args = InspectAIModelConfig._parse_args(model_args) @@ -522,12 +530,6 @@ def eval( # noqa C901 print("run 'inspect view' to view the results") -def from_hub(repo_id: str, models: list[str], limit: int = 100, revision: str = "main"): - task = create_task_function(repo_id, revision) - - inspect_ai_eval(tasks=task, model=models, limit=limit) - - def bundle(log_dir: str, output_dir: str, overwrite: bool = True, repo_id: str | None = None, public: bool = False): bundle_log_dir(log_dir=log_dir, output_dir=output_dir, overwrite=overwrite) From e425fc699b7ecc62900b9417bd57f74374f94a3c Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Thu, 4 Dec 2025 15:44:44 +0100 Subject: [PATCH 6/6] use a eval.yaml from the hub --- src/lighteval/from_hub.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lighteval/from_hub.py b/src/lighteval/from_hub.py index 4a0018428..b30faf60c 100644 --- a/src/lighteval/from_hub.py +++ b/src/lighteval/from_hub.py @@ -67,6 +67,9 @@ def sample_fields(record): sample_fields=sample_fields, ) + if task_config.get("shuffle_choices", False): + dataset.shuffle_choices() + return dataset