Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
603 changes: 603 additions & 0 deletions .agents/docs/llm-pool-eval-migration.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data_browser/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ requires-python = ">=3.11"
dependencies = [
"py7zr>=0.22.0",
"fsspec>=2024.9.0",
"zstandard>=0.23.0",
"zstandard>=0.18.0",
"flask>=3.1.1",
"pyarrow>=17.0.0",
"gcsfs>=2024.9.0.post1",
Expand Down
105 changes: 83 additions & 22 deletions experiments/evals/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,26 @@

import logging

from fray.cluster.base import ResourceConfig
from marin.evaluation.evaluation_config import (
EvalTaskConfig,
EvaluationConfig,
InferencePoolConfig,
ModelConfig,
infer_device_from_resource_config,
)
from marin.evaluation.run import evaluate
from marin.execution.executor import (
ExecutorStep,
InputName,
get_executor_step,
output_path_of,
this_output_path,
versioned,
)

from experiments.evals.engine_configs import DEFAULT_LM_EVAL_MODEL_KWARGS
from experiments.evals.resource_configs import SINGLE_TPU_V4_8, SINGLE_TPU_V6E_8, ResourceConfig
from experiments.evals.resource_configs import SINGLE_TPU_V4_8, SINGLE_TPU_V6E_8
from experiments.evals.task_configs import (
BASE_GENERATION_TASKS,
CORE_TASKS,
Expand All @@ -32,68 +50,111 @@
OPEN_LM_LEADERBOARD_GEN,
OPEN_LM_LEADERBOARD_MCQ,
)
from marin.evaluation.evaluation_config import EvalTaskConfig, EvaluationConfig
from marin.evaluation.run import evaluate
from marin.execution.executor import (
ExecutorStep,
InputName,
get_executor_step,
output_path_of,
this_output_path,
versioned,
)

logger = logging.getLogger(__name__)


def evaluate_helm(model_name: str, model_path: str, evals: list[EvalTaskConfig]) -> ExecutorStep:
"""
Create an ExecutorStep to evaluate the model using HELM.
def evaluate_helm(
model_name: str,
model_path: str,
evals: list[EvalTaskConfig],
resource_config: ResourceConfig,
max_eval_instances: int | None = None,
engine_kwargs: dict | None = None,
) -> ExecutorStep:
"""Create an ExecutorStep to evaluate the model using HELM.

Args:
model_name (str): Name of the model.
model_path (str): Path to the model.
evals (list[str]): List of evaluations to run with HELM, e.g, ["mmlu", "lite"].
model_name: Name of the model
model_path: Path to the model
evals: List of evaluations to run with HELM, e.g, ["mmlu", "lite"]
resource_config: Fray ResourceConfig for pool workers (TPU/GPU resources)
max_eval_instances: Maximum number of evaluation instances to run
engine_kwargs: Additional keyword arguments to pass to the vLLM engine
"""
device = infer_device_from_resource_config(resource_config)

model_config = ModelConfig(
name=model_name,
path=model_path,
device=device,
engine_kwargs=engine_kwargs or {},
apply_chat_template=False,
)

pool_config = InferencePoolConfig(
resource_config=resource_config,
model_config=model_config,
)

return ExecutorStep(
name=f"evaluation/helm/{model_name}",
fn=evaluate,
config=EvaluationConfig(
evaluator="helm",
pool_config=pool_config,
model_name=model_name,
model_path=model_path,
evaluation_path=this_output_path(),
evals=evals,
max_eval_instances=max_eval_instances,
),
pip_dependency_groups=["eval"],
)


def evaluate_helm_on_step(
step: ExecutorStep | InputName, evals: list[EvalTaskConfig], max_eval_instances: int | None = None
step: ExecutorStep | InputName,
evals: list[EvalTaskConfig],
resource_config: ResourceConfig,
max_eval_instances: int | None = None,
engine_kwargs: dict | None = None,
) -> ExecutorStep:
"""
Create an ExecutorStep to evaluate the model using HELM on a step.
"""Create an ExecutorStep to evaluate the model using HELM on a training step.

Args:
step (ExecutorStep | InputName): Executor Step to evaluate.
evals (list[str]): List of evaluations to run with HELM, e.g, ["mmlu", "lite"].
step: Executor Step to evaluate
evals: List of evaluations to run with HELM, e.g, ["mmlu", "lite"]
resource_config: Fray ResourceConfig for pool workers (TPU/GPU resources)
max_eval_instances: Maximum number of evaluation instances to run
engine_kwargs: Additional keyword arguments to pass to the vLLM engine
"""
# TODO: support evaluating all checkpoints in a run
executor_step = get_executor_step(step)
model_step_path = output_path_of(executor_step)

# Auto-detect device from resource config
device = infer_device_from_resource_config(resource_config)

# Build ModelConfig (model name will be imputed from path)
model_config = ModelConfig(
name=executor_step.name,
path=model_step_path, # type: ignore
device=device,
engine_kwargs=engine_kwargs or {},
apply_chat_template=False,
)

# Build InferencePoolConfig
pool_config = InferencePoolConfig(
resource_config=resource_config,
model_config=model_config,
)

return ExecutorStep(
name=f"evaluation/helm/{executor_step.name}",
fn=evaluate,
config=EvaluationConfig(
evaluator="helm",
pool_config=pool_config,
model_name=None,
model_path=model_step_path, # type: ignore
evaluation_path=this_output_path(),
evals=evals,
discover_latest_checkpoint=True,
max_eval_instances=max_eval_instances,
),
pip_dependency_groups=["eval"],
)


Expand Down
55 changes: 55 additions & 0 deletions experiments/evals/test_helm_migration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
# Copyright 2025 The Marin Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Test HELM evaluation with new pool-based architecture."""

import os

from fray.cluster.base import CpuConfig, ResourceConfig, TpuConfig
from marin.execution.executor import executor_main

from experiments.evals.evals import evaluate_helm
from experiments.evals.task_configs import EvalTaskConfig

backend_type = os.environ.get("backend_type", "tpu")

if backend_type == "cpu":
resource_config = ResourceConfig(
cpu=1,
ram="1g",
disk="10g",
device=CpuConfig(),
replicas=1,
)
else:
resource_config = ResourceConfig(
cpu=1,
ram="16",
disk="10g",
device=TpuConfig(type="v5litepod-4", count=4),
replicas=1,
regions=["eu-west4"],
)

step = evaluate_helm(
model_name="HuggingFaceTB/SmolLM2-135M",
model_path="HuggingFaceTB/SmolLM2-135M",
evals=[EvalTaskConfig(name="mmlu", num_fewshot=0)],
resource_config=resource_config,
max_eval_instances=10,
)

if __name__ == "__main__":
executor_main(steps=[step])
2 changes: 1 addition & 1 deletion lib/fray/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ tpu_request = JobRequest(
device=TpuConfig(type="v5e-16", count=8),
),
environment=create_environment(
extra_dependency_groups=["tpu"],
extras=["tpu"],
env_vars={"WANDB_API_KEY": "your-key"},
),
)
Expand Down
2 changes: 1 addition & 1 deletion lib/fray/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies = [
"mergedeep",
"pyyaml>=6.0",
"typing-extensions>=4.0",
"zstandard>=0.22.0",
"zstandard>=0.18.0",
]

[project.scripts]
Expand Down
23 changes: 23 additions & 0 deletions lib/fray/src/fray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@

"""Fray: Execution contexts for distributed and parallel computing."""

from fray.cluster import (
Cluster,
CpuConfig,
Entrypoint,
EnvironmentConfig,
GpuConfig,
JobId,
JobRequest,
LocalCluster,
ResourceConfig,
TpuConfig,
)
from fray.isolated_env import TemporaryVenv
from fray.job_context import (
ContextConfig,
ExecutionContext,
Expand All @@ -24,10 +37,20 @@
)

__all__ = [
"Cluster",
"ContextConfig",
"CpuConfig",
"Entrypoint",
"EnvironmentConfig",
"ExecutionContext",
"JobId",
"JobRequest",
"LocalCluster",
"RayContext",
"ResourceConfig",
"SyncContext",
"TemporaryVenv",
"ThreadContext",
"TpuConfig",
"create_context",
]
2 changes: 1 addition & 1 deletion lib/fray/src/fray/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def submit(ctx, extra, cpus, memory, disk, tpu, gpu, gpu_count, env, auto_stop,

env_config = EnvironmentConfig(
workspace=os.getcwd(),
extra_dependency_groups=extra_groups,
extras=extra_groups,
env_vars=env_dict,
)

Expand Down
Loading
Loading