marin-community · rjpower · Nov 15, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/.agents/docs/llm-pool-eval-migration.md b/.agents/docs/llm-pool-eval-migration.md
diff --git a/data_browser/pyproject.toml b/data_browser/pyproject.toml
@@ -11,7 +11,7 @@ requires-python = ">=3.11"
 dependencies = [
     "py7zr>=0.22.0",
     "fsspec>=2024.9.0",
-    "zstandard>=0.23.0",
+    "zstandard>=0.18.0",
     "flask>=3.1.1",
     "pyarrow>=17.0.0",
     "gcsfs>=2024.9.0.post1",

diff --git a/experiments/evals/evals.py b/experiments/evals/evals.py
@@ -18,8 +18,26 @@
 
 import logging
 
+from fray.cluster.base import ResourceConfig
+from marin.evaluation.evaluation_config import (
+    EvalTaskConfig,
+    EvaluationConfig,
+    InferencePoolConfig,
+    ModelConfig,
+    infer_device_from_resource_config,
+)
+from marin.evaluation.run import evaluate
+from marin.execution.executor import (
+    ExecutorStep,
+    InputName,
+    get_executor_step,
+    output_path_of,
+    this_output_path,
+    versioned,
+)
+
 from experiments.evals.engine_configs import DEFAULT_LM_EVAL_MODEL_KWARGS
-from experiments.evals.resource_configs import SINGLE_TPU_V4_8, SINGLE_TPU_V6E_8, ResourceConfig
+from experiments.evals.resource_configs import SINGLE_TPU_V4_8, SINGLE_TPU_V6E_8
 from experiments.evals.task_configs import (
     BASE_GENERATION_TASKS,
     CORE_TASKS,
@@ -32,68 +50,111 @@
     OPEN_LM_LEADERBOARD_GEN,
     OPEN_LM_LEADERBOARD_MCQ,
 )
-from marin.evaluation.evaluation_config import EvalTaskConfig, EvaluationConfig
-from marin.evaluation.run import evaluate
-from marin.execution.executor import (
-    ExecutorStep,
-    InputName,
-    get_executor_step,
-    output_path_of,
-    this_output_path,
-    versioned,
-)
 
 logger = logging.getLogger(__name__)
 
 
-def evaluate_helm(model_name: str, model_path: str, evals: list[EvalTaskConfig]) -> ExecutorStep:
-    """
-    Create an ExecutorStep to evaluate the model using HELM.
+def evaluate_helm(
+    model_name: str,
+    model_path: str,
+    evals: list[EvalTaskConfig],
+    resource_config: ResourceConfig,
+    max_eval_instances: int | None = None,
+    engine_kwargs: dict | None = None,
+) -> ExecutorStep:
+    """Create an ExecutorStep to evaluate the model using HELM.
 
     Args:
-        model_name (str): Name of the model.
-        model_path (str): Path to the model.
-        evals (list[str]): List of evaluations to run with HELM, e.g, ["mmlu", "lite"].
+        model_name: Name of the model
+        model_path: Path to the model
+        evals: List of evaluations to run with HELM, e.g, ["mmlu", "lite"]
+        resource_config: Fray ResourceConfig for pool workers (TPU/GPU resources)
+        max_eval_instances: Maximum number of evaluation instances to run
+        engine_kwargs: Additional keyword arguments to pass to the vLLM engine
     """
+    device = infer_device_from_resource_config(resource_config)
+
+    model_config = ModelConfig(
+        name=model_name,
+        path=model_path,
+        device=device,
+        engine_kwargs=engine_kwargs or {},
+        apply_chat_template=False,
+    )
+
+    pool_config = InferencePoolConfig(
+        resource_config=resource_config,
+        model_config=model_config,
+    )
+
     return ExecutorStep(
         name=f"evaluation/helm/{model_name}",
         fn=evaluate,
         config=EvaluationConfig(
             evaluator="helm",
+            pool_config=pool_config,
             model_name=model_name,
             model_path=model_path,
             evaluation_path=this_output_path(),
             evals=evals,
+            max_eval_instances=max_eval_instances,
         ),
+        pip_dependency_groups=["eval"],
     )
 
 
 def evaluate_helm_on_step(
-    step: ExecutorStep | InputName, evals: list[EvalTaskConfig], max_eval_instances: int | None = None
+    step: ExecutorStep | InputName,
+    evals: list[EvalTaskConfig],
+    resource_config: ResourceConfig,
+    max_eval_instances: int | None = None,
+    engine_kwargs: dict | None = None,
 ) -> ExecutorStep:
-    """
-    Create an ExecutorStep to evaluate the model using HELM on a step.
+    """Create an ExecutorStep to evaluate the model using HELM on a training step.
 
     Args:
-        step (ExecutorStep | InputName): Executor Step to evaluate.
-        evals (list[str]): List of evaluations to run with HELM, e.g, ["mmlu", "lite"].
+        step: Executor Step to evaluate
+        evals: List of evaluations to run with HELM, e.g, ["mmlu", "lite"]
+        resource_config: Fray ResourceConfig for pool workers (TPU/GPU resources)
+        max_eval_instances: Maximum number of evaluation instances to run
+        engine_kwargs: Additional keyword arguments to pass to the vLLM engine
     """
     # TODO: support evaluating all checkpoints in a run
     executor_step = get_executor_step(step)
     model_step_path = output_path_of(executor_step)
 
+    # Auto-detect device from resource config
+    device = infer_device_from_resource_config(resource_config)
+
+    # Build ModelConfig (model name will be imputed from path)
+    model_config = ModelConfig(
+        name=executor_step.name,
+        path=model_step_path,  # type: ignore
+        device=device,
+        engine_kwargs=engine_kwargs or {},
+        apply_chat_template=False,
+    )
+
+    # Build InferencePoolConfig
+    pool_config = InferencePoolConfig(
+        resource_config=resource_config,
+        model_config=model_config,
+    )
+
     return ExecutorStep(
         name=f"evaluation/helm/{executor_step.name}",
         fn=evaluate,
         config=EvaluationConfig(
             evaluator="helm",
+            pool_config=pool_config,
             model_name=None,
             model_path=model_step_path,  # type: ignore
             evaluation_path=this_output_path(),
             evals=evals,
             discover_latest_checkpoint=True,
             max_eval_instances=max_eval_instances,
         ),
+        pip_dependency_groups=["eval"],
     )
 
 

diff --git a/experiments/evals/test_helm_migration.py b/experiments/evals/test_helm_migration.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# Copyright 2025 The Marin Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test HELM evaluation with new pool-based architecture."""
+
+import os
+
+from fray.cluster.base import CpuConfig, ResourceConfig, TpuConfig
+from marin.execution.executor import executor_main
+
+from experiments.evals.evals import evaluate_helm
+from experiments.evals.task_configs import EvalTaskConfig
+
+backend_type = os.environ.get("backend_type", "tpu")
+
+if backend_type == "cpu":
+    resource_config = ResourceConfig(
+        cpu=1,
+        ram="1g",
+        disk="10g",
+        device=CpuConfig(),
+        replicas=1,
+    )
+else:
+    resource_config = ResourceConfig(
+        cpu=1,
+        ram="16",
+        disk="10g",
+        device=TpuConfig(type="v5litepod-4", count=4),
+        replicas=1,
+        regions=["eu-west4"],
+    )
+
+step = evaluate_helm(
+    model_name="HuggingFaceTB/SmolLM2-135M",
+    model_path="HuggingFaceTB/SmolLM2-135M",
+    evals=[EvalTaskConfig(name="mmlu", num_fewshot=0)],
+    resource_config=resource_config,
+    max_eval_instances=10,
+)
+
+if __name__ == "__main__":
+    executor_main(steps=[step])
diff --git a/lib/fray/README.md b/lib/fray/README.md
@@ -98,7 +98,7 @@ tpu_request = JobRequest(
         device=TpuConfig(type="v5e-16", count=8),
     ),
     environment=create_environment(
-        extra_dependency_groups=["tpu"],
+        extras=["tpu"],
         env_vars={"WANDB_API_KEY": "your-key"},
     ),
 )

diff --git a/lib/fray/pyproject.toml b/lib/fray/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "mergedeep",
     "pyyaml>=6.0",
     "typing-extensions>=4.0",
-    "zstandard>=0.22.0",
+    "zstandard>=0.18.0",
 ]
 
 [project.scripts]

diff --git a/lib/fray/src/fray/__init__.py b/lib/fray/src/fray/__init__.py
@@ -14,6 +14,19 @@
 
 """Fray: Execution contexts for distributed and parallel computing."""
 
+from fray.cluster import (
+    Cluster,
+    CpuConfig,
+    Entrypoint,
+    EnvironmentConfig,
+    GpuConfig,
+    JobId,
+    JobRequest,
+    LocalCluster,
+    ResourceConfig,
+    TpuConfig,
+)
+from fray.isolated_env import TemporaryVenv
 from fray.job_context import (
     ContextConfig,
     ExecutionContext,
@@ -24,10 +37,20 @@
 )
 
 __all__ = [
+    "Cluster",
     "ContextConfig",
+    "CpuConfig",
+    "Entrypoint",
+    "EnvironmentConfig",
     "ExecutionContext",
+    "JobId",
+    "JobRequest",
+    "LocalCluster",
     "RayContext",
+    "ResourceConfig",
     "SyncContext",
+    "TemporaryVenv",
     "ThreadContext",
+    "TpuConfig",
     "create_context",
 ]
diff --git a/lib/fray/src/fray/cli.py b/lib/fray/src/fray/cli.py
@@ -115,7 +115,7 @@ def submit(ctx, extra, cpus, memory, disk, tpu, gpu, gpu_count, env, auto_stop,
 
     env_config = EnvironmentConfig(
         workspace=os.getcwd(),
-        extra_dependency_groups=extra_groups,
+        extras=extra_groups,
         env_vars=env_dict,
     )