Cleanups.

rjpower · rjpower · commit 5a12d3c350e1 · 2025-11-24T20:01:58.000-08:00
diff --git a/.agents/docs/llm-pool-eval-migration.md b/.agents/docs/llm-pool-eval-migration.md
@@ -303,10 +303,8 @@ After local testing passes, test on remote cluster:
 uv run lib/marin/src/marin/run/ray_run.py \
   --auto-stop \
   --cluster eu-west4 \
-  -e HF_TOKEN=$HF_TOKEN \
-  -e WANDB_API_KEY=$WANDB_API_KEY \
   --extra=eval,tpu \
-  -- python ./experiments/evals/test_helm_migration.py
+  -- python ./experiments/evals/test_helm_migration.py --force_run_failed true
 ```
 
 **Success criteria:**
diff --git a/experiments/evals/evals.py b/experiments/evals/evals.py
@@ -99,7 +99,7 @@ def evaluate_helm(
             evals=evals,
             max_eval_instances=max_eval_instances,
         ),
-        pip_dependency_groups=["eval", "pip:crfm-helm@git+https://github.com/stanford-crfm/helm.git"],
+        pip_dependency_groups=["eval"],
     )
 
 
diff --git a/lib/fray/src/fray/cluster/ray/cluster.py b/lib/fray/src/fray/cluster/ray/cluster.py
@@ -334,14 +334,16 @@ def _launch_tpu_job(self, request: JobRequest) -> JobId:
         runtime_env = self._get_runtime_env(request)
 
         if entrypoint.function_args:
-            remote_fn = ray.remote(max_calls=1, runtime_env=runtime_env)(entrypoint.callable)(**entrypoint.function_args)
+            remote_fn = ray.remote(max_calls=1, runtime_env=runtime_env)(
+                lambda: entrypoint.callable(**entrypoint.function_args)
+            )
         else:
             remote_fn = ray.remote(max_calls=1, runtime_env=runtime_env)(entrypoint.callable)
 
         object_ref = run_on_pod_ray.remote(
             remote_fn,
             tpu_type=device.type,
-            num_slices=device.num_slices,
+            num_slices=request.resources.replicas,
             max_retries_preemption=10000,
             max_retries_failure=10,
         )
diff --git a/lib/fray/src/fray/queue/http.py b/lib/fray/src/fray/queue/http.py
@@ -16,6 +16,7 @@
 Queue implemented using a FastAPI server and HTTP requests.
 """
 
+import logging
 import pickle
 import threading
 import time
@@ -25,6 +26,8 @@
 from fastapi import Body, FastAPI, Request, Response
 from fray.queue.base import Lease, MemoryQueue
 
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
 
 class ServerThread:
     """Helper class to run uvicorn server in a background thread."""
@@ -72,7 +75,7 @@ def __init__(self, host: str = "0.0.0.0", port: int = 9999):
 
         import uvicorn
 
-        config = uvicorn.Config(self.app, host=host, port=port, log_level="error")
+        config = uvicorn.Config(self.app, host=host, port=port, log_level="error", access_log=False)
         self.server = uvicorn.Server(config)
         self.server_thread = None
 
diff --git a/lib/marin/src/marin/evaluation/backends/inference_pool.py b/lib/marin/src/marin/evaluation/backends/inference_pool.py
@@ -71,7 +71,7 @@ async def handle_inference_request(request: dict[str, Any], endpoint: str) -> di
                 if lease is None:
                     if time.time() - start_time > timeout:
                         raise HTTPException(status_code=504, detail="Request timed out")
-                    time.sleep(0.1)
+                    time.sleep(1.0)
                     continue
 
                 response = lease.item
@@ -116,7 +116,7 @@ async def health() -> dict[str, str]:
             return {"status": "ok"}
 
         logger.info(f"Starting OpenAI proxy server at http://{self.host}:{self.port}")
-        config = uvicorn.Config(app, host=self.host, port=self.port, log_level="info")
+        config = uvicorn.Config(app, host=self.host, port=self.port, log_level="warning", access_log=False)
         self.server = uvicorn.Server(config)
         self.server.run()
 
diff --git a/lib/marin/src/marin/evaluation/evaluators/helm_evaluator.py b/lib/marin/src/marin/evaluation/evaluators/helm_evaluator.py
@@ -14,6 +14,7 @@
 
 import logging
 import os
+import subprocess
 import tempfile
 import traceback
 from pathlib import Path
@@ -45,6 +46,18 @@
 logger = logging.getLogger(__name__)
 
 
+def ensure_file_downloaded(url: str, target_path: str) -> None:
+    if os.path.exists(target_path):
+        return
+
+    import requests
+
+    response = requests.get(url)
+    response.raise_for_status()
+    with open(target_path, "wb") as f:
+        f.write(response.content)
+
+
 def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: Path) -> None:
     """
     Write out the necessary model configuration files for HELM.
@@ -56,9 +69,7 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
     model_name: str = model.name
     # Use model.path for loading from HuggingFace, fallback to model.name if path is None
     model_path_or_name: str = model.path or model.name
-    print(f"Loading tokenizer for model: {model_path_or_name}", flush=True)
     tokenizer = AutoTokenizer.from_pretrained(model_path_or_name, trust_remote_code=True)
-    print(f"Tokenizer loaded, max_length: {tokenizer.model_max_length}", flush=True)
 
     content: dict = {
         "model_deployments": [
@@ -77,7 +88,6 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
         ]
     }
     deployments_path = prod_env_path / MODEL_DEPLOYMENTS_FILE_PATH
-    print(f"Writing model_deployments to {deployments_path}", flush=True)
     write_yaml(content, deployments_path)
 
     content = {
@@ -94,7 +104,6 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
         ]
     }
     metadata_path = prod_env_path / MODEL_METADATA_FILE_PATH
-    print(f"Writing model_metadata to {metadata_path}", flush=True)
     write_yaml(content, metadata_path)
 
     content = {
@@ -123,9 +132,7 @@ def get_runtime_env(self) -> dict:
         """
         Returns the runtime environment to run the evaluator on the Ray cluster.
         """
-        return build_runtime_env_for_packages(
-            extra=["eval", "tpu"], pip_packages=["crfm-helm@git+https://github.com/stanford-crfm/helm.git@local_vllm"]
-        )
+        return build_runtime_env_for_packages(extra=["eval", "tpu"])
 
     def evaluate(
         self,
@@ -155,9 +162,9 @@ def evaluate(
         prod_env_path = Path(results_path) / "prod_env"
         results_folder = Path(results_path) / "run" / "results"
 
-        try:
-            from helm.common.general import ensure_file_downloaded
+        subprocess.check_call(["uv", "pip", "install", "crfm-helm@git+https://github.com/stanford-crfm/helm.git"])
 
+        try:
             # Download the run_entries files and schema files for the specified evals
             assert len(evals) > 0, "Please specify at least one eval to run."
             run_entries_files: list[str] = []
@@ -266,9 +273,7 @@ def evaluate(
                     },
                 ),
                 resources=ResourceConfig(cpu=1, ram="4g", device=CpuConfig(), replicas=1),
-                environment=create_environment(
-                    pip_packages=["crfm-helm@git+https://github.com/stanford-crfm/helm.git@local_vllm"], extras=["eval"]
-                ),
+                environment=create_environment(extras=["eval"]),
             )
             job_id = cluster.launch(job_request)
             logger.info("Started Helm task with job id %s", job_id)
diff --git a/lib/marin/src/marin/run/ray_deps.py b/lib/marin/src/marin/run/ray_deps.py
@@ -109,10 +109,6 @@ def compute_frozen_packages(extra: list[str] | None = None) -> PackageSpec:
             # convert to a py_module. this isn't used for now, instead see `build_python_path`
             py_modules.append(line[3:].strip())
         else:
-            # Relax version pins for packages with known conflicts
-            # Replace == with >= for zstandard to allow pip more flexibility
-            if line.startswith("zstandard=="):
-                line = line.replace("==", ">=", 1)
             package_specs.append(line)
 
     return PackageSpec(package_specs=package_specs, py_modules=py_modules)

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ def evaluate_helm(`
`99`	`99`	`evals=evals,`
`100`	`100`	`max_eval_instances=max_eval_instances,`
`101`	`101`	`),`
`102`		`- pip_dependency_groups=["eval", "pip:crfm-helm@git+https://github.com/stanford-crfm/helm.git"],`
	`102`	`+ pip_dependency_groups=["eval"],`
`103`	`103`	`)`
`104`	`104`
`105`	`105`