Skip to content

Commit 5a12d3c

Browse files
committed
Cleanups.
1 parent ce969f6 commit 5a12d3c

File tree

7 files changed

+29
-25
lines changed

7 files changed

+29
-25
lines changed

.agents/docs/llm-pool-eval-migration.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -303,10 +303,8 @@ After local testing passes, test on remote cluster:
303303
uv run lib/marin/src/marin/run/ray_run.py \
304304
--auto-stop \
305305
--cluster eu-west4 \
306-
-e HF_TOKEN=$HF_TOKEN \
307-
-e WANDB_API_KEY=$WANDB_API_KEY \
308306
--extra=eval,tpu \
309-
-- python ./experiments/evals/test_helm_migration.py
307+
-- python ./experiments/evals/test_helm_migration.py --force_run_failed true
310308
```
311309

312310
**Success criteria:**

experiments/evals/evals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def evaluate_helm(
9999
evals=evals,
100100
max_eval_instances=max_eval_instances,
101101
),
102-
pip_dependency_groups=["eval", "pip:crfm-helm@git+https://github.com/stanford-crfm/helm.git"],
102+
pip_dependency_groups=["eval"],
103103
)
104104

105105

lib/fray/src/fray/cluster/ray/cluster.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,14 +334,16 @@ def _launch_tpu_job(self, request: JobRequest) -> JobId:
334334
runtime_env = self._get_runtime_env(request)
335335

336336
if entrypoint.function_args:
337-
remote_fn = ray.remote(max_calls=1, runtime_env=runtime_env)(entrypoint.callable)(**entrypoint.function_args)
337+
remote_fn = ray.remote(max_calls=1, runtime_env=runtime_env)(
338+
lambda: entrypoint.callable(**entrypoint.function_args)
339+
)
338340
else:
339341
remote_fn = ray.remote(max_calls=1, runtime_env=runtime_env)(entrypoint.callable)
340342

341343
object_ref = run_on_pod_ray.remote(
342344
remote_fn,
343345
tpu_type=device.type,
344-
num_slices=device.num_slices,
346+
num_slices=request.resources.replicas,
345347
max_retries_preemption=10000,
346348
max_retries_failure=10,
347349
)

lib/fray/src/fray/queue/http.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
Queue implemented using a FastAPI server and HTTP requests.
1717
"""
1818

19+
import logging
1920
import pickle
2021
import threading
2122
import time
@@ -25,6 +26,8 @@
2526
from fastapi import Body, FastAPI, Request, Response
2627
from fray.queue.base import Lease, MemoryQueue
2728

29+
logging.getLogger("httpx").setLevel(logging.WARNING)
30+
2831

2932
class ServerThread:
3033
"""Helper class to run uvicorn server in a background thread."""
@@ -72,7 +75,7 @@ def __init__(self, host: str = "0.0.0.0", port: int = 9999):
7275

7376
import uvicorn
7477

75-
config = uvicorn.Config(self.app, host=host, port=port, log_level="error")
78+
config = uvicorn.Config(self.app, host=host, port=port, log_level="error", access_log=False)
7679
self.server = uvicorn.Server(config)
7780
self.server_thread = None
7881

lib/marin/src/marin/evaluation/backends/inference_pool.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ async def handle_inference_request(request: dict[str, Any], endpoint: str) -> di
7171
if lease is None:
7272
if time.time() - start_time > timeout:
7373
raise HTTPException(status_code=504, detail="Request timed out")
74-
time.sleep(0.1)
74+
time.sleep(1.0)
7575
continue
7676

7777
response = lease.item
@@ -116,7 +116,7 @@ async def health() -> dict[str, str]:
116116
return {"status": "ok"}
117117

118118
logger.info(f"Starting OpenAI proxy server at http://{self.host}:{self.port}")
119-
config = uvicorn.Config(app, host=self.host, port=self.port, log_level="info")
119+
config = uvicorn.Config(app, host=self.host, port=self.port, log_level="warning", access_log=False)
120120
self.server = uvicorn.Server(config)
121121
self.server.run()
122122

lib/marin/src/marin/evaluation/evaluators/helm_evaluator.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import logging
1616
import os
17+
import subprocess
1718
import tempfile
1819
import traceback
1920
from pathlib import Path
@@ -45,6 +46,18 @@
4546
logger = logging.getLogger(__name__)
4647

4748

49+
def ensure_file_downloaded(url: str, target_path: str) -> None:
50+
if os.path.exists(target_path):
51+
return
52+
53+
import requests
54+
55+
response = requests.get(url)
56+
response.raise_for_status()
57+
with open(target_path, "wb") as f:
58+
f.write(response.content)
59+
60+
4861
def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: Path) -> None:
4962
"""
5063
Write out the necessary model configuration files for HELM.
@@ -56,9 +69,7 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
5669
model_name: str = model.name
5770
# Use model.path for loading from HuggingFace, fallback to model.name if path is None
5871
model_path_or_name: str = model.path or model.name
59-
print(f"Loading tokenizer for model: {model_path_or_name}", flush=True)
6072
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name, trust_remote_code=True)
61-
print(f"Tokenizer loaded, max_length: {tokenizer.model_max_length}", flush=True)
6273

6374
content: dict = {
6475
"model_deployments": [
@@ -77,7 +88,6 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
7788
]
7889
}
7990
deployments_path = prod_env_path / MODEL_DEPLOYMENTS_FILE_PATH
80-
print(f"Writing model_deployments to {deployments_path}", flush=True)
8191
write_yaml(content, deployments_path)
8292

8393
content = {
@@ -94,7 +104,6 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
94104
]
95105
}
96106
metadata_path = prod_env_path / MODEL_METADATA_FILE_PATH
97-
print(f"Writing model_metadata to {metadata_path}", flush=True)
98107
write_yaml(content, metadata_path)
99108

100109
content = {
@@ -123,9 +132,7 @@ def get_runtime_env(self) -> dict:
123132
"""
124133
Returns the runtime environment to run the evaluator on the Ray cluster.
125134
"""
126-
return build_runtime_env_for_packages(
127-
extra=["eval", "tpu"], pip_packages=["crfm-helm@git+https://github.com/stanford-crfm/helm.git@local_vllm"]
128-
)
135+
return build_runtime_env_for_packages(extra=["eval", "tpu"])
129136

130137
def evaluate(
131138
self,
@@ -155,9 +162,9 @@ def evaluate(
155162
prod_env_path = Path(results_path) / "prod_env"
156163
results_folder = Path(results_path) / "run" / "results"
157164

158-
try:
159-
from helm.common.general import ensure_file_downloaded
165+
subprocess.check_call(["uv", "pip", "install", "crfm-helm@git+https://github.com/stanford-crfm/helm.git"])
160166

167+
try:
161168
# Download the run_entries files and schema files for the specified evals
162169
assert len(evals) > 0, "Please specify at least one eval to run."
163170
run_entries_files: list[str] = []
@@ -266,9 +273,7 @@ def evaluate(
266273
},
267274
),
268275
resources=ResourceConfig(cpu=1, ram="4g", device=CpuConfig(), replicas=1),
269-
environment=create_environment(
270-
pip_packages=["crfm-helm@git+https://github.com/stanford-crfm/helm.git@local_vllm"], extras=["eval"]
271-
),
276+
environment=create_environment(extras=["eval"]),
272277
)
273278
job_id = cluster.launch(job_request)
274279
logger.info("Started Helm task with job id %s", job_id)

lib/marin/src/marin/run/ray_deps.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,6 @@ def compute_frozen_packages(extra: list[str] | None = None) -> PackageSpec:
109109
# convert to a py_module. this isn't used for now, instead see `build_python_path`
110110
py_modules.append(line[3:].strip())
111111
else:
112-
# Relax version pins for packages with known conflicts
113-
# Replace == with >= for zstandard to allow pip more flexibility
114-
if line.startswith("zstandard=="):
115-
line = line.replace("==", ">=", 1)
116112
package_specs.append(line)
117113

118114
return PackageSpec(package_specs=package_specs, py_modules=py_modules)

0 commit comments

Comments
 (0)