Cleanups.

rjpower · rjpower · commit ce969f65f6f3 · 2025-11-24T19:34:57.000-08:00
diff --git a/data_browser/pyproject.toml b/data_browser/pyproject.toml
@@ -11,7 +11,7 @@ requires-python = ">=3.11"
 dependencies = [
     "py7zr>=0.22.0",
     "fsspec>=2024.9.0",
-    "zstandard>=0.23.0",
+    "zstandard>=0.18.0",
     "flask>=3.1.1",
     "pyarrow>=17.0.0",
     "gcsfs>=2024.9.0.post1",
diff --git a/experiments/evals/evals.py b/experiments/evals/evals.py
@@ -18,20 +18,6 @@
 
 import logging
 
-from experiments.evals.engine_configs import DEFAULT_LM_EVAL_MODEL_KWARGS
-from experiments.evals.resource_configs import SINGLE_TPU_V4_8, SINGLE_TPU_V6E_8
-from experiments.evals.task_configs import (
-    BASE_GENERATION_TASKS,
-    CORE_TASKS,
-    CORE_TASKS_PLUS_LEADERBOARD,
-    KEY_GENERATION_TASKS,
-    KEY_MULTIPLE_CHOICE_TASKS,
-    MMLU_0_SHOT,
-    MMLU_5_SHOT,
-    MMLU_PRO_5_SHOT,
-    OPEN_LM_LEADERBOARD_GEN,
-    OPEN_LM_LEADERBOARD_MCQ,
-)
 from fray.cluster.base import ResourceConfig
 from marin.evaluation.evaluation_config import (
     EvalTaskConfig,
@@ -50,6 +36,21 @@
     versioned,
 )
 
+from experiments.evals.engine_configs import DEFAULT_LM_EVAL_MODEL_KWARGS
+from experiments.evals.resource_configs import SINGLE_TPU_V4_8, SINGLE_TPU_V6E_8
+from experiments.evals.task_configs import (
+    BASE_GENERATION_TASKS,
+    CORE_TASKS,
+    CORE_TASKS_PLUS_LEADERBOARD,
+    KEY_GENERATION_TASKS,
+    KEY_MULTIPLE_CHOICE_TASKS,
+    MMLU_0_SHOT,
+    MMLU_5_SHOT,
+    MMLU_PRO_5_SHOT,
+    OPEN_LM_LEADERBOARD_GEN,
+    OPEN_LM_LEADERBOARD_MCQ,
+)
+
 logger = logging.getLogger(__name__)
 
 
@@ -71,10 +72,8 @@ def evaluate_helm(
         max_eval_instances: Maximum number of evaluation instances to run
         engine_kwargs: Additional keyword arguments to pass to the vLLM engine
     """
-    # Auto-detect device from resource config
     device = infer_device_from_resource_config(resource_config)
 
-    # Build ModelConfig
     model_config = ModelConfig(
         name=model_name,
         path=model_path,
@@ -83,7 +82,6 @@ def evaluate_helm(
         apply_chat_template=False,
     )
 
-    # Build InferencePoolConfig
     pool_config = InferencePoolConfig(
         resource_config=resource_config,
         model_config=model_config,
@@ -101,7 +99,7 @@ def evaluate_helm(
             evals=evals,
             max_eval_instances=max_eval_instances,
         ),
-        pip_dependency_groups=["eval"],
+        pip_dependency_groups=["eval", "pip:crfm-helm@git+https://github.com/stanford-crfm/helm.git"],
     )
 
 
diff --git a/experiments/evals/test_helm_migration.py b/experiments/evals/test_helm_migration.py
@@ -17,27 +17,31 @@
 
 import os
 
-from fray.cluster.base import ResourceConfig
+from fray.cluster.base import ResourceConfig, TpuConfig
+from marin.execution.executor import executor_main
+
 from experiments.evals.evals import evaluate_helm
 from experiments.evals.task_configs import EvalTaskConfig
-from marin.execution.executor import executor_main
 
-# Set local output prefix if not set
+# Set output prefix if not set
 if "MARIN_PREFIX" not in os.environ:
-    os.environ["MARIN_PREFIX"] = "/tmp/marin-helm-test"
+    os.environ["MARIN_PREFIX"] = "gs://marin-eu-west4/evals/helm-migration-test"
 
-# Local test resource config
-local_config = ResourceConfig(
-    cpu=2,
-    ram="8g",
+# TPU test resource config
+tpu_config = ResourceConfig(
+    cpu=16,
+    ram="64g",
+    disk="10g",
+    device=TpuConfig(type="v5litepod-4", count=4),
     replicas=1,
+    regions=["eu-west4"],
 )
 
 step = evaluate_helm(
-    model_name="test-baby-llama",
+    model_name="timinar/baby-llama-58m",
     model_path="timinar/baby-llama-58m",
     evals=[EvalTaskConfig(name="mmlu", num_fewshot=0)],
-    resource_config=local_config,
+    resource_config=tpu_config,
     max_eval_instances=10,
 )
 
diff --git a/lib/fray/pyproject.toml b/lib/fray/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "mergedeep",
     "pyyaml>=6.0",
     "typing-extensions>=4.0",
-    "zstandard>=0.22.0",
+    "zstandard>=0.18.0",
 ]
 
 [project.scripts]
diff --git a/lib/fray/src/fray/cluster/__init__.py b/lib/fray/src/fray/cluster/__init__.py
@@ -75,29 +75,48 @@ def set_current_cluster(cluster: Cluster) -> None:
 def current_cluster() -> Cluster:
     """Get the current cluster from context.
 
-    If no cluster is set in context but FRAY_CLUSTER_SPEC environment variable is present,
-    automatically creates and caches the cluster for this process.
+    Auto-detection priority:
+    1. Context variable (set via set_current_cluster())
+    2. Ray cluster (if ray.is_initialized())
+    3. FRAY_CLUSTER_SPEC environment variable
+    4. LocalCluster (default fallback)
 
     Returns:
-        The cluster instance set via set_current_cluster() or auto-created from env var
+        The cluster instance
 
     Raises:
-        RuntimeError: If no cluster has been set and FRAY_CLUSTER_SPEC is not present
+        RuntimeError: If cluster creation fails
     """
     cluster = _cluster_context.get()
     if cluster is not None:
         return cluster
 
+    # Auto-detect Ray execution
+    try:
+        import ray
+
+        if ray.is_initialized():
+            from fray.cluster.ray.cluster import RayCluster
+
+            cluster = RayCluster()
+            set_current_cluster(cluster)
+            logger.info("Auto-detected Ray cluster from ray.is_initialized()")
+            return cluster
+    except ImportError:
+        pass
+
+    # Check for FRAY_CLUSTER_SPEC
     cluster_spec = os.environ.get("FRAY_CLUSTER_SPEC")
-    if cluster_spec is None:
-        raise RuntimeError(
-            "No cluster set in current context. Either call set_current_cluster() "
-            "or set FRAY_CLUSTER_SPEC environment variable."
-        )
+    if cluster_spec is not None:
+        cluster = create_cluster(cluster_spec)
+        set_current_cluster(cluster)
+        logger.info(f"Auto-created cluster from FRAY_CLUSTER_SPEC={cluster_spec}")
+        return cluster
 
-    cluster = create_cluster(cluster_spec)
+    # Default to LocalCluster
+    cluster = LocalCluster()
     set_current_cluster(cluster)
-    logger.info(f"Auto-created cluster from FRAY_CLUSTER_SPEC={cluster_spec}")
+    logger.info("Using default LocalCluster")
     return cluster
 
 
diff --git a/lib/fray/src/fray/queue/http.py b/lib/fray/src/fray/queue/http.py
@@ -58,13 +58,13 @@ class HttpQueueServer:
     """HTTP server that manages multiple named queues.
 
     Example:
-        with HttpQueueServer(host="127.0.0.1", port=9999) as server:
+        with HttpQueueServer(host="0.0.0.0", port=9999) as server:
             queue_a = server.new_queue("tasks")
             queue_b = server.new_queue("results")
             queue_a.push("task1")
     """
 
-    def __init__(self, host: str = "127.0.0.1", port: int = 9999):
+    def __init__(self, host: str = "0.0.0.0", port: int = 9999):
         self.host = host
         self.port = port
         self.queues: dict[str, MemoryQueue] = {}
@@ -116,11 +116,34 @@ def release(queue_name: str, lease_id: str = Body(...), timestamp: float = Body(
 
         return app
 
+    def get_client_host(self) -> str:
+        """Get the hostname/IP that clients should use to connect.
+
+        When server binds to 0.0.0.0, clients need a specific hostname/IP.
+        Returns the actual IP address using default route.
+        """
+        if self.host == "0.0.0.0":
+            import socket
+
+            # Get the IP address that clients should use by checking default route
+            try:
+                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+                s.connect(("8.8.8.8", 80))  # doesn't actually send anything
+                ip = s.getsockname()[0]
+                s.close()
+                return ip
+            except Exception:
+                # Fall back to localhost for local testing
+                return "127.0.0.1"
+        return self.host
+
     def new_queue(self, name: str) -> "HttpQueue":
         """Create or get a named queue, returns client."""
         if name not in self.queues:
             self.queues[name] = MemoryQueue()
-        return HttpQueue(host=self.host, port=self.port, queue_name=name)
+        # Use client-accessible host instead of bind host
+        client_host = self.get_client_host()
+        return HttpQueue(host=client_host, port=self.port, queue_name=name)
 
     def __enter__(self):
         self.server_thread = ServerThread(self.server, self.host, self.port)
diff --git a/lib/marin/pyproject.toml b/lib/marin/pyproject.toml
@@ -220,7 +220,7 @@ quality_dedup_consolidate = [
     "nltk>=3.8.1",
     "rbloom_gcs",
     "transformers",
-    "zstandard>=0.23.0",
+    "zstandard>=0.18.0",
 ]
 
 tokenize_train = [
diff --git a/lib/marin/src/marin/evaluation/backends/inference_pool.py b/lib/marin/src/marin/evaluation/backends/inference_pool.py
@@ -202,16 +202,65 @@ def base_url(self) -> str:
         return f"http://{self.config.proxy_host}:{self.config.proxy_port}/v1"
 
     def wait_for_healthy(self, timeout: float = 300) -> None:
+        """Wait for the entire inference pool to be healthy.
+
+        Checks three components independently:
+        1. Fray job status (detect worker crashes during startup)
+        2. Proxy server health (FastAPI is responding)
+        3. VLLM worker via queue round-trip (worker can process requests)
+        """
         start_time = time.time()
+        proxy_healthy = False
+        worker_healthy = False
+        proxy_url = f"http://{self.config.proxy_host}:{self.config.proxy_port}"
+
         while True:
+            # Always check job status first - fail fast if worker crashed
             info = self.cluster.poll(self.job_id)
-            if info.status == "running":
-                logger.info("Pool job is running")
-                break
-            elif info.status in ["failed", "stopped"]:
-                raise RuntimeError(f"Pool job failed: {info.error_message}")
-
+            if info.status in ["failed", "stopped"]:
+                raise RuntimeError(f"Pool job failed during startup: {info.error_message}")
+
+            # Step 1: Check proxy server health independently
+            if not proxy_healthy:
+                try:
+                    response = requests.get(f"{proxy_url}/health", timeout=1)
+                    if response.status_code == 200:
+                        logger.info("Proxy server is healthy")
+                        proxy_healthy = True
+                except requests.RequestException:
+                    pass  # Proxy not ready yet
+
+            # Step 2: Check VLLM worker via queue round-trip
+            if proxy_healthy and not worker_healthy:
+                try:
+                    # Send a minimal test request through the queues to VLLM worker
+                    response = requests.post(
+                        f"{proxy_url}/v1/completions",
+                        json={
+                            "model": "default",
+                            "prompt": "test",
+                            "max_tokens": 1,
+                            "temperature": 0,
+                        },
+                        timeout=30,
+                    )
+                    if response.status_code == 200:
+                        logger.info("VLLM worker is healthy and responding via queues")
+                        worker_healthy = True
+                        return  # Success - all components healthy!
+                except requests.RequestException as e:
+                    logger.debug(f"VLLM worker health check failed: {e}")
+
+            # Check timeout
             if time.time() - start_time > timeout:
-                raise TimeoutError("Pool job failed to start within timeout")
+                issues = []
+                if not proxy_healthy:
+                    issues.append("proxy server not responding")
+                if not worker_healthy:
+                    issues.append("VLLM worker not responding")
+                issues.append(f"job status: {info.status}")
+
+                raise TimeoutError(f"Pool failed to become healthy within {timeout}s. Issues: {', '.join(issues)}")
 
-        logger.info("Pool is healthy")
+            # Wait before next check
+            time.sleep(2)
diff --git a/lib/marin/src/marin/evaluation/backends/vllm.py b/lib/marin/src/marin/evaluation/backends/vllm.py
@@ -87,12 +87,10 @@ def start_vllm_server(
     )
 
     # Add device specification
-    if device != "auto":
-        command += f"--device {device} "
-
-    # Add distributed backend for TPU
+    # Note: vLLM 0.11.0 does not support --device flag
+    # For TPU, use distributed executor backend
     if device == "tpu":
-        command += "--distributed-executor-backend ray "
+        command += "--device tpu --distributed-executor-backend ray "
 
     # Add engine kwargs
     for key, value in engine_kwargs.items():
@@ -175,6 +173,7 @@ def vllm_server_worker(
                 payload["model"] = model.name
 
             url = f"{server_url}{endpoint}"
+            logger.info(f"Sending request to vLLM at {url}")
             http_response = requests.post(
                 url,
                 json=payload,
diff --git a/lib/marin/src/marin/evaluation/evaluators/helm_evaluator.py b/lib/marin/src/marin/evaluation/evaluators/helm_evaluator.py
@@ -54,8 +54,10 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
     os.makedirs(prod_env_path, exist_ok=True)
 
     model_name: str = model.name
-    print(f"Loading tokenizer for model: {model_name}", flush=True)
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    # Use model.path for loading from HuggingFace, fallback to model.name if path is None
+    model_path_or_name: str = model.path or model.name
+    print(f"Loading tokenizer for model: {model_path_or_name}", flush=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_path_or_name, trust_remote_code=True)
     print(f"Tokenizer loaded, max_length: {tokenizer.model_max_length}", flush=True)
 
     content: dict = {
@@ -101,7 +103,7 @@ def write_model_config_files(model: ModelConfig, base_url: str, prod_env_path: P
                 "name": model_name,
                 "tokenizer_spec": {
                     "class_name": "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer",
-                    "args": {"pretrained_model_name_or_path": model_name, "trust_remote_code": True},
+                    "args": {"pretrained_model_name_or_path": model_path_or_name, "trust_remote_code": True},
                 },
                 "prefix_token": tokenizer.bos_token,
                 "end_of_text_token": tokenizer.eos_token,
diff --git a/lib/marin/src/marin/evaluation/run.py b/lib/marin/src/marin/evaluation/run.py
diff --git a/lib/marin/src/marin/execution/executor.py b/lib/marin/src/marin/execution/executor.py
diff --git a/lib/marin/src/marin/run/ray_deps.py b/lib/marin/src/marin/run/ray_deps.py
diff --git a/lib/zephyr/pyproject.toml b/lib/zephyr/pyproject.toml
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ dependencies = [`
`14`	`14`	`"mergedeep",`
`15`	`15`	`"pyyaml>=6.0",`
`16`	`16`	`"typing-extensions>=4.0",`
`17`		`- "zstandard>=0.22.0",`
	`17`	`+ "zstandard>=0.18.0",`
`18`	`18`	`]`
`19`	`19`
`20`	`20`	`[project.scripts]`
Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,7 @@ quality_dedup_consolidate = [`
`220`	`220`	`"nltk>=3.8.1",`
`221`	`221`	`"rbloom_gcs",`
`222`	`222`	`"transformers",`
`223`		`- "zstandard>=0.23.0",`
	`223`	`+ "zstandard>=0.18.0",`
`224`	`224`	`]`
`225`	`225`
`226`	`226`	`tokenize_train = [`