@@ -202,16 +202,65 @@ def base_url(self) -> str:
202202 return f"http://{ self .config .proxy_host } :{ self .config .proxy_port } /v1"
203203
204204 def wait_for_healthy (self , timeout : float = 300 ) -> None :
205+ """Wait for the entire inference pool to be healthy.
206+
207+ Checks three components independently:
208+ 1. Fray job status (detect worker crashes during startup)
209+ 2. Proxy server health (FastAPI is responding)
210+ 3. VLLM worker via queue round-trip (worker can process requests)
211+ """
205212 start_time = time .time ()
213+ proxy_healthy = False
214+ worker_healthy = False
215+ proxy_url = f"http://{ self .config .proxy_host } :{ self .config .proxy_port } "
216+
206217 while True :
218+ # Always check job status first - fail fast if worker crashed
207219 info = self .cluster .poll (self .job_id )
208- if info .status == "running" :
209- logger .info ("Pool job is running" )
210- break
211- elif info .status in ["failed" , "stopped" ]:
212- raise RuntimeError (f"Pool job failed: { info .error_message } " )
213-
220+ if info .status in ["failed" , "stopped" ]:
221+ raise RuntimeError (f"Pool job failed during startup: { info .error_message } " )
222+
223+ # Step 1: Check proxy server health independently
224+ if not proxy_healthy :
225+ try :
226+ response = requests .get (f"{ proxy_url } /health" , timeout = 1 )
227+ if response .status_code == 200 :
228+ logger .info ("Proxy server is healthy" )
229+ proxy_healthy = True
230+ except requests .RequestException :
231+ pass # Proxy not ready yet
232+
233+ # Step 2: Check VLLM worker via queue round-trip
234+ if proxy_healthy and not worker_healthy :
235+ try :
236+ # Send a minimal test request through the queues to VLLM worker
237+ response = requests .post (
238+ f"{ proxy_url } /v1/completions" ,
239+ json = {
240+ "model" : "default" ,
241+ "prompt" : "test" ,
242+ "max_tokens" : 1 ,
243+ "temperature" : 0 ,
244+ },
245+ timeout = 30 ,
246+ )
247+ if response .status_code == 200 :
248+ logger .info ("VLLM worker is healthy and responding via queues" )
249+ worker_healthy = True
250+ return # Success - all components healthy!
251+ except requests .RequestException as e :
252+ logger .debug (f"VLLM worker health check failed: { e } " )
253+
254+ # Check timeout
214255 if time .time () - start_time > timeout :
215- raise TimeoutError ("Pool job failed to start within timeout" )
256+ issues = []
257+ if not proxy_healthy :
258+ issues .append ("proxy server not responding" )
259+ if not worker_healthy :
260+ issues .append ("VLLM worker not responding" )
261+ issues .append (f"job status: { info .status } " )
262+
263+ raise TimeoutError (f"Pool failed to become healthy within { timeout } s. Issues: { ', ' .join (issues )} " )
216264
217- logger .info ("Pool is healthy" )
265+ # Wait before next check
266+ time .sleep (2 )
0 commit comments