@@ -247,6 +247,23 @@ def get_async_client(self, **kwargs):
247247 ** kwargs ,
248248 )
249249
250+ def get_client_anthropic (self , ** kwargs ):
251+ if "timeout" not in kwargs :
252+ kwargs ["timeout" ] = 600
253+ return anthropic .Anthropic (
254+ base_url = self .url_for (),
255+ api_key = self .DUMMY_API_KEY ,
256+ max_retries = 0 ,
257+ ** kwargs ,
258+ )
259+
260+ def get_async_client_anthropic (self , ** kwargs ):
261+ if "timeout" not in kwargs :
262+ kwargs ["timeout" ] = 600
263+ return anthropic .AsyncAnthropic (
264+ base_url = self .url_for (), api_key = self .DUMMY_API_KEY , max_retries = 0 , ** kwargs
265+ )
266+
250267
251268class RemoteOpenAIServerCustom (RemoteOpenAIServer ):
252269 """Launch test server with custom child process"""
@@ -293,131 +310,6 @@ def __exit__(self, exc_type, exc_value, traceback):
293310 self .proc .kill ()
294311
295312
296- class RemoteAnthropicServer :
297- DUMMY_API_KEY = "token-abc123" # vLLM's Anthropic server does not need API key
298-
299- def __init__ (
300- self ,
301- model : str ,
302- vllm_serve_args : list [str ],
303- * ,
304- env_dict : dict [str , str ] | None = None ,
305- seed : int | None = 0 ,
306- auto_port : bool = True ,
307- max_wait_seconds : float | None = None ,
308- ) -> None :
309- if auto_port :
310- if "-p" in vllm_serve_args or "--port" in vllm_serve_args :
311- raise ValueError (
312- "You have manually specified the port when `auto_port=True`."
313- )
314-
315- # Don't mutate the input args
316- vllm_serve_args = vllm_serve_args + ["--port" , str (get_open_port ())]
317- if seed is not None :
318- if "--seed" in vllm_serve_args :
319- raise ValueError (
320- f"You have manually specified the seed when `seed={ seed } `."
321- )
322-
323- vllm_serve_args = vllm_serve_args + ["--seed" , str (seed )]
324-
325- parser = FlexibleArgumentParser (description = "vLLM's remote Anthropic server." )
326- subparsers = parser .add_subparsers (required = False , dest = "subparser" )
327- parser = ServeSubcommand ().subparser_init (subparsers )
328- args = parser .parse_args (["--model" , model , * vllm_serve_args ])
329- self .host = str (args .host or "localhost" )
330- self .port = int (args .port )
331-
332- self .show_hidden_metrics = args .show_hidden_metrics_for_version is not None
333-
334- # download the model before starting the server to avoid timeout
335- is_local = os .path .isdir (model )
336- if not is_local :
337- engine_args = AsyncEngineArgs .from_cli_args (args )
338- model_config = engine_args .create_model_config ()
339- load_config = engine_args .create_load_config ()
340-
341- model_loader = get_model_loader (load_config )
342- model_loader .download_model (model_config )
343-
344- env = os .environ .copy ()
345- # the current process might initialize cuda,
346- # to be safe, we should use spawn method
347- env ["VLLM_WORKER_MULTIPROC_METHOD" ] = "spawn"
348- if env_dict is not None :
349- env .update (env_dict )
350- self .proc = subprocess .Popen (
351- [
352- sys .executable ,
353- "-m" ,
354- "vllm.entrypoints.anthropic.api_server" ,
355- model ,
356- * vllm_serve_args ,
357- ],
358- env = env ,
359- stdout = sys .stdout ,
360- stderr = sys .stderr ,
361- )
362- max_wait_seconds = max_wait_seconds or 240
363- self ._wait_for_server (url = self .url_for ("health" ), timeout = max_wait_seconds )
364-
365- def __enter__ (self ):
366- return self
367-
368- def __exit__ (self , exc_type , exc_value , traceback ):
369- self .proc .terminate ()
370- try :
371- self .proc .wait (8 )
372- except subprocess .TimeoutExpired :
373- # force kill if needed
374- self .proc .kill ()
375-
376- def _wait_for_server (self , * , url : str , timeout : float ):
377- # run health check
378- start = time .time ()
379- while True :
380- try :
381- if requests .get (url ).status_code == 200 :
382- break
383- except Exception :
384- # this exception can only be raised by requests.get,
385- # which means the server is not ready yet.
386- # the stack trace is not useful, so we suppress it
387- # by using `raise from None`.
388- result = self .proc .poll ()
389- if result is not None and result != 0 :
390- raise RuntimeError ("Server exited unexpectedly." ) from None
391-
392- time .sleep (0.5 )
393- if time .time () - start > timeout :
394- raise RuntimeError ("Server failed to start in time." ) from None
395-
396- @property
397- def url_root (self ) -> str :
398- return f"http://{ self .host } :{ self .port } "
399-
400- def url_for (self , * parts : str ) -> str :
401- return self .url_root + "/" + "/" .join (parts )
402-
403- def get_client (self , ** kwargs ):
404- if "timeout" not in kwargs :
405- kwargs ["timeout" ] = 600
406- return anthropic .Anthropic (
407- base_url = self .url_for (),
408- api_key = self .DUMMY_API_KEY ,
409- max_retries = 0 ,
410- ** kwargs ,
411- )
412-
413- def get_async_client (self , ** kwargs ):
414- if "timeout" not in kwargs :
415- kwargs ["timeout" ] = 600
416- return anthropic .AsyncAnthropic (
417- base_url = self .url_for (), api_key = self .DUMMY_API_KEY , max_retries = 0 , ** kwargs
418- )
419-
420-
421313def _test_completion (
422314 client : openai .OpenAI ,
423315 model : str ,
0 commit comments