Skip to content
63 changes: 31 additions & 32 deletions tests/entrypoints/anthropic/test_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,37 +105,36 @@ async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):

print(f"Anthropic response: {resp.model_dump_json()}")

@pytest.mark.asyncio
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{
"role": "user",
"content": "What's the weather like in New York today?",
}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather "
"in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
@pytest.mark.asyncio
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{
"role": "user",
"content": "What's the weather like in New York today?",
}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
}
],
stream=True,
)
"required": ["location"],
},
}
],
stream=True,
)

async for chunk in resp:
print(chunk.model_dump_json())
async for chunk in resp:
print(chunk.model_dump_json())
140 changes: 140 additions & 0 deletions tests/entrypoints/openai/test_messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import anthropic
import pytest
import pytest_asyncio

from ...utils import RemoteOpenAIServer

MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len",
"2048",
"--enforce-eager",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--served-model-name",
"claude-3-7-sonnet-latest",
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client


@pytest.mark.asyncio
async def test_simple_messages(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")


@pytest.mark.asyncio
async def test_system_message(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
system="you are a helpful assistant",
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")


@pytest.mark.asyncio
async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
stream=True,
)

async for chunk in resp:
print(chunk.model_dump_json())


@pytest.mark.asyncio
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{"role": "user", "content": "What's the weather like in New York today?"}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=False,
)
assert resp.stop_reason == "tool_use"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")

@pytest.mark.asyncio
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case let's not copy the tests like this to avoid duplication in CI. We just need to check that the endpoint exists for OpenAI server since the same code is used to process the endpoint

resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{
"role": "user",
"content": "What's the weather like in New York today?",
}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=True,
)

async for chunk in resp:
print(chunk.model_dump_json())
86 changes: 86 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import Device, EngineClient
from vllm.entrypoints.anthropic.protocol import (
AnthropicError,
AnthropicErrorResponse,
AnthropicMessagesRequest,
AnthropicMessagesResponse,
)
from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
Expand Down Expand Up @@ -307,6 +314,10 @@
return request.app.state.openai_serving_responses


def messages(request: Request) -> AnthropicServingMessages:
return request.app.state.anthropic_serving_messages


def chat(request: Request) -> OpenAIServingChat | None:
return request.app.state.openai_serving_chat

Expand Down Expand Up @@ -590,6 +601,63 @@
return JSONResponse(content=response.model_dump())


@router.post(
"/v1/messages",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_messages(request: AnthropicMessagesRequest, raw_request: Request):
def translate_error_response(response: ErrorResponse) -> JSONResponse:
anthropic_error = AnthropicErrorResponse(
error=AnthropicError(
type=response.error.type,
message=response.error.message,
)
)
return JSONResponse(status_code=response.error.code,
content=anthropic_error.model_dump())

handler = messages(raw_request)
if handler is None:
error = base(raw_request).create_error_response(
message="The model does not support Messages API"
)
return translate_error_response(error)

try:
generator = await handler.create_messages(request, raw_request)
except Exception as e:
logger.exception("Error in create_messages: %s", e, exc_info=True)

Check failure on line 637 in vllm/entrypoints/openai/api_server.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G202)

vllm/entrypoints/openai/api_server.py:637:61: G202 Logging statement has redundant `exc_info`
return JSONResponse(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
content=AnthropicErrorResponse(
error=AnthropicError(
type="internal_error",
message=str(e),
)
).model_dump(),
)

if isinstance(generator, ErrorResponse):
return translate_error_response(generator)

elif isinstance(generator, AnthropicMessagesResponse):
logger.debug(
"Anthropic Messages Response: %s", generator.model_dump(exclude_none=True)
)
return JSONResponse(content=generator.model_dump(exclude_none=True))

return StreamingResponse(content=generator, media_type="text/event-stream")



@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
Expand Down Expand Up @@ -1816,6 +1884,24 @@
if "transcription" in supported_tasks
else None
)
state.anthropic_serving_messages = (
AnthropicServingMessages(
engine_client,
state.openai_serving_models,
args.response_role,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
)
if "generate" in supported_tasks
else None
)

state.enable_server_load_tracking = args.enable_server_load_tracking
state.server_load_metrics = 0
Expand Down
Loading