Skip to content
141 changes: 141 additions & 0 deletions tests/entrypoints/openai/test_messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import anthropic
import pytest
import pytest_asyncio

from ...utils import RemoteOpenAIServer

MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len",
"2048",
"--enforce-eager",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--served-model-name",
"claude-3-7-sonnet-latest",
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client


@pytest.mark.asyncio
async def test_simple_messages(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")


@pytest.mark.asyncio
async def test_system_message(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
system="you are a helpful assistant",
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")


@pytest.mark.asyncio
async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
stream=True,
)

async for chunk in resp:
print(chunk.model_dump_json())


@pytest.mark.asyncio
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{"role": "user", "content": "What's the weather like in New York today?"}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=False,
)
assert resp.stop_reason == "tool_use"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")

@pytest.mark.asyncio
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{
"role": "user",
"content": "What's the weather like in New York today?",
}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather "
"in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=True,
)

async for chunk in resp:
print(chunk.model_dump_json())
58 changes: 58 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@
from vllm.v1.engine.exceptions import EngineDeadError
from vllm.v1.metrics.prometheus import get_prometheus_registry
from vllm.version import __version__ as VLLM_VERSION
from vllm.entrypoints.anthropic.protocol import (
AnthropicErrorResponse,
AnthropicMessagesRequest,
AnthropicMessagesResponse,
)
from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages

prometheus_multiproc_dir: tempfile.TemporaryDirectory

Expand Down Expand Up @@ -307,6 +313,10 @@ def responses(request: Request) -> OpenAIServingResponses | None:
return request.app.state.openai_serving_responses


def messages(request: Request) -> AnthropicServingMessages:
return request.app.state.anthropic_serving_messages


def chat(request: Request) -> OpenAIServingChat | None:
return request.app.state.openai_serving_chat

Expand Down Expand Up @@ -590,6 +600,40 @@ async def cancel_responses(response_id: str, raw_request: Request):
return JSONResponse(content=response.model_dump())


@router.post(
"/v1/messages",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_messages(request: AnthropicMessagesRequest, raw_request: Request):
handler = messages(raw_request)
if handler is None:
return messages(raw_request).create_error_response(
message="The model does not support Messages API"
)

generator = await handler.create_messages(request, raw_request)

if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump())

elif isinstance(generator, AnthropicMessagesResponse):
logger.debug(
"Anthropic Messages Response: %s", generator.model_dump(exclude_none=True)
)
return JSONResponse(content=generator.model_dump(exclude_none=True))

return StreamingResponse(content=generator, media_type="text/event-stream")



@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
Expand Down Expand Up @@ -1816,6 +1860,20 @@ async def init_app_state(
if "transcription" in supported_tasks
else None
)
state.anthropic_serving_messages = AnthropicServingMessages(
engine_client,
state.openai_serving_models,
args.response_role,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser,
reasoning_parser=args.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
)

state.enable_server_load_tracking = args.enable_server_load_tracking
state.server_load_metrics = 0
Expand Down
Loading