From 6a1d6b3c77511ed29605e58fb143083c4b6f4e55 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 26 Nov 2025 15:36:32 +0100 Subject: [PATCH 1/4] feat(integrations): openai-agents: add usage and response model reporting for chat and invoke_agent spans --- .../openai_agents/_context_vars.py | 18 + .../openai_agents/patches/models.py | 28 + .../openai_agents/spans/ai_client.py | 4 + .../openai_agents/spans/invoke_agent.py | 13 +- .../openai_agents/test_openai_agents.py | 565 ++++++++++++++++++ 5 files changed, 627 insertions(+), 1 deletion(-) create mode 100644 sentry_sdk/integrations/openai_agents/_context_vars.py diff --git a/sentry_sdk/integrations/openai_agents/_context_vars.py b/sentry_sdk/integrations/openai_agents/_context_vars.py new file mode 100644 index 0000000000..83746ecee6 --- /dev/null +++ b/sentry_sdk/integrations/openai_agents/_context_vars.py @@ -0,0 +1,18 @@ +""" +Context variables for passing data between nested calls in the OpenAI Agents integration. +""" + +from contextvars import ContextVar + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + pass + +# Context variable to pass response model between nested calls (for gen_ai.chat spans) +_response_model_context = ContextVar("openai_agents_response_model", default=None) # type: ContextVar[str | None] + +# Context variable to store the last response model for invoke_agent spans +_invoke_agent_response_model_context = ContextVar( + "openai_agents_invoke_agent_response_model", default=None +) # type: ContextVar[str | None] diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py index e6f24da6a1..aa6371302e 100644 --- a/sentry_sdk/integrations/openai_agents/patches/models.py +++ b/sentry_sdk/integrations/openai_agents/patches/models.py @@ -2,6 +2,10 @@ from sentry_sdk.integrations import DidNotEnable +from .._context_vars import ( + _invoke_agent_response_model_context, + _response_model_context, +) from ..spans import ai_client_span, update_ai_client_span from typing import TYPE_CHECKING @@ -33,12 +37,36 @@ def wrapped_get_model(cls, agent, run_config): model = original_get_model(agent, run_config) original_get_response = model.get_response + # Wrap _fetch_response if it exists (for OpenAI models) to capture raw response model + if hasattr(model, "_fetch_response"): + original_fetch_response = model._fetch_response + + @wraps(original_fetch_response) + async def wrapped_fetch_response(*args, **kwargs): + # type: (*Any, **Any) -> Any + response = await original_fetch_response(*args, **kwargs) + # Store model from raw response in context variable + if hasattr(response, "model"): + _response_model_context.set(str(response.model)) + return response + + model._fetch_response = wrapped_fetch_response + @wraps(original_get_response) async def wrapped_get_response(*args, **kwargs): # type: (*Any, **Any) -> Any with ai_client_span(agent, kwargs) as span: result = await original_get_response(*args, **kwargs) + # Retrieve response model from context and attach to ModelResponse + response_model = _response_model_context.get(None) + if response_model: + result._sentry_response_model = response_model + _response_model_context.set(None) # Clear context + + # Also store for invoke_agent span (will be the last one used) + _invoke_agent_response_model_context.set(response_model) + update_ai_client_span(span, agent, kwargs, result) return result diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py index e424e93888..d096cd51c1 100644 --- a/sentry_sdk/integrations/openai_agents/spans/ai_client.py +++ b/sentry_sdk/integrations/openai_agents/spans/ai_client.py @@ -40,3 +40,7 @@ def update_ai_client_span(span, agent, get_response_kwargs, result): _set_usage_data(span, result.usage) _set_output_data(span, result) _create_mcp_execute_tool_spans(span, result) + + # Set response model if captured from raw response + if hasattr(result, "_sentry_response_model") and result._sentry_response_model: + span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, result._sentry_response_model) diff --git a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py index 2a9c5ebe66..55ea40f0b6 100644 --- a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py +++ b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py @@ -8,8 +8,9 @@ from sentry_sdk.scope import should_send_default_pii from sentry_sdk.utils import safe_serialize +from .._context_vars import _invoke_agent_response_model_context from ..consts import SPAN_ORIGIN -from ..utils import _set_agent_data +from ..utils import _set_agent_data, _set_usage_data from typing import TYPE_CHECKING @@ -78,6 +79,16 @@ def update_invoke_agent_span(context, agent, output): span = sentry_sdk.get_current_span() if span: + # Add aggregated usage data from context_wrapper + if hasattr(context, "usage"): + _set_usage_data(span, context.usage) + + # Add response model if available (will be the last model used) + response_model = _invoke_agent_response_model_context.get(None) + if response_model: + span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, response_model) + _invoke_agent_response_model_context.set(None) # Clear after use + if should_send_default_pii(): set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TEXT, output, unpack=False diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 46197ae855..0b436f2338 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -1219,3 +1219,568 @@ def failing_tool(message: str) -> str: # Verify error status was set (this is the key test for our patch) # The span should be marked as error because the tool execution failed assert execute_tool_span["tags"]["status"] == "internal_error" + + +@pytest.mark.asyncio +async def test_invoke_agent_span_includes_usage_data( + sentry_init, capture_events, test_agent, mock_usage +): + """ + Test that invoke_agent spans include aggregated usage data from context_wrapper. + This verifies the new functionality added to track token usage in invoke_agent spans. + """ + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # Create a response with usage data + response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_123", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Response with usage", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=mock_usage, + response_id="resp_123", + ) + mock_get_response.return_value = response + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + test_agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span, ai_client_span = spans + + # Verify invoke_agent span has usage data from context_wrapper + assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 0 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 5 + + +@pytest.mark.asyncio +async def test_ai_client_span_includes_response_model( + sentry_init, capture_events, test_agent +): + """ + Test that ai_client spans (gen_ai.chat) include the response model from the actual API response. + This verifies the new functionality to capture the model used in the response. + """ + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + # Mock the _fetch_response method to return a response with a model field + with patch( + "agents.models.openai_responses.OpenAIResponsesModel._fetch_response" + ) as mock_fetch_response: + # Create a mock OpenAI Response object with a model field + mock_response = MagicMock() + mock_response.model = "gpt-4.1-2025-04-14" # The actual response model + mock_response.id = "resp_123" + mock_response.output = [ + ResponseOutputMessage( + id="msg_123", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Hello from GPT-4.1", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ] + mock_response.usage = MagicMock() + mock_response.usage.input_tokens = 10 + mock_response.usage.output_tokens = 20 + mock_response.usage.total_tokens = 30 + mock_response.usage.input_tokens_details = InputTokensDetails( + cached_tokens=0 + ) + mock_response.usage.output_tokens_details = OutputTokensDetails( + reasoning_tokens=5 + ) + + mock_fetch_response.return_value = mock_response + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + test_agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + _, ai_client_span = spans + + # Verify ai_client span has response model + assert ai_client_span["description"] == "chat gpt-4" + assert "gen_ai.response.model" in ai_client_span["data"] + assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + +@pytest.mark.asyncio +async def test_ai_client_span_response_model_with_chat_completions( + sentry_init, capture_events +): + """ + Test that response model is captured when using ChatCompletions API (not Responses API). + This ensures our implementation works with different OpenAI model types. + """ + # Create agent that uses ChatCompletions model + agent = Agent( + name="chat_completions_agent", + instructions="Test agent using ChatCompletions", + model="gpt-4o-mini", + ) + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + # Mock the get_response method directly since ChatCompletions may use Responses API anyway + with patch( + "agents.models.openai_responses.OpenAIResponsesModel._fetch_response" + ) as mock_fetch_response: + # Create a mock Response object with a model field + mock_response = MagicMock() + mock_response.model = "gpt-4o-mini-2024-07-18" # Actual response model + mock_response.id = "resp_123" + mock_response.output = [ + ResponseOutputMessage( + id="msg_123", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Response from model", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ] + mock_response.usage = MagicMock() + mock_response.usage.input_tokens = 15 + mock_response.usage.output_tokens = 25 + mock_response.usage.total_tokens = 40 + mock_response.usage.input_tokens_details = InputTokensDetails( + cached_tokens=0 + ) + mock_response.usage.output_tokens_details = OutputTokensDetails( + reasoning_tokens=0 + ) + + mock_fetch_response.return_value = mock_response + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) + + events = capture_events() + + result = await agents.Runner.run( + agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + _, ai_client_span = spans + + # Verify response model from Response is captured + assert "gen_ai.response.model" in ai_client_span["data"] + assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4o-mini-2024-07-18" + + +@pytest.mark.asyncio +async def test_multiple_llm_calls_aggregate_usage( + sentry_init, capture_events, test_agent +): + """ + Test that invoke_agent spans show aggregated usage across multiple LLM calls + (e.g., when tools are used and multiple API calls are made). + """ + + @agents.function_tool + def calculator(a: int, b: int) -> int: + """Add two numbers""" + return a + b + + agent_with_tool = test_agent.clone(tools=[calculator]) + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + # First call: agent decides to use tool (10 input, 5 output tokens) + tool_call_response = ModelResponse( + output=[ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="calculator", + type="function_call", + arguments='{"a": 5, "b": 3}', + ) + ], + usage=Usage( + requests=1, + input_tokens=10, + output_tokens=5, + total_tokens=15, + input_tokens_details=InputTokensDetails(cached_tokens=0), + output_tokens_details=OutputTokensDetails(reasoning_tokens=0), + ), + response_id="resp_tool_call", + ) + + # Second call: agent uses tool result to respond (20 input, 15 output tokens) + final_response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="The result is 8", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=Usage( + requests=1, + input_tokens=20, + output_tokens=15, + total_tokens=35, + input_tokens_details=InputTokensDetails(cached_tokens=5), + output_tokens_details=OutputTokensDetails(reasoning_tokens=3), + ), + response_id="resp_final", + ) + + mock_get_response.side_effect = [tool_call_response, final_response] + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = spans[0] + + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + # Cached tokens should be aggregated: 0 + 5 = 5 + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 5 + # Reasoning tokens should be aggregated: 0 + 3 = 3 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 3 + + +@pytest.mark.asyncio +async def test_response_model_not_set_when_unavailable( + sentry_init, capture_events, test_agent +): + """ + Test that response model is not set if the raw response doesn't have a model field. + This can happen with custom model implementations. + """ + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + # Mock without _fetch_response (simulating custom model without this method) + with patch( + "agents.models.openai_responses.OpenAIResponsesModel.get_response" + ) as mock_get_response: + response = ModelResponse( + output=[ + ResponseOutputMessage( + id="msg_123", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Response without model field", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + usage=Usage( + requests=1, + input_tokens=10, + output_tokens=20, + total_tokens=30, + ), + response_id="resp_123", + ) + # Don't set _sentry_response_model attribute + mock_get_response.return_value = response + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) + + events = capture_events() + + # Remove the _fetch_response method to simulate custom model + with patch.object( + agents.models.openai_responses.OpenAIResponsesModel, + "_fetch_response", + None, + ): + result = await agents.Runner.run( + test_agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + _, ai_client_span = spans + + # When response model can't be captured, it shouldn't be in the span data + # (we only set it when we can accurately capture it) + assert "gen_ai.response.model" not in ai_client_span["data"] + + +@pytest.mark.asyncio +async def test_invoke_agent_span_includes_response_model( + sentry_init, capture_events, test_agent +): + """ + Test that invoke_agent spans include the response model. + When an agent makes multiple LLM calls, it should report the last model used. + """ + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel._fetch_response" + ) as mock_fetch_response: + # Create a mock OpenAI Response object with a model field + mock_response = MagicMock() + mock_response.model = "gpt-4.1-2025-04-14" # The actual response model + mock_response.id = "resp_123" + mock_response.output = [ + ResponseOutputMessage( + id="msg_123", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="Response from model", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ] + mock_response.usage = MagicMock() + mock_response.usage.input_tokens = 10 + mock_response.usage.output_tokens = 20 + mock_response.usage.total_tokens = 30 + mock_response.usage.input_tokens_details = InputTokensDetails( + cached_tokens=0 + ) + mock_response.usage.output_tokens_details = OutputTokensDetails( + reasoning_tokens=5 + ) + + mock_fetch_response.return_value = mock_response + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + test_agent, "Test input", run_config=test_run_config + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span, ai_client_span = spans + + # Verify invoke_agent span has response model + assert invoke_agent_span["description"] == "invoke_agent test_agent" + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + # Also verify ai_client span has it + assert "gen_ai.response.model" in ai_client_span["data"] + assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + +@pytest.mark.asyncio +async def test_invoke_agent_span_uses_last_response_model( + sentry_init, capture_events, test_agent +): + """ + Test that when an agent makes multiple LLM calls (e.g., with tools), + the invoke_agent span reports the last response model used. + """ + + @agents.function_tool + def calculator(a: int, b: int) -> int: + """Add two numbers""" + return a + b + + agent_with_tool = test_agent.clone(tools=[calculator]) + + with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}): + with patch( + "agents.models.openai_responses.OpenAIResponsesModel._fetch_response" + ) as mock_fetch_response: + # First call: gpt-4 model + first_response = MagicMock() + first_response.model = "gpt-4-0613" + first_response.id = "resp_1" + first_response.output = [ + ResponseFunctionToolCall( + id="call_123", + call_id="call_123", + name="calculator", + type="function_call", + arguments='{"a": 5, "b": 3}', + ) + ] + first_response.usage = MagicMock() + first_response.usage.input_tokens = 10 + first_response.usage.output_tokens = 5 + first_response.usage.total_tokens = 15 + first_response.usage.input_tokens_details = InputTokensDetails( + cached_tokens=0 + ) + first_response.usage.output_tokens_details = OutputTokensDetails( + reasoning_tokens=0 + ) + + # Second call: different model (e.g., after tool execution) + second_response = MagicMock() + second_response.model = "gpt-4.1-2025-04-14" # Different model + second_response.id = "resp_2" + second_response.output = [ + ResponseOutputMessage( + id="msg_final", + type="message", + status="completed", + content=[ + ResponseOutputText( + text="The result is 8", + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ] + second_response.usage = MagicMock() + second_response.usage.input_tokens = 20 + second_response.usage.output_tokens = 15 + second_response.usage.total_tokens = 35 + second_response.usage.input_tokens_details = InputTokensDetails( + cached_tokens=5 + ) + second_response.usage.output_tokens_details = OutputTokensDetails( + reasoning_tokens=3 + ) + + mock_fetch_response.side_effect = [first_response, second_response] + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + send_default_pii=True, + ) + + events = capture_events() + + result = await agents.Runner.run( + agent_with_tool, + "What is 5 + 3?", + run_config=test_run_config, + ) + + assert result is not None + + (transaction,) = events + spans = transaction["spans"] + invoke_agent_span = spans[0] + first_ai_client_span = spans[1] + second_ai_client_span = spans[3] # After tool span + + # Verify invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + # Verify each ai_client span has its own response model + assert first_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4-0613" + assert ( + second_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + ) From 14a111685d3f9a23b0ca3c11578201f3a2dec748 Mon Sep 17 00:00:00 2001 From: Alex Alderman Webb Date: Tue, 2 Dec 2025 13:52:50 +0100 Subject: [PATCH 2/4] fix(openai-agents): Remove context variables (#5184) --- .../openai_agents/_context_vars.py | 18 ------------------ .../openai_agents/patches/agent_run.py | 9 +++++++-- .../openai_agents/patches/models.py | 17 ++++++++--------- .../openai_agents/patches/runner.py | 4 +++- .../openai_agents/spans/invoke_agent.py | 7 ------- 5 files changed, 18 insertions(+), 37 deletions(-) delete mode 100644 sentry_sdk/integrations/openai_agents/_context_vars.py diff --git a/sentry_sdk/integrations/openai_agents/_context_vars.py b/sentry_sdk/integrations/openai_agents/_context_vars.py deleted file mode 100644 index 83746ecee6..0000000000 --- a/sentry_sdk/integrations/openai_agents/_context_vars.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Context variables for passing data between nested calls in the OpenAI Agents integration. -""" - -from contextvars import ContextVar - -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - pass - -# Context variable to pass response model between nested calls (for gen_ai.chat spans) -_response_model_context = ContextVar("openai_agents_response_model", default=None) # type: ContextVar[str | None] - -# Context variable to store the last response model for invoke_agent spans -_invoke_agent_response_model_context = ContextVar( - "openai_agents_invoke_agent_response_model", default=None -) # type: ContextVar[str | None] diff --git a/sentry_sdk/integrations/openai_agents/patches/agent_run.py b/sentry_sdk/integrations/openai_agents/patches/agent_run.py index b25bf82ad5..57a68f2f5d 100644 --- a/sentry_sdk/integrations/openai_agents/patches/agent_run.py +++ b/sentry_sdk/integrations/openai_agents/patches/agent_run.py @@ -8,6 +8,8 @@ if TYPE_CHECKING: from typing import Any, Optional + from sentry_sdk.tracing import Span + try: import agents except ImportError: @@ -27,13 +29,15 @@ def _patch_agent_run(): original_execute_final_output = agents._run_impl.RunImpl.execute_final_output def _start_invoke_agent_span(context_wrapper, agent, kwargs): - # type: (agents.RunContextWrapper, agents.Agent, dict[str, Any]) -> None + # type: (agents.RunContextWrapper, agents.Agent, dict[str, Any]) -> Span """Start an agent invocation span""" # Store the agent on the context wrapper so we can access it later context_wrapper._sentry_current_agent = agent span = invoke_agent_span(context_wrapper, agent, kwargs) context_wrapper._sentry_agent_span = span + return span + def _end_invoke_agent_span(context_wrapper, agent, output=None): # type: (agents.RunContextWrapper, agents.Agent, Optional[Any]) -> None """End the agent invocation span""" @@ -73,7 +77,8 @@ async def patched_run_single_turn(cls, *args, **kwargs): if current_agent and current_agent != agent: _end_invoke_agent_span(context_wrapper, current_agent) - _start_invoke_agent_span(context_wrapper, agent, kwargs) + span = _start_invoke_agent_span(context_wrapper, agent, kwargs) + agent._sentry_agent_span = span # Call original method with all the correct parameters result = await original_run_single_turn(*args, **kwargs) diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py index aa6371302e..c7b8ed1ca5 100644 --- a/sentry_sdk/integrations/openai_agents/patches/models.py +++ b/sentry_sdk/integrations/openai_agents/patches/models.py @@ -2,11 +2,8 @@ from sentry_sdk.integrations import DidNotEnable -from .._context_vars import ( - _invoke_agent_response_model_context, - _response_model_context, -) from ..spans import ai_client_span, update_ai_client_span +from sentry_sdk.consts import SPANDATA from typing import TYPE_CHECKING @@ -47,7 +44,7 @@ async def wrapped_fetch_response(*args, **kwargs): response = await original_fetch_response(*args, **kwargs) # Store model from raw response in context variable if hasattr(response, "model"): - _response_model_context.set(str(response.model)) + agent._sentry_raw_response_model = str(response.model) return response model._fetch_response = wrapped_fetch_response @@ -59,13 +56,15 @@ async def wrapped_get_response(*args, **kwargs): result = await original_get_response(*args, **kwargs) # Retrieve response model from context and attach to ModelResponse - response_model = _response_model_context.get(None) + response_model = getattr(agent, "_sentry_raw_response_model", None) if response_model: result._sentry_response_model = response_model - _response_model_context.set(None) # Clear context - # Also store for invoke_agent span (will be the last one used) - _invoke_agent_response_model_context.set(response_model) + agent_span = getattr(agent, "_sentry_agent_span", None) + if agent_span: + agent_span.set_data( + SPANDATA.GEN_AI_RESPONSE_MODEL, response_model + ) update_ai_client_span(span, agent, kwargs, result) diff --git a/sentry_sdk/integrations/openai_agents/patches/runner.py b/sentry_sdk/integrations/openai_agents/patches/runner.py index 745f30a38e..05c15da4d1 100644 --- a/sentry_sdk/integrations/openai_agents/patches/runner.py +++ b/sentry_sdk/integrations/openai_agents/patches/runner.py @@ -26,9 +26,11 @@ async def wrapper(*args, **kwargs): # Isolate each workflow so that when agents are run in asyncio tasks they # don't touch each other's scopes with sentry_sdk.isolation_scope(): - agent = args[0] + # Clone agent because agent invocation spans are attached per run. + agent = args[0].clone() with agent_workflow_span(agent): result = None + args = (agent, *args[1:]) try: result = await original_func(*args, **kwargs) return result diff --git a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py index 9ae18c0451..5d1731f247 100644 --- a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py +++ b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py @@ -9,7 +9,6 @@ from sentry_sdk.scope import should_send_default_pii from sentry_sdk.utils import safe_serialize -from .._context_vars import _invoke_agent_response_model_context from ..consts import SPAN_ORIGIN from ..utils import _set_agent_data, _set_usage_data @@ -89,12 +88,6 @@ def update_invoke_agent_span(context, agent, output): if hasattr(context, "usage"): _set_usage_data(span, context.usage) - # Add response model if available (will be the last model used) - response_model = _invoke_agent_response_model_context.get(None) - if response_model: - span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, response_model) - _invoke_agent_response_model_context.set(None) # Clear after use - if should_send_default_pii(): set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TEXT, output, unpack=False From ed1f3ed86f7cf20a757349d6a67c2ee9820c1875 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Tue, 2 Dec 2025 14:11:24 +0100 Subject: [PATCH 3/4] remove comments and cleanup attribute --- sentry_sdk/integrations/openai_agents/patches/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py index c7b8ed1ca5..3ab9211c35 100644 --- a/sentry_sdk/integrations/openai_agents/patches/models.py +++ b/sentry_sdk/integrations/openai_agents/patches/models.py @@ -42,7 +42,6 @@ def wrapped_get_model(cls, agent, run_config): async def wrapped_fetch_response(*args, **kwargs): # type: (*Any, **Any) -> Any response = await original_fetch_response(*args, **kwargs) - # Store model from raw response in context variable if hasattr(response, "model"): agent._sentry_raw_response_model = str(response.model) return response @@ -55,7 +54,6 @@ async def wrapped_get_response(*args, **kwargs): with ai_client_span(agent, kwargs) as span: result = await original_get_response(*args, **kwargs) - # Retrieve response model from context and attach to ModelResponse response_model = getattr(agent, "_sentry_raw_response_model", None) if response_model: result._sentry_response_model = response_model @@ -66,6 +64,8 @@ async def wrapped_get_response(*args, **kwargs): SPANDATA.GEN_AI_RESPONSE_MODEL, response_model ) + delattr(agent, "_sentry_raw_response_model") + update_ai_client_span(span, agent, kwargs, result) return result From a34668b62350d0c5c4d883c234b84ff1d99d0be3 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Tue, 2 Dec 2025 14:34:30 +0100 Subject: [PATCH 4/4] pass response model directly to update_ai_client_span --- .../integrations/openai_agents/patches/models.py | 4 +--- .../integrations/openai_agents/spans/ai_client.py | 12 +++++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py index 3ab9211c35..feaa0c33d2 100644 --- a/sentry_sdk/integrations/openai_agents/patches/models.py +++ b/sentry_sdk/integrations/openai_agents/patches/models.py @@ -56,8 +56,6 @@ async def wrapped_get_response(*args, **kwargs): response_model = getattr(agent, "_sentry_raw_response_model", None) if response_model: - result._sentry_response_model = response_model - agent_span = getattr(agent, "_sentry_agent_span", None) if agent_span: agent_span.set_data( @@ -66,7 +64,7 @@ async def wrapped_get_response(*args, **kwargs): delattr(agent, "_sentry_raw_response_model") - update_ai_client_span(span, agent, kwargs, result) + update_ai_client_span(span, agent, kwargs, result, response_model) return result diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py index d096cd51c1..8f233fbc14 100644 --- a/sentry_sdk/integrations/openai_agents/spans/ai_client.py +++ b/sentry_sdk/integrations/openai_agents/spans/ai_client.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from agents import Agent - from typing import Any + from typing import Any, Optional def ai_client_span(agent, get_response_kwargs): @@ -35,12 +35,14 @@ def ai_client_span(agent, get_response_kwargs): return span -def update_ai_client_span(span, agent, get_response_kwargs, result): - # type: (sentry_sdk.tracing.Span, Agent, dict[str, Any], Any) -> None +def update_ai_client_span( + span, agent, get_response_kwargs, result, response_model=None +): + # type: (sentry_sdk.tracing.Span, Agent, dict[str, Any], Any, Optional[str]) -> None _set_usage_data(span, result.usage) _set_output_data(span, result) _create_mcp_execute_tool_spans(span, result) # Set response model if captured from raw response - if hasattr(result, "_sentry_response_model") and result._sentry_response_model: - span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, result._sentry_response_model) + if response_model is not None: + span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, response_model)