From 6a1d6b3c77511ed29605e58fb143083c4b6f4e55 Mon Sep 17 00:00:00 2001
From: Fabian Schindler <fabian.schindler@sentry.io>
Date: Wed, 26 Nov 2025 15:36:32 +0100
Subject: [PATCH 1/4] feat(integrations): openai-agents: add usage and response
 model reporting for chat and invoke_agent spans

---
 .../openai_agents/_context_vars.py            |  18 +
 .../openai_agents/patches/models.py           |  28 +
 .../openai_agents/spans/ai_client.py          |   4 +
 .../openai_agents/spans/invoke_agent.py       |  13 +-
 .../openai_agents/test_openai_agents.py       | 565 ++++++++++++++++++
 5 files changed, 627 insertions(+), 1 deletion(-)
 create mode 100644 sentry_sdk/integrations/openai_agents/_context_vars.py

diff --git a/sentry_sdk/integrations/openai_agents/_context_vars.py b/sentry_sdk/integrations/openai_agents/_context_vars.py
new file mode 100644
index 0000000000..83746ecee6
--- /dev/null
+++ b/sentry_sdk/integrations/openai_agents/_context_vars.py
@@ -0,0 +1,18 @@
+"""
+Context variables for passing data between nested calls in the OpenAI Agents integration.
+"""
+
+from contextvars import ContextVar
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    pass
+
+# Context variable to pass response model between nested calls (for gen_ai.chat spans)
+_response_model_context = ContextVar("openai_agents_response_model", default=None)  # type: ContextVar[str | None]
+
+# Context variable to store the last response model for invoke_agent spans
+_invoke_agent_response_model_context = ContextVar(
+    "openai_agents_invoke_agent_response_model", default=None
+)  # type: ContextVar[str | None]
diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py
index e6f24da6a1..aa6371302e 100644
--- a/sentry_sdk/integrations/openai_agents/patches/models.py
+++ b/sentry_sdk/integrations/openai_agents/patches/models.py
@@ -2,6 +2,10 @@
 
 from sentry_sdk.integrations import DidNotEnable
 
+from .._context_vars import (
+    _invoke_agent_response_model_context,
+    _response_model_context,
+)
 from ..spans import ai_client_span, update_ai_client_span
 
 from typing import TYPE_CHECKING
@@ -33,12 +37,36 @@ def wrapped_get_model(cls, agent, run_config):
         model = original_get_model(agent, run_config)
         original_get_response = model.get_response
 
+        # Wrap _fetch_response if it exists (for OpenAI models) to capture raw response model
+        if hasattr(model, "_fetch_response"):
+            original_fetch_response = model._fetch_response
+
+            @wraps(original_fetch_response)
+            async def wrapped_fetch_response(*args, **kwargs):
+                # type: (*Any, **Any) -> Any
+                response = await original_fetch_response(*args, **kwargs)
+                # Store model from raw response in context variable
+                if hasattr(response, "model"):
+                    _response_model_context.set(str(response.model))
+                return response
+
+            model._fetch_response = wrapped_fetch_response
+
         @wraps(original_get_response)
         async def wrapped_get_response(*args, **kwargs):
             # type: (*Any, **Any) -> Any
             with ai_client_span(agent, kwargs) as span:
                 result = await original_get_response(*args, **kwargs)
 
+                # Retrieve response model from context and attach to ModelResponse
+                response_model = _response_model_context.get(None)
+                if response_model:
+                    result._sentry_response_model = response_model
+                    _response_model_context.set(None)  # Clear context
+
+                    # Also store for invoke_agent span (will be the last one used)
+                    _invoke_agent_response_model_context.set(response_model)
+
                 update_ai_client_span(span, agent, kwargs, result)
 
             return result
diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py
index e424e93888..d096cd51c1 100644
--- a/sentry_sdk/integrations/openai_agents/spans/ai_client.py
+++ b/sentry_sdk/integrations/openai_agents/spans/ai_client.py
@@ -40,3 +40,7 @@ def update_ai_client_span(span, agent, get_response_kwargs, result):
     _set_usage_data(span, result.usage)
     _set_output_data(span, result)
     _create_mcp_execute_tool_spans(span, result)
+
+    # Set response model if captured from raw response
+    if hasattr(result, "_sentry_response_model") and result._sentry_response_model:
+        span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, result._sentry_response_model)
diff --git a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py
index 2a9c5ebe66..55ea40f0b6 100644
--- a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py
+++ b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py
@@ -8,8 +8,9 @@
 from sentry_sdk.scope import should_send_default_pii
 from sentry_sdk.utils import safe_serialize
 
+from .._context_vars import _invoke_agent_response_model_context
 from ..consts import SPAN_ORIGIN
-from ..utils import _set_agent_data
+from ..utils import _set_agent_data, _set_usage_data
 
 from typing import TYPE_CHECKING
 
@@ -78,6 +79,16 @@ def update_invoke_agent_span(context, agent, output):
     span = sentry_sdk.get_current_span()
 
     if span:
+        # Add aggregated usage data from context_wrapper
+        if hasattr(context, "usage"):
+            _set_usage_data(span, context.usage)
+
+        # Add response model if available (will be the last model used)
+        response_model = _invoke_agent_response_model_context.get(None)
+        if response_model:
+            span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, response_model)
+            _invoke_agent_response_model_context.set(None)  # Clear after use
+
         if should_send_default_pii():
             set_data_normalized(
                 span, SPANDATA.GEN_AI_RESPONSE_TEXT, output, unpack=False
diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py
index 46197ae855..0b436f2338 100644
--- a/tests/integrations/openai_agents/test_openai_agents.py
+++ b/tests/integrations/openai_agents/test_openai_agents.py
@@ -1219,3 +1219,568 @@ def failing_tool(message: str) -> str:
     # Verify error status was set (this is the key test for our patch)
     # The span should be marked as error because the tool execution failed
     assert execute_tool_span["tags"]["status"] == "internal_error"
+
+
+@pytest.mark.asyncio
+async def test_invoke_agent_span_includes_usage_data(
+    sentry_init, capture_events, test_agent, mock_usage
+):
+    """
+    Test that invoke_agent spans include aggregated usage data from context_wrapper.
+    This verifies the new functionality added to track token usage in invoke_agent spans.
+    """
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # Create a response with usage data
+            response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_123",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="Response with usage",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=mock_usage,
+                response_id="resp_123",
+            )
+            mock_get_response.return_value = response
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
+
+            events = capture_events()
+
+            result = await agents.Runner.run(
+                test_agent, "Test input", run_config=test_run_config
+            )
+
+            assert result is not None
+
+    (transaction,) = events
+    spans = transaction["spans"]
+    invoke_agent_span, ai_client_span = spans
+
+    # Verify invoke_agent span has usage data from context_wrapper
+    assert invoke_agent_span["description"] == "invoke_agent test_agent"
+    assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"]
+    assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"]
+    assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"]
+
+    # The usage should match the mock_usage values (aggregated across all calls)
+    assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10
+    assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20
+    assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30
+    assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 0
+    assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 5
+
+
+@pytest.mark.asyncio
+async def test_ai_client_span_includes_response_model(
+    sentry_init, capture_events, test_agent
+):
+    """
+    Test that ai_client spans (gen_ai.chat) include the response model from the actual API response.
+    This verifies the new functionality to capture the model used in the response.
+    """
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        # Mock the _fetch_response method to return a response with a model field
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel._fetch_response"
+        ) as mock_fetch_response:
+            # Create a mock OpenAI Response object with a model field
+            mock_response = MagicMock()
+            mock_response.model = "gpt-4.1-2025-04-14"  # The actual response model
+            mock_response.id = "resp_123"
+            mock_response.output = [
+                ResponseOutputMessage(
+                    id="msg_123",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Hello from GPT-4.1",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ]
+            mock_response.usage = MagicMock()
+            mock_response.usage.input_tokens = 10
+            mock_response.usage.output_tokens = 20
+            mock_response.usage.total_tokens = 30
+            mock_response.usage.input_tokens_details = InputTokensDetails(
+                cached_tokens=0
+            )
+            mock_response.usage.output_tokens_details = OutputTokensDetails(
+                reasoning_tokens=5
+            )
+
+            mock_fetch_response.return_value = mock_response
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
+
+            events = capture_events()
+
+            result = await agents.Runner.run(
+                test_agent, "Test input", run_config=test_run_config
+            )
+
+            assert result is not None
+
+    (transaction,) = events
+    spans = transaction["spans"]
+    _, ai_client_span = spans
+
+    # Verify ai_client span has response model
+    assert ai_client_span["description"] == "chat gpt-4"
+    assert "gen_ai.response.model" in ai_client_span["data"]
+    assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
+
+
+@pytest.mark.asyncio
+async def test_ai_client_span_response_model_with_chat_completions(
+    sentry_init, capture_events
+):
+    """
+    Test that response model is captured when using ChatCompletions API (not Responses API).
+    This ensures our implementation works with different OpenAI model types.
+    """
+    # Create agent that uses ChatCompletions model
+    agent = Agent(
+        name="chat_completions_agent",
+        instructions="Test agent using ChatCompletions",
+        model="gpt-4o-mini",
+    )
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        # Mock the get_response method directly since ChatCompletions may use Responses API anyway
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel._fetch_response"
+        ) as mock_fetch_response:
+            # Create a mock Response object with a model field
+            mock_response = MagicMock()
+            mock_response.model = "gpt-4o-mini-2024-07-18"  # Actual response model
+            mock_response.id = "resp_123"
+            mock_response.output = [
+                ResponseOutputMessage(
+                    id="msg_123",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Response from model",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ]
+            mock_response.usage = MagicMock()
+            mock_response.usage.input_tokens = 15
+            mock_response.usage.output_tokens = 25
+            mock_response.usage.total_tokens = 40
+            mock_response.usage.input_tokens_details = InputTokensDetails(
+                cached_tokens=0
+            )
+            mock_response.usage.output_tokens_details = OutputTokensDetails(
+                reasoning_tokens=0
+            )
+
+            mock_fetch_response.return_value = mock_response
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+            )
+
+            events = capture_events()
+
+            result = await agents.Runner.run(
+                agent, "Test input", run_config=test_run_config
+            )
+
+            assert result is not None
+
+    (transaction,) = events
+    spans = transaction["spans"]
+    _, ai_client_span = spans
+
+    # Verify response model from Response is captured
+    assert "gen_ai.response.model" in ai_client_span["data"]
+    assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4o-mini-2024-07-18"
+
+
+@pytest.mark.asyncio
+async def test_multiple_llm_calls_aggregate_usage(
+    sentry_init, capture_events, test_agent
+):
+    """
+    Test that invoke_agent spans show aggregated usage across multiple LLM calls
+    (e.g., when tools are used and multiple API calls are made).
+    """
+
+    @agents.function_tool
+    def calculator(a: int, b: int) -> int:
+        """Add two numbers"""
+        return a + b
+
+    agent_with_tool = test_agent.clone(tools=[calculator])
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            # First call: agent decides to use tool (10 input, 5 output tokens)
+            tool_call_response = ModelResponse(
+                output=[
+                    ResponseFunctionToolCall(
+                        id="call_123",
+                        call_id="call_123",
+                        name="calculator",
+                        type="function_call",
+                        arguments='{"a": 5, "b": 3}',
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=5,
+                    total_tokens=15,
+                    input_tokens_details=InputTokensDetails(cached_tokens=0),
+                    output_tokens_details=OutputTokensDetails(reasoning_tokens=0),
+                ),
+                response_id="resp_tool_call",
+            )
+
+            # Second call: agent uses tool result to respond (20 input, 15 output tokens)
+            final_response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_final",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="The result is 8",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=20,
+                    output_tokens=15,
+                    total_tokens=35,
+                    input_tokens_details=InputTokensDetails(cached_tokens=5),
+                    output_tokens_details=OutputTokensDetails(reasoning_tokens=3),
+                ),
+                response_id="resp_final",
+            )
+
+            mock_get_response.side_effect = [tool_call_response, final_response]
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
+
+            events = capture_events()
+
+            result = await agents.Runner.run(
+                agent_with_tool,
+                "What is 5 + 3?",
+                run_config=test_run_config,
+            )
+
+            assert result is not None
+
+    (transaction,) = events
+    spans = transaction["spans"]
+    invoke_agent_span = spans[0]
+
+    # Verify invoke_agent span has aggregated usage from both API calls
+    # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total
+    assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30
+    assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20
+    assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50
+    # Cached tokens should be aggregated: 0 + 5 = 5
+    assert invoke_agent_span["data"]["gen_ai.usage.input_tokens.cached"] == 5
+    # Reasoning tokens should be aggregated: 0 + 3 = 3
+    assert invoke_agent_span["data"]["gen_ai.usage.output_tokens.reasoning"] == 3
+
+
+@pytest.mark.asyncio
+async def test_response_model_not_set_when_unavailable(
+    sentry_init, capture_events, test_agent
+):
+    """
+    Test that response model is not set if the raw response doesn't have a model field.
+    This can happen with custom model implementations.
+    """
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        # Mock without _fetch_response (simulating custom model without this method)
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel.get_response"
+        ) as mock_get_response:
+            response = ModelResponse(
+                output=[
+                    ResponseOutputMessage(
+                        id="msg_123",
+                        type="message",
+                        status="completed",
+                        content=[
+                            ResponseOutputText(
+                                text="Response without model field",
+                                type="output_text",
+                                annotations=[],
+                            )
+                        ],
+                        role="assistant",
+                    )
+                ],
+                usage=Usage(
+                    requests=1,
+                    input_tokens=10,
+                    output_tokens=20,
+                    total_tokens=30,
+                ),
+                response_id="resp_123",
+            )
+            # Don't set _sentry_response_model attribute
+            mock_get_response.return_value = response
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+            )
+
+            events = capture_events()
+
+            # Remove the _fetch_response method to simulate custom model
+            with patch.object(
+                agents.models.openai_responses.OpenAIResponsesModel,
+                "_fetch_response",
+                None,
+            ):
+                result = await agents.Runner.run(
+                    test_agent, "Test input", run_config=test_run_config
+                )
+
+                assert result is not None
+
+    (transaction,) = events
+    spans = transaction["spans"]
+    _, ai_client_span = spans
+
+    # When response model can't be captured, it shouldn't be in the span data
+    # (we only set it when we can accurately capture it)
+    assert "gen_ai.response.model" not in ai_client_span["data"]
+
+
+@pytest.mark.asyncio
+async def test_invoke_agent_span_includes_response_model(
+    sentry_init, capture_events, test_agent
+):
+    """
+    Test that invoke_agent spans include the response model.
+    When an agent makes multiple LLM calls, it should report the last model used.
+    """
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel._fetch_response"
+        ) as mock_fetch_response:
+            # Create a mock OpenAI Response object with a model field
+            mock_response = MagicMock()
+            mock_response.model = "gpt-4.1-2025-04-14"  # The actual response model
+            mock_response.id = "resp_123"
+            mock_response.output = [
+                ResponseOutputMessage(
+                    id="msg_123",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="Response from model",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ]
+            mock_response.usage = MagicMock()
+            mock_response.usage.input_tokens = 10
+            mock_response.usage.output_tokens = 20
+            mock_response.usage.total_tokens = 30
+            mock_response.usage.input_tokens_details = InputTokensDetails(
+                cached_tokens=0
+            )
+            mock_response.usage.output_tokens_details = OutputTokensDetails(
+                reasoning_tokens=5
+            )
+
+            mock_fetch_response.return_value = mock_response
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
+
+            events = capture_events()
+
+            result = await agents.Runner.run(
+                test_agent, "Test input", run_config=test_run_config
+            )
+
+            assert result is not None
+
+    (transaction,) = events
+    spans = transaction["spans"]
+    invoke_agent_span, ai_client_span = spans
+
+    # Verify invoke_agent span has response model
+    assert invoke_agent_span["description"] == "invoke_agent test_agent"
+    assert "gen_ai.response.model" in invoke_agent_span["data"]
+    assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
+
+    # Also verify ai_client span has it
+    assert "gen_ai.response.model" in ai_client_span["data"]
+    assert ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
+
+
+@pytest.mark.asyncio
+async def test_invoke_agent_span_uses_last_response_model(
+    sentry_init, capture_events, test_agent
+):
+    """
+    Test that when an agent makes multiple LLM calls (e.g., with tools),
+    the invoke_agent span reports the last response model used.
+    """
+
+    @agents.function_tool
+    def calculator(a: int, b: int) -> int:
+        """Add two numbers"""
+        return a + b
+
+    agent_with_tool = test_agent.clone(tools=[calculator])
+
+    with patch.dict(os.environ, {"OPENAI_API_KEY": "test-key"}):
+        with patch(
+            "agents.models.openai_responses.OpenAIResponsesModel._fetch_response"
+        ) as mock_fetch_response:
+            # First call: gpt-4 model
+            first_response = MagicMock()
+            first_response.model = "gpt-4-0613"
+            first_response.id = "resp_1"
+            first_response.output = [
+                ResponseFunctionToolCall(
+                    id="call_123",
+                    call_id="call_123",
+                    name="calculator",
+                    type="function_call",
+                    arguments='{"a": 5, "b": 3}',
+                )
+            ]
+            first_response.usage = MagicMock()
+            first_response.usage.input_tokens = 10
+            first_response.usage.output_tokens = 5
+            first_response.usage.total_tokens = 15
+            first_response.usage.input_tokens_details = InputTokensDetails(
+                cached_tokens=0
+            )
+            first_response.usage.output_tokens_details = OutputTokensDetails(
+                reasoning_tokens=0
+            )
+
+            # Second call: different model (e.g., after tool execution)
+            second_response = MagicMock()
+            second_response.model = "gpt-4.1-2025-04-14"  # Different model
+            second_response.id = "resp_2"
+            second_response.output = [
+                ResponseOutputMessage(
+                    id="msg_final",
+                    type="message",
+                    status="completed",
+                    content=[
+                        ResponseOutputText(
+                            text="The result is 8",
+                            type="output_text",
+                            annotations=[],
+                        )
+                    ],
+                    role="assistant",
+                )
+            ]
+            second_response.usage = MagicMock()
+            second_response.usage.input_tokens = 20
+            second_response.usage.output_tokens = 15
+            second_response.usage.total_tokens = 35
+            second_response.usage.input_tokens_details = InputTokensDetails(
+                cached_tokens=5
+            )
+            second_response.usage.output_tokens_details = OutputTokensDetails(
+                reasoning_tokens=3
+            )
+
+            mock_fetch_response.side_effect = [first_response, second_response]
+
+            sentry_init(
+                integrations=[OpenAIAgentsIntegration()],
+                traces_sample_rate=1.0,
+                send_default_pii=True,
+            )
+
+            events = capture_events()
+
+            result = await agents.Runner.run(
+                agent_with_tool,
+                "What is 5 + 3?",
+                run_config=test_run_config,
+            )
+
+            assert result is not None
+
+    (transaction,) = events
+    spans = transaction["spans"]
+    invoke_agent_span = spans[0]
+    first_ai_client_span = spans[1]
+    second_ai_client_span = spans[3]  # After tool span
+
+    # Verify invoke_agent span uses the LAST response model
+    assert "gen_ai.response.model" in invoke_agent_span["data"]
+    assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
+
+    # Verify each ai_client span has its own response model
+    assert first_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4-0613"
+    assert (
+        second_ai_client_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
+    )

From 14a111685d3f9a23b0ca3c11578201f3a2dec748 Mon Sep 17 00:00:00 2001
From: Alex Alderman Webb <alexander.webb@sentry.io>
Date: Tue, 2 Dec 2025 13:52:50 +0100
Subject: [PATCH 2/4] fix(openai-agents): Remove context variables (#5184)

---
 .../openai_agents/_context_vars.py             | 18 ------------------
 .../openai_agents/patches/agent_run.py         |  9 +++++++--
 .../openai_agents/patches/models.py            | 17 ++++++++---------
 .../openai_agents/patches/runner.py            |  4 +++-
 .../openai_agents/spans/invoke_agent.py        |  7 -------
 5 files changed, 18 insertions(+), 37 deletions(-)
 delete mode 100644 sentry_sdk/integrations/openai_agents/_context_vars.py

diff --git a/sentry_sdk/integrations/openai_agents/_context_vars.py b/sentry_sdk/integrations/openai_agents/_context_vars.py
deleted file mode 100644
index 83746ecee6..0000000000
--- a/sentry_sdk/integrations/openai_agents/_context_vars.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""
-Context variables for passing data between nested calls in the OpenAI Agents integration.
-"""
-
-from contextvars import ContextVar
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    pass
-
-# Context variable to pass response model between nested calls (for gen_ai.chat spans)
-_response_model_context = ContextVar("openai_agents_response_model", default=None)  # type: ContextVar[str | None]
-
-# Context variable to store the last response model for invoke_agent spans
-_invoke_agent_response_model_context = ContextVar(
-    "openai_agents_invoke_agent_response_model", default=None
-)  # type: ContextVar[str | None]
diff --git a/sentry_sdk/integrations/openai_agents/patches/agent_run.py b/sentry_sdk/integrations/openai_agents/patches/agent_run.py
index b25bf82ad5..57a68f2f5d 100644
--- a/sentry_sdk/integrations/openai_agents/patches/agent_run.py
+++ b/sentry_sdk/integrations/openai_agents/patches/agent_run.py
@@ -8,6 +8,8 @@
 if TYPE_CHECKING:
     from typing import Any, Optional
 
+    from sentry_sdk.tracing import Span
+
 try:
     import agents
 except ImportError:
@@ -27,13 +29,15 @@ def _patch_agent_run():
     original_execute_final_output = agents._run_impl.RunImpl.execute_final_output
 
     def _start_invoke_agent_span(context_wrapper, agent, kwargs):
-        # type: (agents.RunContextWrapper, agents.Agent, dict[str, Any]) -> None
+        # type: (agents.RunContextWrapper, agents.Agent, dict[str, Any]) -> Span
         """Start an agent invocation span"""
         # Store the agent on the context wrapper so we can access it later
         context_wrapper._sentry_current_agent = agent
         span = invoke_agent_span(context_wrapper, agent, kwargs)
         context_wrapper._sentry_agent_span = span
 
+        return span
+
     def _end_invoke_agent_span(context_wrapper, agent, output=None):
         # type: (agents.RunContextWrapper, agents.Agent, Optional[Any]) -> None
         """End the agent invocation span"""
@@ -73,7 +77,8 @@ async def patched_run_single_turn(cls, *args, **kwargs):
                 if current_agent and current_agent != agent:
                     _end_invoke_agent_span(context_wrapper, current_agent)
 
-            _start_invoke_agent_span(context_wrapper, agent, kwargs)
+            span = _start_invoke_agent_span(context_wrapper, agent, kwargs)
+            agent._sentry_agent_span = span
 
         # Call original method with all the correct parameters
         result = await original_run_single_turn(*args, **kwargs)
diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py
index aa6371302e..c7b8ed1ca5 100644
--- a/sentry_sdk/integrations/openai_agents/patches/models.py
+++ b/sentry_sdk/integrations/openai_agents/patches/models.py
@@ -2,11 +2,8 @@
 
 from sentry_sdk.integrations import DidNotEnable
 
-from .._context_vars import (
-    _invoke_agent_response_model_context,
-    _response_model_context,
-)
 from ..spans import ai_client_span, update_ai_client_span
+from sentry_sdk.consts import SPANDATA
 
 from typing import TYPE_CHECKING
 
@@ -47,7 +44,7 @@ async def wrapped_fetch_response(*args, **kwargs):
                 response = await original_fetch_response(*args, **kwargs)
                 # Store model from raw response in context variable
                 if hasattr(response, "model"):
-                    _response_model_context.set(str(response.model))
+                    agent._sentry_raw_response_model = str(response.model)
                 return response
 
             model._fetch_response = wrapped_fetch_response
@@ -59,13 +56,15 @@ async def wrapped_get_response(*args, **kwargs):
                 result = await original_get_response(*args, **kwargs)
 
                 # Retrieve response model from context and attach to ModelResponse
-                response_model = _response_model_context.get(None)
+                response_model = getattr(agent, "_sentry_raw_response_model", None)
                 if response_model:
                     result._sentry_response_model = response_model
-                    _response_model_context.set(None)  # Clear context
 
-                    # Also store for invoke_agent span (will be the last one used)
-                    _invoke_agent_response_model_context.set(response_model)
+                    agent_span = getattr(agent, "_sentry_agent_span", None)
+                    if agent_span:
+                        agent_span.set_data(
+                            SPANDATA.GEN_AI_RESPONSE_MODEL, response_model
+                        )
 
                 update_ai_client_span(span, agent, kwargs, result)
 
diff --git a/sentry_sdk/integrations/openai_agents/patches/runner.py b/sentry_sdk/integrations/openai_agents/patches/runner.py
index 745f30a38e..05c15da4d1 100644
--- a/sentry_sdk/integrations/openai_agents/patches/runner.py
+++ b/sentry_sdk/integrations/openai_agents/patches/runner.py
@@ -26,9 +26,11 @@ async def wrapper(*args, **kwargs):
         # Isolate each workflow so that when agents are run in asyncio tasks they
         # don't touch each other's scopes
         with sentry_sdk.isolation_scope():
-            agent = args[0]
+            # Clone agent because agent invocation spans are attached per run.
+            agent = args[0].clone()
             with agent_workflow_span(agent):
                 result = None
+                args = (agent, *args[1:])
                 try:
                     result = await original_func(*args, **kwargs)
                     return result
diff --git a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py
index 9ae18c0451..5d1731f247 100644
--- a/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py
+++ b/sentry_sdk/integrations/openai_agents/spans/invoke_agent.py
@@ -9,7 +9,6 @@
 from sentry_sdk.scope import should_send_default_pii
 from sentry_sdk.utils import safe_serialize
 
-from .._context_vars import _invoke_agent_response_model_context
 from ..consts import SPAN_ORIGIN
 from ..utils import _set_agent_data, _set_usage_data
 
@@ -89,12 +88,6 @@ def update_invoke_agent_span(context, agent, output):
         if hasattr(context, "usage"):
             _set_usage_data(span, context.usage)
 
-        # Add response model if available (will be the last model used)
-        response_model = _invoke_agent_response_model_context.get(None)
-        if response_model:
-            span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, response_model)
-            _invoke_agent_response_model_context.set(None)  # Clear after use
-
         if should_send_default_pii():
             set_data_normalized(
                 span, SPANDATA.GEN_AI_RESPONSE_TEXT, output, unpack=False

From ed1f3ed86f7cf20a757349d6a67c2ee9820c1875 Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Tue, 2 Dec 2025 14:11:24 +0100
Subject: [PATCH 3/4] remove comments and cleanup attribute

---
 sentry_sdk/integrations/openai_agents/patches/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py
index c7b8ed1ca5..3ab9211c35 100644
--- a/sentry_sdk/integrations/openai_agents/patches/models.py
+++ b/sentry_sdk/integrations/openai_agents/patches/models.py
@@ -42,7 +42,6 @@ def wrapped_get_model(cls, agent, run_config):
             async def wrapped_fetch_response(*args, **kwargs):
                 # type: (*Any, **Any) -> Any
                 response = await original_fetch_response(*args, **kwargs)
-                # Store model from raw response in context variable
                 if hasattr(response, "model"):
                     agent._sentry_raw_response_model = str(response.model)
                 return response
@@ -55,7 +54,6 @@ async def wrapped_get_response(*args, **kwargs):
             with ai_client_span(agent, kwargs) as span:
                 result = await original_get_response(*args, **kwargs)
 
-                # Retrieve response model from context and attach to ModelResponse
                 response_model = getattr(agent, "_sentry_raw_response_model", None)
                 if response_model:
                     result._sentry_response_model = response_model
@@ -66,6 +64,8 @@ async def wrapped_get_response(*args, **kwargs):
                             SPANDATA.GEN_AI_RESPONSE_MODEL, response_model
                         )
 
+                    delattr(agent, "_sentry_raw_response_model")
+
                 update_ai_client_span(span, agent, kwargs, result)
 
             return result

From a34668b62350d0c5c4d883c234b84ff1d99d0be3 Mon Sep 17 00:00:00 2001
From: Alexander Alderman Webb <alexander.webb@sentry.io>
Date: Tue, 2 Dec 2025 14:34:30 +0100
Subject: [PATCH 4/4] pass response model directly to update_ai_client_span

---
 .../integrations/openai_agents/patches/models.py     |  4 +---
 .../integrations/openai_agents/spans/ai_client.py    | 12 +++++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py
index 3ab9211c35..feaa0c33d2 100644
--- a/sentry_sdk/integrations/openai_agents/patches/models.py
+++ b/sentry_sdk/integrations/openai_agents/patches/models.py
@@ -56,8 +56,6 @@ async def wrapped_get_response(*args, **kwargs):
 
                 response_model = getattr(agent, "_sentry_raw_response_model", None)
                 if response_model:
-                    result._sentry_response_model = response_model
-
                     agent_span = getattr(agent, "_sentry_agent_span", None)
                     if agent_span:
                         agent_span.set_data(
@@ -66,7 +64,7 @@ async def wrapped_get_response(*args, **kwargs):
 
                     delattr(agent, "_sentry_raw_response_model")
 
-                update_ai_client_span(span, agent, kwargs, result)
+                update_ai_client_span(span, agent, kwargs, result, response_model)
 
             return result
 
diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py
index d096cd51c1..8f233fbc14 100644
--- a/sentry_sdk/integrations/openai_agents/spans/ai_client.py
+++ b/sentry_sdk/integrations/openai_agents/spans/ai_client.py
@@ -14,7 +14,7 @@
 
 if TYPE_CHECKING:
     from agents import Agent
-    from typing import Any
+    from typing import Any, Optional
 
 
 def ai_client_span(agent, get_response_kwargs):
@@ -35,12 +35,14 @@ def ai_client_span(agent, get_response_kwargs):
     return span
 
 
-def update_ai_client_span(span, agent, get_response_kwargs, result):
-    # type: (sentry_sdk.tracing.Span, Agent, dict[str, Any], Any) -> None
+def update_ai_client_span(
+    span, agent, get_response_kwargs, result, response_model=None
+):
+    # type: (sentry_sdk.tracing.Span, Agent, dict[str, Any], Any, Optional[str]) -> None
     _set_usage_data(span, result.usage)
     _set_output_data(span, result)
     _create_mcp_execute_tool_spans(span, result)
 
     # Set response model if captured from raw response
-    if hasattr(result, "_sentry_response_model") and result._sentry_response_model:
-        span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, result._sentry_response_model)
+    if response_model is not None:
+        span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, response_model)