Skip to content

Commit 6a44721

Browse files
fix(profiling): correctly unwind on-CPU Tasks
1 parent 64d9750 commit 6a44721

File tree

2 files changed

+57
-24
lines changed

2 files changed

+57
-24
lines changed

ddtrace/internal/datadog/profiling/stack_v2/echion/echion/tasks.h

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ class TaskInfo
201201
}
202202

203203
[[nodiscard]] static Result<TaskInfo::Ptr> current(PyObject*);
204-
inline size_t unwind(FrameStack&);
204+
inline size_t unwind(FrameStack&, size_t& upper_python_stack_size);
205205
};
206206

207207
inline std::unordered_map<PyObject*, PyObject*> task_link_map;
@@ -343,7 +343,7 @@ inline std::vector<std::unique_ptr<StackInfo>> current_tasks;
343343
// ----------------------------------------------------------------------------
344344

345345
inline size_t
346-
TaskInfo::unwind(FrameStack& stack)
346+
TaskInfo::unwind(FrameStack& stack, size_t& upper_python_stack_size)
347347
{
348348
// TODO: Check for running task.
349349
std::stack<PyObject*> coro_frames;
@@ -354,14 +354,31 @@ TaskInfo::unwind(FrameStack& stack)
354354
coro_frames.push(py_coro->frame);
355355
}
356356

357-
int count = 0;
357+
// Total number of frames added to the Stack
358+
size_t count = 0;
358359

359360
// Unwind the coro frames
360361
while (!coro_frames.empty()) {
361362
PyObject* frame = coro_frames.top();
362363
coro_frames.pop();
363364

364-
count += unwind_frame(frame, stack);
365+
auto new_frames = unwind_frame(frame, stack);
366+
367+
// If this is the first Frame being unwound (we have not added any Frames to the Stack yet),
368+
// use the number of Frames added to the Stack to determine the size of the upper Python stack.
369+
if (count == 0) {
370+
// The first Frame is the coroutine Frame, so the Python stack size is the number of Frames - 1
371+
upper_python_stack_size = new_frames - 1;
372+
373+
// Remove the Python Frames from the Stack (they will be added back later)
374+
// We cannot push those Frames now because otherwise they would be added once per Task,
375+
// we only want to add them once per Leaf Task, and on top of all non-leaf Tasks.
376+
for (size_t i = 0; i < upper_python_stack_size; i++) {
377+
stack.pop_back();
378+
}
379+
}
380+
381+
count += new_frames;
365382
}
366383

367384
return count;

ddtrace/internal/datadog/profiling/stack_v2/echion/echion/threads.h

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -264,31 +264,40 @@ ThreadInfo::unwind_tasks()
264264
}
265265
}
266266

267+
// Make sure the on CPU task is first
268+
std::sort(leaf_tasks.begin(), leaf_tasks.end(), [](const TaskInfo::Ref& a, const TaskInfo::Ref& b) {
269+
return ((a.get().is_on_cpu ? 0 : 1) < (b.get().is_on_cpu ? 0 : 1));
270+
});
271+
272+
// The size of the "pure Python" stack (before asyncio Frames), computed later by TaskInfo::unwind
273+
size_t upper_python_stack_size = 0;
274+
// Unused variable, will be used later by TaskInfo::unwind
275+
size_t unused;
276+
277+
bool on_cpu_task_seen = false;
267278
for (auto& leaf_task : leaf_tasks) {
279+
on_cpu_task_seen = on_cpu_task_seen || leaf_task.get().is_on_cpu;
280+
268281
auto stack_info = std::make_unique<StackInfo>(leaf_task.get().name, leaf_task.get().is_on_cpu);
269282
auto& stack = stack_info->stack;
283+
270284
for (auto current_task = leaf_task;;) {
271285
auto& task = current_task.get();
272286

273-
size_t stack_size = task.unwind(stack);
274-
287+
// The task_stack_size includes both the coroutines frames and the "upper" Python synchronous frames
288+
size_t task_stack_size = task.unwind(stack, task.is_on_cpu ? upper_python_stack_size : unused);
275289
if (task.is_on_cpu) {
276-
// Undo the stack unwinding
277-
// TODO[perf]: not super-efficient :(
278-
for (size_t i = 0; i < stack_size; i++)
279-
stack.pop_back();
280-
281-
// Instead we get part of the thread stack
282-
FrameStack temp_stack;
283-
size_t nframes = (python_stack.size() > stack_size) ? python_stack.size() - stack_size : 0;
284-
for (size_t i = 0; i < nframes; i++) {
285-
auto python_frame = python_stack.front();
286-
temp_stack.push_front(python_frame);
287-
python_stack.pop_front();
288-
}
289-
while (!temp_stack.empty()) {
290-
stack.push_front(temp_stack.front());
291-
temp_stack.pop_front();
290+
// Get the "bottom" part of the Python synchronous Stack, that is to say the
291+
// synchronous functions and coroutines called by the Task's outermost coroutine
292+
// The number of Frames to push is the total number of Frames in the Python stack, from which we
293+
// subtract the number of Frames in the "upper Python stack" (asyncio machinery + sync entrypoint)
294+
// This gives us [outermost coroutine, ... , innermost coroutine, outermost sync function, ... ,
295+
// innermost sync function]
296+
size_t frames_to_push =
297+
(python_stack.size() > task_stack_size) ? python_stack.size() - task_stack_size : 0;
298+
for (size_t i = 0; i < frames_to_push; i++) {
299+
const auto& python_frame = python_stack[frames_to_push - i - 1];
300+
stack.push_front(python_frame);
292301
}
293302
}
294303

@@ -317,8 +326,15 @@ ThreadInfo::unwind_tasks()
317326
}
318327

319328
// Finish off with the remaining thread stack
320-
for (auto p = python_stack.begin(); p != python_stack.end(); p++)
321-
stack.push_back(*p);
329+
// If we have seen an on-CPU Task, then upper_python_stack_size will be set and will include the sync entry
330+
// point and the asyncio machinery Frames. Otherwise, we are in `select` (idle) and we should push all the
331+
// Frames.
332+
for (size_t i = python_stack.size() - (on_cpu_task_seen ? upper_python_stack_size : python_stack.size());
333+
i < python_stack.size();
334+
i++) {
335+
const auto& python_frame = python_stack[i];
336+
stack.push_back(python_frame);
337+
}
322338

323339
current_tasks.push_back(std::move(stack_info));
324340
}

0 commit comments

Comments
 (0)