@@ -264,31 +264,40 @@ ThreadInfo::unwind_tasks()
264264 }
265265 }
266266
267+ // Make sure the on CPU task is first
268+ std::sort (leaf_tasks.begin (), leaf_tasks.end (), [](const TaskInfo::Ref& a, const TaskInfo::Ref& b) {
269+ return ((a.get ().is_on_cpu ? 0 : 1 ) < (b.get ().is_on_cpu ? 0 : 1 ));
270+ });
271+
272+ // The size of the "pure Python" stack (before asyncio Frames), computed later by TaskInfo::unwind
273+ size_t upper_python_stack_size = 0 ;
274+ // Unused variable, will be used later by TaskInfo::unwind
275+ size_t unused;
276+
277+ bool on_cpu_task_seen = false ;
267278 for (auto & leaf_task : leaf_tasks) {
279+ on_cpu_task_seen = on_cpu_task_seen || leaf_task.get ().is_on_cpu ;
280+
268281 auto stack_info = std::make_unique<StackInfo>(leaf_task.get ().name , leaf_task.get ().is_on_cpu );
269282 auto & stack = stack_info->stack ;
283+
270284 for (auto current_task = leaf_task;;) {
271285 auto & task = current_task.get ();
272286
273- size_t stack_size = task. unwind (stack);
274-
287+ // The task_stack_size includes both the coroutines frames and the "upper" Python synchronous frames
288+ size_t task_stack_size = task. unwind (stack, task. is_on_cpu ? upper_python_stack_size : unused);
275289 if (task.is_on_cpu ) {
276- // Undo the stack unwinding
277- // TODO[perf]: not super-efficient :(
278- for (size_t i = 0 ; i < stack_size; i++)
279- stack.pop_back ();
280-
281- // Instead we get part of the thread stack
282- FrameStack temp_stack;
283- size_t nframes = (python_stack.size () > stack_size) ? python_stack.size () - stack_size : 0 ;
284- for (size_t i = 0 ; i < nframes; i++) {
285- auto python_frame = python_stack.front ();
286- temp_stack.push_front (python_frame);
287- python_stack.pop_front ();
288- }
289- while (!temp_stack.empty ()) {
290- stack.push_front (temp_stack.front ());
291- temp_stack.pop_front ();
290+ // Get the "bottom" part of the Python synchronous Stack, that is to say the
291+ // synchronous functions and coroutines called by the Task's outermost coroutine
292+ // The number of Frames to push is the total number of Frames in the Python stack, from which we
293+ // subtract the number of Frames in the "upper Python stack" (asyncio machinery + sync entrypoint)
294+ // This gives us [outermost coroutine, ... , innermost coroutine, outermost sync function, ... ,
295+ // innermost sync function]
296+ size_t frames_to_push =
297+ (python_stack.size () > task_stack_size) ? python_stack.size () - task_stack_size : 0 ;
298+ for (size_t i = 0 ; i < frames_to_push; i++) {
299+ const auto & python_frame = python_stack[frames_to_push - i - 1 ];
300+ stack.push_front (python_frame);
292301 }
293302 }
294303
@@ -317,8 +326,15 @@ ThreadInfo::unwind_tasks()
317326 }
318327
319328 // Finish off with the remaining thread stack
320- for (auto p = python_stack.begin (); p != python_stack.end (); p++)
321- stack.push_back (*p);
329+ // If we have seen an on-CPU Task, then upper_python_stack_size will be set and will include the sync entry
330+ // point and the asyncio machinery Frames. Otherwise, we are in `select` (idle) and we should push all the
331+ // Frames.
332+ for (size_t i = python_stack.size () - (on_cpu_task_seen ? upper_python_stack_size : python_stack.size ());
333+ i < python_stack.size ();
334+ i++) {
335+ const auto & python_frame = python_stack[i];
336+ stack.push_back (python_frame);
337+ }
322338
323339 current_tasks.push_back (std::move (stack_info));
324340 }
0 commit comments