Skip to content

Commit d05582d

Browse files
TheelxTTsangSCErotemic
authored
Speed Up Worst-Case Overhead (#376)
* Speedups from avoiding re-resolution and threading.get_ident() line_profiler/_line_profiler.pyx - Replaced repeated retrival of `prof._c_last_time[ident]` with a stored reference thereto * Optimization: store pointer to the `LineTime` obj * Fix GPT review suggestions 2 and 4 * Add comments for deref and hash_bytecode * Refactor helpers into separate file * add changelog note --------- Co-authored-by: Terence S.-C. Tsang <[email protected]> Co-authored-by: joncrall <[email protected]>
1 parent 2295c1c commit d05582d

File tree

7 files changed

+135
-52
lines changed

7 files changed

+135
-52
lines changed

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Changes
55
~~~~~
66
* FIX: ref-count leaks #372
77
* ENH: Add support for building ABI3 wheels
8+
* FIX: mitigate speed regressions introduced in 5.0.0
89

910
5.0.0
1011
~~~~~

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ recursive-include requirements *.txt
99
recursive-include tests *.py
1010
recursive-include line_profiler *.txt
1111
recursive-include line_profiler *.pyx
12+
recursive-include line_profiler *.pyi
1213
recursive-include line_profiler *.pxd
1314
recursive-include line_profiler *.pyd
1415
recursive-include line_profiler *.c

build_wheels.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ if ! which cibuildwheel ; then
1919
fi
2020

2121

22-
#pip wheel -w wheelhouse .
23-
# python -m build --wheel -o wheelhouse # line_profiler: +COMMENT_IF(binpy)
24-
cibuildwheel --config-file pyproject.toml --platform linux --archs x86_64 # line_profiler: +UNCOMMENT_IF(binpy)
22+
# Build ABI3 wheels
23+
CIBW_CONFIG_SETTINGS="--build-option=--py-limited-api=cp38" CIBW_BUILD="cp38-*" cibuildwheel --config-file pyproject.toml --platform linux --archs x86_64
24+
25+
# Build version-pinned wheels
26+
cibuildwheel --config-file pyproject.toml --platform linux --archs x86_64

clean.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ rm -rf _skbuild
55
rm -rf _line_profiler.c
66
rm -rf *.so
77
rm -rf line_profiler/_line_profiler.c
8+
rm -rf line_profiler/_line_profiler.cpp
9+
rm -rf line_profiler/*.html
810
rm -rf line_profiler/*.so
911
rm -rf build
1012
rm -rf line_profiler.egg-info

line_profiler/_line_profiler.pyx

Lines changed: 86 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ from functools import wraps
1818
from sys import byteorder
1919
import sys
2020
cimport cython
21+
from cython.operator cimport dereference as deref
22+
from cpython.object cimport PyObject_Hash
23+
from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
2124
from cpython.version cimport PY_VERSION_HEX
2225
from libc.stdint cimport int64_t
2326

@@ -31,7 +34,13 @@ from warnings import warn
3134
from weakref import WeakSet
3235

3336
from line_profiler._diagnostics import (
34-
WRAP_TRACE, SET_FRAME_LOCAL_TRACE, USE_LEGACY_TRACE)
37+
WRAP_TRACE, SET_FRAME_LOCAL_TRACE, USE_LEGACY_TRACE
38+
)
39+
40+
from ._map_helpers cimport (
41+
last_erase_if_present, line_ensure_entry, LastTime, LastTimeMap,
42+
LineTime, LineTimeMap
43+
)
3544

3645

3746
NOP_VALUE: int = opcode.opmap['NOP']
@@ -69,6 +78,7 @@ cdef extern from "Python_wrapper.h":
6978
ctypedef struct PyObject
7079
ctypedef struct PyCodeObject
7180
ctypedef struct PyFrameObject
81+
ctypedef Py_ssize_t Py_hash_t
7282
ctypedef long long PY_LONG_LONG
7383
ctypedef int (*Py_tracefunc)(
7484
object self, PyFrameObject *py_frame, int what, PyObject *arg)
@@ -93,6 +103,12 @@ cdef extern from "Python_wrapper.h":
93103
cdef int PyFrame_GetLineNumber(PyFrameObject *frame)
94104
cdef void Py_XDECREF(PyObject *o)
95105

106+
cdef unsigned long PyThread_get_thread_ident()
107+
108+
ctypedef PyCodeObject *PyCodeObjectPtr
109+
#ctypedef unordered_map[int64, LastTime] LastTimeMap
110+
#ctypedef unordered_map[int64, LineTime] LineTimeMap
111+
96112
cdef extern from "c_trace_callbacks.c": # Legacy tracing
97113
ctypedef unsigned long long Py_uintptr_t
98114

@@ -114,18 +130,18 @@ cdef extern from "timers.c":
114130
PY_LONG_LONG hpTimer()
115131
double hpTimerUnit()
116132

117-
cdef struct LineTime:
118-
int64 code
119-
int lineno
120-
PY_LONG_LONG total_time
121-
long nhits
133+
#cdef struct LineTime:
134+
# int64 code
135+
# int lineno
136+
# PY_LONG_LONG total_time
137+
# long nhits
122138

123-
cdef struct LastTime:
124-
int f_lineno
125-
PY_LONG_LONG time
139+
#cdef struct LastTime:
140+
# int f_lineno
141+
# PY_LONG_LONG time
126142

127143

128-
cdef inline int64 compute_line_hash(uint64 block_hash, uint64 linenum):
144+
cdef inline int64 compute_line_hash(uint64 block_hash, uint64 linenum) noexcept:
129145
"""
130146
Compute the hash used to store each line timing in an unordered_map.
131147
This is fairly simple, and could use some improvement since linenum
@@ -298,7 +314,7 @@ cpdef _copy_local_sysmon_events(old_code, new_code):
298314
return new_code
299315

300316

301-
cpdef int _patch_events(int events, int before, int after):
317+
cpdef int _patch_events(int events, int before, int after) noexcept:
302318
"""
303319
Patch ``events`` based on the differences between ``before`` and
304320
``after``.
@@ -434,7 +450,7 @@ cdef class _SysMonitoringState:
434450
mon.register_callback(self.tool_id, *wrapped_callbacks.popitem())
435451

436452
cdef void call_callback(self, int event_id, object code,
437-
object loc_args, object other_args):
453+
object loc_args, object other_args) noexcept:
438454
"""
439455
Call the appropriate stored callback. Also take care of the
440456
restoration of :py:mod:`sys.monitoring` callbacks, tool-ID lock,
@@ -550,7 +566,7 @@ sys.monitoring.html#monitoring-event-RERAISE
550566
"""
551567
cdef TraceCallback *legacy_callback
552568
cdef _SysMonitoringState mon_state
553-
cdef public object active_instances # type: set[LineProfiler]
569+
cdef public set active_instances # type: set[LineProfiler]
554570
cdef int _wrap_trace
555571
cdef int _set_frame_local_trace
556572
cdef int recursion_guard
@@ -732,7 +748,7 @@ sys.monitoring.html#monitoring-event-RERAISE
732748
sys.monitoring.events.RERAISE, code, instruction_offset, exception)
733749

734750
cdef void _handle_exit_event(
735-
self, int event_id, object code, int offset, object obj):
751+
self, int event_id, object code, int offset, object obj) noexcept:
736752
"""
737753
Base for the frame-exit-event (e.g. via returning or yielding)
738754
callbacks passed to :py:func:`sys.monitoring.register_callback`.
@@ -746,7 +762,7 @@ sys.monitoring.html#monitoring-event-RERAISE
746762

747763
cdef void _base_callback(
748764
self, int is_line_event, int event_id, object code, int lineno,
749-
object loc_args, object other_args):
765+
object loc_args, object other_args) noexcept:
750766
"""
751767
Base for the various callbacks passed to
752768
:py:func:`sys.monitoring.register_callback`.
@@ -1030,9 +1046,9 @@ cdef class LineProfiler:
10301046
.. _"legacy" trace system: https://github.com/python/cpython/blob/\
10311047
3.13/Python/legacy_tracing.c
10321048
"""
1033-
cdef unordered_map[int64, unordered_map[int64, LineTime]] _c_code_map
1049+
cdef unordered_map[int64, LineTimeMap] _c_code_map
10341050
# Mapping between thread-id and map of LastTime
1035-
cdef unordered_map[int64, unordered_map[int64, LastTime]] _c_last_time
1051+
cdef unordered_map[int64, LastTimeMap] _c_last_time
10361052
cdef public list functions
10371053
cdef public dict code_hash_map, dupes_map
10381054
cdef public double timer_unit
@@ -1239,7 +1255,7 @@ datamodel.html#user-defined-functions
12391255

12401256
property _manager:
12411257
def __get__(self):
1242-
thread_id = threading.get_ident()
1258+
thread_id = PyThread_get_thread_ident()
12431259
try:
12441260
return self._managers[thread_id]
12451261
except KeyError:
@@ -1303,7 +1319,7 @@ datamodel.html#user-defined-functions
13031319
If no profiling data is available on the current thread.
13041320
"""
13051321
try:
1306-
return (<dict>self._c_last_time)[threading.get_ident()]
1322+
return (<dict>self._c_last_time)[PyThread_get_thread_ident()]
13071323
except KeyError as e:
13081324
# We haven't actually profiled anything yet
13091325
raise (KeyError('No profiling data on the current thread '
@@ -1349,7 +1365,7 @@ datamodel.html#user-defined-functions
13491365
return py_last_time
13501366

13511367
cpdef disable(self):
1352-
self._c_last_time[threading.get_ident()].clear()
1368+
self._c_last_time[PyThread_get_thread_ident()].clear()
13531369
self._manager._handle_disable_event(self)
13541370

13551371
def get_stats(self):
@@ -1387,56 +1403,79 @@ datamodel.html#user-defined-functions
13871403

13881404
@cython.boundscheck(False)
13891405
@cython.wraparound(False)
1390-
cdef inline inner_trace_callback(
1391-
int is_line_event, object instances, object code, int lineno):
1406+
cdef inline void inner_trace_callback(
1407+
int is_line_event, set instances, object code, int lineno):
13921408
"""
13931409
The basic building block for the trace callbacks.
13941410
"""
1395-
cdef object prof_
1396-
cdef object bytecode = code.co_code
1411+
cdef LineProfiler prof_
13971412
cdef LineProfiler prof
13981413
cdef LastTime old
13991414
cdef int key
1400-
cdef PY_LONG_LONG time
1401-
cdef int has_time = 0
1415+
cdef PY_LONG_LONG time = 0
1416+
cdef bint has_time = False
1417+
cdef bint has_last
14021418
cdef int64 code_hash
1403-
cdef int64 block_hash
1404-
cdef unordered_map[int64, LineTime] line_entries
1405-
1406-
if any(bytecode):
1407-
block_hash = hash(bytecode)
1408-
else: # Cython functions have empty/zero bytecodes
1419+
cdef object py_bytes_obj = code.co_code
1420+
cdef char* data = PyBytes_AS_STRING(py_bytes_obj)
1421+
cdef Py_ssize_t size = PyBytes_GET_SIZE(py_bytes_obj)
1422+
cdef unsigned long ident
1423+
cdef Py_hash_t block_hash
1424+
cdef LineTime* entry
1425+
cdef LineTimeMap* line_entries
1426+
cdef LastTimeMap* last_map
1427+
1428+
# Loop over every byte to check if any are not NULL
1429+
# if there are any non-NULL, that indicates we're profiling Python code
1430+
for i in range(size):
1431+
if data[i]:
1432+
# because we use Python functions like hash, we CANNOT mark this function as nogil
1433+
block_hash = hash(py_bytes_obj)
1434+
break
1435+
else:
1436+
# fallback for Cython functions
14091437
block_hash = hash(code)
1438+
14101439
code_hash = compute_line_hash(block_hash, lineno)
14111440

14121441
for prof_ in instances:
1442+
# for some reason, doing this is much faster than just combining it into the above
1443+
# like doing "for prof in instances:" is far slower
14131444
prof = <LineProfiler>prof_
14141445
if not prof._c_code_map.count(code_hash):
14151446
continue
14161447
if not has_time:
14171448
time = hpTimer()
1418-
has_time = 1
1419-
ident = threading.get_ident()
1420-
if prof._c_last_time[ident].count(block_hash):
1421-
old = prof._c_last_time[ident][block_hash]
1422-
line_entries = prof._c_code_map[code_hash]
1423-
key = old.f_lineno
1424-
if not line_entries.count(key):
1425-
prof._c_code_map[code_hash][key] = LineTime(
1426-
code_hash, key, 0, 0)
1427-
prof._c_code_map[code_hash][key].nhits += 1
1428-
prof._c_code_map[code_hash][key].total_time += time - old.time
1449+
has_time = True
1450+
ident = PyThread_get_thread_ident()
1451+
last_map = &(prof._c_last_time[ident])
1452+
# deref() is Cython's version of the -> accessor in C++. if we don't use deref then
1453+
# Cython thinks that when we index last_map,
1454+
# we want pointer indexing (which is not the case)
1455+
if deref(last_map).count(block_hash):
1456+
old = deref(last_map)[block_hash]
1457+
line_entries = &(prof._c_code_map[code_hash])
1458+
# Ensure that an entry exists in line_entries before accessing it
1459+
entry = line_ensure_entry(line_entries, old.f_lineno, code_hash)
1460+
# Note: explicitly `deref()`-ing here causes the new values
1461+
# to be assigned to a temp var;
1462+
# meanwhile, directly dot-accessing a pointer causes Cython
1463+
# to correctly write `ptr->attr = (ptr->attr + incr)`
1464+
entry.nhits += 1
1465+
entry.total_time += time - old.time
1466+
has_last = True
1467+
else:
1468+
has_last = False
14291469
if is_line_event:
14301470
# Get the time again. This way, we don't record much time
14311471
# wasted in this function.
1432-
prof._c_last_time[ident][block_hash] = LastTime(lineno, hpTimer())
1433-
elif prof._c_last_time[ident].count(block_hash):
1472+
deref(last_map)[block_hash] = LastTime(lineno, hpTimer())
1473+
elif deref(last_map).count(block_hash):
14341474
# We are returning from a function, not executing a line.
14351475
# Delete the last_time record. It may have already been
14361476
# deleted if we are profiling a generator that is being
14371477
# pumped past its end.
1438-
prof._c_last_time[ident].erase(
1439-
prof._c_last_time[ident].find(block_hash))
1478+
last_erase_if_present(last_map, block_hash)
14401479

14411480

14421481
cdef extern int legacy_trace_callback(

line_profiler/_map_helpers.pxd

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# cython: language_level=3
2+
# cython: legacy_implicit_noexcept=True
3+
# used in _line_profiler.pyx
4+
from libcpp.unordered_map cimport unordered_map
5+
from cython.operator cimport dereference as deref
6+
7+
# long long int is at least 64 bytes assuming c99
8+
ctypedef long long int int64
9+
10+
cdef extern from "Python_wrapper.h":
11+
ctypedef long long PY_LONG_LONG
12+
13+
cdef struct LastTime:
14+
int f_lineno
15+
PY_LONG_LONG time
16+
17+
cdef struct LineTime:
18+
long long code
19+
int lineno
20+
PY_LONG_LONG total_time
21+
long nhits
22+
23+
# Types used for mappings from code hash to last/line times.
24+
ctypedef unordered_map[int64, LastTime] LastTimeMap
25+
ctypedef unordered_map[int64, LineTime] LineTimeMap
26+
27+
cdef inline void last_erase_if_present(LastTimeMap* m, int64 key) noexcept:
28+
cdef LastTimeMap.iterator it = deref(m).find(key)
29+
if it != deref(m).end():
30+
deref(m).erase(it)
31+
32+
cdef inline LineTime* line_ensure_entry(LineTimeMap* m, int lineno, long long code_hash) noexcept:
33+
if not deref(m).count(lineno):
34+
deref(m)[lineno] = LineTime(code_hash, lineno, 0, 0)
35+
return &(deref(m)[lineno])

setup.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,10 @@ def run_cythonize(force=False):
231231
sources=["line_profiler/_line_profiler.pyx",
232232
"line_profiler/timers.c",
233233
"line_profiler/c_trace_callbacks.c"],
234+
# force a recompile if this changes
235+
depends=[
236+
"line_profiler/_map_helpers.pxd",
237+
],
234238
language="c++",
235239
define_macros=[("CYTHON_TRACE", (1 if os.getenv("DEV") == "true" else 0))],
236240
),
@@ -240,7 +244,7 @@ def run_cythonize(force=False):
240244
"legacy_implicit_noexcept": True,
241245
"linetrace": (True if os.getenv("DEV") == "true" else False)
242246
},
243-
include_path=["line_profiler/python25.pxd"],
247+
include_path=["line_profiler/python25.pxd", "line_profiler/_map_helpers.pxd"],
244248
force=force,
245249
nthreads=multiprocessing.cpu_count(),
246250
)
@@ -294,7 +298,6 @@ def run_cythonize(force=False):
294298
setupkw["classifiers"] = [
295299
'Development Status :: 5 - Production/Stable',
296300
'Intended Audience :: Developers',
297-
'License :: OSI Approved :: BSD License',
298301
'Operating System :: OS Independent',
299302
'Programming Language :: C',
300303
'Programming Language :: Python',

0 commit comments

Comments
 (0)