From 47a97643fcf797ddaaf44588c983e30c0f69dc82 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Mon, 16 Feb 2026 17:55:21 +0000 Subject: [PATCH 1/3] Optimize the new uops added when recording values during tracing. * Handle dependencies in the optimizer, not the tracer * Strengthen some checks to avoid relying on optimizer for correctness --- Include/internal/pycore_optimizer.h | 26 +++--- Include/internal/pycore_optimizer_types.h | 6 +- Include/internal/pycore_uop_ids.h | 10 +-- Include/internal/pycore_uop_metadata.h | 32 +++---- Modules/_testinternalcapi/test_cases.c.h | 3 +- Objects/codeobject.c | 1 - Objects/frameobject.c | 1 - Objects/funcobject.c | 3 +- Python/bytecodes.c | 16 ++-- Python/executor_cases.c.h | 54 ++++-------- Python/generated_cases.c.h | 3 +- Python/instrumentation.c | 7 +- Python/optimizer.c | 57 +++--------- Python/optimizer_analysis.c | 25 +++--- Python/optimizer_bytecodes.c | 103 ++++++++++++++-------- Python/optimizer_cases.c.h | 88 ++++++++++-------- Python/optimizer_symbols.c | 82 +++++++++++------ 17 files changed, 274 insertions(+), 243 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 79a2d60eb788ea..d9f7f59de1798e 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -22,6 +22,10 @@ typedef struct _PyJitUopBuffer { _PyUOpInstruction *end; } _PyJitUopBuffer; +typedef struct _JitOptRefBuffer { + JitOptRef *used; + JitOptRef *end; +} _JitOptRefBuffer; typedef struct _JitOptContext { char done; @@ -37,10 +41,15 @@ typedef struct _JitOptContext { // Arena for the symbolic types. ty_arena t_arena; - JitOptRef *n_consumed; - JitOptRef *limit; - JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE]; + /* To do -- We could make this more space efficient + * by using a single array and growing the stack and + * locals toward each other. */ + _JitOptRefBuffer locals; + _JitOptRefBuffer stack; + JitOptRef locals_array[ABSTRACT_INTERP_LOCALS_SIZE]; + JitOptRef stack_array[ABSTRACT_INTERP_STACK_SIZE]; _PyJitUopBuffer out_buffer; + _PyBloomFilter *dependencies; } JitOptContext; @@ -83,13 +92,11 @@ typedef struct _PyJitTracerInitialState { } _PyJitTracerInitialState; typedef struct _PyJitTracerPreviousState { - bool dependencies_still_valid; int instr_oparg; int instr_stacklevel; _Py_CODEUNIT *instr; PyCodeObject *instr_code; // Strong struct _PyInterpreterFrame *instr_frame; - _PyBloomFilter dependencies; PyObject *recorded_value; // Strong, may be NULL } _PyJitTracerPreviousState; @@ -303,25 +310,24 @@ extern void _Py_uop_sym_set_recorded_type(JitOptContext *ctx, JitOptRef sym, PyT extern void _Py_uop_sym_set_recorded_gen_func(JitOptContext *ctx, JitOptRef ref, PyFunctionObject *value); extern PyCodeObject *_Py_uop_sym_get_probable_func_code(JitOptRef sym); extern PyObject *_Py_uop_sym_get_probable_value(JitOptRef sym); +extern JitOptRef *_Py_uop_sym_set_stack_depth(JitOptContext *ctx, int stack_depth, JitOptRef *current_sp); -extern void _Py_uop_abstractcontext_init(JitOptContext *ctx); +extern void _Py_uop_abstractcontext_init(JitOptContext *ctx, _PyBloomFilter *dependencies); extern void _Py_uop_abstractcontext_fini(JitOptContext *ctx); extern _Py_UOpsAbstractFrame *_Py_uop_frame_new( JitOptContext *ctx, PyCodeObject *co, - int curr_stackentries, JitOptRef *args, int arg_len); extern _Py_UOpsAbstractFrame *_Py_uop_frame_new_from_symbol( JitOptContext *ctx, JitOptRef callable, - int curr_stackentries, JitOptRef *args, int arg_len); -extern int _Py_uop_frame_pop(JitOptContext *ctx, PyCodeObject *co, int curr_stackentries); +extern int _Py_uop_frame_pop(JitOptContext *ctx, PyCodeObject *co); PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored); @@ -357,8 +363,6 @@ PyAPI_FUNC(void) _PyJit_FinalizeTracing(PyThreadState *tstate, int err); void _PyPrintExecutor(_PyExecutorObject *executor, const _PyUOpInstruction *marker); void _PyJit_TracerFree(_PyThreadStateImpl *_tstate); -void _PyJit_Tracer_InvalidateDependency(PyThreadState *old_tstate, void *obj); - #ifdef _Py_TIER2 typedef void (*_Py_RecordFuncPtr)(_PyInterpreterFrame *frame, _PyStackRef *stackpointer, int oparg, PyObject **recorded_value); PyAPI_DATA(const _Py_RecordFuncPtr) _PyOpcode_RecordFunctions[]; diff --git a/Include/internal/pycore_optimizer_types.h b/Include/internal/pycore_optimizer_types.h index 57c0c828c2aabd..2958db5b787975 100644 --- a/Include/internal/pycore_optimizer_types.h +++ b/Include/internal/pycore_optimizer_types.h @@ -11,8 +11,9 @@ extern "C" { #include #include "pycore_uop.h" // UOP_MAX_TRACE_LENGTH -// Holds locals, stack, locals, stack ... (in that order) -#define MAX_ABSTRACT_INTERP_SIZE 512 +#define ABSTRACT_INTERP_STACK_SIZE 256 +#define ABSTRACT_INTERP_LOCALS_SIZE 512 + #define TY_ARENA_SIZE (UOP_MAX_TRACE_LENGTH * 5) @@ -138,6 +139,7 @@ typedef struct _Py_UOpsAbstractFrame { // Max stacklen int stack_len; int locals_len; + bool caller; // We have made a call from this frame during the trace PyFunctionObject *func; PyCodeObject *code; diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 94b05b736ed277..ebf21b12633c78 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -147,7 +147,7 @@ extern "C" { #define _GUARD_CALLABLE_STR_1 402 #define _GUARD_CALLABLE_TUPLE_1 403 #define _GUARD_CALLABLE_TYPE_1 404 -#define _GUARD_CODE 405 +#define _GUARD_CODE_VERSION 405 #define _GUARD_DORV_NO_DICT 406 #define _GUARD_DORV_VALUES_INST_ATTR_FROM_DICT 407 #define _GUARD_GLOBALS_VERSION 408 @@ -658,10 +658,10 @@ extern "C" { #define _GUARD_CALLABLE_TYPE_1_r13 855 #define _GUARD_CALLABLE_TYPE_1_r23 856 #define _GUARD_CALLABLE_TYPE_1_r33 857 -#define _GUARD_CODE_r00 858 -#define _GUARD_CODE_r11 859 -#define _GUARD_CODE_r22 860 -#define _GUARD_CODE_r33 861 +#define _GUARD_CODE_VERSION_r00 858 +#define _GUARD_CODE_VERSION_r11 859 +#define _GUARD_CODE_VERSION_r22 860 +#define _GUARD_CODE_VERSION_r33 861 #define _GUARD_DORV_NO_DICT_r01 862 #define _GUARD_DORV_NO_DICT_r11 863 #define _GUARD_DORV_NO_DICT_r22 864 diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 5a47eae7a9abb1..7921d229f11db3 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -370,7 +370,7 @@ const uint32_t _PyUop_Flags[MAX_UOP_ID+1] = { [_TIER2_RESUME_CHECK] = HAS_PERIODIC_FLAG, [_COLD_EXIT] = HAS_SYNC_SP_FLAG, [_COLD_DYNAMIC_EXIT] = HAS_SYNC_SP_FLAG, - [_GUARD_CODE] = HAS_EXIT_FLAG, + [_GUARD_CODE_VERSION] = HAS_EXIT_FLAG, [_GUARD_IP__PUSH_FRAME] = HAS_EXIT_FLAG, [_GUARD_IP_YIELD_VALUE] = HAS_EXIT_FLAG, [_GUARD_IP_RETURN_VALUE] = HAS_EXIT_FLAG, @@ -3404,13 +3404,13 @@ const _PyUopCachingInfo _PyUop_Caching[MAX_UOP_ID+1] = { { -1, -1, -1 }, }, }, - [_GUARD_CODE] = { + [_GUARD_CODE_VERSION] = { .best = { 0, 1, 2, 3 }, .entries = { - { 0, 0, _GUARD_CODE_r00 }, - { 1, 1, _GUARD_CODE_r11 }, - { 2, 2, _GUARD_CODE_r22 }, - { 3, 3, _GUARD_CODE_r33 }, + { 0, 0, _GUARD_CODE_VERSION_r00 }, + { 1, 1, _GUARD_CODE_VERSION_r11 }, + { 2, 2, _GUARD_CODE_VERSION_r22 }, + { 3, 3, _GUARD_CODE_VERSION_r33 }, }, }, [_GUARD_IP__PUSH_FRAME] = { @@ -4221,10 +4221,10 @@ const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1] = { [_TIER2_RESUME_CHECK_r33] = _TIER2_RESUME_CHECK, [_COLD_EXIT_r00] = _COLD_EXIT, [_COLD_DYNAMIC_EXIT_r00] = _COLD_DYNAMIC_EXIT, - [_GUARD_CODE_r00] = _GUARD_CODE, - [_GUARD_CODE_r11] = _GUARD_CODE, - [_GUARD_CODE_r22] = _GUARD_CODE, - [_GUARD_CODE_r33] = _GUARD_CODE, + [_GUARD_CODE_VERSION_r00] = _GUARD_CODE_VERSION, + [_GUARD_CODE_VERSION_r11] = _GUARD_CODE_VERSION, + [_GUARD_CODE_VERSION_r22] = _GUARD_CODE_VERSION, + [_GUARD_CODE_VERSION_r33] = _GUARD_CODE_VERSION, [_GUARD_IP__PUSH_FRAME_r00] = _GUARD_IP__PUSH_FRAME, [_GUARD_IP__PUSH_FRAME_r11] = _GUARD_IP__PUSH_FRAME, [_GUARD_IP__PUSH_FRAME_r22] = _GUARD_IP__PUSH_FRAME, @@ -4655,11 +4655,11 @@ const char *const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1] = { [_GUARD_CALLABLE_TYPE_1_r13] = "_GUARD_CALLABLE_TYPE_1_r13", [_GUARD_CALLABLE_TYPE_1_r23] = "_GUARD_CALLABLE_TYPE_1_r23", [_GUARD_CALLABLE_TYPE_1_r33] = "_GUARD_CALLABLE_TYPE_1_r33", - [_GUARD_CODE] = "_GUARD_CODE", - [_GUARD_CODE_r00] = "_GUARD_CODE_r00", - [_GUARD_CODE_r11] = "_GUARD_CODE_r11", - [_GUARD_CODE_r22] = "_GUARD_CODE_r22", - [_GUARD_CODE_r33] = "_GUARD_CODE_r33", + [_GUARD_CODE_VERSION] = "_GUARD_CODE_VERSION", + [_GUARD_CODE_VERSION_r00] = "_GUARD_CODE_VERSION_r00", + [_GUARD_CODE_VERSION_r11] = "_GUARD_CODE_VERSION_r11", + [_GUARD_CODE_VERSION_r22] = "_GUARD_CODE_VERSION_r22", + [_GUARD_CODE_VERSION_r33] = "_GUARD_CODE_VERSION_r33", [_GUARD_DORV_NO_DICT] = "_GUARD_DORV_NO_DICT", [_GUARD_DORV_NO_DICT_r01] = "_GUARD_DORV_NO_DICT_r01", [_GUARD_DORV_NO_DICT_r11] = "_GUARD_DORV_NO_DICT_r11", @@ -6070,7 +6070,7 @@ int _PyUop_num_popped(int opcode, int oparg) return 0; case _COLD_DYNAMIC_EXIT: return 0; - case _GUARD_CODE: + case _GUARD_CODE_VERSION: return 0; case _GUARD_IP__PUSH_FRAME: return 0; diff --git a/Modules/_testinternalcapi/test_cases.c.h b/Modules/_testinternalcapi/test_cases.c.h index ddd8fcdc231bf1..53b771b4514750 100644 --- a/Modules/_testinternalcapi/test_cases.c.h +++ b/Modules/_testinternalcapi/test_cases.c.h @@ -5675,7 +5675,8 @@ assert(executor->vm_data.code == code); assert(executor->vm_data.valid); assert(tstate->current_executor == NULL); - if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); + if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) != iversion) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; diff --git a/Objects/codeobject.c b/Objects/codeobject.c index ed3cc41480ab5c..776444a0cc2086 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2433,7 +2433,6 @@ code_dealloc(PyObject *self) PyMem_Free(co_extra); } #ifdef _Py_TIER2 - _PyJit_Tracer_InvalidateDependency(tstate, self); if (co->co_executors != NULL) { clear_executors(co); } diff --git a/Objects/frameobject.c b/Objects/frameobject.c index 9d774a71edb797..9a7abfc0ec26ab 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -262,7 +262,6 @@ framelocalsproxy_setitem(PyObject *self, PyObject *key, PyObject *value) #if _Py_TIER2 _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), co, 1); - _PyJit_Tracer_InvalidateDependency(_PyThreadState_GET(), co); #endif _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i); diff --git a/Objects/funcobject.c b/Objects/funcobject.c index ee0c46a95b9708..c47a67384a13d4 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -12,7 +12,7 @@ #include "pycore_setobject.h" // _PySet_NextEntry() #include "pycore_stats.h" #include "pycore_weakref.h" // FT_CLEAR_WEAKREFS() -#include "pycore_optimizer.h" // _PyJit_Tracer_InvalidateDependency +#include "pycore_optimizer.h" // _Py_Executors_InvalidateDependency static const char * func_event_name(PyFunction_WatchEvent event) { @@ -1128,7 +1128,6 @@ func_dealloc(PyObject *self) } #if _Py_TIER2 _Py_Executors_InvalidateDependency(_PyInterpreterState_GET(), self, 1); - _PyJit_Tracer_InvalidateDependency(_PyThreadState_GET(), self); #endif _PyObject_GC_UNTRACK(op); FT_CLEAR_WEAKREFS(self, op->func_weakreflist); diff --git a/Python/bytecodes.c b/Python/bytecodes.c index b461f9b5bea8a6..a6767351ef50de 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3125,10 +3125,10 @@ dummy_func( assert(executor->vm_data.code == code); assert(executor->vm_data.valid); assert(tstate->current_executor == NULL); - /* If the eval breaker is set then stay in tier 1. - * This avoids any potentially infinite loops - * involving _RESUME_CHECK */ - if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { + /* If the eval breaker is set, or instrumentation is needed, then stay in tier 1. + * This avoids any potentially infinite loops involving _RESUME_CHECK */ + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); + if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) != iversion) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; @@ -5616,9 +5616,9 @@ dummy_func( HANDLE_PENDING_AND_DEOPT_IF(_Py_emscripten_signal_clock == 0); _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; #endif + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); - HANDLE_PENDING_AND_DEOPT_IF(eval_breaker & _PY_EVAL_EVENTS_MASK); - assert(tstate->tracing || eval_breaker == FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version)); + HANDLE_PENDING_AND_DEOPT_IF(eval_breaker != iversion); } tier2 op(_COLD_EXIT, ( -- )) { @@ -5668,9 +5668,9 @@ dummy_func( Py_UNREACHABLE(); } - tier2 op(_GUARD_CODE, (version/2 -- )) { + tier2 op(_GUARD_CODE_VERSION, (version/2 -- )) { PyObject *code = PyStackRef_AsPyObjectBorrow(frame->f_executable); - EXIT_IF(code == Py_None); + assert(PyCode_Check(code)); EXIT_IF(((PyCodeObject *)code)->co_version != version); } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 9dead4eecc7826..0cc40cbbf9f6a6 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -20034,13 +20034,13 @@ } _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; #endif + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); - if (eval_breaker & _PY_EVAL_EVENTS_MASK) { + if (eval_breaker != iversion) { UOP_STAT_INC(uopcode, miss); SET_CURRENT_CACHED_VALUES(0); JUMP_TO_JUMP_TARGET(); } - assert(tstate->tracing || eval_breaker == FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version)); SET_CURRENT_CACHED_VALUES(0); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); break; @@ -20059,14 +20059,14 @@ } _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; #endif + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); - if (eval_breaker & _PY_EVAL_EVENTS_MASK) { + if (eval_breaker != iversion) { UOP_STAT_INC(uopcode, miss); _tos_cache0 = _stack_item_0; SET_CURRENT_CACHED_VALUES(1); JUMP_TO_JUMP_TARGET(); } - assert(tstate->tracing || eval_breaker == FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version)); _tos_cache0 = _stack_item_0; SET_CURRENT_CACHED_VALUES(1); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); @@ -20088,15 +20088,15 @@ } _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; #endif + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); - if (eval_breaker & _PY_EVAL_EVENTS_MASK) { + if (eval_breaker != iversion) { UOP_STAT_INC(uopcode, miss); _tos_cache1 = _stack_item_1; _tos_cache0 = _stack_item_0; SET_CURRENT_CACHED_VALUES(2); JUMP_TO_JUMP_TARGET(); } - assert(tstate->tracing || eval_breaker == FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version)); _tos_cache1 = _stack_item_1; _tos_cache0 = _stack_item_0; SET_CURRENT_CACHED_VALUES(2); @@ -20121,8 +20121,9 @@ } _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; #endif + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); - if (eval_breaker & _PY_EVAL_EVENTS_MASK) { + if (eval_breaker != iversion) { UOP_STAT_INC(uopcode, miss); _tos_cache2 = _stack_item_2; _tos_cache1 = _stack_item_1; @@ -20130,7 +20131,6 @@ SET_CURRENT_CACHED_VALUES(3); JUMP_TO_JUMP_TARGET(); } - assert(tstate->tracing || eval_breaker == FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version)); _tos_cache2 = _stack_item_2; _tos_cache1 = _stack_item_1; _tos_cache0 = _stack_item_0; @@ -20184,16 +20184,12 @@ GOTO_TIER_ONE(target); } - case _GUARD_CODE_r00: { + case _GUARD_CODE_VERSION_r00: { CHECK_CURRENT_CACHED_VALUES(0); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); uint32_t version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *code = PyStackRef_AsPyObjectBorrow(frame->f_executable); - if (code == Py_None) { - UOP_STAT_INC(uopcode, miss); - SET_CURRENT_CACHED_VALUES(0); - JUMP_TO_JUMP_TARGET(); - } + assert(PyCode_Check(code)); if (((PyCodeObject *)code)->co_version != version) { UOP_STAT_INC(uopcode, miss); SET_CURRENT_CACHED_VALUES(0); @@ -20204,18 +20200,13 @@ break; } - case _GUARD_CODE_r11: { + case _GUARD_CODE_VERSION_r11: { CHECK_CURRENT_CACHED_VALUES(1); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); _PyStackRef _stack_item_0 = _tos_cache0; uint32_t version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *code = PyStackRef_AsPyObjectBorrow(frame->f_executable); - if (code == Py_None) { - UOP_STAT_INC(uopcode, miss); - _tos_cache0 = _stack_item_0; - SET_CURRENT_CACHED_VALUES(1); - JUMP_TO_JUMP_TARGET(); - } + assert(PyCode_Check(code)); if (((PyCodeObject *)code)->co_version != version) { UOP_STAT_INC(uopcode, miss); _tos_cache0 = _stack_item_0; @@ -20228,20 +20219,14 @@ break; } - case _GUARD_CODE_r22: { + case _GUARD_CODE_VERSION_r22: { CHECK_CURRENT_CACHED_VALUES(2); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); _PyStackRef _stack_item_0 = _tos_cache0; _PyStackRef _stack_item_1 = _tos_cache1; uint32_t version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *code = PyStackRef_AsPyObjectBorrow(frame->f_executable); - if (code == Py_None) { - UOP_STAT_INC(uopcode, miss); - _tos_cache1 = _stack_item_1; - _tos_cache0 = _stack_item_0; - SET_CURRENT_CACHED_VALUES(2); - JUMP_TO_JUMP_TARGET(); - } + assert(PyCode_Check(code)); if (((PyCodeObject *)code)->co_version != version) { UOP_STAT_INC(uopcode, miss); _tos_cache1 = _stack_item_1; @@ -20256,7 +20241,7 @@ break; } - case _GUARD_CODE_r33: { + case _GUARD_CODE_VERSION_r33: { CHECK_CURRENT_CACHED_VALUES(3); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); _PyStackRef _stack_item_0 = _tos_cache0; @@ -20264,14 +20249,7 @@ _PyStackRef _stack_item_2 = _tos_cache2; uint32_t version = (uint32_t)CURRENT_OPERAND0_32(); PyObject *code = PyStackRef_AsPyObjectBorrow(frame->f_executable); - if (code == Py_None) { - UOP_STAT_INC(uopcode, miss); - _tos_cache2 = _stack_item_2; - _tos_cache1 = _stack_item_1; - _tos_cache0 = _stack_item_0; - SET_CURRENT_CACHED_VALUES(3); - JUMP_TO_JUMP_TARGET(); - } + assert(PyCode_Check(code)); if (((PyCodeObject *)code)->co_version != version) { UOP_STAT_INC(uopcode, miss); _tos_cache2 = _stack_item_2; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 37fa6d679190dd..044c5efea24add 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -5675,7 +5675,8 @@ assert(executor->vm_data.code == code); assert(executor->vm_data.valid); assert(tstate->current_executor == NULL); - if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { + uintptr_t iversion = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); + if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) != iversion) { opcode = executor->vm_data.opcode; oparg = (oparg & ~255) | executor->vm_data.oparg; next_instr = this_instr; diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 28bbe1d82a3b88..b074d23277878b 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -1785,7 +1785,6 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) _PyCode_Clear_Executors(code); } _Py_Executors_InvalidateDependency(interp, code, 1); - _PyJit_Tracer_InvalidateDependency(PyThreadState_GET(), code); #endif int code_len = (int)Py_SIZE(code); /* Exit early to avoid creating instrumentation @@ -2115,6 +2114,9 @@ int _PyMonitoring_ClearToolId(int tool_id) // Set the new global version so all the code objects can refresh the // instrumentation. set_global_version(_PyThreadState_GET(), version); +#ifdef _Py_TIER2 + _Py_Executors_InvalidateAll(interp, 1); +#endif int res = instrument_all_executing_code_objects(interp); _PyEval_StartTheWorld(interp); return res; @@ -2457,6 +2459,9 @@ monitoring_restart_events_impl(PyObject *module) } interp->last_restart_version = restart_version; set_global_version(tstate, new_version); +#ifdef _Py_TIER2 + _Py_Executors_InvalidateAll(interp, 1); +#endif int res = instrument_all_executing_code_objects(interp); _PyEval_StartTheWorld(interp); diff --git a/Python/optimizer.c b/Python/optimizer.c index 466729b158d345..f075e28d71e0f8 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -160,11 +160,6 @@ _PyOptimizer_Optimize( interp->compiling = false; return 0; } - // One of our dependencies while tracing was invalidated. Not worth compiling. - if (!_tstate->jit_tracer_state->prev_state.dependencies_still_valid) { - interp->compiling = false; - return 0; - } _PyExecutorObject *executor; int err = uop_optimize(frame, tstate, &executor, progress_needed); if (err <= 0) { @@ -615,7 +610,6 @@ _PyJit_translate_single_bytecode_to_trace( _PyJitTracerState *tracer = _tstate->jit_tracer_state; PyCodeObject *old_code = tracer->prev_state.instr_code; bool progress_needed = (tracer->initial_state.chain_depth % MAX_CHAIN_DEPTH) == 0; - _PyBloomFilter *dependencies = &tracer->prev_state.dependencies; _PyJitUopBuffer *trace = &tracer->code_buffer; _Py_CODEUNIT *this_instr = tracer->prev_state.instr; @@ -701,10 +695,6 @@ _PyJit_translate_single_bytecode_to_trace( } #endif - if (!tracer->prev_state.dependencies_still_valid) { - goto done; - } - // This happens when a recursive call happens that we can't trace. Such as Python -> C -> Python calls // If we haven't guarded the IP, then it's untraceable. if (frame != tracer->prev_state.instr_frame && !needs_guard_ip) { @@ -784,11 +774,6 @@ _PyJit_translate_single_bytecode_to_trace( ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)target_instr, target); } - // Can be NULL for the entry frame. - if (old_code != NULL) { - _Py_BloomFilter_Add(dependencies, old_code); - } - switch (opcode) { case POP_JUMP_IF_NONE: case POP_JUMP_IF_NOT_NONE: @@ -925,15 +910,6 @@ _PyJit_translate_single_bytecode_to_trace( expansion->uops[i].offset); Py_FatalError("garbled expansion"); } - if (uop == _PUSH_FRAME || uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) { - PyCodeObject *new_code = (PyCodeObject *)PyStackRef_AsPyObjectBorrow(frame->f_executable); - if (new_code != NULL && !Py_IsNone((PyObject*)new_code)) { - _Py_BloomFilter_Add(dependencies, new_code); - } - ADD_TO_TRACE(uop, oparg, operand, target); - uop_buffer_last(trace)->operand1 = PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer - _PyFrame_Stackbase(frame))); - break; - } if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) { assert(i + 1 == nuops); _Py_CODEUNIT *next = target_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]]; @@ -964,7 +940,10 @@ _PyJit_translate_single_bytecode_to_trace( ADD_TO_TRACE(_RECORD_CODE, 0, (uintptr_t)code, 0); ADD_TO_TRACE(guard_ip, 0, (uintptr_t)next_instr, 0); if (PyCode_Check(code)) { - ADD_TO_TRACE(_GUARD_CODE, 0, ((PyCodeObject *)code)->co_version, 0); + /* Record stack depth, in operand1 */ + int stack_depth = (int)(frame->stackpointer - _PyFrame_Stackbase(frame)); + uop_buffer_last(trace)->operand1 = stack_depth; + ADD_TO_TRACE(_GUARD_CODE_VERSION, 0, ((PyCodeObject *)code)->co_version, 0); } } // Loop back to the start @@ -1046,7 +1025,6 @@ _PyJit_TryInitializeTracing( tracer->initial_state.exit = exit; tracer->initial_state.stack_depth = (int)(stack_pointer - _PyFrame_Stackbase(frame)); tracer->initial_state.chain_depth = chain_depth; - tracer->prev_state.dependencies_still_valid = true; tracer->prev_state.instr_code = (PyCodeObject *)Py_NewRef(_PyFrame_GetCode(frame)); tracer->prev_state.instr = curr_instr; tracer->prev_state.instr_frame = frame; @@ -1064,7 +1042,6 @@ _PyJit_TryInitializeTracing( if (_PyOpcode_Caches[_PyOpcode_Deopt[close_loop_instr->op.code]]) { close_loop_instr[1].counter = trigger_backoff_counter(); } - _Py_BloomFilter_Init(&tracer->prev_state.dependencies); tracer->is_tracing = true; return 1; } @@ -1216,7 +1193,7 @@ prepare_for_execution(_PyUOpInstruction *buffer, int length) base_opcode == _GUARD_IP_RETURN_VALUE || base_opcode == _GUARD_IP_YIELD_VALUE || base_opcode == _GUARD_IP_RETURN_GENERATOR || - base_opcode == _GUARD_CODE + base_opcode == _GUARD_CODE_VERSION ) { base_exit_op = _DYNAMIC_EXIT; } @@ -1498,7 +1475,6 @@ uop_optimize( { _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; assert(_tstate->jit_tracer_state != NULL); - _PyBloomFilter *dependencies = &_tstate->jit_tracer_state->prev_state.dependencies; _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer.start; OPT_STAT_INC(attempts); bool is_noopt = !tstate->interp->opt_config.uops_optimize_enabled; @@ -1510,11 +1486,15 @@ uop_optimize( assert(length > 0); assert(length < UOP_MAX_TRACE_LENGTH); OPT_STAT_INC(traces_created); + + _PyBloomFilter dependencies; + _Py_BloomFilter_Init(&dependencies); if (!is_noopt) { _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[UOP_MAX_TRACE_LENGTH]; length = _Py_uop_analyze_and_optimize( _tstate, buffer, length, curr_stackentries, - output, dependencies); + output, &dependencies); + if (length <= 0) { return length; } @@ -1546,7 +1526,7 @@ uop_optimize( length = prepare_for_execution(buffer, length); assert(length <= UOP_MAX_TRACE_LENGTH); _PyExecutorObject *executor = make_executor_from_uops( - _tstate, buffer, length, dependencies); + _tstate, buffer, length, &dependencies); if (executor == NULL) { return -1; } @@ -1861,21 +1841,6 @@ _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is _Py_Executors_InvalidateAll(interp, is_invalidation); } -void -_PyJit_Tracer_InvalidateDependency(PyThreadState *tstate, void *obj) -{ - _PyBloomFilter obj_filter; - _Py_BloomFilter_Init(&obj_filter); - _Py_BloomFilter_Add(&obj_filter, obj); - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; - if (_tstate->jit_tracer_state == NULL) { - return; - } - if (bloom_filter_may_contain(&_tstate->jit_tracer_state->prev_state.dependencies, &obj_filter)) - { - _tstate->jit_tracer_state->prev_state.dependencies_still_valid = false; - } -} /* Invalidate all executors */ void _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index c6a513ad220b63..45dd42c96064bc 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -55,23 +55,21 @@ static void dump_abstract_stack(_Py_UOpsAbstractFrame *frame, JitOptRef *stack_pointer) { - JitOptRef *stack_base = frame->stack; - JitOptRef *locals_base = frame->locals; printf(" locals=["); - for (JitOptRef *ptr = locals_base; ptr < stack_base; ptr++) { - if (ptr != locals_base) { + for (int i = 0 ; i < frame->locals_len; i++) { + if (i > 0) { printf(", "); } - _PyUOpSymPrint(*ptr); + _PyUOpSymPrint(frame->locals[i]); } printf("]\n"); - if (stack_pointer < stack_base) { - printf(" stack=%d\n", (int)(stack_pointer - stack_base)); + if (stack_pointer < frame->stack) { + printf(" stack=%d\n", (int)(stack_pointer - frame->stack)); } else { printf(" stack=["); - for (JitOptRef *ptr = stack_base; ptr < stack_pointer; ptr++) { - if (ptr != stack_base) { + for (JitOptRef *ptr = frame->stack; ptr < stack_pointer; ptr++) { + if (ptr != frame->stack) { printf(", "); } _PyUOpSymPrint(*ptr); @@ -291,6 +289,7 @@ add_op(JitOptContext *ctx, _PyUOpInstruction *this_instr, #define sym_set_recorded_gen_func(SYM, VAL) _Py_uop_sym_set_recorded_gen_func(ctx, SYM, VAL) #define sym_get_probable_func_code _Py_uop_sym_get_probable_func_code #define sym_get_probable_value _Py_uop_sym_get_probable_value +#define sym_set_stack_depth(DEPTH, SP) _Py_uop_sym_set_stack_depth(ctx, DEPTH, SP) /* Comparison oparg masks */ #define COMPARE_LT_MASK 2 @@ -473,14 +472,15 @@ optimize_uops( interp->type_watchers[TYPE_WATCHER_ID] = type_watcher_callback; } - _Py_uop_abstractcontext_init(ctx); - _Py_UOpsAbstractFrame *frame = _Py_uop_frame_new(ctx, (PyCodeObject *)func->func_code, curr_stacklen, NULL, 0); + _Py_uop_abstractcontext_init(ctx, dependencies); + _Py_UOpsAbstractFrame *frame = _Py_uop_frame_new(ctx, (PyCodeObject *)func->func_code, NULL, 0); if (frame == NULL) { return 0; } frame->func = func; ctx->curr_frame_depth++; ctx->frame = frame; + _Py_uop_sym_set_stack_depth(ctx, curr_stacklen, frame->stack_pointer); _PyUOpInstruction *this_instr = NULL; JitOptRef *stack_pointer = ctx->frame->stack_pointer; @@ -718,8 +718,7 @@ _Py_uop_analyze_and_optimize( OPT_STAT_INC(optimizer_attempts); length = optimize_uops( - tstate, buffer, length, curr_stacklen, - output, dependencies); + tstate, buffer, length, curr_stacklen, output, dependencies); if (length == 0) { return length; diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index 2b35628ad99999..a37402975eb222 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -46,6 +46,7 @@ typedef struct _Py_UOpsAbstractFrame _Py_UOpsAbstractFrame; #define sym_set_recorded_gen_func(SYM, VAL) _Py_uop_sym_set_recorded_gen_func(ctx, SYM, VAL) #define sym_get_probable_func_code _Py_uop_sym_get_probable_func_code #define sym_get_probable_value _Py_uop_sym_get_probable_value +#define sym_set_stack_depth(DEPTH, SP) _Py_uop_sym_set_stack_depth(ctx, DEPTH, SP) extern int optimize_to_bool( @@ -362,7 +363,7 @@ dummy_func(void) { } op(_BINARY_OP_SUBSCR_INIT_CALL, (container, sub, getitem -- new_frame)) { - _Py_UOpsAbstractFrame *f = frame_new_from_symbol(ctx, getitem, 0, NULL, 0); + _Py_UOpsAbstractFrame *f = frame_new_from_symbol(ctx, getitem, NULL, 0); if (f == NULL) { break; } @@ -833,7 +834,7 @@ dummy_func(void) { // + 1 for _SAVE_RETURN_OFFSET // FIX ME -- This needs a version check and function watcher PyCodeObject *co = (PyCodeObject *)((PyFunctionObject *)fget)->func_code; - _Py_UOpsAbstractFrame *f = frame_new(ctx, co, 0, NULL, 0); + _Py_UOpsAbstractFrame *f = frame_new(ctx, co, NULL, 0); if (f == NULL) { break; } @@ -894,9 +895,9 @@ dummy_func(void) { } if (sym_is_null(self_or_null) || sym_is_not_null(self_or_null)) { - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, args, argcount)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, args, argcount)); } else { - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, NULL, 0)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); } } @@ -907,15 +908,15 @@ dummy_func(void) { } op(_PY_FRAME_GENERAL, (callable, self_or_null, args[oparg] -- new_frame)) { - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, NULL, 0)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); } op(_PY_FRAME_KW, (callable, self_or_null, args[oparg], kwnames -- new_frame)) { - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, NULL, 0)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); } op(_PY_FRAME_EX, (func_st, null, callargs_st, kwargs_st -- ex_frame)) { - ex_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, func_st, 0, NULL, 0)); + ex_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, func_st, NULL, 0)); } op(_CHECK_AND_ALLOCATE_OBJECT, (type_version/2, callable, self_or_null, args[oparg] -- callable, self_or_null, args[oparg])) { @@ -927,18 +928,18 @@ dummy_func(void) { op(_CREATE_INIT_FRAME, (init, self, args[oparg] -- init_frame)) { ctx->frame->stack_pointer = stack_pointer - oparg - 2; - _Py_UOpsAbstractFrame *shim = frame_new(ctx, (PyCodeObject *)&_Py_InitCleanup, 0, NULL, 0); + _Py_UOpsAbstractFrame *shim = frame_new(ctx, (PyCodeObject *)&_Py_InitCleanup, NULL, 0); if (shim == NULL) { break; } /* Push self onto stack of shim */ - shim->stack[0] = self; + shim->stack_pointer[0] = self; shim->stack_pointer++; assert((int)(shim->stack_pointer - shim->stack) == 1); ctx->frame = shim; ctx->curr_frame_depth++; assert((this_instr + 1)->opcode == _PUSH_FRAME); - init_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, init, 0, args-1, oparg+1)); + init_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, init, args-1, oparg+1)); } op(_RETURN_VALUE, (retval -- res)) { @@ -954,15 +955,7 @@ dummy_func(void) { ctx->done = true; break; } - int returning_stacklevel = (int)this_instr->operand1; - if (ctx->curr_frame_depth >= 2) { - PyCodeObject *expected_code = ctx->frames[ctx->curr_frame_depth - 2].code; - if (expected_code == returning_code) { - assert(this_instr[2].opcode == _GUARD_IP_RETURN_VALUE); - REPLACE_OP((this_instr + 2), _NOP, 0, 0); - } - } - if (frame_pop(ctx, returning_code, returning_stacklevel)) { + if (frame_pop(ctx, returning_code)) { break; } stack_pointer = ctx->frame->stack_pointer; @@ -976,14 +969,12 @@ dummy_func(void) { ctx->frame->stack_pointer = stack_pointer; assert(this_instr[1].opcode == _RECORD_CODE); PyCodeObject *returning_code = (PyCodeObject *)this_instr[1].operand0; - assert(PyCode_Check(returning_code)); if (returning_code == NULL) { ctx->done = true; break; } - _Py_BloomFilter_Add(dependencies, returning_code); - int returning_stacklevel = (int)this_instr->operand1; - if (frame_pop(ctx, returning_code, returning_stacklevel)) { + assert(PyCode_Check(returning_code)); + if (frame_pop(ctx, returning_code)) { break; } stack_pointer = ctx->frame->stack_pointer; @@ -998,14 +989,12 @@ dummy_func(void) { ctx->frame->stack_pointer = stack_pointer; assert(this_instr[1].opcode == _RECORD_CODE); PyCodeObject *returning_code = (PyCodeObject *)this_instr[1].operand0; - assert(PyCode_Check(returning_code)); if (returning_code == NULL) { ctx->done = true; break; } - _Py_BloomFilter_Add(dependencies, returning_code); - int returning_stacklevel = (int)this_instr->operand1; - if (frame_pop(ctx, returning_code, returning_stacklevel)) { + assert(PyCode_Check(returning_code)); + if (frame_pop(ctx, returning_code)) { break; } stack_pointer = ctx->frame->stack_pointer; @@ -1025,22 +1014,24 @@ dummy_func(void) { } op(_FOR_ITER_GEN_FRAME, (iter, unused -- iter, unused, gen_frame)) { - _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, iter, 1, NULL, 0); + _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, iter, NULL, 0); if (new_frame == NULL) { ctx->done = true; break; } - new_frame->stack[0] = sym_new_const(ctx, Py_None); + new_frame->stack_pointer[0] = sym_new_const(ctx, Py_None); + new_frame->stack_pointer++; gen_frame = PyJitRef_WrapInvalid(new_frame); } op(_SEND_GEN_FRAME, (receiver, v -- receiver, gen_frame)) { - _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, receiver, 1, NULL, 0); + _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, receiver, NULL, 0); if (new_frame == NULL) { ctx->done = true; break; } - new_frame->stack[0] = PyJitRef_StripReferenceInfo(v); + new_frame->stack_pointer[0] = PyJitRef_StripReferenceInfo(v); + new_frame->stack_pointer++; gen_frame = PyJitRef_WrapInvalid(new_frame); } @@ -1062,14 +1053,10 @@ dummy_func(void) { if (!CURRENT_FRAME_IS_INIT_SHIM()) { ctx->frame->stack_pointer = stack_pointer; } + ctx->frame->caller = true; ctx->frame = (_Py_UOpsAbstractFrame *)PyJitRef_Unwrap(new_frame); ctx->curr_frame_depth++; stack_pointer = ctx->frame->stack_pointer; - // Fixed calls don't need IP guards. - if ((this_instr-1)->opcode == _CREATE_INIT_FRAME) { - assert((this_instr+1)->opcode == _GUARD_IP__PUSH_FRAME); - REPLACE_OP(this_instr+1, _NOP, 0, 0); - } assert(ctx->frame->locals != NULL); } @@ -1653,6 +1640,50 @@ dummy_func(void) { sym_set_recorded_gen_func(nos, func); } + op(_GUARD_IP__PUSH_FRAME, (ip/4 --)) { + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); + // TO DO + // Normal function calls to a known functions + // do not need an IP guard. + } + + op(_GUARD_CODE_VERSION, (version/2 -- )) { + PyCodeObject *co = get_current_code_object(ctx); + if (co->co_version == version) { + _Py_BloomFilter_Add(dependencies, co); + REPLACE_OP(this_instr, _NOP, 0, 0); + } + else { + ctx->done = true; + } + } + + op(_GUARD_IP_YIELD_VALUE, (ip/4 --)) { + if (ctx->frame->caller) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); + assert(stack_pointer != NULL); + } + + op(_GUARD_IP_RETURN_VALUE, (ip/4 --)) { + if (ctx->frame->caller) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); + assert(stack_pointer != NULL); + } + + op(_GUARD_IP_RETURN_GENERATOR, (ip/4 --)) { + if (ctx->frame->caller) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); + assert(stack_pointer != NULL); + } + + + // END BYTECODES // } diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 7faa699a058249..c2b9f5aae2d9a3 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -1162,7 +1162,7 @@ getitem = stack_pointer[-1]; sub = stack_pointer[-2]; container = stack_pointer[-3]; - _Py_UOpsAbstractFrame *f = frame_new_from_symbol(ctx, getitem, 0, NULL, 0); + _Py_UOpsAbstractFrame *f = frame_new_from_symbol(ctx, getitem, NULL, 0); if (f == NULL) { break; } @@ -1272,15 +1272,7 @@ ctx->done = true; break; } - int returning_stacklevel = (int)this_instr->operand1; - if (ctx->curr_frame_depth >= 2) { - PyCodeObject *expected_code = ctx->frames[ctx->curr_frame_depth - 2].code; - if (expected_code == returning_code) { - assert(this_instr[2].opcode == _GUARD_IP_RETURN_VALUE); - REPLACE_OP((this_instr + 2), _NOP, 0, 0); - } - } - if (frame_pop(ctx, returning_code, returning_stacklevel)) { + if (frame_pop(ctx, returning_code)) { break; } stack_pointer = ctx->frame->stack_pointer; @@ -1324,12 +1316,13 @@ JitOptRef gen_frame; v = stack_pointer[-1]; receiver = stack_pointer[-2]; - _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, receiver, 1, NULL, 0); + _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, receiver, NULL, 0); if (new_frame == NULL) { ctx->done = true; break; } - new_frame->stack[0] = PyJitRef_StripReferenceInfo(v); + new_frame->stack_pointer[0] = PyJitRef_StripReferenceInfo(v); + new_frame->stack_pointer++; gen_frame = PyJitRef_WrapInvalid(new_frame); stack_pointer[-1] = gen_frame; break; @@ -1346,14 +1339,12 @@ ctx->frame->stack_pointer = stack_pointer; assert(this_instr[1].opcode == _RECORD_CODE); PyCodeObject *returning_code = (PyCodeObject *)this_instr[1].operand0; - assert(PyCode_Check(returning_code)); if (returning_code == NULL) { ctx->done = true; break; } - _Py_BloomFilter_Add(dependencies, returning_code); - int returning_stacklevel = (int)this_instr->operand1; - if (frame_pop(ctx, returning_code, returning_stacklevel)) { + assert(PyCode_Check(returning_code)); + if (frame_pop(ctx, returning_code)) { break; } stack_pointer = ctx->frame->stack_pointer; @@ -2011,7 +2002,7 @@ owner = stack_pointer[-1]; PyObject *fget = (PyObject *)this_instr->operand0; PyCodeObject *co = (PyCodeObject *)((PyFunctionObject *)fget)->func_code; - _Py_UOpsAbstractFrame *f = frame_new(ctx, co, 0, NULL, 0); + _Py_UOpsAbstractFrame *f = frame_new(ctx, co, NULL, 0); if (f == NULL) { break; } @@ -2711,12 +2702,13 @@ JitOptRef iter; JitOptRef gen_frame; iter = stack_pointer[-2]; - _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, iter, 1, NULL, 0); + _Py_UOpsAbstractFrame *new_frame = frame_new_from_symbol(ctx, iter, NULL, 0); if (new_frame == NULL) { ctx->done = true; break; } - new_frame->stack[0] = sym_new_const(ctx, Py_None); + new_frame->stack_pointer[0] = sym_new_const(ctx, Py_None); + new_frame->stack_pointer++; gen_frame = PyJitRef_WrapInvalid(new_frame); CHECK_STACK_BOUNDS(1); stack_pointer[0] = gen_frame; @@ -2897,7 +2889,7 @@ JitOptRef callable; JitOptRef new_frame; callable = stack_pointer[-2 - oparg]; - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, NULL, 0)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); CHECK_STACK_BOUNDS(-1 - oparg); stack_pointer[-2 - oparg] = new_frame; stack_pointer += -1 - oparg; @@ -3033,9 +3025,9 @@ argcount++; } if (sym_is_null(self_or_null) || sym_is_not_null(self_or_null)) { - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, args, argcount)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, args, argcount)); } else { - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, NULL, 0)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); } CHECK_STACK_BOUNDS(-1 - oparg); stack_pointer[-2 - oparg] = new_frame; @@ -3053,13 +3045,10 @@ if (!CURRENT_FRAME_IS_INIT_SHIM()) { ctx->frame->stack_pointer = stack_pointer; } + ctx->frame->caller = true; ctx->frame = (_Py_UOpsAbstractFrame *)PyJitRef_Unwrap(new_frame); ctx->curr_frame_depth++; stack_pointer = ctx->frame->stack_pointer; - if ((this_instr-1)->opcode == _CREATE_INIT_FRAME) { - assert((this_instr+1)->opcode == _GUARD_IP__PUSH_FRAME); - REPLACE_OP(this_instr+1, _NOP, 0, 0); - } assert(ctx->frame->locals != NULL); break; } @@ -3213,17 +3202,17 @@ self = stack_pointer[-1 - oparg]; init = stack_pointer[-2 - oparg]; ctx->frame->stack_pointer = stack_pointer - oparg - 2; - _Py_UOpsAbstractFrame *shim = frame_new(ctx, (PyCodeObject *)&_Py_InitCleanup, 0, NULL, 0); + _Py_UOpsAbstractFrame *shim = frame_new(ctx, (PyCodeObject *)&_Py_InitCleanup, NULL, 0); if (shim == NULL) { break; } - shim->stack[0] = self; + shim->stack_pointer[0] = self; shim->stack_pointer++; assert((int)(shim->stack_pointer - shim->stack) == 1); ctx->frame = shim; ctx->curr_frame_depth++; assert((this_instr + 1)->opcode == _PUSH_FRAME); - init_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, init, 0, args-1, oparg+1)); + init_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, init, args-1, oparg+1)); CHECK_STACK_BOUNDS(-1 - oparg); stack_pointer[-2 - oparg] = init_frame; stack_pointer += -1 - oparg; @@ -3500,7 +3489,7 @@ JitOptRef callable; JitOptRef new_frame; callable = stack_pointer[-3 - oparg]; - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, 0, NULL, 0)); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); CHECK_STACK_BOUNDS(-2 - oparg); stack_pointer[-3 - oparg] = new_frame; stack_pointer += -2 - oparg; @@ -3548,7 +3537,7 @@ JitOptRef func_st; JitOptRef ex_frame; func_st = stack_pointer[-4]; - ex_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, func_st, 0, NULL, 0)); + ex_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, func_st, NULL, 0)); CHECK_STACK_BOUNDS(-3); stack_pointer[-4] = ex_frame; stack_pointer += -3; @@ -3592,14 +3581,12 @@ ctx->frame->stack_pointer = stack_pointer; assert(this_instr[1].opcode == _RECORD_CODE); PyCodeObject *returning_code = (PyCodeObject *)this_instr[1].operand0; - assert(PyCode_Check(returning_code)); if (returning_code == NULL) { ctx->done = true; break; } - _Py_BloomFilter_Add(dependencies, returning_code); - int returning_stacklevel = (int)this_instr->operand1; - if (frame_pop(ctx, returning_code, returning_stacklevel)) { + assert(PyCode_Check(returning_code)); + if (frame_pop(ctx, returning_code)) { break; } stack_pointer = ctx->frame->stack_pointer; @@ -4157,23 +4144,52 @@ break; } - case _GUARD_CODE: { + case _GUARD_CODE_VERSION: { + uint32_t version = (uint32_t)this_instr->operand0; + PyCodeObject *co = get_current_code_object(ctx); + if (co->co_version == version) { + _Py_BloomFilter_Add(dependencies, co); + REPLACE_OP(this_instr, _NOP, 0, 0); + } + else { + ctx->done = true; + } break; } case _GUARD_IP__PUSH_FRAME: { + PyObject *ip = (PyObject *)this_instr->operand0; + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); break; } case _GUARD_IP_YIELD_VALUE: { + PyObject *ip = (PyObject *)this_instr->operand0; + if (ctx->frame->caller) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); + assert(stack_pointer != NULL); break; } case _GUARD_IP_RETURN_VALUE: { + PyObject *ip = (PyObject *)this_instr->operand0; + if (ctx->frame->caller) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); + assert(stack_pointer != NULL); break; } case _GUARD_IP_RETURN_GENERATOR: { + PyObject *ip = (PyObject *)this_instr->operand0; + if (ctx->frame->caller) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); + assert(stack_pointer != NULL); break; } diff --git a/Python/optimizer_symbols.c b/Python/optimizer_symbols.c index 635ce622c3c589..46fef55f7b06df 100644 --- a/Python/optimizer_symbols.c +++ b/Python/optimizer_symbols.c @@ -1284,7 +1284,6 @@ _Py_UOpsAbstractFrame * _Py_uop_frame_new_from_symbol( JitOptContext *ctx, JitOptRef callable, - int curr_stackentries, JitOptRef *args, int arg_len) { @@ -1293,7 +1292,7 @@ _Py_uop_frame_new_from_symbol( ctx->done = true; return NULL; } - _Py_UOpsAbstractFrame *frame = _Py_uop_frame_new(ctx, co, curr_stackentries, args, arg_len); + _Py_UOpsAbstractFrame *frame = _Py_uop_frame_new(ctx, co, args, arg_len); if (frame == NULL) { return NULL; } @@ -1311,7 +1310,6 @@ _Py_UOpsAbstractFrame * _Py_uop_frame_new( JitOptContext *ctx, PyCodeObject *co, - int curr_stackentries, JitOptRef *args, int arg_len) { @@ -1324,17 +1322,21 @@ _Py_uop_frame_new( } _Py_UOpsAbstractFrame *frame = &ctx->frames[ctx->curr_frame_depth]; frame->code = co; - frame->stack_len = co->co_stacksize; + + frame->locals = ctx->locals.used; + ctx->locals.used += co->co_nlocalsplus; frame->locals_len = co->co_nlocalsplus; - frame->locals = ctx->n_consumed; - frame->stack = frame->locals + co->co_nlocalsplus; - frame->stack_pointer = frame->stack + curr_stackentries; + frame->stack = ctx->stack.used; + ctx->stack.used += co->co_stacksize; + frame->stack_len = co->co_stacksize; + + frame->stack_pointer = frame->stack; frame->globals_checked_version = 0; frame->globals_watched = false; frame->func = NULL; - ctx->n_consumed = ctx->n_consumed + (co->co_nlocalsplus + co->co_stacksize); - if (ctx->n_consumed >= ctx->limit) { + frame->caller = false; + if (ctx->locals.used > ctx->locals.end || ctx->stack.used > ctx->stack.end) { ctx->done = true; ctx->out_of_space = true; return NULL; @@ -1354,16 +1356,40 @@ _Py_uop_frame_new( frame->locals[i] = local; } - // Initialize the stack as well - for (int i = 0; i < curr_stackentries; i++) { - JitOptRef stackvar = _Py_uop_sym_new_unknown(ctx); - frame->stack[i] = stackvar; - } + /* Most optimizations rely on code objects being immutable (including sys._getframe modifications), + * and up to date for instrumentation. */ + _Py_BloomFilter_Add(ctx->dependencies, co); assert(frame->locals != NULL); return frame; } +JitOptRef * +_Py_uop_sym_set_stack_depth(JitOptContext *ctx, int stack_depth, JitOptRef *current_sp) { + _Py_UOpsAbstractFrame *frame = ctx->frame; + assert(frame->stack != NULL); + JitOptRef *new_stack_pointer = frame->stack + stack_depth; + if (current_sp > new_stack_pointer) { + ctx->done = true; + ctx->contradiction = "inconsistent recorded stack depths"; + return NULL; + } + int delta = (int)(new_stack_pointer - current_sp); + assert(delta >= 0); + if (delta) { + /* Shift existing stack elements up */ + for (JitOptRef *p = current_sp-1; p >= frame->stack; p--) { + p[delta] = *p; + } + /* Fill rest of stack with unknowns */ + for (int i = 0; i < delta; i++) { + frame->stack[i] = _Py_uop_sym_new_unknown(ctx); + } + } + return frame->stack_pointer = new_stack_pointer; +} + + void _Py_uop_abstractcontext_fini(JitOptContext *ctx) { @@ -1381,14 +1407,20 @@ _Py_uop_abstractcontext_fini(JitOptContext *ctx) } void -_Py_uop_abstractcontext_init(JitOptContext *ctx) +_Py_uop_abstractcontext_init(JitOptContext *ctx, _PyBloomFilter *dependencies) { static_assert(sizeof(JitOptSymbol) <= 3 * sizeof(uint64_t), "JitOptSymbol has grown"); - ctx->limit = ctx->locals_and_stack + MAX_ABSTRACT_INTERP_SIZE; - ctx->n_consumed = ctx->locals_and_stack; + + ctx->stack.used = ctx->stack_array; + ctx->stack.end = &ctx->stack_array[ABSTRACT_INTERP_STACK_SIZE]; + ctx->locals.used = ctx->locals_array; + ctx->locals.end = &ctx->locals_array[ABSTRACT_INTERP_LOCALS_SIZE]; #ifdef Py_DEBUG // Aids debugging a little. There should never be NULL in the abstract interpreter. - for (int i = 0 ; i < MAX_ABSTRACT_INTERP_SIZE; i++) { - ctx->locals_and_stack[i] = PyJitRef_NULL; + for (int i = 0 ; i < ABSTRACT_INTERP_STACK_SIZE; i++) { + ctx->stack_array[i] = PyJitRef_NULL; + } + for (int i = 0 ; i < ABSTRACT_INTERP_LOCALS_SIZE; i++) { + ctx->locals_array[i] = PyJitRef_NULL; } #endif @@ -1406,13 +1438,15 @@ _Py_uop_abstractcontext_init(JitOptContext *ctx) ctx->out_of_space = false; ctx->contradiction = false; ctx->builtins_watched = false; + ctx->dependencies = dependencies; } int -_Py_uop_frame_pop(JitOptContext *ctx, PyCodeObject *co, int curr_stackentries) +_Py_uop_frame_pop(JitOptContext *ctx, PyCodeObject *co) { _Py_UOpsAbstractFrame *frame = ctx->frame; - ctx->n_consumed = frame->locals; + ctx->stack.used = frame->stack; + ctx->locals.used = frame->locals; ctx->curr_frame_depth--; @@ -1436,9 +1470,7 @@ _Py_uop_frame_pop(JitOptContext *ctx, PyCodeObject *co, int curr_stackentries) // Else: trace stack underflow. // This handles swapping out frames. - assert(curr_stackentries >= 1); - // -1 to stackentries as we push to the stack our return value after this. - _Py_UOpsAbstractFrame *new_frame = _Py_uop_frame_new(ctx, co, curr_stackentries - 1, NULL, 0); + _Py_UOpsAbstractFrame *new_frame = _Py_uop_frame_new(ctx, co, NULL, 0); if (new_frame == NULL) { ctx->done = true; return 1; @@ -1474,7 +1506,7 @@ _Py_uop_symbols_test(PyObject *Py_UNUSED(self), PyObject *Py_UNUSED(ignored)) { JitOptContext context; JitOptContext *ctx = &context; - _Py_uop_abstractcontext_init(ctx); + _Py_uop_abstractcontext_init(ctx, NULL); PyObject *val_42 = NULL; PyObject *val_43 = NULL; PyObject *val_big = NULL; From a3054cda47d94a52438b1993abbfc9992da318ff Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Wed, 18 Feb 2026 18:32:36 +0000 Subject: [PATCH 2/3] Address review comments --- Python/optimizer_bytecodes.c | 5 +---- Python/optimizer_cases.c.h | 3 --- Python/optimizer_symbols.c | 18 +++++++++++++----- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index a37402975eb222..228bd51a28bb69 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -1643,7 +1643,7 @@ dummy_func(void) { op(_GUARD_IP__PUSH_FRAME, (ip/4 --)) { stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); // TO DO - // Normal function calls to a known functions + // Normal function calls to known functions // do not need an IP guard. } @@ -1663,7 +1663,6 @@ dummy_func(void) { REPLACE_OP(this_instr, _NOP, 0, 0); } stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); - assert(stack_pointer != NULL); } op(_GUARD_IP_RETURN_VALUE, (ip/4 --)) { @@ -1671,7 +1670,6 @@ dummy_func(void) { REPLACE_OP(this_instr, _NOP, 0, 0); } stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); - assert(stack_pointer != NULL); } op(_GUARD_IP_RETURN_GENERATOR, (ip/4 --)) { @@ -1679,7 +1677,6 @@ dummy_func(void) { REPLACE_OP(this_instr, _NOP, 0, 0); } stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); - assert(stack_pointer != NULL); } diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index c2b9f5aae2d9a3..a93e85329297cd 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -4169,7 +4169,6 @@ REPLACE_OP(this_instr, _NOP, 0, 0); } stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); - assert(stack_pointer != NULL); break; } @@ -4179,7 +4178,6 @@ REPLACE_OP(this_instr, _NOP, 0, 0); } stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); - assert(stack_pointer != NULL); break; } @@ -4189,7 +4187,6 @@ REPLACE_OP(this_instr, _NOP, 0, 0); } stack_pointer = sym_set_stack_depth(this_instr->operand1, stack_pointer); - assert(stack_pointer != NULL); break; } diff --git a/Python/optimizer_symbols.c b/Python/optimizer_symbols.c index 46fef55f7b06df..dcbe093fd6d74c 100644 --- a/Python/optimizer_symbols.c +++ b/Python/optimizer_symbols.c @@ -1371,9 +1371,14 @@ _Py_uop_sym_set_stack_depth(JitOptContext *ctx, int stack_depth, JitOptRef *curr JitOptRef *new_stack_pointer = frame->stack + stack_depth; if (current_sp > new_stack_pointer) { ctx->done = true; - ctx->contradiction = "inconsistent recorded stack depths"; - return NULL; - } + ctx->contradiction = true; + return NULL; + } + if (new_stack_pointer > ctx->stack.end) { + ctx->done = true; + ctx->out_of_space = true; + return NULL; + } int delta = (int)(new_stack_pointer - current_sp); assert(delta >= 0); if (delta) { @@ -1406,15 +1411,18 @@ _Py_uop_abstractcontext_fini(JitOptContext *ctx) } } +// Leave a bit of space to push values before checking that there is space for a new frame +#define STACK_HEADROOM 2 + void _Py_uop_abstractcontext_init(JitOptContext *ctx, _PyBloomFilter *dependencies) { static_assert(sizeof(JitOptSymbol) <= 3 * sizeof(uint64_t), "JitOptSymbol has grown"); ctx->stack.used = ctx->stack_array; - ctx->stack.end = &ctx->stack_array[ABSTRACT_INTERP_STACK_SIZE]; + ctx->stack.end = &ctx->stack_array[ABSTRACT_INTERP_STACK_SIZE-STACK_HEADROOM]; ctx->locals.used = ctx->locals_array; - ctx->locals.end = &ctx->locals_array[ABSTRACT_INTERP_LOCALS_SIZE]; + ctx->locals.end = &ctx->locals_array[ABSTRACT_INTERP_LOCALS_SIZE-STACK_HEADROOM]; #ifdef Py_DEBUG // Aids debugging a little. There should never be NULL in the abstract interpreter. for (int i = 0 ; i < ABSTRACT_INTERP_STACK_SIZE; i++) { ctx->stack_array[i] = PyJitRef_NULL; From 47775faa37001f04447d23db108c36bd78d10669 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 20 Feb 2026 00:16:58 +0000 Subject: [PATCH 3/3] Skip optimizer test if optimizer is off --- Lib/test/test_capi/test_opt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 2cad53d9c0728b..7ac71fbfab1fe0 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -110,6 +110,7 @@ def f{n}(): for exe in executors[:i]: self.assertTrue(exe.is_valid()) + @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") def test_uop_optimizer_invalidation(self): # Generate a new function at each call ns = {}