diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index a97b53028c8f59..363845106e40dc 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -283,6 +283,7 @@ PyAPI_FUNC(PyObject *) _PyEval_LoadName(PyThreadState *tstate, _PyInterpreterFra #define _PY_GC_SCHEDULED_BIT (1U << 4) #define _PY_EVAL_PLEASE_STOP_BIT (1U << 5) #define _PY_EVAL_EXPLICIT_MERGE_BIT (1U << 6) +#define _PY_EVAL_JIT_INVALIDATE_COLD_BIT (1U << 7) /* Reserve a few bits for future use */ #define _PY_EVAL_EVENTS_BITS 8 diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 36366429e8db25..a1898d926ac39f 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -261,7 +261,7 @@ struct _is { struct callable_cache callable_cache; _PyOptimizerObject *optimizer; _PyExecutorObject *executor_list_head; - + size_t trace_run_counter; _rare_events rare_events; PyDict_WatchCallback builtins_dict_watcher; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 19e54bf122a8bb..f92c0a0cddf906 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -29,9 +29,10 @@ typedef struct { typedef struct { uint8_t opcode; uint8_t oparg; - uint16_t valid:1; - uint16_t linked:1; - uint16_t chain_depth:14; // Must be big engough for MAX_CHAIN_DEPTH - 1. + uint8_t valid:1; + uint8_t linked:1; + uint8_t chain_depth:6; // Must be big enough for MAX_CHAIN_DEPTH - 1. + bool warm; int index; // Index of ENTER_EXECUTOR (if code isn't NULL, below). _PyBloomFilter bloom; _PyExecutorLinkListNode links; @@ -123,11 +124,18 @@ PyAPI_FUNC(PyObject *) _PyOptimizer_NewUOpOptimizer(void); #ifdef _Py_TIER2 PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation); PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation); +PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp); + #else # define _Py_Executors_InvalidateDependency(A, B, C) ((void)0) # define _Py_Executors_InvalidateAll(A, B) ((void)0) +# define _Py_Executors_InvalidateCold(A) ((void)0) + #endif +// Used as the threshold to trigger executor invalidation when +// trace_run_counter is greater than this value. +#define JIT_CLEANUP_THRESHOLD 100000 // This is the length of the trace we project initially. #define UOP_MAX_TRACE_LENGTH 800 diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index b950f760d74ac7..927dae88c1fa73 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -222,64 +222,65 @@ extern "C" { #define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD #define _MAKE_CELL MAKE_CELL #define _MAKE_FUNCTION MAKE_FUNCTION +#define _MAKE_WARM 439 #define _MAP_ADD MAP_ADD #define _MATCH_CLASS MATCH_CLASS #define _MATCH_KEYS MATCH_KEYS #define _MATCH_MAPPING MATCH_MAPPING #define _MATCH_SEQUENCE MATCH_SEQUENCE -#define _MAYBE_EXPAND_METHOD 439 -#define _MONITOR_CALL 440 -#define _MONITOR_JUMP_BACKWARD 441 -#define _MONITOR_RESUME 442 +#define _MAYBE_EXPAND_METHOD 440 +#define _MONITOR_CALL 441 +#define _MONITOR_JUMP_BACKWARD 442 +#define _MONITOR_RESUME 443 #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_JUMP_IF_FALSE 443 -#define _POP_JUMP_IF_TRUE 444 +#define _POP_JUMP_IF_FALSE 444 +#define _POP_JUMP_IF_TRUE 445 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 445 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 446 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 446 +#define _PUSH_FRAME 447 #define _PUSH_NULL PUSH_NULL -#define _PY_FRAME_GENERAL 447 -#define _PY_FRAME_KW 448 -#define _QUICKEN_RESUME 449 -#define _REPLACE_WITH_TRUE 450 +#define _PY_FRAME_GENERAL 448 +#define _PY_FRAME_KW 449 +#define _QUICKEN_RESUME 450 +#define _REPLACE_WITH_TRUE 451 #define _RESUME_CHECK RESUME_CHECK #define _RETURN_GENERATOR RETURN_GENERATOR #define _RETURN_VALUE RETURN_VALUE -#define _SAVE_RETURN_OFFSET 451 -#define _SEND 452 -#define _SEND_GEN_FRAME 453 +#define _SAVE_RETURN_OFFSET 452 +#define _SEND 453 +#define _SEND_GEN_FRAME 454 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 454 -#define _STORE_ATTR 455 -#define _STORE_ATTR_INSTANCE_VALUE 456 -#define _STORE_ATTR_SLOT 457 -#define _STORE_ATTR_WITH_HINT 458 +#define _START_EXECUTOR 455 +#define _STORE_ATTR 456 +#define _STORE_ATTR_INSTANCE_VALUE 457 +#define _STORE_ATTR_SLOT 458 +#define _STORE_ATTR_WITH_HINT 459 #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 459 -#define _STORE_FAST_0 460 -#define _STORE_FAST_1 461 -#define _STORE_FAST_2 462 -#define _STORE_FAST_3 463 -#define _STORE_FAST_4 464 -#define _STORE_FAST_5 465 -#define _STORE_FAST_6 466 -#define _STORE_FAST_7 467 +#define _STORE_FAST 460 +#define _STORE_FAST_0 461 +#define _STORE_FAST_1 462 +#define _STORE_FAST_2 463 +#define _STORE_FAST_3 464 +#define _STORE_FAST_4 465 +#define _STORE_FAST_5 466 +#define _STORE_FAST_6 467 +#define _STORE_FAST_7 468 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME -#define _STORE_SLICE 468 -#define _STORE_SUBSCR 469 +#define _STORE_SLICE 469 +#define _STORE_SUBSCR 470 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TIER2_RESUME_CHECK 470 -#define _TO_BOOL 471 +#define _TIER2_RESUME_CHECK 471 +#define _TO_BOOL 472 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -289,14 +290,14 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 472 +#define _UNPACK_SEQUENCE 473 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START #define _YIELD_VALUE YIELD_VALUE #define __DO_CALL_FUNCTION_EX _DO_CALL_FUNCTION_EX -#define MAX_UOP_ID 472 +#define MAX_UOP_ID 473 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 4d0ab22e6aa8f3..07606135d7a356 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -274,6 +274,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_INTERNAL_INCREMENT_OPT_COUNTER] = 0, [_DYNAMIC_EXIT] = HAS_ESCAPES_FLAG, [_START_EXECUTOR] = 0, + [_MAKE_WARM] = 0, [_FATAL_ERROR] = 0, [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG, [_DEOPT] = 0, @@ -481,6 +482,7 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_LOAD_SUPER_ATTR_METHOD] = "_LOAD_SUPER_ATTR_METHOD", [_MAKE_CELL] = "_MAKE_CELL", [_MAKE_FUNCTION] = "_MAKE_FUNCTION", + [_MAKE_WARM] = "_MAKE_WARM", [_MAP_ADD] = "_MAP_ADD", [_MATCH_CLASS] = "_MATCH_CLASS", [_MATCH_KEYS] = "_MATCH_KEYS", @@ -1062,6 +1064,8 @@ int _PyUop_num_popped(int opcode, int oparg) return 0; case _START_EXECUTOR: return 0; + case _MAKE_WARM: + return 0; case _FATAL_ERROR: return 0; case _CHECK_VALIDITY_AND_SET_IP: diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst new file mode 100644 index 00000000000000..de62875e16475d --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst @@ -0,0 +1,2 @@ +Improved JIT memory consumption by periodically freeing memory used by infrequently-executed code. +This change is especially likely to improve the memory footprint of long-running programs. diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 5f194aec0073c8..7c88a50b6266a9 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4828,6 +4828,14 @@ dummy_func( assert(((_PyExecutorObject *)executor)->vm_data.valid); } + tier2 op(_MAKE_WARM, (--)) { + current_executor->vm_data.warm = true; + if (++tstate->interp->trace_run_counter > JIT_CLEANUP_THRESHOLD) { + _Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); + tstate->interp->trace_run_counter = 0; + } + } + tier2 op(_FATAL_ERROR, (--)) { assert(0); Py_FatalError("Fatal error uop executed."); diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c index 6f4476d055b5ec..b1f06c513ad98e 100644 --- a/Python/ceval_gil.c +++ b/Python/ceval_gil.c @@ -1289,6 +1289,11 @@ _Py_HandlePending(PyThreadState *tstate) _Py_RunGC(tstate); } + if ((breaker & _PY_EVAL_JIT_INVALIDATE_COLD_BIT) != 0) { + _Py_unset_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); + _Py_Executors_InvalidateCold(tstate->interp); + } + /* GIL drop request */ if ((breaker & _PY_GIL_DROP_REQUEST_BIT) != 0) { /* Give another thread a chance */ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 7285acec0bacaf..656ee2a69b9c77 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -5435,6 +5435,15 @@ break; } + case _MAKE_WARM: { + current_executor->vm_data.warm = true; + if (++tstate->interp->trace_run_counter > JIT_CLEANUP_THRESHOLD) { + _Py_set_eval_breaker_bit(tstate, _PY_EVAL_JIT_INVALIDATE_COLD_BIT); + tstate->interp->trace_run_counter = 0; + } + break; + } + case _FATAL_ERROR: { assert(0); Py_FatalError("Fatal error uop executed."); diff --git a/Python/optimizer.c b/Python/optimizer.c index bb7a90b3204f40..978649faa04d45 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -565,6 +565,7 @@ translate_bytecode_to_trace( code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code)); + ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0); uint32_t target = 0; for (;;) { @@ -1194,6 +1195,9 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil executor->jit_code = NULL; executor->jit_side_entry = NULL; executor->jit_size = 0; + // This is initialized to true so we can prevent the executor + // from being immediately detected as cold and invalidated. + executor->vm_data.warm = true; if (_PyJIT_Compile(executor, executor->trace, length)) { Py_DECREF(executor); return NULL; @@ -1659,4 +1663,42 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation) } } +void +_Py_Executors_InvalidateCold(PyInterpreterState *interp) +{ + /* Walk the list of executors */ + /* TO DO -- Use a tree to avoid traversing as many objects */ + PyObject *invalidate = PyList_New(0); + if (invalidate == NULL) { + goto error; + } + + /* Clearing an executor can deallocate others, so we need to make a list of + * executors to invalidate first */ + for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) { + assert(exec->vm_data.valid); + _PyExecutorObject *next = exec->vm_data.links.next; + + if (!exec->vm_data.warm && PyList_Append(invalidate, (PyObject *)exec) < 0) { + goto error; + } + else { + exec->vm_data.warm = false; + } + + exec = next; + } + for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) { + _PyExecutorObject *exec = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i); + executor_clear(exec); + } + Py_DECREF(invalidate); + return; +error: + PyErr_Clear(); + Py_XDECREF(invalidate); + // If we're truly out of memory, wiping out everything is a fine fallback + _Py_Executors_InvalidateAll(interp, 0); +} + #endif /* _Py_TIER2 */ diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index a6cfa271ae6758..4d172e3c762704 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -2381,6 +2381,10 @@ break; } + case _MAKE_WARM: { + break; + } + case _FATAL_ERROR: { break; } diff --git a/Python/pystate.c b/Python/pystate.c index 6bf7ebeb75ff73..6a617ed5a36c69 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -660,6 +660,7 @@ init_interpreter(PyInterpreterState *interp, #ifdef _Py_TIER2 (void)_Py_SetOptimizer(interp, NULL); interp->executor_list_head = NULL; + interp->trace_run_counter = 0; #endif if (interp != &runtime->_main_interpreter) { /* Fix the self-referential, statically initialized fields. */ diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py index 0680c21a3c24c5..b69dd21df64a1a 100644 --- a/Tools/cases_generator/analyzer.py +++ b/Tools/cases_generator/analyzer.py @@ -539,6 +539,7 @@ def has_error_without_pop(op: parser.InstDef) -> bool: "_PyList_FromStackRefSteal", "_PyTuple_FromArraySteal", "_PyTuple_FromStackRefSteal", + "_Py_set_eval_breaker_bit" ) ESCAPING_FUNCTIONS = (