diff --git a/Include/internal/pycore_stackref.h b/Include/internal/pycore_stackref.h index ca4a7c216eda53..bfd200d94011f0 100644 --- a/Include/internal/pycore_stackref.h +++ b/Include/internal/pycore_stackref.h @@ -212,6 +212,32 @@ _PyStackRef_FromPyObjectBorrow(PyObject *obj, const char *filename, int linenumb } #define PyStackRef_FromPyObjectBorrow(obj) _PyStackRef_FromPyObjectBorrow(_PyObject_CAST(obj), __FILE__, __LINE__) +/* Tag a PyObject pointer as a borrowed operand for BORROW variants. */ +static inline uintptr_t +PyStackRef_TagBorrow(PyObject *obj) +{ + return (uintptr_t)obj | Py_TAG_REFCNT; +} + +/* Strip tag bits from a pre-tagged operand to recover the PyObject pointer. */ +static inline PyObject * +PyStackRef_UntagBorrow(uintptr_t tagged) +{ + return (PyObject *)(tagged & ~Py_TAG_BITS); +} + +/* Create a stackref from a pre-tagged operand (tag bits already set). + Used by _LOAD_CONST_INLINE_BORROW variants where the operand is + tagged at trace creation time to avoid tagging on every execution. */ +static inline _PyStackRef +_PyStackRef_FromPreTagged(uintptr_t tagged, const char *filename, int linenumber) +{ + assert(tagged & Py_TAG_REFCNT); + PyObject *obj = (PyObject *)(tagged & ~Py_TAG_BITS); + return _Py_stackref_create(obj, Py_TAG_REFCNT, filename, linenumber); +} +#define PyStackRef_FromPreTagged(tagged) _PyStackRef_FromPreTagged((tagged), __FILE__, __LINE__) + static inline void _PyStackRef_CLOSE(_PyStackRef ref, const char *filename, int linenumber) { @@ -617,6 +643,30 @@ PyStackRef_FromPyObjectBorrow(PyObject *obj) return (_PyStackRef){ .bits = (uintptr_t)obj | Py_TAG_REFCNT}; } +/* Tag a PyObject pointer as a borrowed operand for BORROW variants. */ +static inline uintptr_t +PyStackRef_TagBorrow(PyObject *obj) +{ + return (uintptr_t)obj | Py_TAG_REFCNT; +} + +/* Strip tag bits from a pre-tagged operand to recover the PyObject pointer. */ +static inline PyObject * +PyStackRef_UntagBorrow(uintptr_t tagged) +{ + return (PyObject *)(tagged & ~Py_TAG_BITS); +} + +/* Create a stackref from a pre-tagged operand (tag bits already set). + Used by _LOAD_CONST_INLINE_BORROW variants where the operand is + tagged at trace creation time to avoid tagging on every execution. */ +static inline _PyStackRef +PyStackRef_FromPreTagged(uintptr_t tagged) +{ + assert(tagged & Py_TAG_REFCNT); + return (_PyStackRef){ .bits = tagged }; +} + /* WARNING: This macro evaluates its argument more than once */ #ifdef _WIN32 #define PyStackRef_DUP(REF) \ diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 8f543dbeeb8bc9..a7e7a1ba68cc50 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -18,6 +18,7 @@ extern const ReplicationRange _PyUop_Replication[MAX_UOP_ID+1]; extern const char * const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1]; extern int _PyUop_num_popped(int opcode, int oparg); +extern uint64_t _PyUop_PrepareOperand0(int opcode, uint64_t operand0); typedef struct _pyuop_tos_cache_entry { /* input depth is implicit in position */ @@ -6892,6 +6893,16 @@ int _PyUop_num_popped(int opcode, int oparg) } } +uint64_t _PyUop_PrepareOperand0(int opcode, uint64_t operand0) +{ + switch(opcode) { + case _LOAD_CONST_INLINE_BORROW: + return PyStackRef_TagBorrow((PyObject *)operand0); + default: + return operand0; + } +} + #endif // NEED_OPCODE_METADATA diff --git a/Lib/test/test_generated_cases.py b/Lib/test/test_generated_cases.py index 748309b54593a1..cc2ce90d9067d3 100644 --- a/Lib/test/test_generated_cases.py +++ b/Lib/test/test_generated_cases.py @@ -560,6 +560,34 @@ def test_cache_effect(self): """ self.run_cases_test(input, output) + def test_pretagged_cache_effect(self): + input = """ + inst(OP, (ptr/4^, value --)) { + DEAD(value); + } + """ + output = """ + TARGET(OP) { + #if _Py_TAIL_CALL_INTERP + int opcode = OP; + (void)(opcode); + #endif + _Py_CODEUNIT* const this_instr = next_instr; + (void)this_instr; + frame->instr_ptr = next_instr; + next_instr += 5; + INSTRUCTION_STATS(OP); + _PyStackRef value; + value = stack_pointer[-1]; + uintptr_t ptr = read_u64(&this_instr[1].cache); + (void)ptr; + stack_pointer += -1; + ASSERT_WITHIN_STACK_BOUNDS(__FILE__, __LINE__); + DISPATCH(); + } + """ + self.run_cases_test(input, output) + def test_suppress_dispatch(self): input = """ label(somewhere) { diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-04-19-35-32.gh-issue-145742.SU9RYL.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-04-19-35-32.gh-issue-145742.SU9RYL.rst new file mode 100644 index 00000000000000..0909ffe44bd331 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-04-19-35-32.gh-issue-145742.SU9RYL.rst @@ -0,0 +1,2 @@ +Optimize _LOAD_CONST_INLINE_BORROW by pre-tagging operands at trace +creation. Patch by Donghee Na. diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 963391e7598fb6..9f42852335989f 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -6105,8 +6105,8 @@ dummy_func( value = PyStackRef_FromPyObjectNew(ptr); } - tier2 pure op(_LOAD_CONST_INLINE_BORROW, (ptr/4 -- value)) { - value = PyStackRef_FromPyObjectBorrow(ptr); + tier2 pure op(_LOAD_CONST_INLINE_BORROW, (ptr/4^ -- value)) { + value = PyStackRef_FromPreTagged(ptr); } tier2 pure op(_RROT_3, (bottom, middle, top -- bottom, middle, top)) { diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index f8fc35de9d7957..9d5b8814292767 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -22299,8 +22299,8 @@ CHECK_CURRENT_CACHED_VALUES(0); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); _PyStackRef value; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); - value = PyStackRef_FromPyObjectBorrow(ptr); + uintptr_t ptr = (uintptr_t)CURRENT_OPERAND0_64(); + value = PyStackRef_FromPreTagged(ptr); _tos_cache0 = value; SET_CURRENT_CACHED_VALUES(1); assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); @@ -22312,8 +22312,8 @@ assert(WITHIN_STACK_BOUNDS_IGNORING_CACHE()); _PyStackRef value; _PyStackRef _stack_item_0 = _tos_cache0; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); - value = PyStackRef_FromPyObjectBorrow(ptr); + uintptr_t ptr = (uintptr_t)CURRENT_OPERAND0_64(); + value = PyStackRef_FromPreTagged(ptr); _tos_cache1 = value; _tos_cache0 = _stack_item_0; SET_CURRENT_CACHED_VALUES(2); @@ -22327,8 +22327,8 @@ _PyStackRef value; _PyStackRef _stack_item_0 = _tos_cache0; _PyStackRef _stack_item_1 = _tos_cache1; - PyObject *ptr = (PyObject *)CURRENT_OPERAND0_64(); - value = PyStackRef_FromPyObjectBorrow(ptr); + uintptr_t ptr = (uintptr_t)CURRENT_OPERAND0_64(); + value = PyStackRef_FromPreTagged(ptr); _tos_cache2 = value; _tos_cache1 = _stack_item_1; _tos_cache0 = _stack_item_0; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 1dc3a248f45f0c..eab6bab36907b8 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -187,12 +187,8 @@ convert_global_to_const(_PyUOpInstruction *inst, PyObject *obj) if (res == NULL) { return NULL; } - if (_Py_IsImmortal(res)) { - inst->opcode = _LOAD_CONST_INLINE_BORROW; - } else { - inst->opcode = _LOAD_CONST_INLINE; - } - inst->operand0 = (uint64_t)res; + inst->opcode = _Py_IsImmortal(res) ? _LOAD_CONST_INLINE_BORROW : _LOAD_CONST_INLINE; + inst->operand0 = _PyUop_PrepareOperand0(inst->opcode, (uint64_t)res); return res; } @@ -243,7 +239,7 @@ add_op(JitOptContext *ctx, _PyUOpInstruction *this_instr, out->format = this_instr->format; out->oparg = (oparg); out->target = this_instr->target; - out->operand0 = (operand0); + out->operand0 = _PyUop_PrepareOperand0(opcode, (uint64_t)operand0); out->operand1 = this_instr->operand1; #ifdef Py_STATS out->fitness = this_instr->fitness; diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index 0837d57b61b29d..9c8455d160058a 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -892,8 +892,8 @@ dummy_func(void) { value = sym_new_const(ctx, ptr); } - op(_LOAD_CONST_INLINE_BORROW, (ptr/4 -- value)) { - value = PyJitRef_Borrow(sym_new_const(ctx, ptr)); + op(_LOAD_CONST_INLINE_BORROW, (ptr/4^ -- value)) { + value = PyJitRef_Borrow(sym_new_const(ctx, PyStackRef_UntagBorrow(ptr))); } op(_POP_TOP_OPARG, (args[oparg] --)) { diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 1ade86f64b2b20..5e3fede1065f8c 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -5479,8 +5479,8 @@ case _LOAD_CONST_INLINE_BORROW: { JitOptRef value; - PyObject *ptr = (PyObject *)this_instr->operand0; - value = PyJitRef_Borrow(sym_new_const(ctx, ptr)); + uintptr_t ptr = (uintptr_t)this_instr->operand0; + value = PyJitRef_Borrow(sym_new_const(ctx, PyStackRef_UntagBorrow(ptr))); CHECK_STACK_BOUNDS(1); stack_pointer[0] = value; stack_pointer += 1; diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py index 414ca18be4654c..d5a34d778a36ca 100644 --- a/Tools/cases_generator/analyzer.py +++ b/Tools/cases_generator/analyzer.py @@ -180,9 +180,21 @@ def __str__(self) -> str: class CacheEntry: name: str size: int + pretagged: bool = False + + @classmethod + def from_parsed(cls, effect: parser.CacheEffect) -> "CacheEntry": + if effect.pretagged and effect.size != 4: + raise analysis_error( + f"'^' (pretagged) marker requires size /4, " + f"got /{effect.size}", + effect.tokens[0], + ) + return cls(effect.name, effect.size, effect.pretagged) def __str__(self) -> str: - return f"{self.name}/{self.size}" + suffix = "^" if self.pretagged else "" + return f"{self.name}/{self.size}{suffix}" @dataclass @@ -433,7 +445,7 @@ def analyze_caches(inputs: list[parser.InputEffect]) -> list[CacheEntry]: position = "First" if index == 0 else "Last" msg = f"{position} cache entry in op is unused. Move to enclosing macro." raise analysis_error(msg, cache.tokens[0]) - return [CacheEntry(i.name, int(i.size)) for i in caches] + return [CacheEntry.from_parsed(i) for i in caches] def find_variable_stores(node: parser.InstDef) -> list[lexer.Token]: @@ -600,6 +612,7 @@ def has_error_without_pop(op: parser.CodeDef) -> bool: "PyStackRef_CLOSE_SPECIALIZED", "PyStackRef_DUP", "PyStackRef_False", + "PyStackRef_FromPreTagged", "PyStackRef_FromPyObjectBorrow", "PyStackRef_FromPyObjectNew", "PyStackRef_FromPyObjectSteal", @@ -766,7 +779,7 @@ def escaping_call_in_simple_stmt(stmt: SimpleStmt, result: dict[SimpleStmt, Esca continue #if not tkn.text.startswith(("Py", "_Py", "monitor")): # continue - if tkn.text.startswith(("sym_", "optimize_", "PyJitRef")): + if tkn.text.startswith(("sym_", "optimize_", "PyJitRef", "PyStackRef_Tag", "PyStackRef_Untag")): # Optimize functions continue if tkn.text.endswith("Check"): @@ -1111,6 +1124,11 @@ def desugar_inst( # Move unused cache entries to the Instruction, removing them from the Uop. for input in inst.inputs: if isinstance(input, parser.CacheEffect) and input.name == "unused": + if input.pretagged: + raise analysis_error( + "'unused' cache slot cannot carry a '^' marker", + input.tokens[0], + ) parts.append(Skip(input.size)) else: op_inputs.append(input) diff --git a/Tools/cases_generator/interpreter_definition.md b/Tools/cases_generator/interpreter_definition.md index 29e4e74da72154..7710159d2122b0 100644 --- a/Tools/cases_generator/interpreter_definition.md +++ b/Tools/cases_generator/interpreter_definition.md @@ -112,7 +112,7 @@ and a piece of C code describing its semantics: NAME ["*"] stream: - NAME "/" size + NAME "/" size ["^"] size: INTEGER @@ -162,6 +162,10 @@ instruction stream. It returns a 16, 32 or 64 bit value. If the name is `unused` the size can be any value and that many codeunits will be skipped in the instruction stream. +A `/4` slot trailed by `^` is read as raw `uintptr_t` instead of `PyObject *`, +indicating its bits are already tagged as a borrowed `_PyStackRef`. Only +allowed on `/4` and not on `unused`. + By convention cache effects (`stream`) must precede the input effects. The name `oparg` is pre-defined as a 32 bit value fetched from the instruction stream. @@ -313,6 +317,15 @@ This might become (if it was an instruction): } ``` +### Pre-tagged cache effect +```C + op ( LOAD_CONST_INLINE_BORROW, (ptr/4^ -- value) ) { + value = PyStackRef_FromPreTagged(ptr); + } +``` +The `^` marks `ptr` as a pre-tagged borrowed `_PyStackRef`; it is declared +`uintptr_t` and read with `read_u64` rather than `read_obj`. + ### More examples For explanations see "Generating the interpreter" below. diff --git a/Tools/cases_generator/optimizer_generator.py b/Tools/cases_generator/optimizer_generator.py index aa914783f7cdc2..1cd5775b733158 100644 --- a/Tools/cases_generator/optimizer_generator.py +++ b/Tools/cases_generator/optimizer_generator.py @@ -414,7 +414,10 @@ def write_uop( idx = 0 for cache in uop.caches: if cache.name != "unused": - if cache.size == 4: + if cache.pretagged: + type = "uintptr_t " + cast = "uintptr_t" + elif cache.size == 4: type = cast = "PyObject *" else: type = f"uint{cache.size*16}_t " diff --git a/Tools/cases_generator/parsing.py b/Tools/cases_generator/parsing.py index c7fe0d162ac6e4..341d612cf55f55 100644 --- a/Tools/cases_generator/parsing.py +++ b/Tools/cases_generator/parsing.py @@ -266,6 +266,7 @@ class Expression(Node): class CacheEffect(Node): name: str size: int + pretagged: bool = False @dataclass @@ -449,7 +450,9 @@ def output(self) -> OutputEffect | None: @contextual def cache_effect(self) -> CacheEffect | None: - # IDENTIFIER '/' NUMBER + # IDENTIFIER '/' NUMBER ['^'] + # The optional '^' marks the slot's bits as a pre-tagged _PyStackRef + # (see _PyPreTaggedRef in pycore_stackref.h). if tkn := self.expect(lx.IDENTIFIER): if self.expect(lx.DIVIDE): num = self.require(lx.NUMBER).text @@ -457,8 +460,8 @@ def cache_effect(self) -> CacheEffect | None: size = int(num) except ValueError: raise self.make_syntax_error(f"Expected integer, got {num!r}") - else: - return CacheEffect(tkn.text, size) + pretagged = bool(self.expect(lx.XOR)) + return CacheEffect(tkn.text, size, pretagged) return None @contextual diff --git a/Tools/cases_generator/tier1_generator.py b/Tools/cases_generator/tier1_generator.py index d2fa749e1417f5..e3e84042632ccb 100644 --- a/Tools/cases_generator/tier1_generator.py +++ b/Tools/cases_generator/tier1_generator.py @@ -95,7 +95,12 @@ def write_uop( for cache in uop.caches: if cache.name != "unused": - if cache.size == 4: + if cache.pretagged: + # Read raw bits; read_obj would falsely declare PyObject* for + # what is actually a tagged uintptr_t. + type = "uintptr_t " + reader = "read_u64" + elif cache.size == 4: type = "PyObject *" reader = "read_obj" else: diff --git a/Tools/cases_generator/tier2_generator.py b/Tools/cases_generator/tier2_generator.py index 12da5bff254957..65f7f96a0ed35a 100644 --- a/Tools/cases_generator/tier2_generator.py +++ b/Tools/cases_generator/tier2_generator.py @@ -223,7 +223,10 @@ def write_uop(uop: Uop, emitter: Tier2Emitter, stack: Stack, cached_items: int = for cache in uop.caches: if cache.name != "unused": bits = cache.size*16 - if cache.size == 4: + if cache.pretagged: + type = "uintptr_t " + cast = "uintptr_t" + elif cache.size == 4: type = cast = "PyObject *" else: type = f"uint{bits}_t " diff --git a/Tools/cases_generator/uop_metadata_generator.py b/Tools/cases_generator/uop_metadata_generator.py index 4c24435cbdbe05..7884bd265a50e1 100644 --- a/Tools/cases_generator/uop_metadata_generator.py +++ b/Tools/cases_generator/uop_metadata_generator.py @@ -69,6 +69,7 @@ def generate_names_and_flags(analysis: Analysis, out: CWriter) -> None: out.emit("extern const ReplicationRange _PyUop_Replication[MAX_UOP_ID+1];\n") out.emit("extern const char * const _PyOpcode_uop_name[MAX_UOP_REGS_ID+1];\n\n") out.emit("extern int _PyUop_num_popped(int opcode, int oparg);\n") + out.emit("extern uint64_t _PyUop_PrepareOperand0(int opcode, uint64_t operand0);\n") out.emit(CACHING_INFO_DECL) out.emit(f"extern const uint16_t _PyUop_SpillsAndReloads[{MAX_CACHED_REGISTER+1}][{MAX_CACHED_REGISTER+1}];\n") out.emit("extern const uint16_t _PyUop_Uncached[MAX_UOP_REGS_ID+1];\n\n") @@ -133,6 +134,17 @@ def generate_names_and_flags(analysis: Analysis, out: CWriter) -> None: out.emit(" return -1;\n") out.emit("}\n") out.emit("}\n\n") + out.emit("uint64_t _PyUop_PrepareOperand0(int opcode, uint64_t operand0)\n{\n") + out.emit("switch(opcode) {\n") + for uop in analysis.uops.values(): + if uop.is_viable() and uop.properties.tier != 1 and not uop.is_super(): + if any(cache.pretagged for cache in uop.caches): + out.emit(f"case {uop.name}:\n") + out.emit(f" return PyStackRef_TagBorrow((PyObject *)operand0);\n") + out.emit("default:\n") + out.emit(" return operand0;\n") + out.emit("}\n") + out.emit("}\n\n") out.emit("#endif // NEED_OPCODE_METADATA\n\n") def generate_uop_metadata(