From 7acec08c71db17191d5b3ad19d97fd9e255d894a Mon Sep 17 00:00:00 2001 From: Andreas Scharf Date: Sat, 3 Jan 2026 20:10:34 +0100 Subject: [PATCH 1/3] Use py::object (reinterpret_steal) for PyObject_CallObject return values to avoid leaking Python objects during scalar UDF execution. --- src/duckdb_py/python_udf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/duckdb_py/python_udf.cpp b/src/duckdb_py/python_udf.cpp index 41daf4e2..fd6775e0 100644 --- a/src/duckdb_py/python_udf.cpp +++ b/src/duckdb_py/python_udf.cpp @@ -326,7 +326,7 @@ static scalar_function_t CreateNativeFunction(PyObject *function, PythonExceptio } // Call the function - auto ret = PyObject_CallObject(function, bundled_parameters.ptr()); + auto ret = py::reinterpret_steal(PyObject_CallObject(function, bundled_parameters.ptr())); if (ret == nullptr && PyErr_Occurred()) { if (exception_handling == PythonExceptionHandling::FORWARD_ERROR) { auto exception = py::error_already_set(); From cc274dab47fba6eddf751acd1bfb5960c7908547 Mon Sep 17 00:00:00 2001 From: Andreas Scharf Date: Sat, 3 Jan 2026 20:24:01 +0100 Subject: [PATCH 2/3] Add regression test for Python UDF return value refcount leak. --- tests/fast/udf/test_udf_refcount_leak.py | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/fast/udf/test_udf_refcount_leak.py diff --git a/tests/fast/udf/test_udf_refcount_leak.py b/tests/fast/udf/test_udf_refcount_leak.py new file mode 100644 index 00000000..6a5f8dad --- /dev/null +++ b/tests/fast/udf/test_udf_refcount_leak.py @@ -0,0 +1,38 @@ +import sys +import gc +import platform +import duckdb +import pytest + + +@pytest.mark.parametrize("rows, iters", [(1000, 20)]) +def test_python_scalar_udf_return_value_refcount_does_not_leak(rows, iters): + if platform.python_implementation() != "CPython": + pytest.skip("refcount-based test requires CPython") + + payload = (b"processed_data_" + b"x" * 8192) # large-ish bytes to mimic the reported issue + + def udf_bytes(_): + return payload # Always return the exact same object so we can track its refcount. + + # Baseline refcount (note: getrefcount adds a temporary ref) + baseline = sys.getrefcount(payload) + + con = duckdb.connect() + con.create_function("udf_bytes", udf_bytes, ["BIGINT"], "VARCHAR") + + for _ in range(iters): + con.execute(f"SELECT udf_bytes(range) FROM range({rows})") + res = con.fetchall() + # Drop the result ASAP so we don't keep any refs alive in Python + del res + gc.collect() + + # Re-check refcount. In the buggy version this grows by rows*iters (huge). + after = sys.getrefcount(payload) + + # Allow a tiny tolerance for transient references/caches. + # In the presence of the leak, this will be thousands+ higher. + assert after <= baseline + 10, (baseline, after) + + con.close() From f97248fd6742ebd6260457a9ea357d4979ce4f43 Mon Sep 17 00:00:00 2001 From: Andreas Scharf Date: Sat, 3 Jan 2026 21:22:00 +0100 Subject: [PATCH 3/3] Fix formatting. --- tests/fast/udf/test_udf_refcount_leak.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/fast/udf/test_udf_refcount_leak.py b/tests/fast/udf/test_udf_refcount_leak.py index 6a5f8dad..3c1201cf 100644 --- a/tests/fast/udf/test_udf_refcount_leak.py +++ b/tests/fast/udf/test_udf_refcount_leak.py @@ -1,19 +1,21 @@ -import sys import gc import platform -import duckdb +import sys + import pytest +import duckdb + -@pytest.mark.parametrize("rows, iters", [(1000, 20)]) +@pytest.mark.parametrize(("rows", "iters"), [(1000, 20)]) def test_python_scalar_udf_return_value_refcount_does_not_leak(rows, iters): if platform.python_implementation() != "CPython": pytest.skip("refcount-based test requires CPython") - payload = (b"processed_data_" + b"x" * 8192) # large-ish bytes to mimic the reported issue + payload = b"processed_data_" + b"x" * 8192 # large-ish bytes to mimic the reported issue def udf_bytes(_): - return payload # Always return the exact same object so we can track its refcount. + return payload # Always return the exact same object so we can track its refcount. # Baseline refcount (note: getrefcount adds a temporary ref) baseline = sys.getrefcount(payload)