From 44516ce9e4df590519aff365a1ba4dd241b14f73 Mon Sep 17 00:00:00 2001 From: kunchenguid Date: Sat, 23 May 2026 00:46:31 -0700 Subject: [PATCH] fix(container): kill orphaned in-container process on execute() timeout When subprocess.run([docker exec ...], timeout=N) hits TimeoutExpired, Python SIGKILLs the local docker exec CLI. The docker daemon does not propagate that into the container (only SIGINT/SIGTERM propagate gracefully), so the test process spawned inside the container keeps running. With sleep-as-PID-1 cleanrooms it never gets reaped: it just orphans onto PID 1 and keeps burning CPU while the next docker exec call competes for the same resources. On a multi-arm evaluation run over the full task set, this surfaces as: a TUI-style task hits the 3600s outer timeout, returns rc=-1, programbench retries the branch, the retry SIGKILLs another docker exec, the second orphan accumulates alongside the first, and so on. Observed cost on one run: ~2 hr wasted per arm per hang-prone task, plus measurable CPU contention slowing surrounding evals in the same stripe. Fix: on TimeoutExpired, send a follow-up docker exec bash -c "kill -KILL -1; sleep 0.2; kill -KILL -1; true" which signals every non-PID-1 process in the container's PID namespace. Safe here because ContainerEnvironment starts cleanrooms with a long- lived sleep as PID 1; the only non-PID-1 processes are the test subprocess tree we just spawned. Bounded with a 30s timeout on the kill exec itself, and warns if it fails so the failure is debuggable instead of silent. Behavior on successful runs is unchanged. --- src/programbench/container.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/programbench/container.py b/src/programbench/container.py index 4c24e57..8c419a8 100644 --- a/src/programbench/container.py +++ b/src/programbench/container.py @@ -89,6 +89,37 @@ def execute(self, command: str, *, timeout: int | None = None) -> dict[str, Any] "exception_info": "", } except subprocess.TimeoutExpired: + # When subprocess.run hits its host-side timeout, Python SIGKILLs + # the local `docker exec` CLI. The docker daemon does not + # propagate that into the container (only SIGINT/SIGTERM + # propagate gracefully), so the process spawned inside the + # container keeps running. With sleep-as-PID-1 cleanrooms it + # never gets reaped and competes with the next docker exec call. + # + # Sweep the in-container processes by SIGKILL-ing every non- + # PID-1 process. Safe here because the cleanroom's only PID-1 + # process is the long-lived `sleep` that keeps the container + # alive; everything else is the test subprocess tree we just + # spawned. Bounded with a short timeout so a stuck daemon + # cannot wedge us a second time. + try: + subprocess.run( + [ + self.executable, + "exec", + self.container_id, + "bash", + "-c", + "kill -KILL -1 2>/dev/null; sleep 0.2; kill -KILL -1 2>/dev/null; true", + ], + capture_output=True, + text=True, + timeout=30, + ) + except Exception as e: + log.warning( + "in-container teardown after execute() timeout failed: %s", e + ) return { "output": "", "returncode": -1,