Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,13 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() {
let tls = test_tls(&server);
install_fake_ssh(&fake_ssh_dir);

// Bind to port 0 to get an available port from the OS, then release it for
// the sandbox_create call to use. There is a small TOCTOU window here, but
// it is sufficient for testing purposes.
let probe = TcpListener::bind("127.0.0.1:0").await.unwrap();
let free_port = probe.local_addr().unwrap().port();
drop(probe);

run::sandbox_create(
&server.endpoint,
Some("persistent-forward"),
Expand All @@ -750,7 +757,7 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() {
None,
&[],
None,
Some(openshell_core::forward::ForwardSpec::new(8080)),
Some(openshell_core::forward::ForwardSpec::new(free_port)),
&["echo".to_string(), "OK".to_string()],
Some(false),
Some(false),
Expand Down
120 changes: 87 additions & 33 deletions crates/openshell-driver-podman/src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -377,36 +377,61 @@ pub fn build_container_spec(sandbox: &DriverSandbox, config: &PodmanComputeConfi
// proxy, and configure Landlock/seccomp. This matches the K8s
// driver's runAsUser: 0.
user: "0:0".into(),
// The sandbox supervisor needs these capabilities during startup:
// SYS_ADMIN – seccomp filter installation, namespace creation, Landlock
// NET_ADMIN – network namespace veth setup, IP/route configuration
// SYS_PTRACE – reading /proc/<pid>/exe and ancestor walk for policy
// SYSLOG – reading /dev/kmsg for bypass-detection diagnostics
// SETUID – drop_privileges(): setuid() to the sandbox user
// SETGID – drop_privileges(): setgid() + initgroups() to the sandbox group
// DAC_READ_SEARCH – reading /proc/<pid>/fd/ across UIDs for process
// identity resolution. In rootless Podman the supervisor
// runs as UID 0 inside a user namespace while sandbox
// processes run as the sandbox user. The kernel's
// proc_fd_permission() calls generic_permission() which
// denies cross-UID access to the dr-x------ fd directory
// unless CAP_DAC_READ_SEARCH is present. Without it the
// proxy cannot determine which binary made each outbound
// connection and all traffic is denied.
// SETUID/SETGID are needed in rootless Podman: cap_drop:ALL removes them from
// the bounding set even though uid=0 owns the user namespace. Without them,
// setuid/setgid fail EPERM and the supervisor cannot drop to the sandbox user.
cap_drop: vec!["ALL".into()],
// Podman's default container capability set is already restricted:
// CHOWN DAC_OVERRIDE FOWNER FSETID KILL SETGID SETUID SETPCAP
// NET_BIND_SERVICE SYS_CHROOT SETFCAP
// We add what the supervisor needs and drop what it doesn't.
cap_drop: vec![
// Not needed: standard file permission bits are sufficient; dropping
// prevents the supervisor from bypassing DAC checks it shouldn't need.
"DAC_OVERRIDE".into(),
// Not needed: the supervisor does not create setuid/setgid executables.
"FSETID".into(),
// Not needed: the supervisor does not send signals to arbitrary processes.
"KILL".into(),
// Not needed: the supervisor does not bind privileged ports (<1024).
"NET_BIND_SERVICE".into(),
// Not in Podman's default set but explicitly denied in case the image
// or runtime adds it; raw sockets are not required.
"NET_RAW".into(),
// Not needed: the supervisor does not manipulate file capabilities.
"SETFCAP".into(),
// Not needed: the supervisor does not manage its own capability bounding set.
"SETPCAP".into(),
// Not needed: the supervisor does not call chroot().
"SYS_CHROOT".into(),
],
cap_add: vec![
// seccomp filter installation, namespace creation, Landlock setup.
"SYS_ADMIN".into(),
// Network namespace veth setup, IP/route configuration.
"NET_ADMIN".into(),
// Reading /proc/<pid>/exe and ancestor walk for process identity in policy.
"SYS_PTRACE".into(),
// Reading /dev/kmsg for bypass-detection diagnostics.
"SYSLOG".into(),
"SETUID".into(),
"SETGID".into(),
// Reading /proc/<pid>/fd/ across UIDs for process identity resolution.
// In rootless Podman the supervisor runs as UID 0 inside a user namespace
// while sandbox processes run as the sandbox user. The kernel's
// proc_fd_permission() calls generic_permission() which denies cross-UID
// access to the dr-x------ fd directory unless this cap is present.
// Without it the proxy cannot determine which binary made each outbound
// connection and all traffic is denied.
"DAC_READ_SEARCH".into(),
],
// Disable the container-level seccomp profile. The sandbox supervisor
// SETUID, SETGID, CHOWN, and FOWNER are intentionally kept from Podman's
// default set and not dropped:
// SETUID/SETGID – drop_privileges(): setuid()/setgid()/initgroups() to the
// sandbox user. In rootless Podman cap_drop:ALL removes them
// from the bounding set even though uid=0 owns the user
// namespace — so we keep them by not dropping them explicitly.
// CHOWN – prepare_filesystem(): chown(path, uid, gid) on newly
// created read_write directories so the sandbox user can
// write to them.
// FOWNER – chown on files where the supervisor is not the owner
// (e.g. pre-existing directories owned by another user).
//
// Disable the container-level seccomp profile. The sandbox supervisor The sandbox supervisor
// installs its own policy-aware BPF seccomp filter at runtime via
// seccompiler (two-phase: clone3 blocker + main filter). The runtime
// filter is more restrictive than Podman's default — it blocks 20+
Expand Down Expand Up @@ -596,17 +621,46 @@ mod tests {
let config = test_config();
let spec = build_container_spec(&sandbox, &config);

let cap_add = spec["cap_add"]
let added: Vec<&str> = spec["cap_add"]
.as_array()
.expect("cap_add should be an array")
.iter()
.filter_map(|v| v.as_str())
.collect();
assert!(added.contains(&"SYS_ADMIN"), "missing SYS_ADMIN");
assert!(added.contains(&"NET_ADMIN"), "missing NET_ADMIN");
assert!(added.contains(&"SYS_PTRACE"), "missing SYS_PTRACE");
assert!(added.contains(&"SYSLOG"), "missing SYSLOG");
assert!(
added.contains(&"DAC_READ_SEARCH"),
"missing DAC_READ_SEARCH"
);

// SETUID and SETGID are NOT in cap_add — they remain available from the
// default bounding set because we no longer use cap_drop:ALL. Verify they
// are also not explicitly dropped. Similarly CHOWN and FOWNER must not be
// dropped because prepare_filesystem() calls chown() on newly created
// read_write directories before the supervisor drops privileges.
let dropped: Vec<&str> = spec["cap_drop"]
.as_array()
.expect("cap_add should be an array");
let caps: Vec<&str> = cap_add.iter().filter_map(|v| v.as_str()).collect();
assert!(caps.contains(&"SYS_ADMIN"), "missing SYS_ADMIN");
assert!(caps.contains(&"NET_ADMIN"), "missing NET_ADMIN");
assert!(caps.contains(&"SYS_PTRACE"), "missing SYS_PTRACE");
assert!(caps.contains(&"SYSLOG"), "missing SYSLOG");
assert!(caps.contains(&"SETUID"), "missing SETUID");
assert!(caps.contains(&"SETGID"), "missing SETGID");
assert!(caps.contains(&"DAC_READ_SEARCH"), "missing DAC_READ_SEARCH");
.expect("cap_drop should be an array")
.iter()
.filter_map(|v| v.as_str())
.collect();
assert!(!dropped.contains(&"SETUID"), "SETUID must not be dropped");
assert!(!dropped.contains(&"SETGID"), "SETGID must not be dropped");
assert!(
!dropped.contains(&"CHOWN"),
"CHOWN must not be dropped (needed for prepare_filesystem chown)"
);
assert!(
!dropped.contains(&"FOWNER"),
"FOWNER must not be dropped (needed for chown on non-owned files)"
);
assert!(
!dropped.contains(&"ALL"),
"must not use cap_drop:ALL in rootless Podman"
);
}

#[test]
Expand Down
12 changes: 8 additions & 4 deletions e2e/rust/e2e-podman.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ if [ -z "${PORT}" ]; then
PORT=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')
fi

# Allocate a separate port for the unauthenticated health endpoint.
HEALTH_PORT=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()')

# ── Pre-flight checks ───────────────────────────────────────────────

if ! command -v podman &>/dev/null; then
Expand Down Expand Up @@ -73,7 +76,7 @@ if ! podman image exists "${SUPERVISOR_IMAGE}" 2>/dev/null; then
fi

# ── Generate a unique handshake secret ───────────────────────────────
HANDSHAKE_SECRET="e2e-podman-$(head -c 16 /dev/urandom | xxd -p)"
HANDSHAKE_SECRET="e2e-podman-$(python3 -c 'import secrets; print(secrets.token_hex(16))')"

# ── Start the gateway ────────────────────────────────────────────────
GW_LOG=$(mktemp /tmp/openshell-gw-podman-e2e.XXXXXX)
Expand Down Expand Up @@ -118,6 +121,8 @@ OPENSHELL_SSH_HANDSHAKE_SECRET="${HANDSHAKE_SECRET}" \
OPENSHELL_SUPERVISOR_IMAGE="${SUPERVISOR_IMAGE}" \
"${GATEWAY_BIN}" \
--port "${PORT}" \
--health-port "${HEALTH_PORT}" \
--ssh-gateway-port "${PORT}" \
--drivers podman \
--disable-tls \
--db-url "sqlite::memory:" \
Expand All @@ -137,9 +142,8 @@ while [ "${elapsed}" -lt "${TIMEOUT}" ]; do
exit 1
fi

# Use curl to check the gateway's gRPC health endpoint.
# The gateway serves both gRPC and HTTP on the same port.
if curl -sf "http://127.0.0.1:${PORT}/healthz" >/dev/null 2>&1; then
# Poll the dedicated health port (--health-port).
if curl -sf "http://127.0.0.1:${HEALTH_PORT}/healthz" >/dev/null 2>&1; then
healthy=true
break
fi
Expand Down
Loading