Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .air.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ tmp_dir = "tmp"
include_dir = []
include_ext = ["go", "tpl", "tmpl", "html", "yaml"]
include_file = []
kill_delay = "5s"
log = "build-errors.log"
poll = false
poll_interval = 0
post_cmd = []
kill_delay = '1s'
rerun = false
rerun_delay = 500
send_interrupt = true
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ build-all: build build-exec

# Run in development mode with hot reload
dev: ensure-ch-binaries ensure-caddy-binaries lib/system/exec_agent/exec-agent $(AIR)
@rm -f ./tmp/main
$(AIR) -c .air.toml

# Run tests (as root for network capabilities, enables caching and parallelism)
Expand Down
23 changes: 18 additions & 5 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func main() {
slog.Error("application terminated", "error", err)
os.Exit(1)
}
slog.Info("main() exiting normally")
}

func run() error {
Expand All @@ -61,11 +62,13 @@ func run() error {
}
if otelShutdown != nil {
defer func() {
slog.Info("shutting down OpenTelemetry")
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := otelShutdown(shutdownCtx); err != nil {
slog.Warn("error shutting down OpenTelemetry", "error", err)
}
slog.Info("OpenTelemetry shutdown complete")
}()
}

Expand All @@ -91,10 +94,18 @@ func run() error {
if err != nil {
return fmt.Errorf("initialize application: %w", err)
}
defer cleanup()
defer func() {
slog.Info("cleaning up application resources")
cleanup()
slog.Info("application cleanup complete")
}()

ctx, stop := signal.NotifyContext(app.Ctx, os.Interrupt, syscall.SIGTERM)
defer stop()
defer func() {
slog.Info("stopping signal handler")
stop()
slog.Info("signal handler stopped")
}()

logger := app.Logger

Expand Down Expand Up @@ -150,7 +161,7 @@ func run() error {
logger.Error("failed to initialize ingress manager", "error", err)
return fmt.Errorf("initialize ingress manager: %w", err)
}
logger.Info("Ingress manager initialized", "listen_addr", cfg.CaddyListenAddress, "admin", fmt.Sprintf("%s:%d", cfg.CaddyAdminAddress, cfg.CaddyAdminPort))
logger.Info("Ingress manager initialized", "listen_addr", cfg.CaddyListenAddress, "admin", app.IngressManager.AdminURL())

// Create router
r := chi.NewRouter()
Expand Down Expand Up @@ -308,7 +319,7 @@ func run() error {
logger.Info("http server shutdown complete")

// Shutdown ingress manager (stops Caddy if CADDY_STOP_ON_SHUTDOWN=true)
if err := app.IngressManager.Shutdown(); err != nil {
if err := app.IngressManager.Shutdown(shutdownCtx); err != nil {
logger.Error("failed to shutdown ingress manager", "error", err)
// Don't return error - continue with shutdown
} else {
Expand Down Expand Up @@ -338,7 +349,9 @@ func run() error {
}
})

return grp.Wait()
err = grp.Wait()
slog.Info("all goroutines finished")
return err
}

// getRunningInstanceIDs returns IDs of instances currently in Running state
Expand Down
102 changes: 98 additions & 4 deletions lib/ingress/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ package ingress
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"os"
"os/exec"
Expand Down Expand Up @@ -32,12 +34,13 @@ const (
type CaddyDaemon struct {
paths *paths.Paths
adminAddress string
adminPort int
adminPort int // actual port to use (resolved from config or picked fresh)
pid int
stopOnShutdown bool
}

// NewCaddyDaemon creates a new CaddyDaemon manager.
// If adminPort is 0, it will be resolved later from existing config or picked fresh.
func NewCaddyDaemon(p *paths.Paths, adminAddress string, adminPort int, stopOnShutdown bool) *CaddyDaemon {
return &CaddyDaemon{
paths: p,
Expand All @@ -52,8 +55,15 @@ func (d *CaddyDaemon) StopOnShutdown() bool {
return d.stopOnShutdown
}

// AdminPort returns the admin port. If it was configured as 0 (random),
// this returns the actual port after it's been resolved.
func (d *CaddyDaemon) AdminPort() int {
return d.adminPort
}

// Start starts the Caddy daemon. If Caddy is already running (discovered via PID file
// or admin API), this is a no-op and returns the existing PID.
// Note: adminPort must be resolved (non-zero) before calling Start.
func (d *CaddyDaemon) Start(ctx context.Context) (int, error) {
// Check if already running
if pid, running := d.DiscoverRunning(); running {
Expand All @@ -64,6 +74,48 @@ func (d *CaddyDaemon) Start(ctx context.Context) (int, error) {
return d.startCaddy(ctx)
}

// pickAvailablePort finds an available TCP port by briefly binding to port 0.
func pickAvailablePort(address string) (int, error) {
listener, err := net.Listen("tcp", address+":0")
if err != nil {
return 0, err
}
port := listener.Addr().(*net.TCPAddr).Port
listener.Close()
return port, nil
}

// readPortFromConfig reads the admin port from the existing Caddy config file.
func (d *CaddyDaemon) readPortFromConfig() int {
data, err := os.ReadFile(d.paths.CaddyConfig())
if err != nil {
return 0
}

var config struct {
Admin struct {
Listen string `json:"listen"`
} `json:"admin"`
}
if err := json.Unmarshal(data, &config); err != nil {
return 0
}

// Parse "127.0.0.1:2019" format
if config.Admin.Listen == "" {
return 0
}
_, portStr, err := net.SplitHostPort(config.Admin.Listen)
if err != nil {
return 0
}
port, err := strconv.Atoi(portStr)
if err != nil {
return 0
}
return port
}

// startCaddy starts a new Caddy process.
func (d *CaddyDaemon) startCaddy(ctx context.Context) (int, error) {
// Get binary path (extracts if needed)
Expand Down Expand Up @@ -152,27 +204,37 @@ func (d *CaddyDaemon) startCaddy(ctx context.Context) (int, error) {
}

// Stop gracefully stops the Caddy daemon.
func (d *CaddyDaemon) Stop() error {
func (d *CaddyDaemon) Stop(ctx context.Context) error {
log := logger.FromContext(ctx)

pid, running := d.DiscoverRunning()
if !running {
log.InfoContext(ctx, "Caddy not running, nothing to stop")
return nil
}

// Try graceful shutdown via admin API first
client := &http.Client{Timeout: 5 * time.Second}
adminURL := fmt.Sprintf("http://%s:%d/stop", d.adminAddress, d.adminPort)
resp, err := client.Post(adminURL, "", nil)
if err == nil {
if err != nil {
log.InfoContext(ctx, "Caddy admin API stop request failed, will try SIGTERM", "error", err)
} else {
resp.Body.Close()
log.InfoContext(ctx, "Caddy admin API stop request sent")
}

// Wait for process to exit after admin API stop (up to 5s)
log.InfoContext(ctx, "waiting for Caddy process to exit", "pid", pid)
if d.waitForProcessExit(pid, 5*time.Second) {
os.Remove(d.paths.CaddyPIDFile())
d.pid = 0
log.InfoContext(ctx, "Caddy stopped via admin API")
return nil
}

log.InfoContext(ctx, "Caddy still running after admin API stop, sending SIGTERM")

// Send SIGTERM if still running
if proc, err := os.FindProcess(pid); err == nil {
proc.Signal(syscall.SIGTERM)
Expand All @@ -182,9 +244,12 @@ func (d *CaddyDaemon) Stop() error {
if d.waitForProcessExit(pid, 2*time.Second) {
os.Remove(d.paths.CaddyPIDFile())
d.pid = 0
log.InfoContext(ctx, "Caddy stopped via SIGTERM")
return nil
}

log.InfoContext(ctx, "Caddy still running after SIGTERM, sending SIGKILL")

// Final resort: SIGKILL
if proc, err := os.FindProcess(pid); err == nil {
proc.Signal(syscall.SIGKILL)
Expand All @@ -197,6 +262,7 @@ func (d *CaddyDaemon) Stop() error {
os.Remove(d.paths.CaddyPIDFile())
d.pid = 0

log.InfoContext(ctx, "Caddy stopped via SIGKILL")
return nil
}

Expand Down Expand Up @@ -303,6 +369,9 @@ func (d *CaddyDaemon) waitForAdmin(ctx context.Context) error {

// isAdminResponding checks if the admin API is responding.
func (d *CaddyDaemon) isAdminResponding() bool {
if d.adminPort == 0 {
return false
}
client := &http.Client{Timeout: 1 * time.Second}
adminURL := fmt.Sprintf("http://%s:%d/config/", d.adminAddress, d.adminPort)
resp, err := client.Get(adminURL)
Expand All @@ -315,13 +384,38 @@ func (d *CaddyDaemon) isAdminResponding() bool {
}

// isProcessRunning checks if a process with the given PID is running.
// Returns false for zombie processes (which have exited but not been reaped).
func (d *CaddyDaemon) isProcessRunning(pid int) bool {
// Check if process exists
proc, err := os.FindProcess(pid)
if err != nil {
return false
}
err = proc.Signal(syscall.Signal(0))
return err == nil
if err != nil {
return false
}

// Check if it's a zombie process by reading /proc/[pid]/stat
// A zombie has state 'Z' and shouldn't be considered "running"
statPath := fmt.Sprintf("/proc/%d/stat", pid)
data, err := os.ReadFile(statPath)
if err != nil {
// Can't read stat, process might have exited
return false
}

// Format: pid (comm) state ...
// Find the state character after the closing parenthesis
statStr := string(data)
closeParenIdx := strings.LastIndex(statStr, ")")
if closeParenIdx == -1 || closeParenIdx+2 >= len(statStr) {
return false
}
state := statStr[closeParenIdx+2] // Skip ") " to get state char

// Z = zombie, X = dead
return state != 'Z' && state != 'X'
}

// findCaddyPID tries to find the Caddy process PID by scanning /proc.
Expand Down
54 changes: 46 additions & 8 deletions lib/ingress/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ type Manager interface {
Delete(ctx context.Context, idOrName string) error

// Shutdown gracefully stops the ingress subsystem.
Shutdown() error
Shutdown(ctx context.Context) error

// AdminURL returns the Caddy admin API URL.
// Only valid after Initialize() has been called.
AdminURL() string
}

// DefaultDNSPort is the default port for the internal DNS server.
Expand Down Expand Up @@ -152,13 +156,31 @@ func (m *manager) Initialize(ctx context.Context) error {
return fmt.Errorf("start DNS server: %w", err)
}

// Create config generator now that DNS server is running and we know the actual port
// (important when DNSPort was configured as 0 for random port)
// Resolve the admin port before creating config generator.
// If configured as 0, try to read from existing config or pick a new port.
adminPort := m.config.AdminPort
if adminPort == 0 {
// Try to read port from existing Caddy config
if existingPort := m.daemon.readPortFromConfig(); existingPort > 0 {
adminPort = existingPort
} else {
// Pick a new available port
port, err := pickAvailablePort(m.config.AdminAddress)
if err != nil {
return fmt.Errorf("pick admin port: %w", err)
}
adminPort = port
}
// Update daemon with resolved port
m.daemon.adminPort = adminPort
}

// Create config generator with resolved ports
m.configGenerator = NewCaddyConfigGenerator(
m.paths,
m.config.ListenAddress,
m.config.AdminAddress,
m.config.AdminPort,
adminPort,
m.config.ACME,
m.dnsServer.Port(),
)
Expand Down Expand Up @@ -463,31 +485,47 @@ func (m *manager) Delete(ctx context.Context, idOrName string) error {
}

// Shutdown gracefully stops the ingress subsystem.
func (m *manager) Shutdown() error {
func (m *manager) Shutdown(ctx context.Context) error {
m.mu.Lock()
defer m.mu.Unlock()

log := logger.FromContext(ctx)

// Stop log forwarder
if m.logForwarder != nil {
m.logForwarder.Stop()
}

// Stop DNS server
if m.dnsServer != nil {
log.InfoContext(ctx, "stopping DNS server")
if err := m.dnsServer.Stop(); err != nil {
// Log but don't fail - continue with shutdown
slog.Warn("failed to stop DNS server", "error", err)
log.WarnContext(ctx, "failed to stop DNS server", "error", err)
} else {
log.InfoContext(ctx, "stopped DNS server")
}
}

// Only stop Caddy if configured to do so
if m.daemon.StopOnShutdown() {
return m.daemon.Stop()
log.InfoContext(ctx, "stopping Caddy daemon")
if err := m.daemon.Stop(ctx); err != nil {
log.ErrorContext(ctx, "failed to stop Caddy daemon", "error", err)
return err
}
log.InfoContext(ctx, "stopped Caddy daemon")
return nil
}

log.InfoContext(ctx, "leaving Caddy daemon running (CADDY_STOP_ON_SHUTDOWN=false)")
return nil
}

// AdminURL returns the Caddy admin API URL.
func (m *manager) AdminURL() string {
return m.daemon.AdminURL()
}

// loadAllIngresses loads all ingresses and converts them to the Ingress type.
func (m *manager) loadAllIngresses() ([]Ingress, error) {
storedList, err := loadAllIngresses(m.paths)
Expand Down
Loading