diff --git a/.air.toml b/.air.toml index 5e97b31b..9dd4d2ab 100644 --- a/.air.toml +++ b/.air.toml @@ -16,11 +16,11 @@ tmp_dir = "tmp" include_dir = [] include_ext = ["go", "tpl", "tmpl", "html", "yaml"] include_file = [] - kill_delay = "5s" log = "build-errors.log" poll = false poll_interval = 0 post_cmd = [] + kill_delay = '1s' rerun = false rerun_delay = 500 send_interrupt = true diff --git a/Makefile b/Makefile index 56d65fff..6a485fc1 100644 --- a/Makefile +++ b/Makefile @@ -174,6 +174,7 @@ build-all: build build-exec # Run in development mode with hot reload dev: ensure-ch-binaries ensure-caddy-binaries lib/system/exec_agent/exec-agent $(AIR) + @rm -f ./tmp/main $(AIR) -c .air.toml # Run tests (as root for network capabilities, enables caching and parallelism) diff --git a/cmd/api/main.go b/cmd/api/main.go index d5a9b67b..13d3ca84 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -37,6 +37,7 @@ func main() { slog.Error("application terminated", "error", err) os.Exit(1) } + slog.Info("main() exiting normally") } func run() error { @@ -61,11 +62,13 @@ func run() error { } if otelShutdown != nil { defer func() { + slog.Info("shutting down OpenTelemetry") shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() if err := otelShutdown(shutdownCtx); err != nil { slog.Warn("error shutting down OpenTelemetry", "error", err) } + slog.Info("OpenTelemetry shutdown complete") }() } @@ -91,10 +94,18 @@ func run() error { if err != nil { return fmt.Errorf("initialize application: %w", err) } - defer cleanup() + defer func() { + slog.Info("cleaning up application resources") + cleanup() + slog.Info("application cleanup complete") + }() ctx, stop := signal.NotifyContext(app.Ctx, os.Interrupt, syscall.SIGTERM) - defer stop() + defer func() { + slog.Info("stopping signal handler") + stop() + slog.Info("signal handler stopped") + }() logger := app.Logger @@ -150,7 +161,7 @@ func run() error { logger.Error("failed to initialize ingress manager", "error", err) return fmt.Errorf("initialize ingress manager: %w", err) } - logger.Info("Ingress manager initialized", "listen_addr", cfg.CaddyListenAddress, "admin", fmt.Sprintf("%s:%d", cfg.CaddyAdminAddress, cfg.CaddyAdminPort)) + logger.Info("Ingress manager initialized", "listen_addr", cfg.CaddyListenAddress, "admin", app.IngressManager.AdminURL()) // Create router r := chi.NewRouter() @@ -308,7 +319,7 @@ func run() error { logger.Info("http server shutdown complete") // Shutdown ingress manager (stops Caddy if CADDY_STOP_ON_SHUTDOWN=true) - if err := app.IngressManager.Shutdown(); err != nil { + if err := app.IngressManager.Shutdown(shutdownCtx); err != nil { logger.Error("failed to shutdown ingress manager", "error", err) // Don't return error - continue with shutdown } else { @@ -338,7 +349,9 @@ func run() error { } }) - return grp.Wait() + err = grp.Wait() + slog.Info("all goroutines finished") + return err } // getRunningInstanceIDs returns IDs of instances currently in Running state diff --git a/lib/ingress/daemon.go b/lib/ingress/daemon.go index 86fe5101..f31511e1 100644 --- a/lib/ingress/daemon.go +++ b/lib/ingress/daemon.go @@ -3,8 +3,10 @@ package ingress import ( "bytes" "context" + "encoding/json" "fmt" "io" + "net" "net/http" "os" "os/exec" @@ -32,12 +34,13 @@ const ( type CaddyDaemon struct { paths *paths.Paths adminAddress string - adminPort int + adminPort int // actual port to use (resolved from config or picked fresh) pid int stopOnShutdown bool } // NewCaddyDaemon creates a new CaddyDaemon manager. +// If adminPort is 0, it will be resolved later from existing config or picked fresh. func NewCaddyDaemon(p *paths.Paths, adminAddress string, adminPort int, stopOnShutdown bool) *CaddyDaemon { return &CaddyDaemon{ paths: p, @@ -52,8 +55,15 @@ func (d *CaddyDaemon) StopOnShutdown() bool { return d.stopOnShutdown } +// AdminPort returns the admin port. If it was configured as 0 (random), +// this returns the actual port after it's been resolved. +func (d *CaddyDaemon) AdminPort() int { + return d.adminPort +} + // Start starts the Caddy daemon. If Caddy is already running (discovered via PID file // or admin API), this is a no-op and returns the existing PID. +// Note: adminPort must be resolved (non-zero) before calling Start. func (d *CaddyDaemon) Start(ctx context.Context) (int, error) { // Check if already running if pid, running := d.DiscoverRunning(); running { @@ -64,6 +74,48 @@ func (d *CaddyDaemon) Start(ctx context.Context) (int, error) { return d.startCaddy(ctx) } +// pickAvailablePort finds an available TCP port by briefly binding to port 0. +func pickAvailablePort(address string) (int, error) { + listener, err := net.Listen("tcp", address+":0") + if err != nil { + return 0, err + } + port := listener.Addr().(*net.TCPAddr).Port + listener.Close() + return port, nil +} + +// readPortFromConfig reads the admin port from the existing Caddy config file. +func (d *CaddyDaemon) readPortFromConfig() int { + data, err := os.ReadFile(d.paths.CaddyConfig()) + if err != nil { + return 0 + } + + var config struct { + Admin struct { + Listen string `json:"listen"` + } `json:"admin"` + } + if err := json.Unmarshal(data, &config); err != nil { + return 0 + } + + // Parse "127.0.0.1:2019" format + if config.Admin.Listen == "" { + return 0 + } + _, portStr, err := net.SplitHostPort(config.Admin.Listen) + if err != nil { + return 0 + } + port, err := strconv.Atoi(portStr) + if err != nil { + return 0 + } + return port +} + // startCaddy starts a new Caddy process. func (d *CaddyDaemon) startCaddy(ctx context.Context) (int, error) { // Get binary path (extracts if needed) @@ -152,9 +204,12 @@ func (d *CaddyDaemon) startCaddy(ctx context.Context) (int, error) { } // Stop gracefully stops the Caddy daemon. -func (d *CaddyDaemon) Stop() error { +func (d *CaddyDaemon) Stop(ctx context.Context) error { + log := logger.FromContext(ctx) + pid, running := d.DiscoverRunning() if !running { + log.InfoContext(ctx, "Caddy not running, nothing to stop") return nil } @@ -162,17 +217,24 @@ func (d *CaddyDaemon) Stop() error { client := &http.Client{Timeout: 5 * time.Second} adminURL := fmt.Sprintf("http://%s:%d/stop", d.adminAddress, d.adminPort) resp, err := client.Post(adminURL, "", nil) - if err == nil { + if err != nil { + log.InfoContext(ctx, "Caddy admin API stop request failed, will try SIGTERM", "error", err) + } else { resp.Body.Close() + log.InfoContext(ctx, "Caddy admin API stop request sent") } // Wait for process to exit after admin API stop (up to 5s) + log.InfoContext(ctx, "waiting for Caddy process to exit", "pid", pid) if d.waitForProcessExit(pid, 5*time.Second) { os.Remove(d.paths.CaddyPIDFile()) d.pid = 0 + log.InfoContext(ctx, "Caddy stopped via admin API") return nil } + log.InfoContext(ctx, "Caddy still running after admin API stop, sending SIGTERM") + // Send SIGTERM if still running if proc, err := os.FindProcess(pid); err == nil { proc.Signal(syscall.SIGTERM) @@ -182,9 +244,12 @@ func (d *CaddyDaemon) Stop() error { if d.waitForProcessExit(pid, 2*time.Second) { os.Remove(d.paths.CaddyPIDFile()) d.pid = 0 + log.InfoContext(ctx, "Caddy stopped via SIGTERM") return nil } + log.InfoContext(ctx, "Caddy still running after SIGTERM, sending SIGKILL") + // Final resort: SIGKILL if proc, err := os.FindProcess(pid); err == nil { proc.Signal(syscall.SIGKILL) @@ -197,6 +262,7 @@ func (d *CaddyDaemon) Stop() error { os.Remove(d.paths.CaddyPIDFile()) d.pid = 0 + log.InfoContext(ctx, "Caddy stopped via SIGKILL") return nil } @@ -303,6 +369,9 @@ func (d *CaddyDaemon) waitForAdmin(ctx context.Context) error { // isAdminResponding checks if the admin API is responding. func (d *CaddyDaemon) isAdminResponding() bool { + if d.adminPort == 0 { + return false + } client := &http.Client{Timeout: 1 * time.Second} adminURL := fmt.Sprintf("http://%s:%d/config/", d.adminAddress, d.adminPort) resp, err := client.Get(adminURL) @@ -315,13 +384,38 @@ func (d *CaddyDaemon) isAdminResponding() bool { } // isProcessRunning checks if a process with the given PID is running. +// Returns false for zombie processes (which have exited but not been reaped). func (d *CaddyDaemon) isProcessRunning(pid int) bool { + // Check if process exists proc, err := os.FindProcess(pid) if err != nil { return false } err = proc.Signal(syscall.Signal(0)) - return err == nil + if err != nil { + return false + } + + // Check if it's a zombie process by reading /proc/[pid]/stat + // A zombie has state 'Z' and shouldn't be considered "running" + statPath := fmt.Sprintf("/proc/%d/stat", pid) + data, err := os.ReadFile(statPath) + if err != nil { + // Can't read stat, process might have exited + return false + } + + // Format: pid (comm) state ... + // Find the state character after the closing parenthesis + statStr := string(data) + closeParenIdx := strings.LastIndex(statStr, ")") + if closeParenIdx == -1 || closeParenIdx+2 >= len(statStr) { + return false + } + state := statStr[closeParenIdx+2] // Skip ") " to get state char + + // Z = zombie, X = dead + return state != 'Z' && state != 'X' } // findCaddyPID tries to find the Caddy process PID by scanning /proc. diff --git a/lib/ingress/manager.go b/lib/ingress/manager.go index 5cdbf19e..8622e5a6 100644 --- a/lib/ingress/manager.go +++ b/lib/ingress/manager.go @@ -51,7 +51,11 @@ type Manager interface { Delete(ctx context.Context, idOrName string) error // Shutdown gracefully stops the ingress subsystem. - Shutdown() error + Shutdown(ctx context.Context) error + + // AdminURL returns the Caddy admin API URL. + // Only valid after Initialize() has been called. + AdminURL() string } // DefaultDNSPort is the default port for the internal DNS server. @@ -152,13 +156,31 @@ func (m *manager) Initialize(ctx context.Context) error { return fmt.Errorf("start DNS server: %w", err) } - // Create config generator now that DNS server is running and we know the actual port - // (important when DNSPort was configured as 0 for random port) + // Resolve the admin port before creating config generator. + // If configured as 0, try to read from existing config or pick a new port. + adminPort := m.config.AdminPort + if adminPort == 0 { + // Try to read port from existing Caddy config + if existingPort := m.daemon.readPortFromConfig(); existingPort > 0 { + adminPort = existingPort + } else { + // Pick a new available port + port, err := pickAvailablePort(m.config.AdminAddress) + if err != nil { + return fmt.Errorf("pick admin port: %w", err) + } + adminPort = port + } + // Update daemon with resolved port + m.daemon.adminPort = adminPort + } + + // Create config generator with resolved ports m.configGenerator = NewCaddyConfigGenerator( m.paths, m.config.ListenAddress, m.config.AdminAddress, - m.config.AdminPort, + adminPort, m.config.ACME, m.dnsServer.Port(), ) @@ -463,10 +485,12 @@ func (m *manager) Delete(ctx context.Context, idOrName string) error { } // Shutdown gracefully stops the ingress subsystem. -func (m *manager) Shutdown() error { +func (m *manager) Shutdown(ctx context.Context) error { m.mu.Lock() defer m.mu.Unlock() + log := logger.FromContext(ctx) + // Stop log forwarder if m.logForwarder != nil { m.logForwarder.Stop() @@ -474,20 +498,34 @@ func (m *manager) Shutdown() error { // Stop DNS server if m.dnsServer != nil { + log.InfoContext(ctx, "stopping DNS server") if err := m.dnsServer.Stop(); err != nil { - // Log but don't fail - continue with shutdown - slog.Warn("failed to stop DNS server", "error", err) + log.WarnContext(ctx, "failed to stop DNS server", "error", err) + } else { + log.InfoContext(ctx, "stopped DNS server") } } // Only stop Caddy if configured to do so if m.daemon.StopOnShutdown() { - return m.daemon.Stop() + log.InfoContext(ctx, "stopping Caddy daemon") + if err := m.daemon.Stop(ctx); err != nil { + log.ErrorContext(ctx, "failed to stop Caddy daemon", "error", err) + return err + } + log.InfoContext(ctx, "stopped Caddy daemon") + return nil } + log.InfoContext(ctx, "leaving Caddy daemon running (CADDY_STOP_ON_SHUTDOWN=false)") return nil } +// AdminURL returns the Caddy admin API URL. +func (m *manager) AdminURL() string { + return m.daemon.AdminURL() +} + // loadAllIngresses loads all ingresses and converts them to the Ingress type. func (m *manager) loadAllIngresses() ([]Ingress, error) { storedList, err := loadAllIngresses(m.paths) diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index fe2e0f30..7e0b2dcf 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -376,7 +376,7 @@ func TestBasicEndToEnd(t *testing.T) { // Ensure we clean up Caddy - use t.Cleanup for guaranteed cleanup even on test failures t.Cleanup(func() { t.Log("Shutting down Caddy...") - if err := ingressManager.Shutdown(); err != nil { + if err := ingressManager.Shutdown(context.Background()); err != nil { t.Logf("Warning: failed to shutdown ingress manager: %v", err) } }) @@ -502,7 +502,7 @@ func TestBasicEndToEnd(t *testing.T) { // Use t.Cleanup for guaranteed cleanup even on test failures t.Cleanup(func() { t.Log("Shutting down TLS Caddy...") - if err := tlsIngressManager.Shutdown(); err != nil { + if err := tlsIngressManager.Shutdown(context.Background()); err != nil { t.Logf("Warning: failed to shutdown TLS ingress manager: %v", err) } })