From 77ed14e530a2b2d92df3eda87d5ed54d5a70d4c5 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 22 Dec 2025 17:11:14 -0500 Subject: [PATCH 01/14] Generate QEMU implementation --- cmd/api/api/api_test.go | 2 +- cmd/api/api/instances.go | 12 ++ cmd/api/config/config.go | 6 + go.mod | 2 + go.sum | 13 ++ lib/devices/gpu_e2e_test.go | 2 +- lib/devices/gpu_inference_test.go | 2 +- lib/devices/gpu_module_test.go | 4 +- lib/hypervisor/hypervisor.go | 3 +- lib/hypervisor/qemu/config.go | 79 +++++++++ lib/hypervisor/qemu/config_test.go | 161 +++++++++++++++++++ lib/hypervisor/qemu/process.go | 171 ++++++++++++++++++++ lib/hypervisor/qemu/qemu.go | 132 +++++++++++++++ lib/hypervisor/qemu/qmp.go | 150 +++++++++++++++++ lib/hypervisor/qemu/qmp_test.go | 100 ++++++++++++ lib/instances/create.go | 9 +- lib/instances/manager.go | 16 +- lib/instances/manager_test.go | 4 +- lib/instances/resource_limits_test.go | 4 +- lib/instances/standby.go | 6 +- lib/instances/types.go | 1 + lib/oapi/oapi.go | 221 ++++++++++++++------------ lib/providers/providers.go | 4 +- openapi.yaml | 10 ++ 24 files changed, 997 insertions(+), 117 deletions(-) create mode 100644 lib/hypervisor/qemu/config.go create mode 100644 lib/hypervisor/qemu/config_test.go create mode 100644 lib/hypervisor/qemu/process.go create mode 100644 lib/hypervisor/qemu/qemu.go create mode 100644 lib/hypervisor/qemu/qmp.go create mode 100644 lib/hypervisor/qemu/qmp_test.go diff --git a/cmd/api/api/api_test.go b/cmd/api/api/api_test.go index fda6127b..c2cce0f0 100644 --- a/cmd/api/api/api_test.go +++ b/cmd/api/api/api_test.go @@ -40,7 +40,7 @@ func newTestService(t *testing.T) *ApiService { limits := instances.ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB } - instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil) + instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, "", nil, nil) // Register cleanup for orphaned Cloud Hypervisor processes t.Cleanup(func() { diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go index acbd37c3..fb3e4dc2 100644 --- a/cmd/api/api/instances.go +++ b/cmd/api/api/instances.go @@ -8,6 +8,7 @@ import ( "net/http" "github.com/c2h5oh/datasize" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/logger" mw "github.com/onkernel/hypeman/lib/middleware" @@ -136,6 +137,12 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst } } + // Convert hypervisor type from API enum to domain type + var hvType hypervisor.Type + if request.Body.Hypervisor != nil { + hvType = hypervisor.Type(*request.Body.Hypervisor) + } + domainReq := instances.CreateInstanceRequest{ Name: request.Body.Name, Image: request.Body.Image, @@ -147,6 +154,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst NetworkEnabled: networkEnabled, Devices: deviceRefs, Volumes: volumes, + Hypervisor: hvType, } inst, err := s.InstanceManager.CreateInstance(ctx, domainReq) @@ -469,6 +477,9 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance { netObj.Mac = lo.ToPtr(inst.MAC) } + // Convert hypervisor type + hvType := oapi.InstanceHypervisor(inst.HypervisorType) + oapiInst := oapi.Instance{ Id: inst.Id, Name: inst.Name, @@ -484,6 +495,7 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance { StartedAt: inst.StartedAt, StoppedAt: inst.StoppedAt, HasSnapshot: lo.ToPtr(inst.HasSnapshot), + Hypervisor: &hvType, } if len(inst.Env) > 0 { diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index a3e5a556..2d43b526 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -101,6 +101,9 @@ type Config struct { // Cloudflare configuration (if AcmeDnsProvider=cloudflare) CloudflareApiToken string // Cloudflare API token + + // Hypervisor configuration + DefaultHypervisor string // Default hypervisor type: "cloud-hypervisor" or "qemu" } // Load loads configuration from environment variables @@ -163,6 +166,9 @@ func Load() *Config { // Cloudflare configuration CloudflareApiToken: getEnv("CLOUDFLARE_API_TOKEN", ""), + + // Hypervisor configuration + DefaultHypervisor: getEnv("DEFAULT_HYPERVISOR", "cloud-hypervisor"), } return cfg diff --git a/go.mod b/go.mod index 0359d7bc..ed6c313c 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 github.com/creack/pty v1.1.24 github.com/cyphar/filepath-securejoin v0.6.1 + github.com/digitalocean/go-qemu v0.0.0-20250212194115-ee9b0668d242 github.com/distribution/reference v0.6.0 github.com/getkin/kin-openapi v0.133.0 github.com/ghodss/yaml v1.0.0 @@ -58,6 +59,7 @@ require ( github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/digitalocean/go-libvirt v0.0.0-20220804181439-8648fbde413e // indirect github.com/docker/cli v28.2.2+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect github.com/docker/docker v28.2.2+incompatible // indirect diff --git a/go.sum b/go.sum index 6772c9ed..3edd3725 100644 --- a/go.sum +++ b/go.sum @@ -40,6 +40,10 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/digitalocean/go-libvirt v0.0.0-20220804181439-8648fbde413e h1:SCnqm8SjSa0QqRxXbo5YY//S+OryeJioe17nK+iDZpg= +github.com/digitalocean/go-libvirt v0.0.0-20220804181439-8648fbde413e/go.mod h1:o129ljs6alsIQTc8d6eweihqpmmrbxZ2g1jhgjhPykI= +github.com/digitalocean/go-qemu v0.0.0-20250212194115-ee9b0668d242 h1:rh6rt8pF5U4iyQ86h6lRDenJoX4ht2wFnZXB9ogIrIM= +github.com/digitalocean/go-qemu v0.0.0-20250212194115-ee9b0668d242/go.mod h1:LGHUtlhsY4vRGM6AHejEQKVI5e3eHbSylMHwTSpQtVw= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/docker/cli v28.2.2+incompatible h1:qzx5BNUDFqlvyq4AHzdNB7gSyVTmU4cgsyN9SdInc1A= @@ -233,6 +237,7 @@ github.com/woodsbury/decimal128 v1.3.0 h1:8pffMNWIlC0O5vbyHWFZAt5yWvWcrHA+3ovIIj github.com/woodsbury/decimal128 v1.3.0/go.mod h1:C5UTmyTjW3JftjUFzOVhC20BEQa2a4ZKOB5I6Zjb+ds= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/bridges/otelslog v0.13.0 h1:bwnLpizECbPr1RrQ27waeY2SPIPeccCx/xLuoYADZ9s= @@ -279,6 +284,7 @@ golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U= golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -286,12 +292,14 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 h1:6/3JGEh1C88g7m+qzzTbl3A0FtsLguXieqofVLU/JAo= golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -299,12 +307,16 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -316,6 +328,7 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/lib/devices/gpu_e2e_test.go b/lib/devices/gpu_e2e_test.go index 4348ebdb..4599dd51 100644 --- a/lib/devices/gpu_e2e_test.go +++ b/lib/devices/gpu_e2e_test.go @@ -72,7 +72,7 @@ func TestGPUPassthrough(t *testing.T) { limits := instances.ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB } - instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil) + instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, "", nil, nil) // Step 1: Discover available GPUs t.Log("Step 1: Discovering available GPUs...") diff --git a/lib/devices/gpu_inference_test.go b/lib/devices/gpu_inference_test.go index 0749b840..33c9b416 100644 --- a/lib/devices/gpu_inference_test.go +++ b/lib/devices/gpu_inference_test.go @@ -111,7 +111,7 @@ func TestGPUInference(t *testing.T) { limits := instances.ResourceLimits{ MaxOverlaySize: 100 * 1024 * 1024 * 1024, } - instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil) + instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, "", nil, nil) // Step 1: Build custom CUDA+Ollama image t.Log("Step 1: Building custom CUDA+Ollama Docker image...") diff --git a/lib/devices/gpu_module_test.go b/lib/devices/gpu_module_test.go index 841faedd..97045474 100644 --- a/lib/devices/gpu_module_test.go +++ b/lib/devices/gpu_module_test.go @@ -77,7 +77,7 @@ func TestNVIDIAModuleLoading(t *testing.T) { deviceMgr := devices.NewManager(p) volumeMgr := volumes.NewManager(p, 10*1024*1024*1024, nil) limits := instances.ResourceLimits{MaxOverlaySize: 10 * 1024 * 1024 * 1024} - instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil) + instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, "", nil, nil) // Step 1: Find an NVIDIA GPU t.Log("Step 1: Discovering available GPUs...") @@ -318,7 +318,7 @@ func TestNVMLDetection(t *testing.T) { deviceMgr := devices.NewManager(p) volumeMgr := volumes.NewManager(p, 10*1024*1024*1024, nil) limits := instances.ResourceLimits{MaxOverlaySize: 10 * 1024 * 1024 * 1024} - instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil) + instanceMgr := instances.NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, "", nil, nil) // Step 1: Check if ollama-cuda:test image exists in Docker t.Log("Step 1: Checking for ollama-cuda:test Docker image...") diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index 2b93b0bf..e6ae7c35 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -16,7 +16,8 @@ type Type string const ( // TypeCloudHypervisor is the Cloud Hypervisor VMM TypeCloudHypervisor Type = "cloud-hypervisor" - // Future: TypeQEMU Type = "qemu" + // TypeQEMU is the QEMU VMM + TypeQEMU Type = "qemu" ) // socketNames maps hypervisor types to their socket filenames. diff --git a/lib/hypervisor/qemu/config.go b/lib/hypervisor/qemu/config.go new file mode 100644 index 00000000..2e0d0ad1 --- /dev/null +++ b/lib/hypervisor/qemu/config.go @@ -0,0 +1,79 @@ +package qemu + +import ( + "fmt" + "strconv" + + "github.com/onkernel/hypeman/lib/hypervisor" +) + +// BuildArgs converts hypervisor.VMConfig to QEMU command-line arguments. +func BuildArgs(cfg hypervisor.VMConfig) []string { + args := make([]string, 0, 64) + + // Machine type with KVM acceleration + args = append(args, "-machine", "q35,accel=kvm") + + // CPU configuration + args = append(args, "-cpu", "host") + args = append(args, "-smp", strconv.Itoa(cfg.VCPUs)) + + // Memory configuration + memMB := cfg.MemoryBytes / (1024 * 1024) + args = append(args, "-m", fmt.Sprintf("%dM", memMB)) + + // Kernel and initrd + if cfg.KernelPath != "" { + args = append(args, "-kernel", cfg.KernelPath) + } + if cfg.InitrdPath != "" { + args = append(args, "-initrd", cfg.InitrdPath) + } + if cfg.KernelArgs != "" { + args = append(args, "-append", cfg.KernelArgs) + } + + // Disk configuration + for i, disk := range cfg.Disks { + driveOpts := fmt.Sprintf("file=%s,format=raw,if=none,id=drive%d", disk.Path, i) + if disk.Readonly { + driveOpts += ",readonly=on" + } + args = append(args, "-drive", driveOpts) + args = append(args, "-device", fmt.Sprintf("virtio-blk-pci,drive=drive%d", i)) + } + + // Network configuration + for i, net := range cfg.Networks { + netdevOpts := fmt.Sprintf("tap,id=net%d,ifname=%s,script=no,downscript=no", i, net.TAPDevice) + args = append(args, "-netdev", netdevOpts) + + deviceOpts := fmt.Sprintf("virtio-net-pci,netdev=net%d,mac=%s", i, net.MAC) + args = append(args, "-device", deviceOpts) + } + + // Vsock configuration + if cfg.VsockCID > 0 { + args = append(args, "-device", fmt.Sprintf("vhost-vsock-pci,guest-cid=%d", cfg.VsockCID)) + } + + // PCI device passthrough (GPU, etc.) + for _, pciAddr := range cfg.PCIDevices { + args = append(args, "-device", fmt.Sprintf("vfio-pci,host=%s", pciAddr)) + } + + // Serial console output to file + if cfg.SerialLogPath != "" { + args = append(args, "-serial", fmt.Sprintf("file:%s", cfg.SerialLogPath)) + } else { + args = append(args, "-serial", "stdio") + } + + // No graphics + args = append(args, "-nographic") + + // Disable default devices we don't need + args = append(args, "-nodefaults") + + return args +} diff --git a/lib/hypervisor/qemu/config_test.go b/lib/hypervisor/qemu/config_test.go new file mode 100644 index 00000000..9fe3d427 --- /dev/null +++ b/lib/hypervisor/qemu/config_test.go @@ -0,0 +1,161 @@ +package qemu + +import ( + "testing" + + "github.com/onkernel/hypeman/lib/hypervisor" + "github.com/stretchr/testify/assert" +) + +func TestBuildArgs_Basic(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 2, + MemoryBytes: 1024 * 1024 * 1024, // 1GB + KernelPath: "/path/to/vmlinux", + InitrdPath: "/path/to/initrd", + KernelArgs: "console=ttyS0", + } + + args := BuildArgs(cfg) + + // Check machine type + assert.Contains(t, args, "-machine") + assert.Contains(t, args, "q35,accel=kvm") + + // Check CPU + assert.Contains(t, args, "-cpu") + assert.Contains(t, args, "host") + assert.Contains(t, args, "-smp") + assert.Contains(t, args, "2") + + // Check memory + assert.Contains(t, args, "-m") + assert.Contains(t, args, "1024M") + + // Check kernel + assert.Contains(t, args, "-kernel") + assert.Contains(t, args, "/path/to/vmlinux") + + // Check initrd + assert.Contains(t, args, "-initrd") + assert.Contains(t, args, "/path/to/initrd") + + // Check kernel args + assert.Contains(t, args, "-append") + assert.Contains(t, args, "console=ttyS0") + + // Check nographic + assert.Contains(t, args, "-nographic") +} + +func TestBuildArgs_Disks(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + Disks: []hypervisor.DiskConfig{ + {Path: "/path/to/rootfs.ext4", Readonly: false}, + {Path: "/path/to/data.ext4", Readonly: true}, + }, + } + + args := BuildArgs(cfg) + + // Check first disk (writable) + assert.Contains(t, args, "-drive") + foundDrive0 := false + foundDrive1 := false + for _, arg := range args { + if arg == "file=/path/to/rootfs.ext4,format=raw,if=none,id=drive0" { + foundDrive0 = true + } + if arg == "file=/path/to/data.ext4,format=raw,if=none,id=drive1,readonly=on" { + foundDrive1 = true + } + } + assert.True(t, foundDrive0, "Expected writable drive0") + assert.True(t, foundDrive1, "Expected readonly drive1") + + // Check virtio-blk devices + assert.Contains(t, args, "virtio-blk-pci,drive=drive0") + assert.Contains(t, args, "virtio-blk-pci,drive=drive1") +} + +func TestBuildArgs_Network(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + Networks: []hypervisor.NetworkConfig{ + { + TAPDevice: "tap0", + MAC: "02:00:00:ab:cd:ef", + IP: "192.168.1.10", + Netmask: "255.255.255.0", + }, + }, + } + + args := BuildArgs(cfg) + + // Check netdev + foundNetdev := false + for _, arg := range args { + if arg == "tap,id=net0,ifname=tap0,script=no,downscript=no" { + foundNetdev = true + } + } + assert.True(t, foundNetdev, "Expected tap netdev") + + // Check virtio-net device with MAC + assert.Contains(t, args, "virtio-net-pci,netdev=net0,mac=02:00:00:ab:cd:ef") +} + +func TestBuildArgs_Vsock(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + VsockCID: 123, + } + + args := BuildArgs(cfg) + + assert.Contains(t, args, "-device") + assert.Contains(t, args, "vhost-vsock-pci,guest-cid=123") +} + +func TestBuildArgs_PCIPassthrough(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + PCIDevices: []string{"0000:01:00.0", "0000:02:00.0"}, + } + + args := BuildArgs(cfg) + + assert.Contains(t, args, "vfio-pci,host=0000:01:00.0") + assert.Contains(t, args, "vfio-pci,host=0000:02:00.0") +} + +func TestBuildArgs_SerialLog(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + SerialLogPath: "/var/log/app.log", + } + + args := BuildArgs(cfg) + + assert.Contains(t, args, "-serial") + assert.Contains(t, args, "file:/var/log/app.log") +} + +func TestBuildArgs_NoSerialLog(t *testing.T) { + cfg := hypervisor.VMConfig{ + VCPUs: 1, + MemoryBytes: 512 * 1024 * 1024, + } + + args := BuildArgs(cfg) + + assert.Contains(t, args, "-serial") + assert.Contains(t, args, "stdio") +} diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go new file mode 100644 index 00000000..d30b0f20 --- /dev/null +++ b/lib/hypervisor/qemu/process.go @@ -0,0 +1,171 @@ +// Package qemu implements the hypervisor.Hypervisor interface for QEMU. +package qemu + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "syscall" + "time" + + "github.com/onkernel/hypeman/lib/hypervisor" + "github.com/onkernel/hypeman/lib/paths" +) + +func init() { + hypervisor.RegisterSocketName(hypervisor.TypeQEMU, "qemu.sock") +} + +// ProcessManager implements hypervisor.ProcessManager for QEMU. +type ProcessManager struct{} + +// NewProcessManager creates a new QEMU process manager. +func NewProcessManager() *ProcessManager { + return &ProcessManager{} +} + +// Verify ProcessManager implements the interface +var _ hypervisor.ProcessManager = (*ProcessManager)(nil) + +// SocketName returns the socket filename for QEMU. +func (p *ProcessManager) SocketName() string { + return "qemu.sock" +} + +// StartProcess launches a QEMU VMM process. +func (p *ProcessManager) StartProcess(ctx context.Context, paths *paths.Paths, version string, socketPath string) (int, error) { + return p.StartProcessWithArgs(ctx, paths, version, socketPath, nil) +} + +// StartProcessWithArgs launches a QEMU VMM process with extra arguments. +func (p *ProcessManager) StartProcessWithArgs(ctx context.Context, paths *paths.Paths, version string, socketPath string, extraArgs []string) (int, error) { + // Get binary path + binaryPath, err := p.GetBinaryPath(paths, version) + if err != nil { + return 0, fmt.Errorf("get binary: %w", err) + } + + // Check if socket is already in use + if isSocketInUse(socketPath) { + return 0, fmt.Errorf("socket already in use, QEMU may be running at %s", socketPath) + } + + // Remove stale socket if exists + os.Remove(socketPath) + + // Build base command arguments for QMP socket + args := []string{ + "-chardev", fmt.Sprintf("socket,id=qmp,path=%s,server=on,wait=off", socketPath), + "-mon", "chardev=qmp,mode=control", + } + args = append(args, extraArgs...) + + // Create command + cmd := exec.Command(binaryPath, args...) + + // Daemonize: detach from parent process group + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setpgid: true, + } + + // Redirect stdout/stderr to VMM log file + instanceDir := filepath.Dir(socketPath) + logsDir := filepath.Join(instanceDir, "logs") + if err := os.MkdirAll(logsDir, 0755); err != nil { + return 0, fmt.Errorf("create logs directory: %w", err) + } + + vmmLogFile, err := os.OpenFile( + filepath.Join(logsDir, "vmm.log"), + os.O_CREATE|os.O_WRONLY|os.O_APPEND, + 0644, + ) + if err != nil { + return 0, fmt.Errorf("create vmm log: %w", err) + } + defer vmmLogFile.Close() + + cmd.Stdout = vmmLogFile + cmd.Stderr = vmmLogFile + + if err := cmd.Start(); err != nil { + return 0, fmt.Errorf("start qemu: %w", err) + } + + pid := cmd.Process.Pid + + // Wait for socket to be ready + if err := waitForSocket(socketPath, 10*time.Second); err != nil { + // Read vmm.log to understand why socket wasn't created + vmmLogPath := filepath.Join(logsDir, "vmm.log") + if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { + return 0, fmt.Errorf("%w; vmm.log: %s", err, string(logData)) + } + return 0, err + } + + return pid, nil +} + +// GetBinaryPath returns the path to the QEMU binary. +// QEMU is expected to be installed on the system. +func (p *ProcessManager) GetBinaryPath(paths *paths.Paths, version string) (string, error) { + // Look for system-installed QEMU + candidates := []string{ + "/usr/bin/qemu-system-x86_64", + "/usr/local/bin/qemu-system-x86_64", + } + + for _, path := range candidates { + if _, err := os.Stat(path); err == nil { + return path, nil + } + } + + // Try PATH lookup + if path, err := exec.LookPath("qemu-system-x86_64"); err == nil { + return path, nil + } + + return "", fmt.Errorf("qemu-system-x86_64 not found; install QEMU on your system") +} + +// isSocketInUse checks if a Unix socket is actively being used +func isSocketInUse(socketPath string) bool { + conn, err := dialUnixTimeout(socketPath, 100*time.Millisecond) + if err != nil { + return false + } + conn.Close() + return true +} + +// waitForSocket waits for the QMP socket to become available +func waitForSocket(socketPath string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if _, err := os.Stat(socketPath); err == nil { + // Socket file exists, try to connect + conn, err := dialUnixTimeout(socketPath, 100*time.Millisecond) + if err == nil { + conn.Close() + return nil + } + } + time.Sleep(50 * time.Millisecond) + } + return fmt.Errorf("timeout waiting for socket") +} + +// dialUnixTimeout dials a Unix socket with a timeout +func dialUnixTimeout(path string, timeout time.Duration) (*os.File, error) { + // Use a simple stat check - actual connection will be done by go-qemu + if _, err := os.Stat(path); err != nil { + return nil, err + } + // For now, just return nil to indicate socket exists + // The actual QMP connection will be made by the QEMU client + return nil, nil +} diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go new file mode 100644 index 00000000..6ec5a43e --- /dev/null +++ b/lib/hypervisor/qemu/qemu.go @@ -0,0 +1,132 @@ +package qemu + +import ( + "context" + "fmt" + "time" + + "github.com/onkernel/hypeman/lib/hypervisor" +) + +// QEMU implements hypervisor.Hypervisor for QEMU VMM. +type QEMU struct { + client *QMPClient +} + +// New creates a new QEMU client for an existing QMP socket. +func New(socketPath string) (*QEMU, error) { + client, err := NewQMPClient(socketPath) + if err != nil { + return nil, fmt.Errorf("create qmp client: %w", err) + } + return &QEMU{client: client}, nil +} + +// Verify QEMU implements the interface +var _ hypervisor.Hypervisor = (*QEMU)(nil) + +// Capabilities returns the features supported by QEMU. +func (q *QEMU) Capabilities() hypervisor.Capabilities { + return hypervisor.Capabilities{ + SupportsSnapshot: false, // Not implemented in first pass + SupportsHotplugMemory: false, // Not implemented in first pass + SupportsPause: true, + SupportsVsock: true, + SupportsGPUPassthrough: true, + } +} + +// CreateVM configures the VM in QEMU. +// For QEMU, the VM is configured via command-line args when the process starts, +// so this is a no-op. The configuration is applied in StartProcess. +func (q *QEMU) CreateVM(ctx context.Context, config hypervisor.VMConfig) error { + // QEMU doesn't have a separate create step - configuration is done at process start + // This is a no-op for QEMU + return nil +} + +// BootVM starts the configured VM. +// For QEMU, the VM starts automatically when the process starts, +// so this is a no-op. +func (q *QEMU) BootVM(ctx context.Context) error { + // QEMU starts running immediately when the process starts + // This is a no-op for QEMU + return nil +} + +// DeleteVM removes the VM configuration from QEMU. +// This sends a graceful shutdown signal to the guest. +func (q *QEMU) DeleteVM(ctx context.Context) error { + return q.client.SystemPowerdown() +} + +// Shutdown stops the QEMU process. +func (q *QEMU) Shutdown(ctx context.Context) error { + return q.client.Quit() +} + +// GetVMInfo returns current VM state. +func (q *QEMU) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { + status, running, err := q.client.QueryStatus() + if err != nil { + return nil, fmt.Errorf("query status: %w", err) + } + + var state hypervisor.VMState + switch { + case running: + state = hypervisor.StateRunning + case status == "paused": + state = hypervisor.StatePaused + case status == "shutdown": + state = hypervisor.StateShutdown + case status == "prelaunch": + state = hypervisor.StateCreated + default: + // Map other QEMU states to appropriate hypervisor states + if status == "inmigrate" || status == "postmigrate" { + state = hypervisor.StatePaused + } else { + state = hypervisor.StateRunning + } + } + + return &hypervisor.VMInfo{ + State: state, + MemoryActualSize: nil, // Not implemented in first pass + }, nil +} + +// Pause suspends VM execution. +func (q *QEMU) Pause(ctx context.Context) error { + return q.client.Stop() +} + +// Resume continues VM execution. +func (q *QEMU) Resume(ctx context.Context) error { + return q.client.Continue() +} + +// Snapshot creates a VM snapshot. +// Not implemented in first pass. +func (q *QEMU) Snapshot(ctx context.Context, destPath string) error { + return fmt.Errorf("snapshot not supported by QEMU implementation") +} + +// Restore loads a VM from snapshot. +// Not implemented in first pass. +func (q *QEMU) Restore(ctx context.Context, sourcePath string) error { + return fmt.Errorf("restore not supported by QEMU implementation") +} + +// ResizeMemory changes the VM's memory allocation. +// Not implemented in first pass. +func (q *QEMU) ResizeMemory(ctx context.Context, bytes int64) error { + return fmt.Errorf("memory resize not supported by QEMU implementation") +} + +// ResizeMemoryAndWait changes the VM's memory allocation and waits for it to stabilize. +// Not implemented in first pass. +func (q *QEMU) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error { + return fmt.Errorf("memory resize not supported by QEMU implementation") +} diff --git a/lib/hypervisor/qemu/qmp.go b/lib/hypervisor/qemu/qmp.go new file mode 100644 index 00000000..3f47d902 --- /dev/null +++ b/lib/hypervisor/qemu/qmp.go @@ -0,0 +1,150 @@ +package qemu + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/digitalocean/go-qemu/qmp" +) + +// QMPClient wraps go-qemu's QMP monitor with convenience methods. +type QMPClient struct { + monitor *qmp.SocketMonitor +} + +// NewQMPClient creates a new QMP client connected to the given socket. +func NewQMPClient(socketPath string) (*QMPClient, error) { + monitor, err := qmp.NewSocketMonitor("unix", socketPath, 2*time.Second) + if err != nil { + return nil, fmt.Errorf("create socket monitor: %w", err) + } + + if err := monitor.Connect(); err != nil { + return nil, fmt.Errorf("connect to qmp: %w", err) + } + + return &QMPClient{monitor: monitor}, nil +} + +// Close disconnects from the QMP socket. +func (c *QMPClient) Close() error { + return c.monitor.Disconnect() +} + +// Run executes a raw QMP command and returns the response. +func (c *QMPClient) Run(command []byte) ([]byte, error) { + return c.monitor.Run(command) +} + +// qmpCommand represents a QMP command structure +type qmpCommand struct { + Execute string `json:"execute"` + Arguments interface{} `json:"arguments,omitempty"` +} + +// qmpStatusResponse represents the response from query-status +type qmpStatusResponse struct { + Return struct { + Running bool `json:"running"` + Status string `json:"status"` + } `json:"return"` +} + +// Stop pauses VM execution (QMP 'stop' command). +func (c *QMPClient) Stop() error { + cmd := qmpCommand{Execute: "stop"} + cmdBytes, err := json.Marshal(cmd) + if err != nil { + return fmt.Errorf("marshal stop command: %w", err) + } + + _, err = c.monitor.Run(cmdBytes) + if err != nil { + return fmt.Errorf("execute stop: %w", err) + } + + return nil +} + +// Continue resumes VM execution (QMP 'cont' command). +func (c *QMPClient) Continue() error { + cmd := qmpCommand{Execute: "cont"} + cmdBytes, err := json.Marshal(cmd) + if err != nil { + return fmt.Errorf("marshal cont command: %w", err) + } + + _, err = c.monitor.Run(cmdBytes) + if err != nil { + return fmt.Errorf("execute cont: %w", err) + } + + return nil +} + +// QueryStatus returns the current VM status. +func (c *QMPClient) QueryStatus() (string, bool, error) { + cmd := qmpCommand{Execute: "query-status"} + cmdBytes, err := json.Marshal(cmd) + if err != nil { + return "", false, fmt.Errorf("marshal query-status: %w", err) + } + + resp, err := c.monitor.Run(cmdBytes) + if err != nil { + return "", false, fmt.Errorf("execute query-status: %w", err) + } + + var statusResp qmpStatusResponse + if err := json.Unmarshal(resp, &statusResp); err != nil { + return "", false, fmt.Errorf("unmarshal status response: %w", err) + } + + return statusResp.Return.Status, statusResp.Return.Running, nil +} + +// Quit shuts down QEMU (QMP 'quit' command). +func (c *QMPClient) Quit() error { + cmd := qmpCommand{Execute: "quit"} + cmdBytes, err := json.Marshal(cmd) + if err != nil { + return fmt.Errorf("marshal quit command: %w", err) + } + + // quit command doesn't return a response - QEMU exits + _, _ = c.monitor.Run(cmdBytes) + return nil +} + +// SystemPowerdown sends ACPI power button event (graceful shutdown). +func (c *QMPClient) SystemPowerdown() error { + cmd := qmpCommand{Execute: "system_powerdown"} + cmdBytes, err := json.Marshal(cmd) + if err != nil { + return fmt.Errorf("marshal system_powerdown: %w", err) + } + + _, err = c.monitor.Run(cmdBytes) + if err != nil { + return fmt.Errorf("execute system_powerdown: %w", err) + } + + return nil +} + +// SystemReset resets the VM (hard reset). +func (c *QMPClient) SystemReset() error { + cmd := qmpCommand{Execute: "system_reset"} + cmdBytes, err := json.Marshal(cmd) + if err != nil { + return fmt.Errorf("marshal system_reset: %w", err) + } + + _, err = c.monitor.Run(cmdBytes) + if err != nil { + return fmt.Errorf("execute system_reset: %w", err) + } + + return nil +} diff --git a/lib/hypervisor/qemu/qmp_test.go b/lib/hypervisor/qemu/qmp_test.go new file mode 100644 index 00000000..476648f5 --- /dev/null +++ b/lib/hypervisor/qemu/qmp_test.go @@ -0,0 +1,100 @@ +package qemu + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestQMPCommand_Marshal(t *testing.T) { + tests := []struct { + name string + cmd qmpCommand + expected string + }{ + { + name: "stop command", + cmd: qmpCommand{Execute: "stop"}, + expected: `{"execute":"stop"}`, + }, + { + name: "cont command", + cmd: qmpCommand{Execute: "cont"}, + expected: `{"execute":"cont"}`, + }, + { + name: "query-status command", + cmd: qmpCommand{Execute: "query-status"}, + expected: `{"execute":"query-status"}`, + }, + { + name: "quit command", + cmd: qmpCommand{Execute: "quit"}, + expected: `{"execute":"quit"}`, + }, + { + name: "system_powerdown command", + cmd: qmpCommand{Execute: "system_powerdown"}, + expected: `{"execute":"system_powerdown"}`, + }, + { + name: "system_reset command", + cmd: qmpCommand{Execute: "system_reset"}, + expected: `{"execute":"system_reset"}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data, err := json.Marshal(tt.cmd) + require.NoError(t, err) + assert.JSONEq(t, tt.expected, string(data)) + }) + } +} + +func TestQMPStatusResponse_Unmarshal(t *testing.T) { + tests := []struct { + name string + json string + expectedStatus string + expectedRunning bool + }{ + { + name: "running", + json: `{"return":{"running":true,"status":"running"}}`, + expectedStatus: "running", + expectedRunning: true, + }, + { + name: "paused", + json: `{"return":{"running":false,"status":"paused"}}`, + expectedStatus: "paused", + expectedRunning: false, + }, + { + name: "shutdown", + json: `{"return":{"running":false,"status":"shutdown"}}`, + expectedStatus: "shutdown", + expectedRunning: false, + }, + { + name: "prelaunch", + json: `{"return":{"running":false,"status":"prelaunch"}}`, + expectedStatus: "prelaunch", + expectedRunning: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var resp qmpStatusResponse + err := json.Unmarshal([]byte(tt.json), &resp) + require.NoError(t, err) + assert.Equal(t, tt.expectedStatus, resp.Return.Status) + assert.Equal(t, tt.expectedRunning, resp.Return.Running) + }) + } +} diff --git a/lib/instances/create.go b/lib/instances/create.go index 77b40ed7..59ce3b50 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -203,11 +203,14 @@ func (m *manager) createInstance( kernelVer := m.systemManager.GetDefaultKernelVersion() // 9. Get process manager for hypervisor type (needed for socket name) - hvType := hypervisor.TypeCloudHypervisor + hvType := req.Hypervisor + if hvType == "" { + hvType = m.defaultHypervisor + } pm, err := m.getProcessManager(hvType) if err != nil { - log.ErrorContext(ctx, "failed to get process manager", "error", err) - return nil, fmt.Errorf("get process manager: %w", err) + log.ErrorContext(ctx, "failed to get process manager", "hypervisor", hvType, "error", err) + return nil, fmt.Errorf("get process manager for %s: %w", hvType, err) } // 10. Validate, resolve, and auto-bind devices (GPU passthrough) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 08980e80..a11eb33e 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -8,6 +8,7 @@ import ( "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/hypervisor/cloudhypervisor" + "github.com/onkernel/hypeman/lib/hypervisor/qemu" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/network" "github.com/onkernel/hypeman/lib/paths" @@ -57,12 +58,19 @@ type manager struct { metrics *Metrics // Hypervisor support - processManagers map[hypervisor.Type]hypervisor.ProcessManager + processManagers map[hypervisor.Type]hypervisor.ProcessManager + defaultHypervisor hypervisor.Type // Default hypervisor type when not specified in request } // NewManager creates a new instances manager. // If meter is nil, metrics are disabled. -func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, meter metric.Meter, tracer trace.Tracer) Manager { +// defaultHypervisor specifies which hypervisor to use when not specified in requests. +func NewManager(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, defaultHypervisor hypervisor.Type, meter metric.Meter, tracer trace.Tracer) Manager { + // Validate and default the hypervisor type + if defaultHypervisor == "" { + defaultHypervisor = hypervisor.TypeCloudHypervisor + } + m := &manager{ paths: p, imageManager: imageManager, @@ -75,7 +83,9 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste hostTopology: detectHostTopology(), // Detect and cache host topology processManagers: map[hypervisor.Type]hypervisor.ProcessManager{ hypervisor.TypeCloudHypervisor: cloudhypervisor.NewProcessManager(), + hypervisor.TypeQEMU: qemu.NewProcessManager(), }, + defaultHypervisor: defaultHypervisor, } // Initialize metrics if meter is provided @@ -94,6 +104,8 @@ func (m *manager) getHypervisor(socketPath string, hvType hypervisor.Type) (hype switch hvType { case hypervisor.TypeCloudHypervisor: return cloudhypervisor.New(socketPath) + case hypervisor.TypeQEMU: + return qemu.New(socketPath) default: return nil, fmt.Errorf("unsupported hypervisor type: %s", hvType) } diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index b2795333..588ac871 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -57,7 +57,7 @@ func setupTestManager(t *testing.T) (*manager, string) { MaxTotalVcpus: 0, // unlimited MaxTotalMemory: 0, // unlimited } - mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, nil, nil).(*manager) + mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager) // Register cleanup to kill any orphaned Cloud Hypervisor processes t.Cleanup(func() { @@ -766,7 +766,7 @@ func TestStorageOperations(t *testing.T) { MaxTotalVcpus: 0, // unlimited MaxTotalMemory: 0, // unlimited } - manager := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, nil, nil).(*manager) + manager := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager) // Test metadata doesn't exist initially _, err := manager.loadMetadata("nonexistent") diff --git a/lib/instances/resource_limits_test.go b/lib/instances/resource_limits_test.go index 9392ee0a..1895d87e 100644 --- a/lib/instances/resource_limits_test.go +++ b/lib/instances/resource_limits_test.go @@ -163,7 +163,7 @@ func createTestManager(t *testing.T, limits ResourceLimits) *manager { deviceMgr := devices.NewManager(p) volumeMgr := volumes.NewManager(p, 0, nil) - return NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, nil, nil).(*manager) + return NewManager(p, imageMgr, systemMgr, networkMgr, deviceMgr, volumeMgr, limits, "", nil, nil).(*manager) } func TestResourceLimits_StructValues(t *testing.T) { @@ -267,7 +267,7 @@ func TestAggregateLimits_EnforcedAtRuntime(t *testing.T) { MaxTotalMemory: 6 * 1024 * 1024 * 1024, // aggregate: only 6GB total (allows first 2.5GB VM) } - mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, nil, nil).(*manager) + mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil).(*manager) // Cleanup any orphaned processes on test end t.Cleanup(func() { diff --git a/lib/instances/standby.go b/lib/instances/standby.go index b4391ff0..dc3cc0c5 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -74,10 +74,10 @@ func (m *manager) standbyInstance( // 6. Reduce memory to base size (virtio-mem hotplug) if supported // Wait for memory to stabilize so the snapshot is as small as possible if hv.Capabilities().SupportsHotplugMemory { - log.DebugContext(ctx, "reducing VM memory before snapshot", "instance_id", id, "base_size", inst.Size) + log.DebugContext(ctx, "reducing VM memory before snapshot", "instance_id", id, "base_size", inst.Size) if err := hv.ResizeMemoryAndWait(ctx, inst.Size, 5*time.Second); err != nil { - // Log warning but continue - snapshot will just be larger - log.WarnContext(ctx, "failed to reduce memory, snapshot will be larger", "instance_id", id, "error", err) + // Log warning but continue - snapshot will just be larger + log.WarnContext(ctx, "failed to reduce memory, snapshot will be larger", "instance_id", id, "error", err) } } diff --git a/lib/instances/types.go b/lib/instances/types.go index 0f3356f1..b823d5cf 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -97,6 +97,7 @@ type CreateInstanceRequest struct { NetworkEnabled bool // Whether to enable networking (uses default network) Devices []string // Device IDs or names to attach (GPU passthrough) Volumes []VolumeAttachment // Volumes to attach at creation time + Hypervisor hypervisor.Type // Optional: hypervisor type (defaults to config) } // AttachVolumeRequest is the domain request for attaching a volume (used for API compatibility) diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go index 0401b78c..9f8b8bab 100644 --- a/lib/oapi/oapi.go +++ b/lib/oapi/oapi.go @@ -29,6 +29,12 @@ const ( BearerAuthScopes = "bearerAuth.Scopes" ) +// Defines values for CreateInstanceRequestHypervisor. +const ( + CreateInstanceRequestHypervisorCloudHypervisor CreateInstanceRequestHypervisor = "cloud-hypervisor" + CreateInstanceRequestHypervisorQemu CreateInstanceRequestHypervisor = "qemu" +) + // Defines values for DeviceType. const ( Gpu DeviceType = "gpu" @@ -49,6 +55,12 @@ const ( Ready ImageStatus = "ready" ) +// Defines values for InstanceHypervisor. +const ( + InstanceHypervisorCloudHypervisor InstanceHypervisor = "cloud-hypervisor" + InstanceHypervisorQemu InstanceHypervisor = "qemu" +) + // Defines values for InstanceState. const ( Created InstanceState = "Created" @@ -135,6 +147,9 @@ type CreateInstanceRequest struct { // HotplugSize Additional memory for hotplug (human-readable format like "3GB", "1G") HotplugSize *string `json:"hotplug_size,omitempty"` + // Hypervisor Hypervisor to use for this instance. Defaults to server configuration. + Hypervisor *CreateInstanceRequestHypervisor `json:"hypervisor,omitempty"` + // Image OCI image reference Image string `json:"image"` @@ -160,6 +175,9 @@ type CreateInstanceRequest struct { Volumes *[]VolumeMount `json:"volumes,omitempty"` } +// CreateInstanceRequestHypervisor Hypervisor to use for this instance. Defaults to server configuration. +type CreateInstanceRequestHypervisor string + // CreateVolumeRequest defines model for CreateVolumeRequest. type CreateVolumeRequest struct { // Id Optional custom identifier (auto-generated if not provided) @@ -348,6 +366,9 @@ type Instance struct { // HotplugSize Hotplug memory size (human-readable) HotplugSize *string `json:"hotplug_size,omitempty"` + // Hypervisor Hypervisor running this instance + Hypervisor *InstanceHypervisor `json:"hypervisor,omitempty"` + // Id Auto-generated unique identifier (CUID2 format) Id string `json:"id"` @@ -404,6 +425,9 @@ type Instance struct { Volumes *[]VolumeMount `json:"volumes,omitempty"` } +// InstanceHypervisor Hypervisor running this instance +type InstanceHypervisor string + // InstanceState Instance state: // - Created: VMM created but not started (Cloud Hypervisor native) // - Running: VM is actively running (Cloud Hypervisor native) @@ -7921,104 +7945,105 @@ func (sh *strictHandler) GetVolume(w http.ResponseWriter, r *http.Request, id st // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+x9C3MTO7L/V+ma/26t81+/kgAL3rp1KycBjk8RSBHIubsn3CDPtG0dZqRB0jgxVL77", - "LT3mafkRIIYsqaIKx6ORulu/bnW3WvLnIORJyhkyJYPB50CGU0yI+XigFAmnZzzOEnyNHzOUSn+dCp6i", - "UBRNo4RnTF2kRE31XxHKUNBUUc6CQXBC1BQupygQZqYXkFOexRGMEMx7GAXtAK9IksYYDIJewlQvIooE", - "7UDNU/2VVIKySXDdDgSSiLN4bocZkyxWwWBMYontxrDHumsgEvQrHfNO0d+I8xgJC65Njx8zKjAKBn9U", - "2XhXNOajPzFUevCDGaExGcV4hDMa4qIYwkwIZOoiEnSGYlEUh/Z5PIcRz1gEth20WBbHQMfAOMOdmjDY", - "jEZUS0I30UMHAyUy9EgmMjRd0MgzA4dDsI9heAStKV7VB9n7x+hxsLxLRhJc7PTXLCGso4Wrycr7N22r", - "fb944OuZ8iTJLiaCZ+liz8NXx8dvwTwEliUjFNUeH+8V/VGmcIJCd5iG9IJEkUAp/fznD6u09fv9/oDs", - "Dfr9bt9H5QxZxMVSkdrHfpHu9iNc0eVGInX9L4j05dnwaHgAh1ykXBDz7sJIDWBXxVPlqwqb+qz48H8o", - "kCgH/qWmwM/aK/OBxDCJ+YjE8RwyRj9mNdx0YahVQEEq+IxGGLWBmAdAJZBM8c4EGQqiMIKx4AmoKUJl", - "bqGF3Um3Deea3Y6e3A7Z6/T7nf55UJ+d+EFnkmZBO0iJUig0gf/7B+l8Ouj8u9958q78eNHtvPv7X3wT", - "uSnggI8NnY7PVj4rbciJraKwSehqhK6Y5OXTN0zI5MazdzgEqt8DgWMUyDQnlv6Ihx9QdCnvxXQkiJj3", - "2ISyq0FMFEpV52Z127X8GdpWMMYmmvUbstbQOQO3VswvUYREIsSoASLbENEJVbINRJttIqcoQa8p/4SQ", - "MI1ZqYhQwAUgi+CSqikQ064ugWTeISntUEtq0A4ScvUC2USvm4/2F/CowdhyHzrv/n/+1c5/eyEpshg9", - "YHzNM0XZBMxjGHMBakollDRQhYl57y8Cx8Eg+H+90hnoOU+gl0s3i1GPlVA2tK/tFpQQIcjcP2s5catm", - "TyrCVtgVq0Ae/o7ylU2Cs5YSFAdi/BbD7/OTtz2tkimRUk0FzybT6qz8kduDdxVZLEi3zmQ7QDbT7UgU", - "UWvaTmrkehbTKtFP2YwKzhJkCmZEUA2+2uL0OXj56ujpxdOXZ8FASyLKQmfpT169fhMMgv1+v1+hq5Tn", - "lKs0ziYXkn7CmpsU7D//JWgSclDQDwkmXMyNxFwf0JrW1WPMRUIUxPQDwrnu7zzQJmz3edNw7ZmhFtd9", - "bUQ2si9rDAeJU8pwqeVo/yjafsnFh5iTqLP7jZWdodJ9L7L40j6AkLMxnWTWQXBqj0Cdmum1r4ZXZFoi", - "UQ0w1tOsd//7FNUURUXD8i71V3alM69DTmFFIjXXteqEL4CYz1DEZO4B8W7fg+LfBVVmRt17EFH5AfTL", - "ayCse7MYfthfBHHfj2IPUR6aftGIcjq1CSUFIbt7x+7j3qZ6NQvTTNZI2muS89J40todmVGhMhLD4cnb", - "msnxOtY2ZPOYXRsRVk2tm/8CD0RBqG27xp+iZhXYaKmxPZv4bdHw+lcXa1eWry5rwlefh194rGEmFU+A", - "RsgUHVMdrzWcUVp3W+szNuNxR0ezxgJsaKYsuYuefzK3XdlJWQbNi8losctTjUDKYEInZDRX9cVmt784", - "9X5B5/37RL0sKrbwwOhCcU+wl6NleKTlmLfdJOI1MfSF4hezMfX0XFiq0vumEsJGCO5Aq7vopCF1IXkb", - "LqdU2zYJuRCMCT07rjoR3XPWAU3cAI6KAYpuiy71IqKV3i6tLS4qRFAGmUQYzXeAwNlxF94U1P5NAiOK", - "zjBPE0yJhBEig4zpJQUjM75JflQJyKT29qhqvu4cdptR2DG+EnfPuvDrPMWEMLikcWxirYQoGppAbUQb", - "/FxOkbmJ0iNpA8AKre+esyqyXGqmafLbgbEMGF0Q5fFYcUKlEqXlkIokKbRePzvc399/0jTSew87/d3O", - "7sM3u/1BX//7d9AOrHHVvgNR2HHmZxtJE19fB3V74ULfqkU5fDs82nMrQn0c9ekBefL46oqoJ4/opXzy", - "KRmJyZ/7ZCtpFb95OipjdmhlEkUnN30aVb5IvRIQL4nEvzjAvlFGx36xevmx3L3RLW8jB9SwqybxYpq0", - "vyBL0zSCNb1abqPfODHU+dHfav+gRL5mh2WJptPlS0Ja6baU61MhuPDkQ3nkGecgTWMaGu3uyBRDOqYh", - "oO4B9AvQSoxlwcJTqot1RKIL4VZyr0orQmMPZiqRjh3MtYSWNstJFiuaxmifGZRu5KwYzo9MT74okTKG", - "4gJz8dygpwSl9AZLjRgm56VoYlaZCEfZZKJFUhXdMZVmcSjXNIpxNLCx11qomtksCfPBq8rDhmh4oaOv", - "TowzjKsgsBZFE5twgVDgxE5ajSvKZiSm0QVlaeaFxFJRPsuEcRFsp0BGPFPGHbATVh3EpM2MmzfWGucV", - "1oI4fkUS2y2RuiSkIipzYZdVL/5By7Mcjn9YOx2uE980DPMwuzEBiceKHR4fWRsdcqYIZSggQUXcBkwl", - "SWJydUE76GhMRQQTzoCPx/9cnTZZ4sUVCrLKDzisRg+35wPQiQsKml6I5PEMI0gIo2OUClzL6shySvYe", - "PhqQUbi7tx/h+MHDR91u1zcMMiXmKafMM9TT4tlmU9GzCZFO2WdXTr9uHm4hnbUJL5+Dk4M3vwaDoJdJ", - "0Yt5SOKeHFE2qPxd/Fk+MB/snyPKvGmwwuY2KDUmxlkEHXFYNdKO85jQuLEXmWZx7L4faE4YhgUguTE2", - "a6MUvwv1UkMzpp8wAm9yXZGJ9qUs4r4ui94OPmaY4UXKJbWjLzgy7omORkYZjSMwb1T3JZX9qh7b7i1l", - "v+JCmojRRpyLjmSRpdEj6zZuzIwpGtugqTbiw/1Hj//Rf7K7V1FuytSjB8FGpBRmt5GpMTy7p6XLkyKL", - "7AqqYWA/hZzNtFaYPwx92s5Y4NQMeP5sYTIuufhA2eQioh50/m4fQkQFhspkY9frUNAjaboeiv6AvrBp", - "BftrPEi3EeBZXb67Jf+S0Ks++qvJbx//R57848/djy/Ozv41e/7b0Uv6r7P45NVX5ZhX7wF9142cldk1", - "E2/UNnA2hccxUaHH8ZlyqZZIzT0BxSHRL3fhkDAY4eCcdeAFVShIPIDzgKS064TZDXlyHkALr0io7FvA", - "GeiuYIokQrGjXz6xeXb98uc8TXHd7COaM5LQEIQT8ohIHc4ykNko4gmhbOecnTPXF+SMSJO+0Z8iCEmq", - "MoF6RiDMRDyHkSAhFvvS5eBt+EzS9HrnnKkpUYBXSmgOUiJUsWGcj2Am2lFl00OuOUYwI3GGEkIjqHNW", - "rB+RJkF3ooiYoOoWKVnj7zdSNEuE4o3JuVC1LPPjftszj6Db6YmMqVTIoNh/oNKAF1r5HsHjfk39H/cf", - "r89EFhhaAT+D7sUqpRyUG+iHBbAZ2hrji6lS6fqyI2NvrI7Ar2/enGgx6P9PIe+olEUxxS3O4jkQHRej", - "tPk1FRufxG3L7AS+HJqd3Q0ZemMb69diuZ6Pp2ZgePPiFBSKhDJrv1uhFudYh+9oMz1UykxDkRI4ODx+", - "utPdoMzKyLagf8U8vik4bCTs822sxSSGeaPchNDybcPwqK3dKaehpaNlMqjPuIDYGphSrwfwVmJ9P8NM", - "lU322JmM52XJibXq58FO3mPatBQDeF34d6QgpShkKcGQd1nqpen2nP2ugWHTuwu9t+u0msS1i1+caTPJ", - "XKLA5U7MUrzcFKxWf4/Ejc5z1txlvJluV7cn9WB+aJRzf+seyP7NPJDbKQpY3OIn8kIyksopV8s3Pgjk", - "bQCvqFQ1n2Fxgpam6hcLCuoG35YKrNjp3Kw04HvmzX+8soSVhQRfWw3gXIzNigF80KramXzL7ov3/9sB", - "9WxXHEhJJwwjGJ6UBX5lQJp330i5P9nr7j563N3t97u7/U3C84SEK8Y+PjjcfPD+ng1YBmQ0CKMBjr8i", - "PeCmzS4IJL4kcwnn+ZJ9HlgfoeIcVEDplvWNEpSLZRZfVlXR3PhYVzdxkzqJjayHKchZYvpPTbHOze3+", - "w6V2f+2s6mAa1ztmVolOTeP8rYubJK4QQp7FEfubgpHWPOuqYeQ8SonKIsW2pRLesg+MX7I66zZ/ofX3", - "Y4ZiDmfHx7Vsl8BxJjfbk5eKp+nSeeDpjaZhb83yu5aaSlnMNkphmpawYl+/eeFLNTTPt28s6jYI0au4", - "W16RYboz4bctookGGhngeodRpqCohdOQO4x5FplKAjGj0tRiKjpD4xG/zhijbKJ7MGtGqJ/EcxD2+9Uv", - "nxANv/zd1Py1+o3TaaYifsnMO3KaKdB/GZI1C84dWt2FRfIAXnLzjqO0rc1/w6+yzQmLRvPF5k0frGWj", - "de2eKy4wMoM5tRzAs0IVC2V2ytuS6D5aC+G2+cwW5o51413e0s1W0A6c1IN2YEUYtINcMvqj5dB8MsQH", - "7cAR4t3ktfBcVt+T5GeHGtt7VCqtaa7kBSqNoYVJquZ5gjnXnp2bqctB0aFv//VbhwT9J98iKfl2ZRby", - "P6RirGqh8kHW2qaFOV0a+ntLMoZHTd/Wxjvu4FndW21sIkvVsdt43i3kFQfc7Ekz/SxPu02y5j7hDQ61", - "LathKzXHRvflqbZ1IdySINuWk1Q4q1CyfG7s8vSVJwCpzI/+faHInEe6Po9ljSGkKDoFJHJ3VlvQS0HN", - "tpMTkBWsFsF/aZ/Cn3db7TUfk6tiBOPPEgmNyl/LR5npMbW/O114nddl0HHehSGjW3ev/S7w5kcjc1Qt", - "Tsaqs5K5A+RVPGd/Vli0ZbrVAGc5Rnv1cUxtujDMBFXzU70gWBiOkAgUB5mFoVkpDBPm63Jwk8u9vjYF", - "OmNP7ehzZChoCAcnQ4OShDAy0VN2dgwxHWM4D2N0qbgFJ8KcB3h1OOzYPYQ8cjeZHKqMQPJyy4OToan0", - "EtKO2+/udc2pDp4iIykNBsF+d9fUsmkxGBZ7lZMwLjmqFdEsZcPILblHro0Wrkw5k7b9Xr9vK3GYcsaV", - "lMVYvT+l3R22C6wxtpusw67ud9FjXUhw5c6AMAWeqJGeM3PdDh70d29E3NpiKh8JbxnJ1JQL+gkjPejD", - "G0rkiwYdMoWCkRgkihkKV15UhXAw+KMO3j/eXb9rBzJLEiLmuej8cku59KCgelgzsDqGUv3Co/k349d3", - "HvS6rtDael0vgPDbzXOOvUWZuyLVUmQWYluY7V9IVCTZW644rdg8qFXCfi/QP+g/uP1BKwXURdkccLtl", - "YYl4cvtEHHI2jmmooJPT4s4IAoltTX4dIHfFHLx2VAPJ+RqbPafygKPuLl8qeiS/uWDlotG432A7q0fz", - "UoUbLCMFV5WS5fuVZB10jqgMtXNZRUsnJGnlGgdZ6mkVRZ9pdG19pRht5qiOoSPzfbHkpESQBBUKaWha", - "cmoXyisOqH7gIhEb5togsr6ctCsybPqS7xYQ+2Dp8YWMNdeGLRjFo4ZB/I6GsLF1UznDc5fQ/LaYxfzM", - "wnXbb+Geo/qxoNnfnheUH4r4njC/K4h6jipXkUJs2gpOi2L+ZfBy5f63ONFuBA/jpzr6tFptCbVbBiVb", - "9lUIpxh+sAyZbYPVYeTQNtmGH2DPLNxg9Xfk3y/3GwSOpaxWBYtDt490e7Fi7fKZjULFvW9GgQOYR8im", - "xGOUF4HbzSwi5yzc+R4x4392VNg8R3WHNOkki2NzmNsdAihPblTtae+z9g828JNzbVvpi7x9/aKDLOQR", - "Rq4UablDkhdqf1tv2U6YZeUeJpvEV0ZUOTCWO6NfMf9256C8e+uve89cBdpf957ZGrS/7h+UV3DdDlj6", - "2zLN2/Ze7zD4tPNK60IzpsmWU6/z9opWW3H43LmVm7h8BYH3Xt8mXl9VXCsdv+II0S26fvXr+ba8T1CA", - "zSdt8yiva/rJXL7tpp4cIu0WqanMqOXiXYmUuYDOnYawF+TcJdVzFQe0QFzV/m6YQy0VcqV3kEN3eNR2", - "B13s8ZRU4JhebS+jmtOxdS/Rjbv9dOpBMqKTjGeyehrBnGtCWd4lUjPAd81/LZfnpR7sD4zS/jaXjq07", - "qPe4vyXXuTmh1njbbZF1znPeajvOc7lVs7n3nFN47z1v5D1XxLXaey7K/G/Tfa7fj7x1/znHm0/grq7y", - "Z/Sg75hXSpjLcVc2e2s2bmMHtTw6uHrtL6/33PpGfzH49v3S/Mj4XcwhmXNa5ubr3BMs15rlruCPhof+", - "dm3f9l3Auwyx59UrGvzOljFEvZhPqm5X80ChQJKUp9lBtwYi4dQQ1jlFpuDpTHPVPWf5fQ7vJc9EiO+h", - "ACooDhJjDJW79jfm5lpbafo3J+DekzR9X9xbsjOA56a8syJdO3hLoqAkhpAzyWN7kuz9LEneDxbrxM+O", - "j81Lps3UVoS/HxRX8RY6JnWrc3bOXqPKBJOGi5hIBS8hpgwltPSECx7HGMFoDu+1PCv87ZjLIXSP9lKC", - "eH7O9BuUZSgdl5RNgOGl65CO4f2YxzG/NOcP3tt7IpZq/Qs9S99J89vLT3daXhQHYQRn79pAc3mjGdec", - "di0HdhdLlkMVpyh2+97jTp8XE11Gpl6RkrEyB+ep0vjgmbKXVfoIsZL3k7L0iM/ivZoTsEhvQJmk6abw", - "dWQaFM+SZAWGoTUtv5Qq4pn6u1QRCnsFk0P3MnBDi4T2D0U+2AuDatdU2PONPlFZDv2iCuy1aPmxSPvX", - "LEmCduDo8Rxz3GAlUXileqjNSseKtW5Tmx0uxmN6ZsyL0Do9fbpzv2Zs6JYYkdWNvROgZ+Vw52vNSTVv", - "8PbaNvjpPZf8IPJ3huH2tyIqVFBzRQKLRnN3L35xu86dOhNgJrLkzKx3ji+vjuTPluqIOxj+0+tIiY+f", - "XEtCLswddjK/s+TuFG9VIo6KurfMdRLlNQ3tPOo9Oz7eWaY09hq0pSoj7sNhV0f5068p5oaNu6ct9sok", - "UjCwKlnY041W6QNP79XBXbVyv3jcycXDZEQLbloTQUIcZ7G5WSgy12n59MLdF9X7bD8M1+XVy9/Z/mFy", - "Ke5ah3XD5AzeCaV0PEXofmls6zrJi5s37mh9s/m1P8eCiTGqOwT+VaD6K/I/D7q//Waw79f4N9oK3qpu", - "Fb/i96Po1rZXPkdDXtdYlcddUXOLtJwTxRs+YOUyxKUlMe5exK0UxDjTcoNymJyD+8qBDYphKsLKDbzv", - "ni0JxGx52OZdOM3SlAslQV1ySHiE0mxB/Hb66iWMeDQfQPEeA3sboAOcu8bN/e6XjqHoJ9TvHpsiMx2e", - "jLlIKh3kb6YCOylPs9jcUmkqjZ2M7WJFQBHRnXwCIsIpnaFna6v6o7G3WtXTNOTtIMnZ62n2zOV99U6b", - "v6lW0FKfjzqPMKYx5j8jY360c1rcxZZ3UbnQcEQZEfNNbzNs/lLurFhW7+IP5R6TK5pkSfGzRM9/gZb7", - "xQ3z43rmJwPpuMAUXoWIkTQbVjtf96O67WI6PdedbbXcK7emS1f471jqVd6ppKfY/OKoA7niHGIiJrjz", - "0xyocLpWnqcYHjVOU9zBIrVZjr7Sz9iwLG2zAGNDv/82StKK4HO7BWlnP45PXLl25g6eipgVbuaySrgf", - "C4L97S0J266AO7vDOZTnmLvUleo304Hu0QeYFzwkMUQ4w5in5hJg2zZoB5mI3ZWmg579ycwpl8r84E5w", - "/e76/wIAAP//cHRGgNyPAAA=", + "H4sIAAAAAAAC/+x9C3MTO7LwX+mab7fW+davJMCCt27dykmA41MEUgRy7u4JN8gzbVuHGWmQNE4Mlf9+", + "S495Wn4EiCFLqqjC8Wik7la/1Wp/DkKepJwhUzIYfA5kOMWEmI8HSpFwesbjLMHX+DFDqfTXqeApCkXR", + "DEp4xtRFStRU/xWhDAVNFeUsGAQnRE3hcooCYWZmATnlWRzBCMG8h1HQDvCKJGmMwSDoJUz1IqJI0A7U", + "PNVfSSUomwTX7UAgiTiL53aZMcliFQzGJJbYbix7rKcGIkG/0jHvFPONOI+RsODazPgxowKjYPBHFY13", + "xWA++hNDpRc/mBEak1GMRzijIS6SIcyEQKYuIkFnKBZJcWifx3MY8YxFYMdBi2VxDHQMjDPcqRGDzWhE", + "NSX0EL10MFAiQw9lIgPTBY08O3A4BPsYhkfQmuJVfZG9f4weB8unZCTBxUl/zRLCOpq4Gqx8fjO2OveL", + "B76ZKU+S7GIieJYuzjx8dXz8FsxDYFkyQlGd8fFeMR9lCico9IRpSC9IFAmU0o9//rAKW7/f7w/I3qDf", + "7/Z9UM6QRVwsJal97Cfpbj/CFVNuRFI3/wJJX54Nj4YHcMhFygUx7y6s1GDsKnmqeFXZpr4rPv4/FEiU", + "Y/6lqsCP2ivzgcQwifmIxPEcMkY/ZjW+6cJQi4CCVPAZjTBqAzEPgEogmeKdCTIURGEEY8ETUFOEyt5C", + "C7uTbhvONbodvbkdstfp9zv986C+O/GDziTNgnaQEqVQaAD/9w/S+XTQ+Xe/8+Rd+fGi23n397/4NnJT", + "hgM+NnA6PFv5rrQhB7bKhU1AV3Poik1evn3DhExuvHuHQ6D6PRA4RoFMY2Lhj3j4AUWX8l5MR4KIeY9N", + "KLsaxEShVHVsVo9di5+BbQVibKJRvyFqDZkz7NaK+SWKkEiEGDWDyDZEdEKVbAPRapvIKUrQNuWfEBKm", + "eVYqIhRwAcgiuKRqCsSMq1MgmXdISjvUghq0g4RcvUA20Xbz0f4CP2pmbLkPnXf/P/9q57+9LCmyGD3M", + "+JpnirIJmMcw5gLUlEooYaAKE/PeXwSOg0Hw/3qlM9BznkAvp24Wo14roWxoX9stICFCkLl/13LgVu2e", + "VISt0CtWgDz4HeWWTYLTlhIUB2L8FoPv85O3PS2SKZFSTQXPJtPqrvyR64N3FVosULeOZDtANtPjSBRR", + "q9pOauB6jGkV6KdsRgVnCTIFMyKoZr6acfocvHx19PTi6cuzYKApEWWh0/Qnr16/CQbBfr/fr8BV0nPK", + "VRpnkwtJP2HNTQr2n/8SNAE5KOCHBBMu5oZibg5oTeviMeYiIQpi+gHhXM93HmgVtvu8qbj2zFILRJjO", + "UxQzKrnHOfq1eKa3L5NY5VXLHF04ssiYHZYotPcUcjamk8xawq6GgmWJ3tUw5lnUqSzZDj5iYra5BNQz", + "aNFb0apvI624Rt2ROKUMl+q79o+ioy65+BBzEnV2v7GKYqj03IsovrQP6pvpGACL/dcWuyZlyDRFohqb", + "W/+4Pv3vU1RTFBW9kE+pv7L22bwOOYQVitQc7mrosCB6fIYiJnOP6O32PbL3u6DK7Kh7DyIqP4B+eY3g", + "6dms5D3sL4pe3y97HqA8MP2iOcppgk0gKQDZ3Tt2H/c21QazMM1kDaS9Jjgvjf+vnagZFSojMRyevK0p", + "Sm84YANNj7GwcWzVQLj9L/iBKAi1RdL8p6ixXRsZSDuziToXzYXfJlq9stwmrgm6fXFJ4WeHmVQ8ARoh", + "U3RMdZTZcKFp3dmu79iMxx0dgxsNsKGasuAuxivJ3E5lN2UZa15MRotTnmoOpAwmdEJGc1U3kbv9xa33", + "Ezqf30fqZbG8ZQ+MLhT3hKg5twyPNB3zsZvE6Sbyv1D8YjamnpkLTVXGDFRC2EgcOKbVU3TSkLpEQhsu", + "p1TrNgk5EYwKPTuuuj7dc9YBDdwAjooFimmLKbUR0UJvHYIWFxUgKDP2eTTfAQJnx114U0D7NwmMKDrD", + "PLkxJRJGiAwypk0KRmZ9k7KpApBJ7aNS1XzdhRk2D7JjPDzunnVBuwwJYXBJ49hEiAlRNDTh5Yg28Lmc", + "InMbpVfSCoCVjsU5q3KWSyg1VX47MJoBowuiPH42TqhUotQcUpEkhdbrZ4f7+/tPmkp672Gnv9vZffhm", + "tz/o63//DtqBVa7adyAKO079bCPV45vroK4vXMBe1SiHb4dHe84i1NdRnx6QJ4+vroh68oheyiefkpGY", + "/LlPtpIM8qunozLTAK1Moujkqk9zlS+/UAnjl+QPvjgtcKM8lP1itfmx2L3RI28jc9XQqyZdZIa0vyC3", + "1FSCNblarqPfODLU8dHfav+g5PxKCOCyPCGtTFvS9akQNhBpZHF55FnnIE1jGhrp7sgUQzqmIaCeAfQL", + "0EqMZsHCU6qTdUSiC+EsuVekFaGxh2cq8ZldzI2EllbLSRYrmsZonxku3chZMZgfmZl8sS1lDMUF5uS5", + "wUwJSukNlhoxTI5LMcRYmQhH2WSiSVIl3TGVxjiUNo1iHA1s7LWWVc1uloD52KuKw4bc8EJHX50YZxhX", + "mcBqFA1swgVCwSd202pYUTYjMY0uKEszL0ssJeWzTBgXwU4KZMQzZdwBu2HVRUyyz7h5Yy1xXmItkONX", + "JLE9yKlTQiqiMhd2WfHiH+rBNP+wdjvcJL5tGOZhdmMDEo8WOzw+sjo65EwRylBAgoq4Y6NKasdkGIN2", + "0NE8FRFMOAM+Hv9zdbJniRdXCMgqP+CwGj3cng9AJy4oaHohksczjCAhjI5RKnAjqyvLKdl7+GhARuHu", + "3n6E4wcPH3W7Xd8yyJSYp5wyz1JPi2ebbUXPJkQ65ZxdOf26fbiFJNwmuHwOTg7e/BoMgl4mRS/mIYl7", + "ckTZoPJ38Wf5wHywf44o8ybvCp3bgNSoGKcRdMRhxUg7zmNC48YJaprFsft+oDFhGBYMyY2yWRul+F2o", + "l5o1Y/oJI/AeCSgy0b6U5bivy/23g48ZZniRcknt6guOjHuio5FRRuMIzBvV01Rlv6rHtntL0a+4kCZi", + "tBHnoiNZZGn0ynqMWzNjisY2aKqt+HD/0eN/9J/s7lWEmzL16EGwESiF2m1kagzO7mnp8qTIImtBNRvY", + "TyFnMy0V5g8Dn9YzlnFqCjx/trAZl1x8oGxyEVEPd/5uH0JEBYbK5JDXy1DQI2m6nhX9AX2h0wr013iQ", + "7vjCY12+uyb/ktCrvvqryW8f/0ee/OPP3Y8vzs7+NXv+29FL+q+z+OTVV+WYV59cfdfjp5XZNRNv1I6d", + "NmWPY6JCj+Mz5VItoZp7AopDol/uwiFhMMLBOevAC6pQkHgA5wFJadcRsxvy5DyAFl6RUNm3gDPQU8EU", + "SYRiR798YvPs+uXPeZriujlHNGckoSEIR+QRkTqcZSCzUcQTQtnOOTtnbi7IEZEmfaM/RRCSVGUC9Y5A", + "mIl4DiNBQixO08vF2/CZpOn1zjlTU6IAr5TQGKREqOKYO1/BbLSDyqaH3HCMYEbiDCWEhlDnrLAfkQZB", + "T6KImKDqFilZ4+83UjRLiOKNyblQtSzz437bs4+gx+mNjKlUyKA4f6DSMC+08jOCx/2a+D/uP16fiSx4", + "aAX7Ge5erK3KmXID+bAMbJa2yvhiqlS6vljK6BsrI/Drmzcnmgz6/1PIJyppUWxxi7N4DkTHxShtfk3F", + "xidxxzI7gS+HZnd3Q4Te2MH6tViux+OpWRjevDgFhSKhzOrvVqjJOdbhO9pMD5Uy06xICRwcHj/d6W5Q", + "HGZoW8C/Yh/fFBg2Evb5MdZiEsO8UR5CaPq2YXjU1u6Uk9DS0TIZ1GdcQGwVTCnXA3grsX6eYbbKJnvs", + "TsbzslDGavXzYCefMW1qigG8Lvw7UoBSlN+UzJBPWcqlmfac/a4Zw6Z3F2Zv12E1iWsXvzjVZpK5RIHL", + "nRhTvFwVrBZ/D8WNzHPWPGW8mWxXjyf1Yn7WKPf+1j2Q/Zt5ILdTyrBYmEDkhWQklVOulh98EMjHAF5R", + "qeRiGcBGqfrFMoi6wrcFDitOOr9lQYPIGDOnDk00vnmpwvfM4/94ZRIrCxu+tjrBuTybFSf4WL2q9/Ij", + "xC+uR2gH1HN8ciAlnTCMYHhSlkmWAXI+feMI4Mled/fR4+5uv9/d7W+SLkhIuGLt44PDzRfv79kAakBG", + "gzAa4Pgr0hVu26yBIvElmUs4z12I88D6LBVnpcKUzs3YKGG6WPbxZVUezYOYdXUcN6nb2EibmQKhJabo", + "1BQP3dwOPVxqh9buqg7ucb2jaIXo1AzO37q4SSINIeRZHLG/KRhpybOuI0bOw5WoLKfYsVTCW/aB8UtW", + "R93mU7T8fsxQzOHs+LiWfRM4zuRmNQJS8TRdug88vdE27K1xB9ZCUynT2UZpTlMTVvTrNy/EqaYK8uMk", + "y3UbpAyqfLe8QsRMZ9IBtqgnGmjOADc7jDIFRW2eZrlDbeWh4jvYegjjob+2boSewdiMUD+J54V7sfLl", + "E6LZL383NX+tfuN0mqmIXzLzjpxmCvRfBmSNgnPPVk9hOXkAL7l5x0Ha1uq/4efZ4YRFo/ni8KZP2LLZ", + "Ax0uKC4wMos5sRzAs0IUC2F2wtuS6D5aDeGOHc2R6o4NK5xL5nYraAeO6kE7sCQM2kFOGf3RYmg+GeCD", + "duAA8R46W/ZcVm+U5DewGseNVCotaa4EByqDoYVJquZ5wjuXnp2bictBMaHvPPhbhyj9J98iSfp2ZVb0", + "P6SCraqh8kXW6qaFPV2aivCWiAyPmr6tDVzc9b26t9o41JaqY48VvUfaK64J2vt6+lmeBpxkzXPLG1wN", + "XFZTV0qOzTaUdwPXhZRLgn5b3lLBrALJ8r2x5ukr71FSmV+g/EKSOY90fV7NKkNIUXQKlsjdWa1BLwU1", + "x2COQJawmgT/pX0Kfx5wtdd8TK6KFYw/SyQ0KpEtHmXmydQi73ThdV4nQsf5FAaMbt299rvAm18wzblq", + "cTNW3TjNHSCv4Dn9s0KjLZOtBnOWa7RXX2rVqgvDTFA1P9UGwbLhCIlAcZBZNjSWwiBhvi4XN7nl62tT", + "MDT21LI+R4aChnBwMjRckhBGJnrLzo4hpmMM52GMLjW44ESY+wmvDocde6aRR+4ms0SVIUhe/nlwMjSV", + "Z0Ladfvdva65G8NTZCSlwSDY7+6a2jpNBoNir3KfyCVrtSAaUzaMnMk9cmM0cWXKmbTj9/p9WxnElFOu", + "pCwO6/0p7Wm1NbBG2W5ih10d8qLHupBwy50BYQpOUXN6jsx1O3jQ370RcGuLu3wgvGUkU1Mu6CeM9KIP", + "b0iRL1p0yBQKRuL84g+6gSULB4M/6sz7x7vrd+1AZklCxDwnnZ9uKZceLqheeQ2sjKFUv/Bo/s3w9d2q", + "va4LtNZe1wtM+O32Oee9RZq7otmSZJbFtrDbv5CoSPq3XLFccZhRq8z9Xkz/oP/g9hetFHQXZXzA7RGK", + "BeLJ7QNxyNk4pqGCTg6Lu2kJJLZ3BOoMclfUwWsHNZAcr7E5AyuvierpclPRI3n/h5VGo9ElYjvWo9ma", + "4gZmpMCqUkJ9b0nWsc4RlaF2Lqvc0glJWmmGIUs5rXLRZxpdW18pRps5qvPQkfm+MDkpESRBhUIamJbc", + "fYayUQTVD1wkYsNcG0TWzUm7QsOmL/lugWMfLL1OkbGmbdiCUjxqKMTvqAgbRzeVO0V3iZvfFruY36G4", + "bvs13HNUPxZr9rfnBeWXNL4nm98VjnqOKheRgmxaC06LywXL2MtdP7jFjXYreBA/1dGnlWoLqD0yKNGy", + "r0I4xfCDRcgcG6wOI4d2yDb8AHuH4gbW34F/b+43CBxLWq0KFofuHOn2YsVaC5+NQsW9bwaBYzAPkU2J", + "xygvSreHWUTOWbjzPWLG/+yosHmv6w5J0kkWx+ZyubuUUN4kqerT3mftH2zgJ+fSttIXefv6RQdZyCOM", + "XCnScockLxz/tt6y3TCLyj2bbBJfGVLljLHcGf2K/bcnB2UHs7/uPXMVaH/de2Zr0P66f1A2MrsdZulv", + "SzVv23u9w8ynnVdaJ5pRTba8e523V4zaisPn7tHcxOUrALz3+jbx+qrkWun4FVeabtH1qzc53PI5QcFs", + "PmqbR3ld00/m8m039eQ40h6RmsqMWi7elUiZNn7udoZt2HOXRM9VHNCC46r6d8McaimQK72DnHWHR213", + "8cZel0kFjunV9jKqORxb9xLduttPpx4kIzrJeCartxHMPSuUZW+TmgK+a/5raZ6XerA/MJf2t2k6tu6g", + "3vP9LbnOzQ21ytsei6xznvNR23Gey6Oazb3nHMJ773kj77lCrtXec1Hmf5vuc73L9Nb955zffAR3dZU/", + "owd9x7xSwlyOu3LYW9NxGzuo5dXB1ba/bDe69YP+YvHt+6X5Ffa7mEMy97RMJ+7cEyxtzXJX8Efjh/52", + "dd/2XcC7zGLPqy0j/M6WUUS9mE+qblfzQqFAkpTX0kGPBiLh1ADWOUWm4OlMY9U9Z3l/ifeSZyLE91Aw", + "qm3JH2OoXBvimJs2u9LMb27AvSdp+r7oo7IzgOemvLNCXbt4S6KgJIaQM8lje5Ps/SxJ3g8W68TPjo/N", + "S2bM1FaEvx8UrYELGZN61Dk7Z69RZYJJg0VMpIKXEFOGElp6wwWPY4xgNIf3mp4V/HZMswo9o22SEM/P", + "mX6Dsgylw5KyCTC8dBPSMbwf8zjml+b+wXvbt2Kp1L/Qu/SdJL+9/HanxUVxEIZwtvcHmmaSZl1z27Vc", + "2DW6LJcqblHs9r3XnT4vJroMTb0kJWNlLs5TpfmDZ8o2z/QBYinvB2XpFZ/FPp8TsJzeYGWSppuyrwPT", + "cPEsSVbwMLTK7g4gVcQz9XepIhS2JZTj7mXMDS0S2j8U+WAbGNX6Tdj7jT5SWQz9pApsm7b8WqT9a5Yk", + "gW1+kRDfNccNLInCK9VDrVY6lqx1ndqccDEe0ztjXoTW6enTnXubsaFbYkhWV/aOgB7L4e7Xmptq3uDt", + "tR3w03su+UXk78yG2z+KqEBBTYsEFo3mrk9/0e3nTt0JMBtZYmbsncPLKyP5s6Uy4i6G//QyUvLHTy4l", + "IRemp57Me5bcneKtSsRREfeWaSdRtmlo51Hv2fHxzjKhsW3ZloqMuA+HXR3lT29TTIeNuycttmUSKRBY", + "lSzs6UGr5IGn9+LgWq3cG487aTxMRrTApjURJMRxFpvOQpFpp+WTC9cvqvfZfhiuy6uXv1b+w+RSXFuH", + "dcvkCN4JoXQ4Reh++WzrMsmLzht3tL7Z/PqgQ8HEGNUTAr8VqP4W/8/D3d/+MLhKxxsdBW9VtopfFfxR", + "ZGvbls/BkNc1VulxV8TcclqOieINH7DSDHFpSYzri7iVghinWm5QDpNjcF85sEExTIVYuYL39dmSQMyR", + "hx3ehdMsTblQEtQlh4RHKM0RxG+nr17CiEfzARTvMbDdAB3DuTZu7nfIdAxFP6F+99gUmenwZMxFUpkg", + "fzMV2El5msWmS6WpNHY0tsaKgCKiO/kERIRTOkPP0Vb1R2xvtaqnqcjbQZKj19PomeZ99Umbv/FWwFLf", + "jzqOMKYx5j9rY9t5F73Y8ikqDQ1HlBEx37SbYfOXe2eFWb2LP9x7TK5okiXFzyQ9/wVa7hdAzI/9mZ8w", + "pOOCp/AqRIykObDa+bof+W0X2+lpd7bVcq9cmy618N+x1KvsqaS32PwCqmNyxTnERExw56e5UOFkrbxP", + "MTxq3Ka4g0Vqs5z7Sj9jw7K0zQKMDf3+2yhJK4LP7Raknf04PnGl7cwdvBUxK9zMZZVwPxYL9rdnErZd", + "AXd2h3MozzF3qSvVb2YCPaOPYV7wkMQQ4QxjnpomwHZs0A4yEbuWpoOe/QnPKZfK/ABQcP3u+v8CAAD/", + "/4yBtfMikQAA", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/lib/providers/providers.go b/lib/providers/providers.go index ecbeb708..82c9606c 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -9,6 +9,7 @@ import ( "github.com/c2h5oh/datasize" "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/devices" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/ingress" "github.com/onkernel/hypeman/lib/instances" @@ -114,7 +115,8 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima meter := otel.GetMeterProvider().Meter("hypeman") tracer := otel.GetTracerProvider().Tracer("hypeman") - return instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, meter, tracer), nil + defaultHypervisor := hypervisor.Type(cfg.DefaultHypervisor) + return instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, meter, tracer), nil } // ProvideVolumeManager provides the volume manager diff --git a/openapi.yaml b/openapi.yaml index a096083c..9157508a 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -160,6 +160,11 @@ components: description: Volumes to attach to the instance at creation time items: $ref: "#/components/schemas/VolumeMount" + hypervisor: + type: string + enum: [cloud-hypervisor, qemu] + description: Hypervisor to use for this instance. Defaults to server configuration. + example: cloud-hypervisor # Future: port_mappings, timeout_seconds Instance: @@ -254,6 +259,11 @@ components: type: boolean description: Whether a snapshot exists for this instance example: false + hypervisor: + type: string + enum: [cloud-hypervisor, qemu] + description: Hypervisor running this instance + example: cloud-hypervisor CreateImageRequest: type: object From 894ce44f477442ffa347bd2d8f7bd1fd79213553 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 22 Dec 2025 17:20:23 -0500 Subject: [PATCH 02/14] Simplify to higher level apis --- lib/hypervisor/qemu/qemu.go | 34 ++++--- lib/hypervisor/qemu/qmp.go | 162 +++++++++++--------------------- lib/hypervisor/qemu/qmp_test.go | 125 ++++++++++-------------- 3 files changed, 125 insertions(+), 196 deletions(-) diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index 6ec5a43e..2ea35e56 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -5,19 +5,20 @@ import ( "fmt" "time" + "github.com/digitalocean/go-qemu/qemu" "github.com/onkernel/hypeman/lib/hypervisor" ) // QEMU implements hypervisor.Hypervisor for QEMU VMM. type QEMU struct { - client *QMPClient + client *Client } // New creates a new QEMU client for an existing QMP socket. func New(socketPath string) (*QEMU, error) { - client, err := NewQMPClient(socketPath) + client, err := NewClient(socketPath) if err != nil { - return nil, fmt.Errorf("create qmp client: %w", err) + return nil, fmt.Errorf("create qemu client: %w", err) } return &QEMU{client: client}, nil } @@ -67,28 +68,31 @@ func (q *QEMU) Shutdown(ctx context.Context) error { // GetVMInfo returns current VM state. func (q *QEMU) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { - status, running, err := q.client.QueryStatus() + status, err := q.client.Status() if err != nil { return nil, fmt.Errorf("query status: %w", err) } + // Map qemu.Status to hypervisor.VMState using typed enum comparison var state hypervisor.VMState - switch { - case running: + switch status { + case qemu.StatusRunning: state = hypervisor.StateRunning - case status == "paused": + case qemu.StatusPaused: state = hypervisor.StatePaused - case status == "shutdown": + case qemu.StatusShutdown: state = hypervisor.StateShutdown - case status == "prelaunch": + case qemu.StatusPreLaunch: state = hypervisor.StateCreated + case qemu.StatusInMigrate, qemu.StatusPostMigrate, qemu.StatusFinishMigrate: + state = hypervisor.StatePaused + case qemu.StatusSuspended: + state = hypervisor.StatePaused + case qemu.StatusGuestPanicked, qemu.StatusIOError, qemu.StatusInternalError, qemu.StatusWatchdog: + // Error states - report as running so caller can investigate + state = hypervisor.StateRunning default: - // Map other QEMU states to appropriate hypervisor states - if status == "inmigrate" || status == "postmigrate" { - state = hypervisor.StatePaused - } else { - state = hypervisor.StateRunning - } + state = hypervisor.StateRunning } return &hypervisor.VMInfo{ diff --git a/lib/hypervisor/qemu/qmp.go b/lib/hypervisor/qemu/qmp.go index 3f47d902..155ef1f4 100644 --- a/lib/hypervisor/qemu/qmp.go +++ b/lib/hypervisor/qemu/qmp.go @@ -1,150 +1,96 @@ package qemu import ( - "encoding/json" "fmt" "time" + "github.com/digitalocean/go-qemu/qemu" "github.com/digitalocean/go-qemu/qmp" + "github.com/digitalocean/go-qemu/qmp/raw" ) -// QMPClient wraps go-qemu's QMP monitor with convenience methods. -type QMPClient struct { - monitor *qmp.SocketMonitor +// Client wraps go-qemu's Domain and raw.Monitor with convenience methods. +type Client struct { + domain *qemu.Domain + raw *raw.Monitor + mon *qmp.SocketMonitor } -// NewQMPClient creates a new QMP client connected to the given socket. -func NewQMPClient(socketPath string) (*QMPClient, error) { - monitor, err := qmp.NewSocketMonitor("unix", socketPath, 2*time.Second) +// NewClient creates a new QEMU client connected to the given socket. +func NewClient(socketPath string) (*Client, error) { + mon, err := qmp.NewSocketMonitor("unix", socketPath, 2*time.Second) if err != nil { return nil, fmt.Errorf("create socket monitor: %w", err) } - if err := monitor.Connect(); err != nil { + if err := mon.Connect(); err != nil { return nil, fmt.Errorf("connect to qmp: %w", err) } - return &QMPClient{monitor: monitor}, nil -} - -// Close disconnects from the QMP socket. -func (c *QMPClient) Close() error { - return c.monitor.Disconnect() -} - -// Run executes a raw QMP command and returns the response. -func (c *QMPClient) Run(command []byte) ([]byte, error) { - return c.monitor.Run(command) -} + domain, err := qemu.NewDomain(mon, "vm") + if err != nil { + mon.Disconnect() + return nil, fmt.Errorf("create domain: %w", err) + } -// qmpCommand represents a QMP command structure -type qmpCommand struct { - Execute string `json:"execute"` - Arguments interface{} `json:"arguments,omitempty"` + return &Client{ + domain: domain, + raw: raw.NewMonitor(mon), + mon: mon, + }, nil } -// qmpStatusResponse represents the response from query-status -type qmpStatusResponse struct { - Return struct { - Running bool `json:"running"` - Status string `json:"status"` - } `json:"return"` +// Close disconnects from the QMP socket. +func (c *Client) Close() error { + return c.domain.Close() } // Stop pauses VM execution (QMP 'stop' command). -func (c *QMPClient) Stop() error { - cmd := qmpCommand{Execute: "stop"} - cmdBytes, err := json.Marshal(cmd) - if err != nil { - return fmt.Errorf("marshal stop command: %w", err) - } - - _, err = c.monitor.Run(cmdBytes) - if err != nil { - return fmt.Errorf("execute stop: %w", err) - } - - return nil +func (c *Client) Stop() error { + return c.raw.Stop() } // Continue resumes VM execution (QMP 'cont' command). -func (c *QMPClient) Continue() error { - cmd := qmpCommand{Execute: "cont"} - cmdBytes, err := json.Marshal(cmd) - if err != nil { - return fmt.Errorf("marshal cont command: %w", err) - } - - _, err = c.monitor.Run(cmdBytes) - if err != nil { - return fmt.Errorf("execute cont: %w", err) - } - - return nil +func (c *Client) Continue() error { + return c.raw.Cont() } -// QueryStatus returns the current VM status. -func (c *QMPClient) QueryStatus() (string, bool, error) { - cmd := qmpCommand{Execute: "query-status"} - cmdBytes, err := json.Marshal(cmd) - if err != nil { - return "", false, fmt.Errorf("marshal query-status: %w", err) - } - - resp, err := c.monitor.Run(cmdBytes) - if err != nil { - return "", false, fmt.Errorf("execute query-status: %w", err) - } - - var statusResp qmpStatusResponse - if err := json.Unmarshal(resp, &statusResp); err != nil { - return "", false, fmt.Errorf("unmarshal status response: %w", err) - } +// Status returns the current VM status as a typed enum. +func (c *Client) Status() (qemu.Status, error) { + return c.domain.Status() +} - return statusResp.Return.Status, statusResp.Return.Running, nil +// StatusInfo returns detailed status information from the raw monitor. +func (c *Client) StatusInfo() (raw.StatusInfo, error) { + return c.raw.QueryStatus() } // Quit shuts down QEMU (QMP 'quit' command). -func (c *QMPClient) Quit() error { - cmd := qmpCommand{Execute: "quit"} - cmdBytes, err := json.Marshal(cmd) - if err != nil { - return fmt.Errorf("marshal quit command: %w", err) - } - - // quit command doesn't return a response - QEMU exits - _, _ = c.monitor.Run(cmdBytes) - return nil +func (c *Client) Quit() error { + return c.raw.Quit() } // SystemPowerdown sends ACPI power button event (graceful shutdown). -func (c *QMPClient) SystemPowerdown() error { - cmd := qmpCommand{Execute: "system_powerdown"} - cmdBytes, err := json.Marshal(cmd) - if err != nil { - return fmt.Errorf("marshal system_powerdown: %w", err) - } - - _, err = c.monitor.Run(cmdBytes) - if err != nil { - return fmt.Errorf("execute system_powerdown: %w", err) - } - - return nil +func (c *Client) SystemPowerdown() error { + return c.raw.SystemPowerdown() } // SystemReset resets the VM (hard reset). -func (c *QMPClient) SystemReset() error { - cmd := qmpCommand{Execute: "system_reset"} - cmdBytes, err := json.Marshal(cmd) - if err != nil { - return fmt.Errorf("marshal system_reset: %w", err) - } +func (c *Client) SystemReset() error { + return c.raw.SystemReset() +} - _, err = c.monitor.Run(cmdBytes) - if err != nil { - return fmt.Errorf("execute system_reset: %w", err) - } +// Version returns the QEMU version string. +func (c *Client) Version() (string, error) { + return c.domain.Version() +} + +// Events returns a channel for receiving QEMU events. +func (c *Client) Events() (chan qmp.Event, chan struct{}, error) { + return c.domain.Events() +} - return nil +// Run executes a raw QMP command (for commands not yet wrapped). +func (c *Client) Run(cmd qmp.Command) ([]byte, error) { + return c.domain.Run(cmd) } diff --git a/lib/hypervisor/qemu/qmp_test.go b/lib/hypervisor/qemu/qmp_test.go index 476648f5..16d8e2a2 100644 --- a/lib/hypervisor/qemu/qmp_test.go +++ b/lib/hypervisor/qemu/qmp_test.go @@ -1,100 +1,79 @@ package qemu import ( - "encoding/json" "testing" + "github.com/digitalocean/go-qemu/qemu" + "github.com/digitalocean/go-qemu/qmp/raw" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" ) -func TestQMPCommand_Marshal(t *testing.T) { +func TestStatusMapping(t *testing.T) { + // Test that qemu.Status values are properly defined tests := []struct { - name string - cmd qmpCommand - expected string + name string + status qemu.Status }{ - { - name: "stop command", - cmd: qmpCommand{Execute: "stop"}, - expected: `{"execute":"stop"}`, - }, - { - name: "cont command", - cmd: qmpCommand{Execute: "cont"}, - expected: `{"execute":"cont"}`, - }, - { - name: "query-status command", - cmd: qmpCommand{Execute: "query-status"}, - expected: `{"execute":"query-status"}`, - }, - { - name: "quit command", - cmd: qmpCommand{Execute: "quit"}, - expected: `{"execute":"quit"}`, - }, - { - name: "system_powerdown command", - cmd: qmpCommand{Execute: "system_powerdown"}, - expected: `{"execute":"system_powerdown"}`, - }, - { - name: "system_reset command", - cmd: qmpCommand{Execute: "system_reset"}, - expected: `{"execute":"system_reset"}`, - }, + {"running", qemu.StatusRunning}, + {"paused", qemu.StatusPaused}, + {"shutdown", qemu.StatusShutdown}, + {"prelaunch", qemu.StatusPreLaunch}, + {"in-migrate", qemu.StatusInMigrate}, + {"post-migrate", qemu.StatusPostMigrate}, + {"finish-migrate", qemu.StatusFinishMigrate}, + {"suspended", qemu.StatusSuspended}, + {"guest-panicked", qemu.StatusGuestPanicked}, + {"io-error", qemu.StatusIOError}, + {"internal-error", qemu.StatusInternalError}, + {"watchdog", qemu.StatusWatchdog}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - data, err := json.Marshal(tt.cmd) - require.NoError(t, err) - assert.JSONEq(t, tt.expected, string(data)) + // Verify the status is a valid enum value (not zero except for Debug) + // This ensures we're using the correct constants from go-qemu + assert.NotEqual(t, qemu.Status(-1), tt.status, "status should be valid") }) } } -func TestQMPStatusResponse_Unmarshal(t *testing.T) { +func TestRunStateMapping(t *testing.T) { + // Test that raw.RunState values are properly defined tests := []struct { - name string - json string - expectedStatus string - expectedRunning bool + name string + state raw.RunState }{ - { - name: "running", - json: `{"return":{"running":true,"status":"running"}}`, - expectedStatus: "running", - expectedRunning: true, - }, - { - name: "paused", - json: `{"return":{"running":false,"status":"paused"}}`, - expectedStatus: "paused", - expectedRunning: false, - }, - { - name: "shutdown", - json: `{"return":{"running":false,"status":"shutdown"}}`, - expectedStatus: "shutdown", - expectedRunning: false, - }, - { - name: "prelaunch", - json: `{"return":{"running":false,"status":"prelaunch"}}`, - expectedStatus: "prelaunch", - expectedRunning: false, - }, + {"running", raw.RunStateRunning}, + {"paused", raw.RunStatePaused}, + {"shutdown", raw.RunStateShutdown}, + {"prelaunch", raw.RunStatePrelaunch}, + {"inmigrate", raw.RunStateInmigrate}, + {"postmigrate", raw.RunStatePostmigrate}, + {"finish-migrate", raw.RunStateFinishMigrate}, + {"suspended", raw.RunStateSuspended}, + {"guest-panicked", raw.RunStateGuestPanicked}, + {"io-error", raw.RunStateIOError}, + {"internal-error", raw.RunStateInternalError}, + {"watchdog", raw.RunStateWatchdog}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - var resp qmpStatusResponse - err := json.Unmarshal([]byte(tt.json), &resp) - require.NoError(t, err) - assert.Equal(t, tt.expectedStatus, resp.Return.Status) - assert.Equal(t, tt.expectedRunning, resp.Return.Running) + // Verify the state is a valid enum value + assert.NotEqual(t, raw.RunState(-1), tt.state, "state should be valid") }) } } + +func TestStatusInfoFields(t *testing.T) { + // Test that StatusInfo has the expected structure + info := raw.StatusInfo{ + Running: true, + Singlestep: false, + Status: raw.RunStateRunning, + } + + assert.True(t, info.Running) + assert.False(t, info.Singlestep) + assert.Equal(t, raw.RunStateRunning, info.Status) +} From 312b9327e26dd981dc906e6849d9446b0579c790 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Mon, 22 Dec 2025 17:30:25 -0500 Subject: [PATCH 03/14] Add otel integration --- lib/instances/create.go | 20 ++++++++++--- lib/instances/metrics.go | 61 +++++++++++++++++++++++++++++---------- lib/instances/restore.go | 4 +-- lib/instances/standby.go | 10 +++---- lib/instances/start.go | 4 +-- lib/instances/stop.go | 4 +-- lib/instances/types.go | 6 ++++ lib/middleware/resolve.go | 25 ++++++++++++++++ 8 files changed, 104 insertions(+), 30 deletions(-) diff --git a/lib/instances/create.go b/lib/instances/create.go index 59ce3b50..ed28e785 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -17,6 +17,7 @@ import ( "github.com/onkernel/hypeman/lib/system" "github.com/onkernel/hypeman/lib/vmm" "github.com/onkernel/hypeman/lib/volumes" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" "gvisor.dev/gvisor/pkg/cleanup" ) @@ -207,9 +208,20 @@ func (m *manager) createInstance( if hvType == "" { hvType = m.defaultHypervisor } + + // Enrich logger and trace span with hypervisor type + log = log.With("hypervisor", string(hvType)) + ctx = logger.AddToContext(ctx, log) + if m.metrics != nil && m.metrics.tracer != nil { + span := trace.SpanFromContext(ctx) + if span.IsRecording() { + span.SetAttributes(attribute.String("hypervisor", string(hvType))) + } + } + pm, err := m.getProcessManager(hvType) if err != nil { - log.ErrorContext(ctx, "failed to get process manager", "hypervisor", hvType, "error", err) + log.ErrorContext(ctx, "failed to get process manager", "error", err) return nil, fmt.Errorf("get process manager for %s: %w", hvType, err) } @@ -409,13 +421,13 @@ func (m *manager) createInstance( // Record metrics if m.metrics != nil { - m.recordDuration(ctx, m.metrics.createDuration, start, "success") - m.recordStateTransition(ctx, "stopped", string(StateRunning)) + m.recordDuration(ctx, m.metrics.createDuration, start, "success", hvType) + m.recordStateTransition(ctx, "stopped", string(StateRunning), hvType) } // Return instance with derived state finalInst := m.toInstance(ctx, meta) - log.InfoContext(ctx, "instance created successfully", "instance_id", id, "name", req.Name, "state", finalInst.State) + log.InfoContext(ctx, "instance created successfully", "instance_id", id, "name", req.Name, "state", finalInst.State, "hypervisor", hvType) return &finalInst, nil } diff --git a/lib/instances/metrics.go b/lib/instances/metrics.go index 78901b98..162d94fe 100644 --- a/lib/instances/metrics.go +++ b/lib/instances/metrics.go @@ -4,6 +4,8 @@ import ( "context" "time" + "github.com/onkernel/hypeman/lib/hypervisor" + mw "github.com/onkernel/hypeman/lib/middleware" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/trace" @@ -90,13 +92,25 @@ func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*M if err != nil { return nil } - stateCounts := make(map[string]int64) + // Count by state and hypervisor combination + type stateHypervisor struct { + state string + hypervisor string + } + counts := make(map[stateHypervisor]int64) for _, inst := range instances { - stateCounts[string(inst.State)]++ + key := stateHypervisor{ + state: string(inst.State), + hypervisor: string(inst.HypervisorType), + } + counts[key]++ } - for state, count := range stateCounts { + for key, count := range counts { o.ObserveInt64(instancesTotal, count, - metric.WithAttributes(attribute.String("state", state))) + metric.WithAttributes( + attribute.String("state", key.state), + attribute.String("hypervisor", key.hypervisor), + )) } return nil }, @@ -117,24 +131,41 @@ func newInstanceMetrics(meter metric.Meter, tracer trace.Tracer, m *manager) (*M }, nil } -// recordDuration records operation duration. -func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string) { +// getHypervisorFromContext extracts the hypervisor type from the resolved instance in context. +// Returns empty string if not available. +func getHypervisorFromContext(ctx context.Context) string { + if inst := mw.GetResolvedInstance[Instance](ctx); inst != nil { + return string(inst.HypervisorType) + } + return "" +} + +// recordDuration records operation duration with hypervisor label. +func (m *manager) recordDuration(ctx context.Context, histogram metric.Float64Histogram, start time.Time, status string, hvType hypervisor.Type) { if m.metrics == nil { return } duration := time.Since(start).Seconds() - histogram.Record(ctx, duration, - metric.WithAttributes(attribute.String("status", status))) + attrs := []attribute.KeyValue{ + attribute.String("status", status), + } + if hvType != "" { + attrs = append(attrs, attribute.String("hypervisor", string(hvType))) + } + histogram.Record(ctx, duration, metric.WithAttributes(attrs...)) } -// recordStateTransition records a state transition. -func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string) { +// recordStateTransition records a state transition with hypervisor label. +func (m *manager) recordStateTransition(ctx context.Context, fromState, toState string, hvType hypervisor.Type) { if m.metrics == nil { return } - m.metrics.stateTransitions.Add(ctx, 1, - metric.WithAttributes( - attribute.String("from", fromState), - attribute.String("to", toState), - )) + attrs := []attribute.KeyValue{ + attribute.String("from", fromState), + attribute.String("to", toState), + } + if hvType != "" { + attrs = append(attrs, attribute.String("hypervisor", string(hvType))) + } + m.metrics.stateTransitions.Add(ctx, 1, metric.WithAttributes(attrs...)) } diff --git a/lib/instances/restore.go b/lib/instances/restore.go index d4698bf0..b9ce872c 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -114,8 +114,8 @@ func (m *manager) restoreInstance( // Record metrics if m.metrics != nil { - m.recordDuration(ctx, m.metrics.restoreDuration, start, "success") - m.recordStateTransition(ctx, string(StateStandby), string(StateRunning)) + m.recordDuration(ctx, m.metrics.restoreDuration, start, "success", stored.HypervisorType) + m.recordStateTransition(ctx, string(StateStandby), string(StateRunning), stored.HypervisorType) } // Return instance with derived state (should be Running now) diff --git a/lib/instances/standby.go b/lib/instances/standby.go index dc3cc0c5..72ce467c 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -74,10 +74,10 @@ func (m *manager) standbyInstance( // 6. Reduce memory to base size (virtio-mem hotplug) if supported // Wait for memory to stabilize so the snapshot is as small as possible if hv.Capabilities().SupportsHotplugMemory { - log.DebugContext(ctx, "reducing VM memory before snapshot", "instance_id", id, "base_size", inst.Size) + log.DebugContext(ctx, "reducing VM memory before snapshot", "instance_id", id, "base_size", inst.Size) if err := hv.ResizeMemoryAndWait(ctx, inst.Size, 5*time.Second); err != nil { - // Log warning but continue - snapshot will just be larger - log.WarnContext(ctx, "failed to reduce memory, snapshot will be larger", "instance_id", id, "error", err) + // Log warning but continue - snapshot will just be larger + log.WarnContext(ctx, "failed to reduce memory, snapshot will be larger", "instance_id", id, "error", err) } } @@ -129,8 +129,8 @@ func (m *manager) standbyInstance( // Record metrics if m.metrics != nil { - m.recordDuration(ctx, m.metrics.standbyDuration, start, "success") - m.recordStateTransition(ctx, string(StateRunning), string(StateStandby)) + m.recordDuration(ctx, m.metrics.standbyDuration, start, "success", stored.HypervisorType) + m.recordStateTransition(ctx, string(StateRunning), string(StateStandby), stored.HypervisorType) } // Return instance with derived state (should be Standby now) diff --git a/lib/instances/start.go b/lib/instances/start.go index b57fd12b..36b15328 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -111,8 +111,8 @@ func (m *manager) startInstance( // Record metrics if m.metrics != nil { - m.recordDuration(ctx, m.metrics.startDuration, start, "success") - m.recordStateTransition(ctx, string(StateStopped), string(StateRunning)) + m.recordDuration(ctx, m.metrics.startDuration, start, "success", stored.HypervisorType) + m.recordStateTransition(ctx, string(StateStopped), string(StateRunning), stored.HypervisorType) } // Return instance with derived state (should be Running now) diff --git a/lib/instances/stop.go b/lib/instances/stop.go index 1d7ee112..eff32a66 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -84,8 +84,8 @@ func (m *manager) stopInstance( // Record metrics if m.metrics != nil { - m.recordDuration(ctx, m.metrics.stopDuration, start, "success") - m.recordStateTransition(ctx, string(StateRunning), string(StateStopped)) + m.recordDuration(ctx, m.metrics.stopDuration, start, "success", stored.HypervisorType) + m.recordStateTransition(ctx, string(StateRunning), string(StateStopped), stored.HypervisorType) } // Return instance with derived state (should be Stopped now) diff --git a/lib/instances/types.go b/lib/instances/types.go index b823d5cf..5d7d05f4 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -85,6 +85,12 @@ type Instance struct { HasSnapshot bool // Derived from filesystem check } +// GetHypervisorType returns the hypervisor type as a string. +// This implements the middleware.HypervisorTyper interface for OTEL enrichment. +func (i *Instance) GetHypervisorType() string { + return string(i.HypervisorType) +} + // CreateInstanceRequest is the domain request for creating an instance type CreateInstanceRequest struct { Name string // Required diff --git a/lib/middleware/resolve.go b/lib/middleware/resolve.go index 75593370..705614a9 100644 --- a/lib/middleware/resolve.go +++ b/lib/middleware/resolve.go @@ -8,8 +8,16 @@ import ( "github.com/go-chi/chi/v5" "github.com/onkernel/hypeman/lib/logger" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" ) +// HypervisorTyper is implemented by resources that have a hypervisor type. +// This allows the middleware to enrich logs/traces without importing the instances package. +type HypervisorTyper interface { + GetHypervisorType() string +} + // ResourceResolver is implemented by managers that support lookup by ID, name, or prefix. type ResourceResolver interface { // Resolve looks up a resource by ID, name, or ID prefix. @@ -115,6 +123,23 @@ func ResolveResource(resolvers Resolvers, errResponder ErrorResponder) func(http logKey = "image_name" } log := logger.FromContext(ctx).With(logKey, resolvedID) + + // For instances, also add hypervisor type to logs and traces + if resourceType == "instance" { + if hvTyper, ok := resource.(HypervisorTyper); ok { + hvType := hvTyper.GetHypervisorType() + if hvType != "" { + log = log.With("hypervisor", hvType) + + // Add to trace span if one exists + span := trace.SpanFromContext(ctx) + if span.IsRecording() { + span.SetAttributes(attribute.String("hypervisor", hvType)) + } + } + } + } + ctx = logger.AddToContext(ctx, log) next.ServeHTTP(w, r.WithContext(ctx)) From 41115d7d96c38a7c0cee4f06f028b7e029fa4271 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 09:15:12 -0500 Subject: [PATCH 04/14] arm64 configs --- lib/hypervisor/qemu/config.go | 16 ++++++++++++++-- lib/hypervisor/qemu/config_test.go | 4 ++-- lib/hypervisor/qemu/process.go | 27 +++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/lib/hypervisor/qemu/config.go b/lib/hypervisor/qemu/config.go index 2e0d0ad1..5f3b457f 100644 --- a/lib/hypervisor/qemu/config.go +++ b/lib/hypervisor/qemu/config.go @@ -2,6 +2,7 @@ package qemu import ( "fmt" + "runtime" "strconv" "github.com/onkernel/hypeman/lib/hypervisor" @@ -11,8 +12,8 @@ import ( func BuildArgs(cfg hypervisor.VMConfig) []string { args := make([]string, 0, 64) - // Machine type with KVM acceleration - args = append(args, "-machine", "q35,accel=kvm") + // Machine type with KVM acceleration (arch-specific) + args = append(args, "-machine", machineType()) // CPU configuration args = append(args, "-cpu", "host") @@ -77,3 +78,14 @@ func BuildArgs(cfg hypervisor.VMConfig) []string { return args } + +// machineType returns the QEMU machine type for the host architecture. +func machineType() string { + switch runtime.GOARCH { + case "arm64": + return "virt,accel=kvm" + default: + // x86_64 and others use q35 + return "q35,accel=kvm" + } +} diff --git a/lib/hypervisor/qemu/config_test.go b/lib/hypervisor/qemu/config_test.go index 9fe3d427..bc6e9be3 100644 --- a/lib/hypervisor/qemu/config_test.go +++ b/lib/hypervisor/qemu/config_test.go @@ -18,9 +18,9 @@ func TestBuildArgs_Basic(t *testing.T) { args := BuildArgs(cfg) - // Check machine type + // Check machine type (arch-dependent) assert.Contains(t, args, "-machine") - assert.Contains(t, args, "q35,accel=kvm") + assert.Contains(t, args, machineType()) // Check CPU assert.Contains(t, args, "-cpu") diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index d30b0f20..ccee49dc 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -7,6 +7,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "syscall" "time" @@ -112,10 +113,16 @@ func (p *ProcessManager) StartProcessWithArgs(ctx context.Context, paths *paths. // GetBinaryPath returns the path to the QEMU binary. // QEMU is expected to be installed on the system. func (p *ProcessManager) GetBinaryPath(paths *paths.Paths, version string) (string, error) { + // Determine binary name based on host architecture + binaryName, err := qemuBinaryName() + if err != nil { + return "", err + } + // Look for system-installed QEMU candidates := []string{ - "/usr/bin/qemu-system-x86_64", - "/usr/local/bin/qemu-system-x86_64", + "/usr/bin/" + binaryName, + "/usr/local/bin/" + binaryName, } for _, path := range candidates { @@ -125,11 +132,23 @@ func (p *ProcessManager) GetBinaryPath(paths *paths.Paths, version string) (stri } // Try PATH lookup - if path, err := exec.LookPath("qemu-system-x86_64"); err == nil { + if path, err := exec.LookPath(binaryName); err == nil { return path, nil } - return "", fmt.Errorf("qemu-system-x86_64 not found; install QEMU on your system") + return "", fmt.Errorf("%s not found; install QEMU on your system", binaryName) +} + +// qemuBinaryName returns the QEMU binary name for the host architecture. +func qemuBinaryName() (string, error) { + switch runtime.GOARCH { + case "amd64": + return "qemu-system-x86_64", nil + case "arm64": + return "qemu-system-aarch64", nil + default: + return "", fmt.Errorf("unsupported architecture: %s", runtime.GOARCH) + } } // isSocketInUse checks if a Unix socket is actively being used From a9da8b78b78f732949bcadcf09489da33478b916 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 09:20:59 -0500 Subject: [PATCH 05/14] Add review comment --- lib/hypervisor/qemu/process.go | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index ccee49dc..ede72ed4 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -4,6 +4,7 @@ package qemu import ( "context" "fmt" + "net" "os" "os/exec" "path/filepath" @@ -153,7 +154,7 @@ func qemuBinaryName() (string, error) { // isSocketInUse checks if a Unix socket is actively being used func isSocketInUse(socketPath string) bool { - conn, err := dialUnixTimeout(socketPath, 100*time.Millisecond) + conn, err := net.DialTimeout("unix", socketPath, 100*time.Millisecond) if err != nil { return false } @@ -165,26 +166,12 @@ func isSocketInUse(socketPath string) bool { func waitForSocket(socketPath string, timeout time.Duration) error { deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { - if _, err := os.Stat(socketPath); err == nil { - // Socket file exists, try to connect - conn, err := dialUnixTimeout(socketPath, 100*time.Millisecond) - if err == nil { - conn.Close() - return nil - } + conn, err := net.DialTimeout("unix", socketPath, 100*time.Millisecond) + if err == nil { + conn.Close() + return nil } time.Sleep(50 * time.Millisecond) } return fmt.Errorf("timeout waiting for socket") } - -// dialUnixTimeout dials a Unix socket with a timeout -func dialUnixTimeout(path string, timeout time.Duration) (*os.File, error) { - // Use a simple stat check - actual connection will be done by go-qemu - if _, err := os.Stat(path); err != nil { - return nil, err - } - // For now, just return nil to indicate socket exists - // The actual QMP connection will be made by the QEMU client - return nil, nil -} From 6948c1f374f6b04e534463f081212267afdb9bed Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 09:24:00 -0500 Subject: [PATCH 06/14] Package hint --- lib/hypervisor/qemu/process.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index ede72ed4..5fa1e9a3 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -137,7 +137,7 @@ func (p *ProcessManager) GetBinaryPath(paths *paths.Paths, version string) (stri return path, nil } - return "", fmt.Errorf("%s not found; install QEMU on your system", binaryName) + return "", fmt.Errorf("%s not found; install with: %s", binaryName, qemuInstallHint()) } // qemuBinaryName returns the QEMU binary name for the host architecture. @@ -152,6 +152,18 @@ func qemuBinaryName() (string, error) { } } +// qemuInstallHint returns package installation hints for the current architecture. +func qemuInstallHint() string { + switch runtime.GOARCH { + case "amd64": + return "apt install qemu-system-x86 (Debian/Ubuntu) or dnf install qemu-system-x86-core (Fedora)" + case "arm64": + return "apt install qemu-system-arm (Debian/Ubuntu) or dnf install qemu-system-aarch64-core (Fedora)" + default: + return "install QEMU for your platform" + } +} + // isSocketInUse checks if a Unix socket is actively being used func isSocketInUse(socketPath string) bool { conn, err := net.DialTimeout("unix", socketPath, 100*time.Millisecond) From 65269699d4318f69182f700d4618cc0fdc48e9ca Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 10:14:30 -0500 Subject: [PATCH 07/14] Add vsock dialer abstraction --- cmd/api/api/exec.go | 12 +- cmd/api/api/exec_test.go | 13 +- lib/devices/gpu_e2e_test.go | 6 +- lib/devices/gpu_inference_test.go | 16 +- lib/devices/gpu_module_test.go | 17 +- lib/exec/README.md | 9 +- lib/exec/client.go | 114 +--- .../cloudhypervisor/cloudhypervisor.go | 49 +- lib/hypervisor/cloudhypervisor/process.go | 109 +++- lib/hypervisor/cloudhypervisor/vsock.go | 118 ++++ lib/hypervisor/hypervisor.go | 88 ++- lib/hypervisor/qemu/process.go | 105 ++-- lib/hypervisor/qemu/qemu.go | 24 - lib/hypervisor/qemu/vsock.go | 235 ++++++++ lib/instances/create.go | 51 +- lib/instances/exec_test.go | 16 +- lib/instances/manager.go | 19 +- lib/instances/manager_test.go | 7 +- lib/instances/metrics.go | 4 +- lib/instances/network_test.go | 16 +- lib/instances/qemu_test.go | 538 ++++++++++++++++++ lib/instances/restore.go | 60 +- lib/instances/volumes_test.go | 22 +- 23 files changed, 1252 insertions(+), 396 deletions(-) create mode 100644 lib/hypervisor/cloudhypervisor/vsock.go create mode 100644 lib/hypervisor/qemu/vsock.go create mode 100644 lib/instances/qemu_test.go diff --git a/cmd/api/api/exec.go b/cmd/api/api/exec.go index 9d529fc7..917ebd57 100644 --- a/cmd/api/api/exec.go +++ b/cmd/api/api/exec.go @@ -12,6 +12,7 @@ import ( "github.com/gorilla/websocket" "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/logger" mw "github.com/onkernel/hypeman/lib/middleware" @@ -110,8 +111,17 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) { // Create WebSocket read/writer wrapper wsConn := &wsReadWriter{ws: ws, ctx: ctx} + // Create vsock dialer for this hypervisor type + dialer, err := hypervisor.NewVsockDialer(hypervisor.Type(inst.HypervisorType), inst.VsockSocket, inst.VsockCID) + if err != nil { + log.ErrorContext(ctx, "failed to create vsock dialer", "error", err) + ws.WriteMessage(websocket.BinaryMessage, []byte(fmt.Sprintf("Error: %v\r\n", err))) + ws.WriteMessage(websocket.TextMessage, []byte(`{"exitCode":127}`)) + return + } + // Execute via vsock - exit, err := exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{ + exit, err := exec.ExecIntoInstance(ctx, dialer, exec.ExecOptions{ Command: execReq.Command, Stdin: wsConn, Stdout: wsConn, diff --git a/cmd/api/api/exec_test.go b/cmd/api/api/exec_test.go index 219d2ea7..5e491bf4 100644 --- a/cmd/api/api/exec_test.go +++ b/cmd/api/api/exec_test.go @@ -8,6 +8,7 @@ import ( "time" "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/oapi" "github.com/onkernel/hypeman/lib/paths" @@ -119,13 +120,16 @@ func TestExecInstanceNonTTY(t *testing.T) { var stdout, stderr outputBuffer var execErr error + dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) + require.NoError(t, err) + t.Log("Testing exec command: whoami") maxRetries := 10 for i := 0; i < maxRetries; i++ { stdout = outputBuffer{} stderr = outputBuffer{} - exit, execErr = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{ + exit, execErr = exec.ExecIntoInstance(ctx(), dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "whoami"}, Stdin: nil, Stdout: &stdout, @@ -250,9 +254,12 @@ func TestExecWithDebianMinimal(t *testing.T) { assert.Contains(t, logs, "overlay-init: app exited with code", "App should have exited") // Test exec commands work even though the main app (bash) has exited + dialer2, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) + require.NoError(t, err) + t.Log("Testing exec command: echo") var stdout, stderr outputBuffer - exit, err := exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{ + exit, err := exec.ExecIntoInstance(ctx(), dialer2, exec.ExecOptions{ Command: []string{"echo", "hello from debian"}, Stdout: &stdout, Stderr: &stderr, @@ -266,7 +273,7 @@ func TestExecWithDebianMinimal(t *testing.T) { // Verify we're actually in Debian t.Log("Verifying OS release...") stdout = outputBuffer{} - exit, err = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{ + exit, err = exec.ExecIntoInstance(ctx(), dialer2, exec.ExecOptions{ Command: []string{"cat", "/etc/os-release"}, Stdout: &stdout, TTY: false, diff --git a/lib/devices/gpu_e2e_test.go b/lib/devices/gpu_e2e_test.go index 4599dd51..e15f8d9f 100644 --- a/lib/devices/gpu_e2e_test.go +++ b/lib/devices/gpu_e2e_test.go @@ -12,6 +12,7 @@ import ( "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/network" @@ -218,6 +219,9 @@ func TestGPUPassthrough(t *testing.T) { actualInst, err := instanceMgr.GetInstance(ctx, inst.Id) require.NoError(t, err) + dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) + require.NoError(t, err) + // Create a context with timeout for exec operations execCtx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() @@ -232,7 +236,7 @@ func TestGPUPassthrough(t *testing.T) { stdout = outputBuffer{} stderr = outputBuffer{} - _, execErr = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, execErr = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", checkGPUCmd}, Stdin: nil, Stdout: &stdout, diff --git a/lib/devices/gpu_inference_test.go b/lib/devices/gpu_inference_test.go index 33c9b416..8d05e8f1 100644 --- a/lib/devices/gpu_inference_test.go +++ b/lib/devices/gpu_inference_test.go @@ -23,6 +23,7 @@ import ( "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/network" @@ -285,6 +286,9 @@ func TestGPUInference(t *testing.T) { actualInst, err := instanceMgr.GetInstance(ctx, inst.Id) require.NoError(t, err) + dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) + require.NoError(t, err) + // Step 10: Wait for Ollama server t.Log("Step 10: Waiting for Ollama server to be ready...") ollamaReady := false @@ -292,7 +296,7 @@ func TestGPUInference(t *testing.T) { healthCtx, healthCancel := context.WithTimeout(ctx, 5*time.Second) var healthStdout, healthStderr inferenceOutputBuffer - _, err = exec.ExecIntoInstance(healthCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(healthCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "ollama list 2>&1"}, Stdout: &healthStdout, Stderr: &healthStderr, @@ -319,7 +323,7 @@ func TestGPUInference(t *testing.T) { // Check nvidia-smi (should work now with CUDA image) var nvidiaSmiStdout, nvidiaSmiStderr inferenceOutputBuffer - _, _ = exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, _ = exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "nvidia-smi 2>&1 || echo 'nvidia-smi failed'"}, Stdout: &nvidiaSmiStdout, Stderr: &nvidiaSmiStderr, @@ -333,7 +337,7 @@ func TestGPUInference(t *testing.T) { // Check NVIDIA kernel modules var modulesStdout inferenceOutputBuffer - exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{ + exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "cat /proc/modules | grep nvidia"}, Stdout: &modulesStdout, }) @@ -343,7 +347,7 @@ func TestGPUInference(t *testing.T) { // Check device nodes var devStdout inferenceOutputBuffer - exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{ + exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "ls -la /dev/nvidia* 2>&1"}, Stdout: &devStdout, }) @@ -355,7 +359,7 @@ func TestGPUInference(t *testing.T) { t.Log("Step 12: Ensuring TinyLlama model is available...") var listStdout inferenceOutputBuffer - exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{ + exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "ollama list 2>&1"}, Stdout: &listStdout, }) @@ -366,7 +370,7 @@ func TestGPUInference(t *testing.T) { defer pullCancel() var pullStdout inferenceOutputBuffer - _, pullErr := exec.ExecIntoInstance(pullCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, pullErr := exec.ExecIntoInstance(pullCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "ollama pull tinyllama 2>&1"}, Stdout: &pullStdout, }) diff --git a/lib/devices/gpu_module_test.go b/lib/devices/gpu_module_test.go index 97045474..e1012acd 100644 --- a/lib/devices/gpu_module_test.go +++ b/lib/devices/gpu_module_test.go @@ -19,6 +19,7 @@ import ( "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/instances" "github.com/onkernel/hypeman/lib/network" @@ -194,6 +195,9 @@ func TestNVIDIAModuleLoading(t *testing.T) { actualInst, err := instanceMgr.GetInstance(ctx, inst.Id) require.NoError(t, err) + dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) + require.NoError(t, err) + execCtx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() @@ -204,7 +208,7 @@ func TestNVIDIAModuleLoading(t *testing.T) { for i := 0; i < 10; i++ { stdout = outputBuffer{} stderr = outputBuffer{} - _, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", dmesgCmd}, Stdin: nil, Stdout: &stdout, @@ -234,7 +238,7 @@ func TestNVIDIAModuleLoading(t *testing.T) { // Check lsmod for nvidia modules stdout = outputBuffer{} stderr = outputBuffer{} - _, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "cat /proc/modules | grep nvidia || echo 'No nvidia modules loaded'"}, Stdin: nil, Stdout: &stdout, @@ -254,7 +258,7 @@ func TestNVIDIAModuleLoading(t *testing.T) { // Check for /dev/nvidia* devices stdout = outputBuffer{} stderr = outputBuffer{} - _, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "ls -la /dev/nvidia* 2>&1 || echo 'No nvidia devices found'"}, Stdin: nil, Stdout: &stdout, @@ -430,13 +434,16 @@ func TestNVMLDetection(t *testing.T) { actualInst, err := instanceMgr.GetInstance(ctx, inst.Id) require.NoError(t, err) + dialer2, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) + require.NoError(t, err) + // Step 5: Run NVML test t.Log("Step 5: Running NVML detection test...") execCtx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() var stdout, stderr outputBuffer - _, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(execCtx, dialer2, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "python3 /usr/local/bin/test-nvml.py 2>&1"}, Stdin: nil, Stdout: &stdout, @@ -469,7 +476,7 @@ func TestNVMLDetection(t *testing.T) { t.Log("Step 6: Running CUDA driver test...") stdout = outputBuffer{} stderr = outputBuffer{} - _, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(execCtx, dialer2, exec.ExecOptions{ Command: []string{"/bin/sh", "-c", "python3 /usr/local/bin/test-cuda.py 2>&1"}, Stdin: nil, Stdout: &stdout, diff --git a/lib/exec/README.md b/lib/exec/README.md index 5de81187..5d041c04 100644 --- a/lib/exec/README.md +++ b/lib/exec/README.md @@ -11,7 +11,7 @@ API Server (/instances/{id}/exec) ↓ lib/exec/client.go (ExecIntoInstance) ↓ -Cloud Hypervisor vsock socket +Hypervisor vsock (CH: Unix socket, QEMU: AF_VSOCK) ↓ Guest: exec-agent (lib/system/exec_agent) ↓ @@ -37,14 +37,13 @@ Container (chroot /overlay/newroot) "timeout": 30 // optional: timeout in seconds } ``` -- Calls `exec.ExecIntoInstance()` with the instance's vsock socket path +- Creates a `VsockDialer` for the instance's hypervisor type and calls `exec.ExecIntoInstance()` - Logs audit trail: JWT subject, instance ID, command, start/end time, exit code ### 2. Client (`lib/exec/client.go`) -- **ExecIntoInstance()**: Main client function -- Connects to Cloud Hypervisor's vsock Unix socket -- Performs vsock handshake: `CONNECT 2222\n` → `OK ` +- **ExecIntoInstance()**: Main client function, takes a `VsockDialer` interface +- Uses hypervisor-specific dialer to connect to guest (see `lib/hypervisor/*/vsock.go`) - Creates gRPC client over the vsock connection (pooled per VM for efficiency) - Streams stdin/stdout/stderr bidirectionally - Returns exit status when command completes diff --git a/lib/exec/client.go b/lib/exec/client.go index a32e6338..32497fb8 100644 --- a/lib/exec/client.go +++ b/lib/exec/client.go @@ -1,31 +1,27 @@ package exec import ( - "bufio" "context" "fmt" "io" "log/slog" "net" - "strings" "sync" "sync/atomic" "time" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" + + "github.com/onkernel/hypeman/lib/hypervisor" ) const ( - // vsockDialTimeout is the timeout for connecting to the vsock Unix socket - vsockDialTimeout = 5 * time.Second - // vsockHandshakeTimeout is the timeout for the Cloud Hypervisor vsock handshake - vsockHandshakeTimeout = 5 * time.Second // vsockGuestPort is the port the exec-agent listens on inside the guest vsockGuestPort = 2222 ) -// connPool manages reusable gRPC connections per vsock socket path +// connPool manages reusable gRPC connections per vsock dialer key // This avoids the overhead and potential issues of rapidly creating/closing connections var connPool = struct { sync.RWMutex @@ -35,10 +31,12 @@ var connPool = struct { } // getOrCreateConn returns an existing connection or creates a new one -func getOrCreateConn(ctx context.Context, vsockSocketPath string) (*grpc.ClientConn, error) { +func getOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc.ClientConn, error) { + key := dialer.Key() + // Try read lock first for existing connection connPool.RLock() - if conn, ok := connPool.conns[vsockSocketPath]; ok { + if conn, ok := connPool.conns[key]; ok { connPool.RUnlock() return conn, nil } @@ -49,14 +47,14 @@ func getOrCreateConn(ctx context.Context, vsockSocketPath string) (*grpc.ClientC defer connPool.Unlock() // Double-check after acquiring write lock - if conn, ok := connPool.conns[vsockSocketPath]; ok { + if conn, ok := connPool.conns[key]; ok { return conn, nil } // Create new connection conn, err := grpc.Dial("passthrough:///vsock", grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { - return dialVsock(ctx, vsockSocketPath) + return dialer.DialVsock(ctx, vsockGuestPort) }), grpc.WithTransportCredentials(insecure.NewCredentials()), ) @@ -64,20 +62,20 @@ func getOrCreateConn(ctx context.Context, vsockSocketPath string) (*grpc.ClientC return nil, fmt.Errorf("create grpc connection: %w", err) } - connPool.conns[vsockSocketPath] = conn - slog.Debug("created new gRPC connection", "socket", vsockSocketPath) + connPool.conns[key] = conn + slog.Debug("created new gRPC connection", "key", key) return conn, nil } // CloseConn closes and removes a connection from the pool (call when VM is deleted) -func CloseConn(vsockSocketPath string) { +func CloseConn(dialerKey string) { connPool.Lock() defer connPool.Unlock() - if conn, ok := connPool.conns[vsockSocketPath]; ok { + if conn, ok := connPool.conns[dialerKey]; ok { conn.Close() - delete(connPool.conns, vsockSocketPath) - slog.Debug("closed gRPC connection", "socket", vsockSocketPath) + delete(connPool.conns, dialerKey) + slog.Debug("closed gRPC connection", "key", dialerKey) } } @@ -98,26 +96,15 @@ type ExecOptions struct { Timeout int32 // Execution timeout in seconds (0 = no timeout) } -// bufferedConn wraps a net.Conn with a bufio.Reader to ensure any buffered -// data from the handshake is properly drained before reading from the connection -type bufferedConn struct { - net.Conn - reader *bufio.Reader -} - -func (c *bufferedConn) Read(p []byte) (int, error) { - return c.reader.Read(p) -} - -// ExecIntoInstance executes command in instance via vsock using gRPC -// vsockSocketPath is the Unix socket created by Cloud Hypervisor (e.g., /var/lib/hypeman/guests/{id}/vsock.sock) -func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOptions) (*ExitStatus, error) { +// ExecIntoInstance executes command in instance via vsock using gRPC. +// The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest. +func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) { start := time.Now() var bytesSent int64 - // Get or create a reusable gRPC connection for this vsock socket + // Get or create a reusable gRPC connection for this vsock dialer // Connection pooling avoids issues with rapid connect/disconnect cycles - grpcConn, err := getOrCreateConn(ctx, vsockSocketPath) + grpcConn, err := getOrCreateConn(ctx, dialer) if err != nil { return nil, fmt.Errorf("get grpc connection: %w", err) } @@ -200,64 +187,3 @@ func ExecIntoInstance(ctx context.Context, vsockSocketPath string, opts ExecOpti } } } - -// dialVsock connects to Cloud Hypervisor's vsock Unix socket and performs the handshake -func dialVsock(ctx context.Context, vsockSocketPath string) (net.Conn, error) { - slog.DebugContext(ctx, "connecting to vsock", "socket", vsockSocketPath) - - // Use dial timeout, respecting context deadline if shorter - dialTimeout := vsockDialTimeout - if deadline, ok := ctx.Deadline(); ok { - if remaining := time.Until(deadline); remaining < dialTimeout { - dialTimeout = remaining - } - } - - // Connect to CH's Unix socket with timeout - dialer := net.Dialer{Timeout: dialTimeout} - conn, err := dialer.DialContext(ctx, "unix", vsockSocketPath) - if err != nil { - return nil, fmt.Errorf("dial vsock socket %s: %w", vsockSocketPath, err) - } - - slog.DebugContext(ctx, "connected to vsock socket, performing handshake", "port", vsockGuestPort) - - // Set deadline for handshake - if err := conn.SetDeadline(time.Now().Add(vsockHandshakeTimeout)); err != nil { - conn.Close() - return nil, fmt.Errorf("set handshake deadline: %w", err) - } - - // Perform Cloud Hypervisor vsock handshake - handshakeCmd := fmt.Sprintf("CONNECT %d\n", vsockGuestPort) - if _, err := conn.Write([]byte(handshakeCmd)); err != nil { - conn.Close() - return nil, fmt.Errorf("send vsock handshake: %w", err) - } - - // Read handshake response - reader := bufio.NewReader(conn) - response, err := reader.ReadString('\n') - if err != nil { - conn.Close() - return nil, fmt.Errorf("read vsock handshake response (is exec-agent running in guest?): %w", err) - } - - // Clear deadline after successful handshake - if err := conn.SetDeadline(time.Time{}); err != nil { - conn.Close() - return nil, fmt.Errorf("clear deadline: %w", err) - } - - response = strings.TrimSpace(response) - if !strings.HasPrefix(response, "OK ") { - conn.Close() - return nil, fmt.Errorf("vsock handshake failed: %s", response) - } - - slog.DebugContext(ctx, "vsock handshake successful", "response", response) - - // Return wrapped connection that uses the bufio.Reader - // This ensures any bytes buffered during handshake are not lost - return &bufferedConn{Conn: conn, reader: reader}, nil -} diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index effcc596..4410ff43 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -27,6 +27,9 @@ func New(socketPath string) (*CloudHypervisor, error) { }, nil } +// Verify CloudHypervisor implements the interface +var _ hypervisor.Hypervisor = (*CloudHypervisor)(nil) + // Capabilities returns the features supported by Cloud Hypervisor. func (c *CloudHypervisor) Capabilities() hypervisor.Capabilities { return hypervisor.Capabilities{ @@ -38,31 +41,6 @@ func (c *CloudHypervisor) Capabilities() hypervisor.Capabilities { } } -// CreateVM configures the VM in Cloud Hypervisor. -func (c *CloudHypervisor) CreateVM(ctx context.Context, config hypervisor.VMConfig) error { - vmConfig := ToVMConfig(config) - resp, err := c.client.CreateVMWithResponse(ctx, vmConfig) - if err != nil { - return fmt.Errorf("create vm: %w", err) - } - if resp.StatusCode() != 204 { - return fmt.Errorf("create vm failed with status %d: %s", resp.StatusCode(), string(resp.Body)) - } - return nil -} - -// BootVM starts the configured VM. -func (c *CloudHypervisor) BootVM(ctx context.Context) error { - resp, err := c.client.BootVMWithResponse(ctx) - if err != nil { - return fmt.Errorf("boot vm: %w", err) - } - if resp.StatusCode() != 204 { - return fmt.Errorf("boot vm failed with status %d: %s", resp.StatusCode(), string(resp.Body)) - } - return nil -} - // DeleteVM removes the VM configuration from Cloud Hypervisor. func (c *CloudHypervisor) DeleteVM(ctx context.Context) error { resp, err := c.client.DeleteVMWithResponse(ctx) @@ -157,23 +135,6 @@ func (c *CloudHypervisor) Snapshot(ctx context.Context, destPath string) error { return nil } -// Restore loads a VM from snapshot. -func (c *CloudHypervisor) Restore(ctx context.Context, sourcePath string) error { - sourceURL := "file://" + sourcePath - restoreConfig := vmm.RestoreConfig{ - SourceUrl: sourceURL, - Prefault: ptr(false), - } - resp, err := c.client.PutVmRestoreWithResponse(ctx, restoreConfig) - if err != nil { - return fmt.Errorf("restore: %w", err) - } - if resp.StatusCode() != 204 { - return fmt.Errorf("restore failed with status %d", resp.StatusCode()) - } - return nil -} - // ResizeMemory changes the VM's memory allocation. func (c *CloudHypervisor) ResizeMemory(ctx context.Context, bytes int64) error { resizeConfig := vmm.VmResize{DesiredRam: &bytes} @@ -239,7 +200,3 @@ func (c *CloudHypervisor) ResizeMemoryAndWait(ctx context.Context, bytes int64, // Timeout reached, but resize was requested successfully return nil } - -func ptr[T any](v T) *T { - return &v -} diff --git a/lib/hypervisor/cloudhypervisor/process.go b/lib/hypervisor/cloudhypervisor/process.go index 705c91c2..230a9b16 100644 --- a/lib/hypervisor/cloudhypervisor/process.go +++ b/lib/hypervisor/cloudhypervisor/process.go @@ -13,45 +13,114 @@ func init() { hypervisor.RegisterSocketName(hypervisor.TypeCloudHypervisor, "ch.sock") } -// ProcessManager implements hypervisor.ProcessManager for Cloud Hypervisor. -type ProcessManager struct{} +// Starter implements hypervisor.VMStarter for Cloud Hypervisor. +type Starter struct{} -// NewProcessManager creates a new Cloud Hypervisor process manager. -func NewProcessManager() *ProcessManager { - return &ProcessManager{} +// NewStarter creates a new Cloud Hypervisor starter. +func NewStarter() *Starter { + return &Starter{} } -// Verify ProcessManager implements the interface -var _ hypervisor.ProcessManager = (*ProcessManager)(nil) +// Verify Starter implements the interface +var _ hypervisor.VMStarter = (*Starter)(nil) // SocketName returns the socket filename for Cloud Hypervisor. -func (p *ProcessManager) SocketName() string { +func (s *Starter) SocketName() string { return "ch.sock" } -// StartProcess launches a Cloud Hypervisor VMM process. -func (p *ProcessManager) StartProcess(ctx context.Context, paths *paths.Paths, version string, socketPath string) (int, error) { +// GetBinaryPath returns the path to the Cloud Hypervisor binary. +func (s *Starter) GetBinaryPath(p *paths.Paths, version string) (string, error) { chVersion := vmm.CHVersion(version) if !vmm.IsVersionSupported(chVersion) { - return 0, fmt.Errorf("unsupported cloud-hypervisor version: %s", version) + return "", fmt.Errorf("unsupported cloud-hypervisor version: %s", version) } - return vmm.StartProcess(ctx, paths, chVersion, socketPath) + return vmm.GetBinaryPath(p, chVersion) } -// StartProcessWithArgs launches a Cloud Hypervisor VMM process with extra arguments. -func (p *ProcessManager) StartProcessWithArgs(ctx context.Context, paths *paths.Paths, version string, socketPath string, extraArgs []string) (int, error) { +// StartVM launches Cloud Hypervisor, configures the VM, and boots it. +// Returns the process ID and a Hypervisor client for subsequent operations. +func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { + // Validate version chVersion := vmm.CHVersion(version) if !vmm.IsVersionSupported(chVersion) { - return 0, fmt.Errorf("unsupported cloud-hypervisor version: %s", version) + return 0, nil, fmt.Errorf("unsupported cloud-hypervisor version: %s", version) + } + + // 1. Start the Cloud Hypervisor process + pid, err := vmm.StartProcess(ctx, p, chVersion, socketPath) + if err != nil { + return 0, nil, fmt.Errorf("start process: %w", err) } - return vmm.StartProcessWithArgs(ctx, paths, chVersion, socketPath, extraArgs) + + // 2. Create the HTTP client + hv, err := New(socketPath) + if err != nil { + return 0, nil, fmt.Errorf("create client: %w", err) + } + + // 3. Configure the VM via HTTP API + vmConfig := ToVMConfig(config) + resp, err := hv.client.CreateVMWithResponse(ctx, vmConfig) + if err != nil { + return 0, nil, fmt.Errorf("create vm: %w", err) + } + if resp.StatusCode() != 204 { + return 0, nil, fmt.Errorf("create vm failed with status %d: %s", resp.StatusCode(), string(resp.Body)) + } + + // 4. Boot the VM via HTTP API + bootResp, err := hv.client.BootVMWithResponse(ctx) + if err != nil { + return 0, nil, fmt.Errorf("boot vm: %w", err) + } + if bootResp.StatusCode() != 204 { + return 0, nil, fmt.Errorf("boot vm failed with status %d: %s", bootResp.StatusCode(), string(bootResp.Body)) + } + + return pid, hv, nil } -// GetBinaryPath returns the path to the Cloud Hypervisor binary. -func (p *ProcessManager) GetBinaryPath(paths *paths.Paths, version string) (string, error) { +// RestoreVM starts Cloud Hypervisor and restores VM state from a snapshot. +// The VM is in paused state after restore; caller should call Resume() to continue execution. +func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { + // Validate version chVersion := vmm.CHVersion(version) if !vmm.IsVersionSupported(chVersion) { - return "", fmt.Errorf("unsupported cloud-hypervisor version: %s", version) + return 0, nil, fmt.Errorf("unsupported cloud-hypervisor version: %s", version) + } + + // 1. Start the Cloud Hypervisor process + pid, err := vmm.StartProcess(ctx, p, chVersion, socketPath) + if err != nil { + return 0, nil, fmt.Errorf("start process: %w", err) } - return vmm.GetBinaryPath(paths, chVersion) + + // 2. Create the HTTP client + hv, err := New(socketPath) + if err != nil { + return 0, nil, fmt.Errorf("create client: %w", err) + } + + // 3. Restore from snapshot via HTTP API + sourceURL := "file://" + snapshotPath + restoreConfig := vmm.RestoreConfig{ + SourceUrl: sourceURL, + Prefault: ptr(false), + } + resp, err := hv.client.PutVmRestoreWithResponse(ctx, restoreConfig) + if err != nil { + hv.Shutdown(ctx) // Cleanup on failure + return 0, nil, fmt.Errorf("restore: %w", err) + } + if resp.StatusCode() != 204 { + hv.Shutdown(ctx) // Cleanup on failure + return 0, nil, fmt.Errorf("restore failed with status %d: %s", resp.StatusCode(), string(resp.Body)) + } + + return pid, hv, nil +} + +func ptr[T any](v T) *T { + return &v } diff --git a/lib/hypervisor/cloudhypervisor/vsock.go b/lib/hypervisor/cloudhypervisor/vsock.go new file mode 100644 index 00000000..a29b04b2 --- /dev/null +++ b/lib/hypervisor/cloudhypervisor/vsock.go @@ -0,0 +1,118 @@ +package cloudhypervisor + +import ( + "bufio" + "context" + "fmt" + "log/slog" + "net" + "strings" + "time" + + "github.com/onkernel/hypeman/lib/hypervisor" +) + +const ( + // vsockDialTimeout is the timeout for connecting to the vsock Unix socket + vsockDialTimeout = 5 * time.Second + // vsockHandshakeTimeout is the timeout for the Cloud Hypervisor vsock handshake + vsockHandshakeTimeout = 5 * time.Second +) + +func init() { + hypervisor.RegisterVsockDialerFactory(hypervisor.TypeCloudHypervisor, NewVsockDialer) +} + +// VsockDialer implements hypervisor.VsockDialer for Cloud Hypervisor. +// Cloud Hypervisor exposes vsock through a Unix socket file with a text-based +// handshake protocol (CONNECT {port}\n / OK ...). +type VsockDialer struct { + socketPath string +} + +// NewVsockDialer creates a new VsockDialer for Cloud Hypervisor. +// The vsockSocket parameter is the path to the Unix socket file. +// The vsockCID parameter is unused for Cloud Hypervisor (it uses socket path instead). +func NewVsockDialer(vsockSocket string, vsockCID int64) hypervisor.VsockDialer { + return &VsockDialer{ + socketPath: vsockSocket, + } +} + +// Key returns a unique identifier for this dialer, used for connection pooling. +func (d *VsockDialer) Key() string { + return "ch:" + d.socketPath +} + +// DialVsock connects to the guest on the specified port. +// It connects to the Cloud Hypervisor Unix socket and performs the handshake protocol. +func (d *VsockDialer) DialVsock(ctx context.Context, port int) (net.Conn, error) { + slog.DebugContext(ctx, "connecting to vsock", "socket", d.socketPath, "port", port) + + // Use dial timeout, respecting context deadline if shorter + dialTimeout := vsockDialTimeout + if deadline, ok := ctx.Deadline(); ok { + if remaining := time.Until(deadline); remaining < dialTimeout { + dialTimeout = remaining + } + } + + // Connect to CH's Unix socket with timeout + dialer := net.Dialer{Timeout: dialTimeout} + conn, err := dialer.DialContext(ctx, "unix", d.socketPath) + if err != nil { + return nil, fmt.Errorf("dial vsock socket %s: %w", d.socketPath, err) + } + + slog.DebugContext(ctx, "connected to vsock socket, performing handshake", "port", port) + + // Set deadline for handshake + if err := conn.SetDeadline(time.Now().Add(vsockHandshakeTimeout)); err != nil { + conn.Close() + return nil, fmt.Errorf("set handshake deadline: %w", err) + } + + // Perform Cloud Hypervisor vsock handshake + handshakeCmd := fmt.Sprintf("CONNECT %d\n", port) + if _, err := conn.Write([]byte(handshakeCmd)); err != nil { + conn.Close() + return nil, fmt.Errorf("send vsock handshake: %w", err) + } + + // Read handshake response + reader := bufio.NewReader(conn) + response, err := reader.ReadString('\n') + if err != nil { + conn.Close() + return nil, fmt.Errorf("read vsock handshake response (is exec-agent running in guest?): %w", err) + } + + // Clear deadline after successful handshake + if err := conn.SetDeadline(time.Time{}); err != nil { + conn.Close() + return nil, fmt.Errorf("clear deadline: %w", err) + } + + response = strings.TrimSpace(response) + if !strings.HasPrefix(response, "OK ") { + conn.Close() + return nil, fmt.Errorf("vsock handshake failed: %s", response) + } + + slog.DebugContext(ctx, "vsock handshake successful", "response", response) + + // Return wrapped connection that uses the bufio.Reader + // This ensures any bytes buffered during handshake are not lost + return &bufferedConn{Conn: conn, reader: reader}, nil +} + +// bufferedConn wraps a net.Conn with a bufio.Reader to ensure any buffered +// data from the handshake is properly drained before reading from the connection +type bufferedConn struct { + net.Conn + reader *bufio.Reader +} + +func (c *bufferedConn) Read(p []byte) (int, error) { + return c.reader.Read(p) +} diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index e6ae7c35..31ea5e19 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -5,6 +5,8 @@ package hypervisor import ( "context" + "fmt" + "net" "time" "github.com/onkernel/hypeman/lib/paths" @@ -39,19 +41,34 @@ func SocketNameForType(t Type) string { return string(t) + ".sock" } -// Hypervisor defines the interface for VM management operations. -// All hypervisor implementations must implement this interface. -type Hypervisor interface { - // CreateVM configures the VM with the given configuration. - // The VM is not started yet after this call. - CreateVM(ctx context.Context, config VMConfig) error +// VMStarter handles the full VM startup sequence. +// Each hypervisor implements its own startup flow: +// - Cloud Hypervisor: starts process, configures via HTTP API, boots via HTTP API +// - QEMU: converts config to command-line args, starts process (VM runs immediately) +type VMStarter interface { + // SocketName returns the socket filename for this hypervisor. + // Uses short names to stay within Unix socket path length limits (SUN_LEN ~108 bytes). + SocketName() string + + // GetBinaryPath returns the path to the hypervisor binary, extracting if needed. + GetBinaryPath(p *paths.Paths, version string) (string, error) + + // StartVM launches the hypervisor process and boots the VM. + // Returns the process ID and a Hypervisor client for subsequent operations. + StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config VMConfig) (pid int, hv Hypervisor, err error) - // BootVM starts the configured VM. - // Must be called after CreateVM. - BootVM(ctx context.Context) error + // RestoreVM starts the hypervisor and restores VM state from a snapshot. + // Each hypervisor implements its own restore flow: + // - Cloud Hypervisor: starts process, calls Restore API + // - QEMU: would start with -incoming or -loadvm flags (not yet implemented) + // Returns the process ID and a Hypervisor client. The VM is in paused state after restore. + RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (pid int, hv Hypervisor, err error) +} - // DeleteVM removes the VM configuration. - // The VMM process may still be running after this call. +// Hypervisor defines the interface for VM control operations. +// A Hypervisor client is returned by VMStarter.StartVM after the VM is running. +type Hypervisor interface { + // DeleteVM sends a graceful shutdown signal to the guest. DeleteVM(ctx context.Context) error // Shutdown stops the VMM process gracefully. @@ -72,10 +89,6 @@ type Hypervisor interface { // Check Capabilities().SupportsSnapshot before calling. Snapshot(ctx context.Context, destPath string) error - // Restore loads a VM from a snapshot at the given path. - // Check Capabilities().SupportsSnapshot before calling. - Restore(ctx context.Context, sourcePath string) error - // ResizeMemory changes the VM's memory allocation. // Check Capabilities().SupportsHotplugMemory before calling. ResizeMemory(ctx context.Context, bytes int64) error @@ -108,21 +121,38 @@ type Capabilities struct { SupportsGPUPassthrough bool } -// ProcessManager handles hypervisor process lifecycle. -// This is separate from the Hypervisor interface because process management -// happens before/after the VMM socket is available. -type ProcessManager interface { - // SocketName returns the socket filename for this hypervisor. - // Uses short names to stay within Unix socket path length limits (SUN_LEN ~108 bytes). - SocketName() string +// VsockDialer provides vsock connectivity to a guest VM. +// Each hypervisor implements its own connection method: +// - Cloud Hypervisor: Unix socket file + text handshake protocol +// - QEMU: Kernel AF_VSOCK with CID-based addressing +type VsockDialer interface { + // DialVsock connects to the guest on the specified port. + // Returns a net.Conn that can be used for bidirectional communication. + DialVsock(ctx context.Context, port int) (net.Conn, error) + + // Key returns a unique identifier for this dialer, used for connection pooling. + Key() string +} - // StartProcess launches the hypervisor process. - // Returns the process ID of the started hypervisor. - StartProcess(ctx context.Context, p *paths.Paths, version string, socketPath string) (pid int, err error) +// VsockDialerFactory creates VsockDialer instances for a hypervisor type. +type VsockDialerFactory func(vsockSocket string, vsockCID int64) VsockDialer - // StartProcessWithArgs launches the hypervisor process with extra arguments. - StartProcessWithArgs(ctx context.Context, p *paths.Paths, version string, socketPath string, extraArgs []string) (pid int, err error) +// vsockDialerFactories maps hypervisor types to their dialer factories. +// Registered by each hypervisor package's init() function. +var vsockDialerFactories = make(map[Type]VsockDialerFactory) - // GetBinaryPath returns the path to the hypervisor binary, extracting if needed. - GetBinaryPath(p *paths.Paths, version string) (string, error) +// RegisterVsockDialerFactory registers a VsockDialer factory for a hypervisor type. +// Called by each hypervisor implementation's init() function. +func RegisterVsockDialerFactory(t Type, factory VsockDialerFactory) { + vsockDialerFactories[t] = factory +} + +// NewVsockDialer creates a VsockDialer for the given hypervisor type. +// Returns an error if the hypervisor type doesn't have a registered factory. +func NewVsockDialer(hvType Type, vsockSocket string, vsockCID int64) (VsockDialer, error) { + factory, ok := vsockDialerFactories[hvType] + if !ok { + return nil, fmt.Errorf("no vsock dialer registered for hypervisor type: %s", hvType) + } + return factory(vsockSocket, vsockCID), nil } diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index 5fa1e9a3..a11ac622 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -20,49 +20,72 @@ func init() { hypervisor.RegisterSocketName(hypervisor.TypeQEMU, "qemu.sock") } -// ProcessManager implements hypervisor.ProcessManager for QEMU. -type ProcessManager struct{} +// Starter implements hypervisor.VMStarter for QEMU. +type Starter struct{} -// NewProcessManager creates a new QEMU process manager. -func NewProcessManager() *ProcessManager { - return &ProcessManager{} +// NewStarter creates a new QEMU starter. +func NewStarter() *Starter { + return &Starter{} } -// Verify ProcessManager implements the interface -var _ hypervisor.ProcessManager = (*ProcessManager)(nil) +// Verify Starter implements the interface +var _ hypervisor.VMStarter = (*Starter)(nil) // SocketName returns the socket filename for QEMU. -func (p *ProcessManager) SocketName() string { +func (s *Starter) SocketName() string { return "qemu.sock" } -// StartProcess launches a QEMU VMM process. -func (p *ProcessManager) StartProcess(ctx context.Context, paths *paths.Paths, version string, socketPath string) (int, error) { - return p.StartProcessWithArgs(ctx, paths, version, socketPath, nil) +// GetBinaryPath returns the path to the QEMU binary. +// QEMU is expected to be installed on the system. +func (s *Starter) GetBinaryPath(p *paths.Paths, version string) (string, error) { + binaryName, err := qemuBinaryName() + if err != nil { + return "", err + } + + candidates := []string{ + "/usr/bin/" + binaryName, + "/usr/local/bin/" + binaryName, + } + + for _, path := range candidates { + if _, err := os.Stat(path); err == nil { + return path, nil + } + } + + if path, err := exec.LookPath(binaryName); err == nil { + return path, nil + } + + return "", fmt.Errorf("%s not found; install with: %s", binaryName, qemuInstallHint()) } -// StartProcessWithArgs launches a QEMU VMM process with extra arguments. -func (p *ProcessManager) StartProcessWithArgs(ctx context.Context, paths *paths.Paths, version string, socketPath string, extraArgs []string) (int, error) { +// StartVM launches QEMU with the VM configuration and returns a Hypervisor client. +// QEMU receives all configuration via command-line arguments at process start. +func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { // Get binary path - binaryPath, err := p.GetBinaryPath(paths, version) + binaryPath, err := s.GetBinaryPath(p, version) if err != nil { - return 0, fmt.Errorf("get binary: %w", err) + return 0, nil, fmt.Errorf("get binary: %w", err) } // Check if socket is already in use if isSocketInUse(socketPath) { - return 0, fmt.Errorf("socket already in use, QEMU may be running at %s", socketPath) + return 0, nil, fmt.Errorf("socket already in use, QEMU may be running at %s", socketPath) } // Remove stale socket if exists os.Remove(socketPath) - // Build base command arguments for QMP socket + // Build command arguments: QMP socket + VM configuration args := []string{ "-chardev", fmt.Sprintf("socket,id=qmp,path=%s,server=on,wait=off", socketPath), "-mon", "chardev=qmp,mode=control", } - args = append(args, extraArgs...) + // Append VM configuration as command-line arguments + args = append(args, BuildArgs(config)...) // Create command cmd := exec.Command(binaryPath, args...) @@ -76,7 +99,7 @@ func (p *ProcessManager) StartProcessWithArgs(ctx context.Context, paths *paths. instanceDir := filepath.Dir(socketPath) logsDir := filepath.Join(instanceDir, "logs") if err := os.MkdirAll(logsDir, 0755); err != nil { - return 0, fmt.Errorf("create logs directory: %w", err) + return 0, nil, fmt.Errorf("create logs directory: %w", err) } vmmLogFile, err := os.OpenFile( @@ -85,7 +108,7 @@ func (p *ProcessManager) StartProcessWithArgs(ctx context.Context, paths *paths. 0644, ) if err != nil { - return 0, fmt.Errorf("create vmm log: %w", err) + return 0, nil, fmt.Errorf("create vmm log: %w", err) } defer vmmLogFile.Close() @@ -93,51 +116,33 @@ func (p *ProcessManager) StartProcessWithArgs(ctx context.Context, paths *paths. cmd.Stderr = vmmLogFile if err := cmd.Start(); err != nil { - return 0, fmt.Errorf("start qemu: %w", err) + return 0, nil, fmt.Errorf("start qemu: %w", err) } pid := cmd.Process.Pid // Wait for socket to be ready if err := waitForSocket(socketPath, 10*time.Second); err != nil { - // Read vmm.log to understand why socket wasn't created vmmLogPath := filepath.Join(logsDir, "vmm.log") if logData, readErr := os.ReadFile(vmmLogPath); readErr == nil && len(logData) > 0 { - return 0, fmt.Errorf("%w; vmm.log: %s", err, string(logData)) + return 0, nil, fmt.Errorf("%w; vmm.log: %s", err, string(logData)) } - return 0, err + return 0, nil, err } - return pid, nil -} - -// GetBinaryPath returns the path to the QEMU binary. -// QEMU is expected to be installed on the system. -func (p *ProcessManager) GetBinaryPath(paths *paths.Paths, version string) (string, error) { - // Determine binary name based on host architecture - binaryName, err := qemuBinaryName() + // Create QMP client + hv, err := New(socketPath) if err != nil { - return "", err - } - - // Look for system-installed QEMU - candidates := []string{ - "/usr/bin/" + binaryName, - "/usr/local/bin/" + binaryName, + return 0, nil, fmt.Errorf("create client: %w", err) } - for _, path := range candidates { - if _, err := os.Stat(path); err == nil { - return path, nil - } - } - - // Try PATH lookup - if path, err := exec.LookPath(binaryName); err == nil { - return path, nil - } + return pid, hv, nil +} - return "", fmt.Errorf("%s not found; install with: %s", binaryName, qemuInstallHint()) +// RestoreVM starts QEMU and restores VM state from a snapshot. +// Not yet implemented for QEMU. +func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { + return 0, nil, fmt.Errorf("restore not supported by QEMU implementation") } // qemuBinaryName returns the QEMU binary name for the host architecture. diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index 2ea35e56..f313ac37 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -37,24 +37,6 @@ func (q *QEMU) Capabilities() hypervisor.Capabilities { } } -// CreateVM configures the VM in QEMU. -// For QEMU, the VM is configured via command-line args when the process starts, -// so this is a no-op. The configuration is applied in StartProcess. -func (q *QEMU) CreateVM(ctx context.Context, config hypervisor.VMConfig) error { - // QEMU doesn't have a separate create step - configuration is done at process start - // This is a no-op for QEMU - return nil -} - -// BootVM starts the configured VM. -// For QEMU, the VM starts automatically when the process starts, -// so this is a no-op. -func (q *QEMU) BootVM(ctx context.Context) error { - // QEMU starts running immediately when the process starts - // This is a no-op for QEMU - return nil -} - // DeleteVM removes the VM configuration from QEMU. // This sends a graceful shutdown signal to the guest. func (q *QEMU) DeleteVM(ctx context.Context) error { @@ -117,12 +99,6 @@ func (q *QEMU) Snapshot(ctx context.Context, destPath string) error { return fmt.Errorf("snapshot not supported by QEMU implementation") } -// Restore loads a VM from snapshot. -// Not implemented in first pass. -func (q *QEMU) Restore(ctx context.Context, sourcePath string) error { - return fmt.Errorf("restore not supported by QEMU implementation") -} - // ResizeMemory changes the VM's memory allocation. // Not implemented in first pass. func (q *QEMU) ResizeMemory(ctx context.Context, bytes int64) error { diff --git a/lib/hypervisor/qemu/vsock.go b/lib/hypervisor/qemu/vsock.go new file mode 100644 index 00000000..58ac23aa --- /dev/null +++ b/lib/hypervisor/qemu/vsock.go @@ -0,0 +1,235 @@ +package qemu + +import ( + "context" + "fmt" + "log/slog" + "net" + "time" + + "golang.org/x/sys/unix" + + "github.com/onkernel/hypeman/lib/hypervisor" +) + +const ( + // vsockDialTimeout is the timeout for connecting via AF_VSOCK + vsockDialTimeout = 5 * time.Second +) + +func init() { + hypervisor.RegisterVsockDialerFactory(hypervisor.TypeQEMU, NewVsockDialer) +} + +// VsockDialer implements hypervisor.VsockDialer for QEMU. +// QEMU with vhost-vsock-pci uses the kernel's native AF_VSOCK socket family. +// Connections are made using the guest's CID (Context ID) and port number. +type VsockDialer struct { + cid uint32 +} + +// NewVsockDialer creates a new VsockDialer for QEMU. +// The vsockSocket parameter is unused for QEMU (it uses CID instead). +// The vsockCID is the guest's Context ID assigned via vhost-vsock-pci. +func NewVsockDialer(vsockSocket string, vsockCID int64) hypervisor.VsockDialer { + return &VsockDialer{ + cid: uint32(vsockCID), + } +} + +// Key returns a unique identifier for this dialer, used for connection pooling. +func (d *VsockDialer) Key() string { + return fmt.Sprintf("qemu:%d", d.cid) +} + +// DialVsock connects to the guest on the specified port using AF_VSOCK. +// This uses the kernel's vsock infrastructure with the guest's CID. +func (d *VsockDialer) DialVsock(ctx context.Context, port int) (net.Conn, error) { + slog.DebugContext(ctx, "connecting to vsock via AF_VSOCK", "cid", d.cid, "port", port) + + // Create AF_VSOCK socket + fd, err := unix.Socket(unix.AF_VSOCK, unix.SOCK_STREAM, 0) + if err != nil { + return nil, fmt.Errorf("create vsock socket: %w", err) + } + + // Set up the sockaddr for the guest + sockaddr := &unix.SockaddrVM{ + CID: d.cid, + Port: uint32(port), + } + + // Use context deadline or default timeout + dialTimeout := vsockDialTimeout + if deadline, ok := ctx.Deadline(); ok { + if remaining := time.Until(deadline); remaining < dialTimeout { + dialTimeout = remaining + } + } + + // Set socket to non-blocking for timeout support + if err := unix.SetNonblock(fd, true); err != nil { + unix.Close(fd) + return nil, fmt.Errorf("set non-blocking: %w", err) + } + + // Attempt to connect + err = unix.Connect(fd, sockaddr) + if err != nil { + if err != unix.EINPROGRESS { + unix.Close(fd) + return nil, fmt.Errorf("connect to vsock cid=%d port=%d: %w", d.cid, port, err) + } + + // Wait for connection to complete using poll + deadline := time.Now().Add(dialTimeout) + for { + remaining := time.Until(deadline) + if remaining <= 0 { + unix.Close(fd) + return nil, fmt.Errorf("connect to vsock cid=%d port=%d: timeout after %v", d.cid, port, dialTimeout) + } + + // Poll for write readiness (indicates connection complete) + pollFds := []unix.PollFd{{ + Fd: int32(fd), + Events: unix.POLLOUT, + }} + + timeoutMs := int(remaining.Milliseconds()) + if timeoutMs < 1 { + timeoutMs = 1 + } + + n, err := unix.Poll(pollFds, timeoutMs) + if err != nil { + if err == unix.EINTR { + continue // Interrupted, retry + } + unix.Close(fd) + return nil, fmt.Errorf("poll vsock: %w", err) + } + + if n > 0 { + // Check for connection errors + errno, err := unix.GetsockoptInt(fd, unix.SOL_SOCKET, unix.SO_ERROR) + if err != nil { + unix.Close(fd) + return nil, fmt.Errorf("getsockopt: %w", err) + } + if errno != 0 { + unix.Close(fd) + return nil, fmt.Errorf("connect to vsock cid=%d port=%d: %w", d.cid, port, unix.Errno(errno)) + } + break // Connection successful + } + } + } + + // Set back to blocking mode for normal I/O + if err := unix.SetNonblock(fd, false); err != nil { + unix.Close(fd) + return nil, fmt.Errorf("set blocking: %w", err) + } + + slog.DebugContext(ctx, "vsock connection established", "cid", d.cid, "port", port) + + // Wrap the file descriptor in a net.Conn + return newVsockConn(fd, d.cid, uint32(port)) +} + +// vsockConn wraps a vsock file descriptor as a net.Conn +type vsockConn struct { + fd int + localCID uint32 + localPort uint32 + remoteCID uint32 + remotePort uint32 +} + +func newVsockConn(fd int, remoteCID, remotePort uint32) (*vsockConn, error) { + return &vsockConn{ + fd: fd, + localCID: unix.VMADDR_CID_HOST, + localPort: 0, // ephemeral + remoteCID: remoteCID, + remotePort: remotePort, + }, nil +} + +func (c *vsockConn) Read(b []byte) (int, error) { + return unix.Read(c.fd, b) +} + +func (c *vsockConn) Write(b []byte) (int, error) { + return unix.Write(c.fd, b) +} + +func (c *vsockConn) Close() error { + return unix.Close(c.fd) +} + +func (c *vsockConn) LocalAddr() net.Addr { + return &vsockAddr{cid: c.localCID, port: c.localPort} +} + +func (c *vsockConn) RemoteAddr() net.Addr { + return &vsockAddr{cid: c.remoteCID, port: c.remotePort} +} + +func (c *vsockConn) SetDeadline(t time.Time) error { + if t.IsZero() { + // Clear deadlines + if err := c.SetReadDeadline(t); err != nil { + return err + } + return c.SetWriteDeadline(t) + } + timeout := time.Until(t) + if timeout < 0 { + timeout = 0 + } + tv := unix.NsecToTimeval(timeout.Nanoseconds()) + if err := unix.SetsockoptTimeval(c.fd, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &tv); err != nil { + return err + } + return unix.SetsockoptTimeval(c.fd, unix.SOL_SOCKET, unix.SO_SNDTIMEO, &tv) +} + +func (c *vsockConn) SetReadDeadline(t time.Time) error { + var tv unix.Timeval + if !t.IsZero() { + timeout := time.Until(t) + if timeout < 0 { + timeout = 0 + } + tv = unix.NsecToTimeval(timeout.Nanoseconds()) + } + return unix.SetsockoptTimeval(c.fd, unix.SOL_SOCKET, unix.SO_RCVTIMEO, &tv) +} + +func (c *vsockConn) SetWriteDeadline(t time.Time) error { + var tv unix.Timeval + if !t.IsZero() { + timeout := time.Until(t) + if timeout < 0 { + timeout = 0 + } + tv = unix.NsecToTimeval(timeout.Nanoseconds()) + } + return unix.SetsockoptTimeval(c.fd, unix.SOL_SOCKET, unix.SO_SNDTIMEO, &tv) +} + +// vsockAddr implements net.Addr for vsock addresses +type vsockAddr struct { + cid uint32 + port uint32 +} + +func (a *vsockAddr) Network() string { + return "vsock" +} + +func (a *vsockAddr) String() string { + return fmt.Sprintf("%d:%d", a.cid, a.port) +} diff --git a/lib/instances/create.go b/lib/instances/create.go index ed28e785..a440a017 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -219,10 +219,10 @@ func (m *manager) createInstance( } } - pm, err := m.getProcessManager(hvType) + starter, err := m.getVMStarter(hvType) if err != nil { - log.ErrorContext(ctx, "failed to get process manager", "error", err) - return nil, fmt.Errorf("get process manager for %s: %w", hvType, err) + log.ErrorContext(ctx, "failed to get vm starter", "error", err) + return nil, fmt.Errorf("get vm starter for %s: %w", hvType, err) } // 10. Validate, resolve, and auto-bind devices (GPU passthrough) @@ -296,7 +296,7 @@ func (m *manager) createInstance( KernelVersion: string(kernelVer), HypervisorType: hvType, HypervisorVersion: string(vmm.V49_0), // Use latest - SocketPath: m.paths.InstanceSocket(id, pm.SocketName()), + SocketPath: m.paths.InstanceSocket(id, starter.SocketName()), DataDir: m.paths.InstanceDir(id), VsockCID: vsockCID, VsockSocket: vsockSocket, @@ -542,27 +542,10 @@ func (m *manager) startAndBootVM( ) error { log := logger.FromContext(ctx) - // Get process manager for this hypervisor type - pm, err := m.getProcessManager(stored.HypervisorType) + // Get VM starter for this hypervisor type + starter, err := m.getVMStarter(stored.HypervisorType) if err != nil { - return fmt.Errorf("get process manager: %w", err) - } - - // Start VMM process and capture PID - log.DebugContext(ctx, "starting VMM process", "instance_id", stored.Id, "hypervisor", stored.HypervisorType, "version", stored.HypervisorVersion) - pid, err := pm.StartProcess(ctx, m.paths, stored.HypervisorVersion, stored.SocketPath) - if err != nil { - return fmt.Errorf("start vmm: %w", err) - } - - // Store the PID for later cleanup - stored.HypervisorPID = &pid - log.DebugContext(ctx, "VMM process started", "instance_id", stored.Id, "pid", pid) - - // Create hypervisor client - hv, err := m.getHypervisor(stored.SocketPath, stored.HypervisorType) - if err != nil { - return fmt.Errorf("create hypervisor client: %w", err) + return fmt.Errorf("get vm starter: %w", err) } // Build VM configuration @@ -572,20 +555,16 @@ func (m *manager) startAndBootVM( return fmt.Errorf("build vm config: %w", err) } - // Create VM in hypervisor - log.DebugContext(ctx, "creating VM in hypervisor", "instance_id", stored.Id) - if err := hv.CreateVM(ctx, vmConfig); err != nil { - return fmt.Errorf("create vm: %w", err) + // Start VM (handles process start, configuration, and boot) + log.DebugContext(ctx, "starting VM", "instance_id", stored.Id, "hypervisor", stored.HypervisorType, "version", stored.HypervisorVersion) + pid, hv, err := starter.StartVM(ctx, m.paths, stored.HypervisorVersion, stored.SocketPath, vmConfig) + if err != nil { + return fmt.Errorf("start vm: %w", err) } - // Transition: Created → Running (boot VM) - log.DebugContext(ctx, "booting VM", "instance_id", stored.Id) - if err := hv.BootVM(ctx); err != nil { - // Try to cleanup - hv.DeleteVM(ctx) - hv.Shutdown(ctx) - return fmt.Errorf("boot vm: %w", err) - } + // Store the PID for later cleanup + stored.HypervisorPID = &pid + log.DebugContext(ctx, "VM started", "instance_id", stored.Id, "pid", pid) // Optional: Expand memory to max if hotplug configured if inst.HotplugSize > 0 && hv.Capabilities().SupportsHotplugMemory { diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index d3dbfde9..3ef15c81 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/system" @@ -94,7 +95,7 @@ func TestExecConcurrent(t *testing.T) { require.NoError(t, err, "exec-agent should be ready") // Verify exec-agent works with a simple command first - _, code, err := execCommand(ctx, inst.VsockSocket, "echo", "ready") + _, code, err := execCommand(ctx, inst, "echo", "ready") require.NoError(t, err, "initial exec should work") require.Equal(t, 0, code, "initial exec should succeed") @@ -117,7 +118,7 @@ func TestExecConcurrent(t *testing.T) { for i := 1; i <= numIterations; i++ { // Write (no retry - must work first time) writeCmd := fmt.Sprintf("echo '%d-%d' > %s", workerID, i, filename) - output, code, err := execCommand(ctx, inst.VsockSocket, "/bin/sh", "-c", writeCmd) + output, code, err := execCommand(ctx, inst, "/bin/sh", "-c", writeCmd) if err != nil { errors <- fmt.Errorf("worker %d, iter %d: write error: %w", workerID, i, err) return @@ -128,7 +129,7 @@ func TestExecConcurrent(t *testing.T) { } // Read (no retry - must work first time) - output, code, err = execCommand(ctx, inst.VsockSocket, "cat", filename) + output, code, err = execCommand(ctx, inst, "cat", filename) if err != nil { errors <- fmt.Errorf("worker %d, iter %d: read error: %w", workerID, i, err) return @@ -180,7 +181,7 @@ func TestExecConcurrent(t *testing.T) { // Command that takes ~2 seconds and produces output cmd := fmt.Sprintf("sleep %d && echo 'stream-%d-done'", streamDuration, workerID) - output, code, err := execCommand(ctx, inst.VsockSocket, "/bin/sh", "-c", cmd) + output, code, err := execCommand(ctx, inst, "/bin/sh", "-c", cmd) if err != nil { streamErrors <- fmt.Errorf("stream worker %d: error: %w", workerID, err) return @@ -221,9 +222,12 @@ func TestExecConcurrent(t *testing.T) { t.Log("Phase 3: Testing exec with non-existent command...") // Test without TTY + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + require.NoError(t, err) + start := time.Now() var stdout, stderr strings.Builder - _, err = exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(ctx, dialer, exec.ExecOptions{ Command: []string{"nonexistent_command_asdfasdf"}, Stdout: &stdout, Stderr: &stderr, @@ -240,7 +244,7 @@ func TestExecConcurrent(t *testing.T) { start = time.Now() stdout.Reset() stderr.Reset() - _, err = exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{ + _, err = exec.ExecIntoInstance(ctx, dialer, exec.ExecOptions{ Command: []string{"nonexistent_command_xyz123"}, Stdout: &stdout, Stderr: &stderr, diff --git a/lib/instances/manager.go b/lib/instances/manager.go index a11eb33e..915879d4 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -58,7 +58,7 @@ type manager struct { metrics *Metrics // Hypervisor support - processManagers map[hypervisor.Type]hypervisor.ProcessManager + vmStarters map[hypervisor.Type]hypervisor.VMStarter defaultHypervisor hypervisor.Type // Default hypervisor type when not specified in request } @@ -81,9 +81,9 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste limits: limits, instanceLocks: sync.Map{}, hostTopology: detectHostTopology(), // Detect and cache host topology - processManagers: map[hypervisor.Type]hypervisor.ProcessManager{ - hypervisor.TypeCloudHypervisor: cloudhypervisor.NewProcessManager(), - hypervisor.TypeQEMU: qemu.NewProcessManager(), + vmStarters: map[hypervisor.Type]hypervisor.VMStarter{ + hypervisor.TypeCloudHypervisor: cloudhypervisor.NewStarter(), + hypervisor.TypeQEMU: qemu.NewStarter(), }, defaultHypervisor: defaultHypervisor, } @@ -100,6 +100,7 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste } // getHypervisor creates a hypervisor client for the given socket and type. +// Used for connecting to already-running VMs (e.g., for state queries). func (m *manager) getHypervisor(socketPath string, hvType hypervisor.Type) (hypervisor.Hypervisor, error) { switch hvType { case hypervisor.TypeCloudHypervisor: @@ -111,13 +112,13 @@ func (m *manager) getHypervisor(socketPath string, hvType hypervisor.Type) (hype } } -// getProcessManager returns the process manager for the given hypervisor type. -func (m *manager) getProcessManager(hvType hypervisor.Type) (hypervisor.ProcessManager, error) { - pm, ok := m.processManagers[hvType] +// getVMStarter returns the VM starter for the given hypervisor type. +func (m *manager) getVMStarter(hvType hypervisor.Type) (hypervisor.VMStarter, error) { + starter, ok := m.vmStarters[hvType] if !ok { - return nil, fmt.Errorf("no process manager for hypervisor type: %s", hvType) + return nil, fmt.Errorf("no VM starter for hypervisor type: %s", hvType) } - return pm, nil + return starter, nil } // getInstanceLock returns or creates a lock for a specific instance diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 588ac871..3b4834d1 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -604,13 +604,18 @@ func TestBasicEndToEnd(t *testing.T) { var lastExitCode int var lastErr error + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + return "", -1, err + } + for attempt := 0; attempt < 5; attempt++ { if attempt > 0 { time.Sleep(200 * time.Millisecond) } var stdout, stderr bytes.Buffer - exit, err := exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{ + exit, err := exec.ExecIntoInstance(ctx, dialer, exec.ExecOptions{ Command: command, Stdout: &stdout, Stderr: &stderr, diff --git a/lib/instances/metrics.go b/lib/instances/metrics.go index 162d94fe..e07c5983 100644 --- a/lib/instances/metrics.go +++ b/lib/instances/metrics.go @@ -161,8 +161,8 @@ func (m *manager) recordStateTransition(ctx context.Context, fromState, toState return } attrs := []attribute.KeyValue{ - attribute.String("from", fromState), - attribute.String("to", toState), + attribute.String("from", fromState), + attribute.String("to", toState), } if hvType != "" { attrs = append(attrs, attribute.String("hypervisor", string(hvType))) diff --git a/lib/instances/network_test.go b/lib/instances/network_test.go index 419115e8..5cda9747 100644 --- a/lib/instances/network_test.go +++ b/lib/instances/network_test.go @@ -9,6 +9,7 @@ import ( "time" "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/images" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -115,7 +116,7 @@ func TestCreateInstanceWithNetwork(t *testing.T) { // Test initial internet connectivity via exec t.Log("Testing initial internet connectivity via exec...") - output, exitCode, err := execCommand(ctx, inst.VsockSocket, "curl", "-s", "--connect-timeout", "10", "https://public-ping-bucket-kernel.s3.us-east-1.amazonaws.com/index.html") + output, exitCode, err := execCommand(ctx, inst, "curl", "-s", "--connect-timeout", "10", "https://public-ping-bucket-kernel.s3.us-east-1.amazonaws.com/index.html") if err != nil || exitCode != 0 { t.Logf("curl failed: exitCode=%d err=%v output=%s", exitCode, err, output) } @@ -182,7 +183,7 @@ func TestCreateInstanceWithNetwork(t *testing.T) { var restoreOutput string var restoreExitCode int for i := 0; i < 10; i++ { - restoreOutput, restoreExitCode, err = execCommand(ctx, inst.VsockSocket, "curl", "-s", "https://public-ping-bucket-kernel.s3.us-east-1.amazonaws.com/index.html") + restoreOutput, restoreExitCode, err = execCommand(ctx, inst, "curl", "-s", "https://public-ping-bucket-kernel.s3.us-east-1.amazonaws.com/index.html") if err == nil && restoreExitCode == 0 { break } @@ -196,7 +197,7 @@ func TestCreateInstanceWithNetwork(t *testing.T) { // Verify the original nginx process is still running (proves restore worked, not reboot) t.Log("Verifying nginx master process is still running...") - psOutput, psExitCode, err := execCommand(ctx, inst.VsockSocket, "ps", "aux") + psOutput, psExitCode, err := execCommand(ctx, inst, "ps", "aux") require.NoError(t, err) require.Equal(t, 0, psExitCode) require.Contains(t, psOutput, "nginx: master process", "nginx master should still be running") @@ -223,10 +224,15 @@ func TestCreateInstanceWithNetwork(t *testing.T) { } // execCommand runs a command in the instance via vsock and returns stdout+stderr, exit code, and error -func execCommand(ctx context.Context, vsockSocket string, command ...string) (string, int, error) { +func execCommand(ctx context.Context, inst *Instance, command ...string) (string, int, error) { + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + return "", -1, err + } + var stdout, stderr bytes.Buffer - exit, err := exec.ExecIntoInstance(ctx, vsockSocket, exec.ExecOptions{ + exit, err := exec.ExecIntoInstance(ctx, dialer, exec.ExecOptions{ Command: command, Stdin: nil, Stdout: &stdout, diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go new file mode 100644 index 00000000..521b29fa --- /dev/null +++ b/lib/instances/qemu_test.go @@ -0,0 +1,538 @@ +package instances + +import ( + "bytes" + "context" + "fmt" + "io" + "net" + "net/http" + "os" + "path/filepath" + "strings" + "syscall" + "testing" + "time" + + "github.com/onkernel/hypeman/cmd/api/config" + "github.com/onkernel/hypeman/lib/devices" + "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" + "github.com/onkernel/hypeman/lib/hypervisor/qemu" + "github.com/onkernel/hypeman/lib/images" + "github.com/onkernel/hypeman/lib/ingress" + "github.com/onkernel/hypeman/lib/network" + "github.com/onkernel/hypeman/lib/paths" + "github.com/onkernel/hypeman/lib/system" + "github.com/onkernel/hypeman/lib/volumes" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// setupTestManagerForQEMU creates a manager configured to use QEMU as the default hypervisor +func setupTestManagerForQEMU(t *testing.T) (*manager, string) { + tmpDir := t.TempDir() + + cfg := &config.Config{ + DataDir: tmpDir, + BridgeName: "vmbr0", + SubnetCIDR: "10.100.0.0/16", + DNSServer: "1.1.1.1", + } + + p := paths.New(tmpDir) + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + systemManager := system.NewManager(p) + networkManager := network.NewManager(p, cfg, nil) + deviceManager := devices.NewManager(p) + volumeManager := volumes.NewManager(p, 0, nil) // 0 = unlimited storage + limits := ResourceLimits{ + MaxOverlaySize: 100 * 1024 * 1024 * 1024, // 100GB + MaxVcpusPerInstance: 0, // unlimited + MaxMemoryPerInstance: 0, // unlimited + MaxTotalVcpus: 0, // unlimited + MaxTotalMemory: 0, // unlimited + } + mgr := NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, hypervisor.TypeQEMU, nil, nil).(*manager) + + // Register cleanup to kill any orphaned QEMU processes + t.Cleanup(func() { + cleanupOrphanedQEMUProcesses(t, mgr) + }) + + return mgr, tmpDir +} + +// cleanupOrphanedQEMUProcesses kills any QEMU processes from metadata +func cleanupOrphanedQEMUProcesses(t *testing.T, mgr *manager) { + metaFiles, err := mgr.listMetadataFiles() + if err != nil { + return + } + + for _, metaFile := range metaFiles { + id := filepath.Base(filepath.Dir(metaFile)) + meta, err := mgr.loadMetadata(id) + if err != nil { + continue + } + + if meta.HypervisorPID != nil { + pid := *meta.HypervisorPID + if err := syscall.Kill(pid, 0); err == nil { + t.Logf("Cleaning up orphaned QEMU process: PID %d (instance %s)", pid, id) + syscall.Kill(pid, syscall.SIGKILL) + WaitForProcessExit(pid, 1*time.Second) + } + } + } +} + +// waitForQEMUReady polls QEMU status via QMP until it's running or times out +func waitForQEMUReady(ctx context.Context, socketPath string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + + for time.Now().Before(deadline) { + client, err := qemu.New(socketPath) + if err != nil { + time.Sleep(100 * time.Millisecond) + continue + } + + info, err := client.GetVMInfo(ctx) + if err != nil { + time.Sleep(100 * time.Millisecond) + continue + } + + if info.State == hypervisor.StateRunning { + return nil + } + + time.Sleep(100 * time.Millisecond) + } + + return fmt.Errorf("QEMU VM did not reach running state within %v", timeout) +} + +// collectQEMULogs gets the last N lines of logs (non-streaming) +func collectQEMULogs(ctx context.Context, mgr *manager, instanceID string, n int) (string, error) { + logChan, err := mgr.StreamInstanceLogs(ctx, instanceID, n, false, LogSourceApp) + if err != nil { + return "", err + } + + var lines []string + for line := range logChan { + lines = append(lines, line) + } + + return strings.Join(lines, "\n"), nil +} + +// qemuInstanceResolver is a simple resolver for ingress tests +type qemuInstanceResolver struct { + ip string + exists bool +} + +func (r *qemuInstanceResolver) ResolveInstanceIP(ctx context.Context, nameOrID string) (string, error) { + if r.ip == "" { + return "", fmt.Errorf("instance not found: %s", nameOrID) + } + return r.ip, nil +} + +func (r *qemuInstanceResolver) InstanceExists(ctx context.Context, nameOrID string) (bool, error) { + return r.exists, nil +} + +func (r *qemuInstanceResolver) ResolveInstance(ctx context.Context, nameOrID string) (string, string, error) { + if !r.exists { + return "", "", fmt.Errorf("instance not found: %s", nameOrID) + } + return nameOrID, nameOrID, nil +} + +// TestQEMUBasicEndToEnd tests the complete instance lifecycle with QEMU. +// This is the primary integration test for QEMU support. +// It tests: create, get, list, logs, network, ingress, volumes, exec, and delete. +// It does NOT test: snapshot/standby, hot memory resize (not supported by QEMU in first pass). +func TestQEMUBasicEndToEnd(t *testing.T) { + // Require KVM access + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { + t.Fatal("/dev/kvm not available - ensure KVM is enabled and user is in 'kvm' group (sudo usermod -aG kvm $USER)") + } + + // Require QEMU to be installed + starter := qemu.NewStarter() + if _, err := starter.GetBinaryPath(nil, ""); err != nil { + t.Fatalf("QEMU not available: %v", err) + } + + manager, tmpDir := setupTestManagerForQEMU(t) + ctx := context.Background() + + // Get the image manager for image operations + imageManager, err := images.NewManager(paths.New(tmpDir), 1, nil) + require.NoError(t, err) + + // Pull nginx image + t.Log("Pulling nginx:alpine image...") + nginxImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: "docker.io/library/nginx:alpine", + }) + require.NoError(t, err) + + // Wait for image to be ready + t.Log("Waiting for image build to complete...") + imageName := nginxImage.Name + for i := 0; i < 60; i++ { + img, err := imageManager.GetImage(ctx, imageName) + if err == nil && img.Status == images.StatusReady { + nginxImage = img + break + } + if err == nil && img.Status == images.StatusFailed { + t.Fatalf("Image build failed: %s", *img.Error) + } + time.Sleep(1 * time.Second) + } + require.Equal(t, images.StatusReady, nginxImage.Status, "Image should be ready after 60 seconds") + t.Log("Nginx image ready") + + // Ensure system files + systemManager := system.NewManager(paths.New(tmpDir)) + t.Log("Ensuring system files (downloads kernel and builds initrd)...") + err = systemManager.EnsureSystemFiles(ctx) + require.NoError(t, err) + t.Log("System files ready") + + // Create a volume to attach + p := paths.New(tmpDir) + volumeManager := volumes.NewManager(p, 0, nil) + t.Log("Creating volume...") + vol, err := volumeManager.CreateVolume(ctx, volumes.CreateVolumeRequest{ + Name: "test-data", + SizeGb: 1, + }) + require.NoError(t, err) + require.NotNil(t, vol) + t.Logf("Volume created: %s", vol.Id) + + // Verify volume file exists and is not attached + assert.FileExists(t, p.VolumeData(vol.Id)) + assert.Empty(t, vol.Attachments, "Volume should not be attached yet") + + // Initialize network + networkManager := network.NewManager(p, &config.Config{ + DataDir: tmpDir, + BridgeName: "vmbr0", + SubnetCIDR: "10.100.0.0/16", + DNSServer: "1.1.1.1", + }, nil) + t.Log("Initializing network...") + err = networkManager.Initialize(ctx, nil) + require.NoError(t, err) + t.Log("Network initialized") + + // Create instance with QEMU hypervisor + req := CreateInstanceRequest{ + Name: "test-nginx-qemu", + Image: "docker.io/library/nginx:alpine", + Size: 2 * 1024 * 1024 * 1024, // 2GB + HotplugSize: 512 * 1024 * 1024, // 512MB (unused by QEMU, but part of the request) + OverlaySize: 10 * 1024 * 1024 * 1024, // 10GB + Vcpus: 1, + NetworkEnabled: true, + Hypervisor: hypervisor.TypeQEMU, // Explicitly use QEMU + Env: map[string]string{ + "TEST_VAR": "test_value", + }, + Volumes: []VolumeAttachment{ + { + VolumeID: vol.Id, + MountPath: "/mnt/data", + Readonly: false, + }, + }, + } + + t.Log("Creating QEMU instance...") + inst, err := manager.CreateInstance(ctx, req) + require.NoError(t, err) + require.NotNil(t, inst) + t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) + + // Verify instance fields + assert.NotEmpty(t, inst.Id) + assert.Equal(t, "test-nginx-qemu", inst.Name) + assert.Equal(t, "docker.io/library/nginx:alpine", inst.Image) + assert.Equal(t, StateRunning, inst.State) + assert.Equal(t, hypervisor.TypeQEMU, inst.HypervisorType) + assert.False(t, inst.HasSnapshot) + assert.NotEmpty(t, inst.KernelVersion) + + // Verify volume is attached to instance + assert.Len(t, inst.Volumes, 1, "Instance should have 1 volume attached") + assert.Equal(t, vol.Id, inst.Volumes[0].VolumeID) + assert.Equal(t, "/mnt/data", inst.Volumes[0].MountPath) + + // Verify volume shows as attached + vol, err = volumeManager.GetVolume(ctx, vol.Id) + require.NoError(t, err) + require.Len(t, vol.Attachments, 1, "Volume should be attached") + assert.Equal(t, inst.Id, vol.Attachments[0].InstanceID) + assert.Equal(t, "/mnt/data", vol.Attachments[0].MountPath) + + // Verify directories exist + assert.DirExists(t, p.InstanceDir(inst.Id)) + assert.FileExists(t, p.InstanceMetadata(inst.Id)) + assert.FileExists(t, p.InstanceOverlay(inst.Id)) + assert.FileExists(t, p.InstanceConfigDisk(inst.Id)) + + // Wait for VM to be fully running + err = waitForQEMUReady(ctx, inst.SocketPath, 10*time.Second) + require.NoError(t, err, "QEMU VM should reach running state") + + // Get instance + retrieved, err := manager.GetInstance(ctx, inst.Id) + require.NoError(t, err) + assert.Equal(t, inst.Id, retrieved.Id) + assert.Equal(t, StateRunning, retrieved.State) + + // List instances + instances, err := manager.ListInstances(ctx) + require.NoError(t, err) + assert.Len(t, instances, 1) + assert.Equal(t, inst.Id, instances[0].Id) + + // Poll for logs to contain nginx startup message + var logs string + foundNginxStartup := false + for i := 0; i < 50; i++ { + logs, err = collectQEMULogs(ctx, manager, inst.Id, 100) + require.NoError(t, err) + + if strings.Contains(logs, "start worker processes") { + foundNginxStartup = true + break + } + time.Sleep(100 * time.Millisecond) + } + + t.Logf("Instance logs (last 100 lines):\n%s", logs) + assert.True(t, foundNginxStartup, "Nginx should have started worker processes within 5 seconds") + + // Test ingress - route external traffic to nginx + t.Log("Testing ingress routing to nginx...") + + // Get random free ports + listener, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + ingressPort := listener.Addr().(*net.TCPAddr).Port + listener.Close() + + adminListener, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + adminPort := adminListener.Addr().(*net.TCPAddr).Port + adminListener.Close() + + t.Logf("Using random ports: ingress=%d, admin=%d", ingressPort, adminPort) + + // Create ingress manager + ingressConfig := ingress.Config{ + ListenAddress: "127.0.0.1", + AdminAddress: "127.0.0.1", + AdminPort: adminPort, + DNSPort: 0, + StopOnShutdown: true, + } + + instanceIP := inst.IP + require.NotEmpty(t, instanceIP, "Instance should have an IP address") + t.Logf("Instance IP: %s", instanceIP) + + resolver := &qemuInstanceResolver{ + ip: instanceIP, + exists: true, + } + + ingressManager := ingress.NewManager(p, ingressConfig, resolver, nil) + + // Initialize ingress manager (starts Caddy) + t.Log("Starting Caddy...") + err = ingressManager.Initialize(ctx) + require.NoError(t, err, "Ingress manager should initialize successfully") + + t.Cleanup(func() { + t.Log("Shutting down Caddy...") + if err := ingressManager.Shutdown(context.Background()); err != nil { + t.Logf("Warning: failed to shutdown ingress manager: %v", err) + } + }) + + // Create an ingress rule + t.Log("Creating ingress rule...") + ingressReq := ingress.CreateIngressRequest{ + Name: "test-nginx-ingress", + Rules: []ingress.IngressRule{ + { + Match: ingress.IngressMatch{ + Hostname: "test.local", + Port: ingressPort, + }, + Target: ingress.IngressTarget{ + Instance: "test-nginx-qemu", + Port: 80, + }, + }, + }, + } + ing, err := ingressManager.Create(ctx, ingressReq) + require.NoError(t, err) + require.NotNil(t, ing) + t.Logf("Ingress created: %s", ing.ID) + + // Make HTTP request through Caddy to nginx + t.Log("Making HTTP request through Caddy to nginx...") + client := &http.Client{Timeout: 2 * time.Second} + var resp *http.Response + var lastErr error + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + httpReq, err := http.NewRequest("GET", fmt.Sprintf("http://127.0.0.1:%d/", ingressPort), nil) + require.NoError(t, err) + httpReq.Host = "test.local" + + resp, lastErr = client.Do(httpReq) + if lastErr == nil && resp.StatusCode == http.StatusOK { + break + } + if resp != nil { + resp.Body.Close() + resp = nil + } + time.Sleep(200 * time.Millisecond) + } + require.NoError(t, lastErr, "HTTP request through Caddy should succeed") + require.NotNil(t, resp, "HTTP response should not be nil") + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode, "Should get 200 OK from nginx") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + assert.Contains(t, string(body), "nginx", "Response should contain nginx welcome page") + t.Logf("Got response from nginx through Caddy: %d bytes", len(body)) + + err = ingressManager.Delete(ctx, ing.ID) + require.NoError(t, err) + t.Log("Ingress deleted") + + // Test volume is accessible from inside the guest via exec + t.Log("Testing volume from inside guest via exec...") + + runCmd := func(command ...string) (string, int, error) { + var lastOutput string + var lastExitCode int + var lastErr error + + dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if err != nil { + return "", -1, err + } + + for attempt := 0; attempt < 5; attempt++ { + if attempt > 0 { + time.Sleep(200 * time.Millisecond) + } + + var stdout, stderr bytes.Buffer + exit, err := exec.ExecIntoInstance(ctx, dialer, exec.ExecOptions{ + Command: command, + Stdout: &stdout, + Stderr: &stderr, + TTY: false, + }) + + output := stdout.String() + if stderr.Len() > 0 { + output += stderr.String() + } + output = strings.TrimSpace(output) + + if err != nil { + lastErr = err + lastOutput = output + lastExitCode = -1 + continue + } + + lastOutput = output + lastExitCode = exit.Code + lastErr = nil + + if output != "" || exit.Code == 0 { + return output, exit.Code, nil + } + } + + return lastOutput, lastExitCode, lastErr + } + + // Test volume in a single exec call + testContent := "hello-from-qemu-volume-test" + script := fmt.Sprintf(` + set -e + echo "=== Volume directory ===" + ls -la /mnt/data + echo "=== Writing test file ===" + echo '%s' > /mnt/data/test.txt + echo "=== Reading test file ===" + cat /mnt/data/test.txt + echo "=== Volume mount info ===" + df -h /mnt/data + `, testContent) + + output, exitCode, err := runCmd("sh", "-c", script) + require.NoError(t, err, "Volume test script should execute") + require.Equal(t, 0, exitCode, "Volume test script should succeed") + + require.Contains(t, output, "lost+found", "Volume should be ext4-formatted") + require.Contains(t, output, testContent, "Should be able to read written content") + require.Contains(t, output, "/dev/vd", "Volume should be mounted from block device") + t.Logf("Volume test output:\n%s", output) + t.Log("Volume read/write test passed!") + + // Delete instance + t.Log("Deleting instance...") + err = manager.DeleteInstance(ctx, inst.Id) + require.NoError(t, err) + + // Verify cleanup + assert.NoDirExists(t, p.InstanceDir(inst.Id)) + + // Verify instance no longer exists + _, err = manager.GetInstance(ctx, inst.Id) + assert.ErrorIs(t, err, ErrNotFound) + + // Verify volume is detached but still exists + vol, err = volumeManager.GetVolume(ctx, vol.Id) + require.NoError(t, err) + assert.Empty(t, vol.Attachments, "Volume should be detached after instance deletion") + assert.FileExists(t, p.VolumeData(vol.Id), "Volume file should still exist") + + // Delete volume + t.Log("Deleting volume...") + err = volumeManager.DeleteVolume(ctx, vol.Id) + require.NoError(t, err) + + // Verify volume is gone + _, err = volumeManager.GetVolume(ctx, vol.Id) + assert.ErrorIs(t, err, volumes.ErrNotFound) + + t.Log("QEMU instance lifecycle test complete!") +} diff --git a/lib/instances/restore.go b/lib/instances/restore.go index b9ce872c..5f7af9b6 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -6,6 +6,7 @@ import ( "os" "time" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/logger" "go.opentelemetry.io/otel/trace" ) @@ -64,7 +65,8 @@ func (m *manager) restoreInstance( // 5. Transition: Standby → Paused (start hypervisor + restore) log.DebugContext(ctx, "restoring from snapshot", "instance_id", id, "snapshot_dir", snapshotDir) - if err := m.restoreFromSnapshot(ctx, stored, snapshotDir); err != nil { + pid, hv, err := m.restoreFromSnapshot(ctx, stored, snapshotDir) + if err != nil { log.ErrorContext(ctx, "failed to restore from snapshot", "instance_id", id, "error", err) // Cleanup network on failure if stored.NetworkEnabled { @@ -74,23 +76,15 @@ func (m *manager) restoreInstance( return nil, err } - // 6. Create hypervisor client for resumed VM - hv, err := m.getHypervisor(stored.SocketPath, stored.HypervisorType) - if err != nil { - log.ErrorContext(ctx, "failed to create hypervisor client", "instance_id", id, "error", err) - // Cleanup network on failure - if stored.NetworkEnabled { - netAlloc, _ := m.networkManager.GetAllocation(ctx, id) - m.networkManager.ReleaseAllocation(ctx, netAlloc) - } - return nil, fmt.Errorf("create hypervisor client: %w", err) - } + // Store the PID for later cleanup + stored.HypervisorPID = &pid - // 7. Transition: Paused → Running (resume) + // 6. Transition: Paused → Running (resume) log.DebugContext(ctx, "resuming VM", "instance_id", id) if err := hv.Resume(ctx); err != nil { log.ErrorContext(ctx, "failed to resume VM", "instance_id", id, "error", err) - // Cleanup network on failure + // Cleanup on failure + hv.Shutdown(ctx) if stored.NetworkEnabled { netAlloc, _ := m.networkManager.GetAllocation(ctx, id) m.networkManager.ReleaseAllocation(ctx, netAlloc) @@ -129,40 +123,22 @@ func (m *manager) restoreFromSnapshot( ctx context.Context, stored *StoredMetadata, snapshotDir string, -) error { +) (int, hypervisor.Hypervisor, error) { log := logger.FromContext(ctx) - // Get process manager for this hypervisor type - pm, err := m.getProcessManager(stored.HypervisorType) + // Get VM starter for this hypervisor type + starter, err := m.getVMStarter(stored.HypervisorType) if err != nil { - return fmt.Errorf("get process manager: %w", err) + return 0, nil, fmt.Errorf("get vm starter: %w", err) } - // Start hypervisor process and capture PID - log.DebugContext(ctx, "starting hypervisor process for restore", "instance_id", stored.Id, "hypervisor", stored.HypervisorType, "version", stored.HypervisorVersion) - pid, err := pm.StartProcess(ctx, m.paths, stored.HypervisorVersion, stored.SocketPath) + // Restore VM from snapshot (handles process start + restore) + log.DebugContext(ctx, "restoring VM from snapshot", "instance_id", stored.Id, "hypervisor", stored.HypervisorType, "version", stored.HypervisorVersion, "snapshot_dir", snapshotDir) + pid, hv, err := starter.RestoreVM(ctx, m.paths, stored.HypervisorVersion, stored.SocketPath, snapshotDir) if err != nil { - return fmt.Errorf("start hypervisor: %w", err) - } - - // Store the PID for later cleanup - stored.HypervisorPID = &pid - log.DebugContext(ctx, "hypervisor process started", "instance_id", stored.Id, "pid", pid) - - // Create hypervisor client - hv, err := m.getHypervisor(stored.SocketPath, stored.HypervisorType) - if err != nil { - return fmt.Errorf("create hypervisor client: %w", err) - } - - // Restore from snapshot - log.DebugContext(ctx, "invoking hypervisor restore API", "instance_id", stored.Id, "snapshot_dir", snapshotDir) - if err := hv.Restore(ctx, snapshotDir); err != nil { - log.ErrorContext(ctx, "restore API call failed", "instance_id", stored.Id, "error", err) - hv.Shutdown(ctx) // Cleanup - return fmt.Errorf("restore: %w", err) + return 0, nil, fmt.Errorf("restore vm: %w", err) } - log.DebugContext(ctx, "VM restored from snapshot successfully", "instance_id", stored.Id) - return nil + log.DebugContext(ctx, "VM restored from snapshot successfully", "instance_id", stored.Id, "pid", pid) + return pid, hv, nil } diff --git a/lib/instances/volumes_test.go b/lib/instances/volumes_test.go index 3237db3b..6be9e24f 100644 --- a/lib/instances/volumes_test.go +++ b/lib/instances/volumes_test.go @@ -19,13 +19,13 @@ import ( ) // execWithRetry runs a command with retries until exec-agent is ready -func execWithRetry(ctx context.Context, vsockSocket string, command []string) (string, int, error) { +func execWithRetry(ctx context.Context, inst *Instance, command []string) (string, int, error) { var output string var code int var err error for i := 0; i < 10; i++ { - output, code, err = execCommand(ctx, vsockSocket, command...) + output, code, err = execCommand(ctx, inst, command...) if err == nil { return output, code, nil } @@ -111,7 +111,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) { // Write test file, sync, and verify in one command to ensure data persistence t.Log("Writing test file to volume...") - output, code, err := execWithRetry(ctx, writerInst.VsockSocket, []string{ + output, code, err := execWithRetry(ctx, writerInst, []string{ "/bin/sh", "-c", "echo 'Hello from writer' > /data/test.txt && sync && cat /data/test.txt", }) require.NoError(t, err) @@ -177,21 +177,21 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) { // Verify data is readable from reader-1 t.Log("Verifying data from reader-1...") - output1, code, err := execWithRetry(ctx, reader1.VsockSocket, []string{"cat", "/data/test.txt"}) + output1, code, err := execWithRetry(ctx, reader1, []string{"cat", "/data/test.txt"}) require.NoError(t, err) require.Equal(t, 0, code) require.Contains(t, output1, "Hello from writer", "Reader 1 should see the file") // Verify data is readable from reader-2 (overlay mode) t.Log("Verifying data from reader-2 (overlay)...") - output2, code, err := execWithRetry(ctx, reader2.VsockSocket, []string{"cat", "/data/test.txt"}) + output2, code, err := execWithRetry(ctx, reader2, []string{"cat", "/data/test.txt"}) require.NoError(t, err) require.Equal(t, 0, code) assert.Contains(t, output2, "Hello from writer", "Reader 2 should see the file from base volume") // Verify overlay allows writes: append to the file and verify in one command t.Log("Verifying overlay allows writes (append to file)...") - output2, code, err = execWithRetry(ctx, reader2.VsockSocket, []string{ + output2, code, err = execWithRetry(ctx, reader2, []string{ "/bin/sh", "-c", "echo 'Appended by overlay' >> /data/test.txt && sync && cat /data/test.txt", }) require.NoError(t, err) @@ -201,7 +201,7 @@ func TestVolumeMultiAttachReadOnly(t *testing.T) { // Verify reader-1 does NOT see the appended data AND write fails (all in one command) t.Log("Verifying read-only enforcement and isolation on reader-1...") - output1, code, err = execWithRetry(ctx, reader1.VsockSocket, []string{ + output1, code, err = execWithRetry(ctx, reader1, []string{ "/bin/sh", "-c", "cat /data/test.txt && echo 'illegal' > /data/illegal.txt", }) require.NoError(t, err, "Exec should succeed even if write command fails") @@ -414,14 +414,14 @@ func TestVolumeFromArchive(t *testing.T) { t.Log("Verifying archive files are accessible...") // Check greeting.txt - output, code, err := execWithRetry(ctx, inst.VsockSocket, []string{"cat", "/archive/greeting.txt"}) + output, code, err := execWithRetry(ctx, inst, []string{"cat", "/archive/greeting.txt"}) require.NoError(t, err) require.Equal(t, 0, code, "cat greeting.txt should succeed") assert.Equal(t, "Hello from archive!", strings.TrimSpace(output)) t.Log("✓ greeting.txt verified") // Check data/config.json - output, code, err = execWithRetry(ctx, inst.VsockSocket, []string{"cat", "/archive/data/config.json"}) + output, code, err = execWithRetry(ctx, inst, []string{"cat", "/archive/data/config.json"}) require.NoError(t, err) require.Equal(t, 0, code, "cat config.json should succeed") assert.Contains(t, output, `"key": "value"`) @@ -429,14 +429,14 @@ func TestVolumeFromArchive(t *testing.T) { t.Log("✓ data/config.json verified") // Check deeply nested file - output, code, err = execWithRetry(ctx, inst.VsockSocket, []string{"cat", "/archive/data/nested/deep.txt"}) + output, code, err = execWithRetry(ctx, inst, []string{"cat", "/archive/data/nested/deep.txt"}) require.NoError(t, err) require.Equal(t, 0, code, "cat deep.txt should succeed") assert.Equal(t, "Deep nested file content", strings.TrimSpace(output)) t.Log("✓ data/nested/deep.txt verified") // List directory to confirm structure - output, code, err = execWithRetry(ctx, inst.VsockSocket, []string{"find", "/archive", "-type", "f"}) + output, code, err = execWithRetry(ctx, inst, []string{"find", "/archive", "-type", "f"}) require.NoError(t, err) require.Equal(t, 0, code, "find should succeed") assert.Contains(t, output, "/archive/greeting.txt") From 4abaf5ef0eeb4b03cb1f73de8be0ac2e77d0f237 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 10:44:45 -0500 Subject: [PATCH 08/14] QMP connection pool --- lib/hypervisor/qemu/pool.go | 57 +++++++++++++++++++++++++++++++++++++ lib/hypervisor/qemu/qemu.go | 40 +++++++++++++++++++++----- lib/instances/manager.go | 2 +- 3 files changed, 91 insertions(+), 8 deletions(-) create mode 100644 lib/hypervisor/qemu/pool.go diff --git a/lib/hypervisor/qemu/pool.go b/lib/hypervisor/qemu/pool.go new file mode 100644 index 00000000..f261ea3b --- /dev/null +++ b/lib/hypervisor/qemu/pool.go @@ -0,0 +1,57 @@ +package qemu + +import ( + "sync" +) + +// clientPool manages singleton QMP connections per socket path. +// QEMU's QMP socket only allows one connection at a time, so we must +// reuse existing connections rather than creating new ones. +var clientPool = struct { + sync.RWMutex + clients map[string]*QEMU +}{ + clients: make(map[string]*QEMU), +} + +// GetOrCreate returns an existing QEMU client for the socket path, +// or creates a new one if none exists. +func GetOrCreate(socketPath string) (*QEMU, error) { + // Try read lock first for existing connection + clientPool.RLock() + if client, ok := clientPool.clients[socketPath]; ok { + clientPool.RUnlock() + return client, nil + } + clientPool.RUnlock() + + // Need to create new connection - acquire write lock + clientPool.Lock() + defer clientPool.Unlock() + + // Double-check after acquiring write lock + if client, ok := clientPool.clients[socketPath]; ok { + return client, nil + } + + // Create new client + client, err := newClient(socketPath) + if err != nil { + return nil, err + } + + clientPool.clients[socketPath] = client + return client, nil +} + +// Remove closes and removes a client from the pool. +// Called automatically on errors to allow fresh reconnection. +func Remove(socketPath string) { + clientPool.Lock() + defer clientPool.Unlock() + + if client, ok := clientPool.clients[socketPath]; ok { + client.client.Close() + delete(clientPool.clients, socketPath) + } +} diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index f313ac37..10ed70ee 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -11,16 +11,23 @@ import ( // QEMU implements hypervisor.Hypervisor for QEMU VMM. type QEMU struct { - client *Client + client *Client + socketPath string // for self-removal from pool on error } -// New creates a new QEMU client for an existing QMP socket. +// New returns a QEMU client for the given socket path. +// Uses a connection pool to ensure only one connection per socket exists. func New(socketPath string) (*QEMU, error) { + return GetOrCreate(socketPath) +} + +// newClient creates a new QEMU client (internal, used by pool). +func newClient(socketPath string) (*QEMU, error) { client, err := NewClient(socketPath) if err != nil { return nil, fmt.Errorf("create qemu client: %w", err) } - return &QEMU{client: client}, nil + return &QEMU{client: client, socketPath: socketPath}, nil } // Verify QEMU implements the interface @@ -40,18 +47,29 @@ func (q *QEMU) Capabilities() hypervisor.Capabilities { // DeleteVM removes the VM configuration from QEMU. // This sends a graceful shutdown signal to the guest. func (q *QEMU) DeleteVM(ctx context.Context) error { - return q.client.SystemPowerdown() + if err := q.client.SystemPowerdown(); err != nil { + Remove(q.socketPath) + return err + } + return nil } // Shutdown stops the QEMU process. func (q *QEMU) Shutdown(ctx context.Context) error { - return q.client.Quit() + if err := q.client.Quit(); err != nil { + Remove(q.socketPath) + return err + } + // Connection is gone after quit, remove from pool + Remove(q.socketPath) + return nil } // GetVMInfo returns current VM state. func (q *QEMU) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { status, err := q.client.Status() if err != nil { + Remove(q.socketPath) return nil, fmt.Errorf("query status: %w", err) } @@ -85,12 +103,20 @@ func (q *QEMU) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { // Pause suspends VM execution. func (q *QEMU) Pause(ctx context.Context) error { - return q.client.Stop() + if err := q.client.Stop(); err != nil { + Remove(q.socketPath) + return err + } + return nil } // Resume continues VM execution. func (q *QEMU) Resume(ctx context.Context) error { - return q.client.Continue() + if err := q.client.Continue(); err != nil { + Remove(q.socketPath) + return err + } + return nil } // Snapshot creates a VM snapshot. diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 915879d4..e3d3d231 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -264,7 +264,7 @@ func (m *manager) RotateLogs(ctx context.Context, maxBytes int64, maxFiles int) m.paths.InstanceHypemanLog(inst.Id), } for _, logPath := range logPaths { - if err := rotateLogIfNeeded(logPath, maxBytes, maxFiles); err != nil { + if err := rotateLogIfNeeded(logPath, maxBytes, maxFiles); err != nil { lastErr = err // Continue with other logs, but track error } } From d223feec977bea50aede1c71e047b07d560c7304 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 11:07:00 -0500 Subject: [PATCH 09/14] Fix cleanup of vm --- lib/exec/client.go | 10 ++-- lib/hypervisor/qemu/pool.go | 4 +- lib/hypervisor/qemu/vsock.go | 12 ++++- lib/instances/delete.go | 17 +++++-- lib/network/derive.go | 93 +++++++++++------------------------- 5 files changed, 61 insertions(+), 75 deletions(-) diff --git a/lib/exec/client.go b/lib/exec/client.go index 32497fb8..eeacb348 100644 --- a/lib/exec/client.go +++ b/lib/exec/client.go @@ -67,15 +67,17 @@ func getOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc. return conn, nil } -// CloseConn closes and removes a connection from the pool (call when VM is deleted) +// CloseConn removes a connection from the pool (call when VM is deleted). +// We only remove from pool, not explicitly close - the connection will fail +// naturally when the VM dies, and grpc will clean up. Calling Close() on a +// connection with an active reader can cause panics in grpc internals. func CloseConn(dialerKey string) { connPool.Lock() defer connPool.Unlock() - if conn, ok := connPool.conns[dialerKey]; ok { - conn.Close() + if _, ok := connPool.conns[dialerKey]; ok { delete(connPool.conns, dialerKey) - slog.Debug("closed gRPC connection", "key", dialerKey) + slog.Debug("removed gRPC connection from pool", "key", dialerKey) } } diff --git a/lib/hypervisor/qemu/pool.go b/lib/hypervisor/qemu/pool.go index f261ea3b..398ce63e 100644 --- a/lib/hypervisor/qemu/pool.go +++ b/lib/hypervisor/qemu/pool.go @@ -46,12 +46,14 @@ func GetOrCreate(socketPath string) (*QEMU, error) { // Remove closes and removes a client from the pool. // Called automatically on errors to allow fresh reconnection. +// Close is done asynchronously to avoid blocking if the connection is in a bad state. func Remove(socketPath string) { clientPool.Lock() defer clientPool.Unlock() if client, ok := clientPool.clients[socketPath]; ok { - client.client.Close() delete(clientPool.clients, socketPath) + // Close asynchronously to avoid blocking on stuck connections + go client.client.Close() } } diff --git a/lib/hypervisor/qemu/vsock.go b/lib/hypervisor/qemu/vsock.go index 58ac23aa..5591ccb6 100644 --- a/lib/hypervisor/qemu/vsock.go +++ b/lib/hypervisor/qemu/vsock.go @@ -3,6 +3,7 @@ package qemu import ( "context" "fmt" + "io" "log/slog" "net" "time" @@ -158,7 +159,16 @@ func newVsockConn(fd int, remoteCID, remotePort uint32) (*vsockConn, error) { } func (c *vsockConn) Read(b []byte) (int, error) { - return unix.Read(c.fd, b) + n, err := unix.Read(c.fd, b) + // Ensure we never return negative n (violates io.Reader contract) + // This can happen when the vsock fd becomes invalid (VM died) + if n < 0 { + if err == nil { + err = io.EOF + } + return 0, err + } + return n, err } func (c *vsockConn) Write(b []byte) (int, error) { diff --git a/lib/instances/delete.go b/lib/instances/delete.go index d0ddb8cb..bb8437ba 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -7,6 +7,8 @@ import ( "syscall" "time" + "github.com/onkernel/hypeman/lib/exec" + "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/network" ) @@ -39,7 +41,12 @@ func (m *manager) deleteInstance( } } - // 3. If hypervisor might be running, force kill it + // 3. Close exec gRPC connection before killing hypervisor to prevent panic + if dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID); err == nil { + exec.CloseConn(dialer.Key()) + } + + // 4. If hypervisor might be running, force kill it // Also attempt kill for StateUnknown since we can't be sure if hypervisor is running if inst.State.RequiresVMM() || inst.State == StateUnknown { log.DebugContext(ctx, "stopping hypervisor", "instance_id", id, "state", inst.State) @@ -50,7 +57,7 @@ func (m *manager) deleteInstance( } } - // 4. Release network allocation + // 5. Release network allocation if inst.NetworkEnabled { log.DebugContext(ctx, "releasing network", "instance_id", id, "network", "default") if err := m.networkManager.ReleaseAllocation(ctx, networkAlloc); err != nil { @@ -59,7 +66,7 @@ func (m *manager) deleteInstance( } } - // 5. Detach and auto-unbind devices from VFIO + // 6. Detach and auto-unbind devices from VFIO if len(inst.Devices) > 0 && m.deviceManager != nil { for _, deviceID := range inst.Devices { log.DebugContext(ctx, "detaching device", "id", id, "device", deviceID) @@ -76,7 +83,7 @@ func (m *manager) deleteInstance( } } - // 5b. Detach volumes + // 6b. Detach volumes if len(inst.Volumes) > 0 { log.DebugContext(ctx, "detaching volumes", "instance_id", id, "count", len(inst.Volumes)) for _, volAttach := range inst.Volumes { @@ -87,7 +94,7 @@ func (m *manager) deleteInstance( } } - // 6. Delete all instance data + // 7. Delete all instance data log.DebugContext(ctx, "deleting instance data", "instance_id", id) if err := m.deleteInstanceData(id); err != nil { log.ErrorContext(ctx, "failed to delete instance data", "instance_id", id, "error", err) diff --git a/lib/network/derive.go b/lib/network/derive.go index f568c035..53f8b773 100644 --- a/lib/network/derive.go +++ b/lib/network/derive.go @@ -9,7 +9,6 @@ import ( "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/logger" - "github.com/onkernel/hypeman/lib/vmm" ) // instanceMetadata is the minimal metadata we need to derive allocations @@ -18,6 +17,8 @@ type instanceMetadata struct { Name string NetworkEnabled bool HypervisorType string + IP string // Assigned IP address + MAC string // Assigned MAC address } // deriveAllocation derives network allocation from CH or snapshot @@ -49,58 +50,38 @@ func (m *manager) deriveAllocation(ctx context.Context, instanceID string) (*All } netmask := fmt.Sprintf("%d.%d.%d.%d", ipNet.Mask[0], ipNet.Mask[1], ipNet.Mask[2], ipNet.Mask[3]) - // 4. Try to derive from running VM first - socketPath := m.paths.InstanceSocket(instanceID, hypervisor.SocketNameForType(hypervisor.Type(meta.HypervisorType))) - if fileExists(socketPath) { - client, err := vmm.NewVMM(socketPath) - if err == nil { - resp, err := client.GetVmInfoWithResponse(ctx) - if err == nil && resp.JSON200 != nil && resp.JSON200.Config.Net != nil && len(*resp.JSON200.Config.Net) > 0 { - nets := *resp.JSON200.Config.Net - net := nets[0] - if net.Ip != nil && net.Mac != nil && net.Tap != nil { - log.DebugContext(ctx, "derived allocation from running VM", "instance_id", instanceID) - return &Allocation{ - InstanceID: instanceID, - InstanceName: meta.Name, - Network: "default", - IP: *net.Ip, - MAC: *net.Mac, - TAPDevice: *net.Tap, - Gateway: defaultNet.Gateway, - Netmask: netmask, - State: "running", - }, nil - } + // 4. Use stored metadata to derive allocation (works for all hypervisors) + if meta.IP != "" && meta.MAC != "" { + tap := generateTAPName(instanceID) + + // Determine state based on socket existence and snapshot + socketPath := m.paths.InstanceSocket(instanceID, hypervisor.SocketNameForType(hypervisor.Type(meta.HypervisorType))) + state := "stopped" + if fileExists(socketPath) { + state = "running" + } else { + // Check for snapshot (standby state) + snapshotConfigJson := m.paths.InstanceSnapshotConfig(instanceID) + if fileExists(snapshotConfigJson) { + state = "standby" } } - } - // 5. Try to derive from snapshot - // Cloud Hypervisor creates config.json in the snapshot directory - snapshotConfigJson := m.paths.InstanceSnapshotConfig(instanceID) - if fileExists(snapshotConfigJson) { - vmConfig, err := m.parseVmJson(snapshotConfigJson) - if err == nil && vmConfig.Net != nil && len(*vmConfig.Net) > 0 { - nets := *vmConfig.Net - if nets[0].Ip != nil && nets[0].Mac != nil && nets[0].Tap != nil { - log.DebugContext(ctx, "derived allocation from snapshot", "instance_id", instanceID) - return &Allocation{ - InstanceID: instanceID, - InstanceName: meta.Name, - Network: "default", - IP: *nets[0].Ip, - MAC: *nets[0].Mac, - TAPDevice: *nets[0].Tap, - Gateway: defaultNet.Gateway, - Netmask: netmask, - State: "standby", - }, nil - } - } + log.DebugContext(ctx, "derived allocation from metadata", "instance_id", instanceID, "state", state) + return &Allocation{ + InstanceID: instanceID, + InstanceName: meta.Name, + Network: "default", + IP: meta.IP, + MAC: meta.MAC, + TAPDevice: tap, + Gateway: defaultNet.Gateway, + Netmask: netmask, + State: state, + }, nil } - // 6. No allocation (stopped or network not yet configured) + // 5. No allocation (network not yet configured) return nil, nil } @@ -164,22 +145,6 @@ func (m *manager) loadInstanceMetadata(instanceID string) (*instanceMetadata, er return &meta, nil } -// parseVmJson parses Cloud Hypervisor's config.json from snapshot -// Note: Despite the function name, this parses config.json (what CH actually creates) -func (m *manager) parseVmJson(path string) (*vmm.VmConfig, error) { - data, err := os.ReadFile(path) - if err != nil { - return nil, fmt.Errorf("read config.json: %w", err) - } - - var vmConfig vmm.VmConfig - if err := json.Unmarshal(data, &vmConfig); err != nil { - return nil, fmt.Errorf("unmarshal config.json: %w", err) - } - - return &vmConfig, nil -} - // fileExists checks if a file exists func fileExists(path string) bool { _, err := os.Stat(path) From 661159af60b508d357432e4a5fa48e60a9fb9fef Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 11:20:48 -0500 Subject: [PATCH 10/14] Address review comments --- lib/hypervisor/cloudhypervisor/process.go | 20 ++++++++++++++++++-- lib/hypervisor/qemu/process.go | 9 +++++++++ lib/hypervisor/qemu/vsock.go | 11 ++++++++++- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/lib/hypervisor/cloudhypervisor/process.go b/lib/hypervisor/cloudhypervisor/process.go index 230a9b16..edfa82c0 100644 --- a/lib/hypervisor/cloudhypervisor/process.go +++ b/lib/hypervisor/cloudhypervisor/process.go @@ -3,10 +3,12 @@ package cloudhypervisor import ( "context" "fmt" + "syscall" "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/paths" "github.com/onkernel/hypeman/lib/vmm" + "gvisor.dev/gvisor/pkg/cleanup" ) func init() { @@ -53,6 +55,12 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s return 0, nil, fmt.Errorf("start process: %w", err) } + // Setup cleanup to kill the process if subsequent steps fail + cu := cleanup.Make(func() { + syscall.Kill(pid, syscall.SIGKILL) + }) + defer cu.Clean() + // 2. Create the HTTP client hv, err := New(socketPath) if err != nil { @@ -78,6 +86,8 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s return 0, nil, fmt.Errorf("boot vm failed with status %d: %s", bootResp.StatusCode(), string(bootResp.Body)) } + // Success - release cleanup to prevent killing the process + cu.Release() return pid, hv, nil } @@ -96,6 +106,12 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, return 0, nil, fmt.Errorf("start process: %w", err) } + // Setup cleanup to kill the process if subsequent steps fail + cu := cleanup.Make(func() { + syscall.Kill(pid, syscall.SIGKILL) + }) + defer cu.Clean() + // 2. Create the HTTP client hv, err := New(socketPath) if err != nil { @@ -110,14 +126,14 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, } resp, err := hv.client.PutVmRestoreWithResponse(ctx, restoreConfig) if err != nil { - hv.Shutdown(ctx) // Cleanup on failure return 0, nil, fmt.Errorf("restore: %w", err) } if resp.StatusCode() != 204 { - hv.Shutdown(ctx) // Cleanup on failure return 0, nil, fmt.Errorf("restore failed with status %d: %s", resp.StatusCode(), string(resp.Body)) } + // Success - release cleanup to prevent killing the process + cu.Release() return pid, hv, nil } diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index a11ac622..61f3496e 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -14,6 +14,7 @@ import ( "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/paths" + "gvisor.dev/gvisor/pkg/cleanup" ) func init() { @@ -121,6 +122,12 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s pid := cmd.Process.Pid + // Setup cleanup to kill the process if subsequent steps fail + cu := cleanup.Make(func() { + syscall.Kill(pid, syscall.SIGKILL) + }) + defer cu.Clean() + // Wait for socket to be ready if err := waitForSocket(socketPath, 10*time.Second); err != nil { vmmLogPath := filepath.Join(logsDir, "vmm.log") @@ -136,6 +143,8 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s return 0, nil, fmt.Errorf("create client: %w", err) } + // Success - release cleanup to prevent killing the process + cu.Release() return pid, hv, nil } diff --git a/lib/hypervisor/qemu/vsock.go b/lib/hypervisor/qemu/vsock.go index 5591ccb6..6ee7c54d 100644 --- a/lib/hypervisor/qemu/vsock.go +++ b/lib/hypervisor/qemu/vsock.go @@ -172,7 +172,16 @@ func (c *vsockConn) Read(b []byte) (int, error) { } func (c *vsockConn) Write(b []byte) (int, error) { - return unix.Write(c.fd, b) + n, err := unix.Write(c.fd, b) + // Ensure we never return negative n (violates io.Writer contract) + // This can happen when the vsock fd becomes invalid (VM died) + if n < 0 { + if err == nil { + err = io.ErrClosedPipe + } + return 0, err + } + return n, err } func (c *vsockConn) Close() error { From e531ec4acc714d1bd10399abaa066b0cd12e5074 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 11:47:12 -0500 Subject: [PATCH 11/14] Add version checking to hypervisor interface --- lib/hypervisor/cloudhypervisor/process.go | 6 ++ lib/hypervisor/hypervisor.go | 5 ++ lib/hypervisor/qemu/process.go | 25 ++++++ lib/hypervisor/qemu/process_test.go | 100 ++++++++++++++++++++++ lib/instances/create.go | 10 ++- 5 files changed, 144 insertions(+), 2 deletions(-) create mode 100644 lib/hypervisor/qemu/process_test.go diff --git a/lib/hypervisor/cloudhypervisor/process.go b/lib/hypervisor/cloudhypervisor/process.go index edfa82c0..bd48718b 100644 --- a/lib/hypervisor/cloudhypervisor/process.go +++ b/lib/hypervisor/cloudhypervisor/process.go @@ -40,6 +40,12 @@ func (s *Starter) GetBinaryPath(p *paths.Paths, version string) (string, error) return vmm.GetBinaryPath(p, chVersion) } +// GetVersion returns the latest supported Cloud Hypervisor version. +// Cloud Hypervisor binaries are embedded, so we return the latest known version. +func (s *Starter) GetVersion(p *paths.Paths) (string, error) { + return string(vmm.V49_0), nil +} + // StartVM launches Cloud Hypervisor, configures the VM, and boots it. // Returns the process ID and a Hypervisor client for subsequent operations. func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index 31ea5e19..a92f832e 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -53,6 +53,11 @@ type VMStarter interface { // GetBinaryPath returns the path to the hypervisor binary, extracting if needed. GetBinaryPath(p *paths.Paths, version string) (string, error) + // GetVersion returns the version of the hypervisor binary. + // For embedded binaries (Cloud Hypervisor), returns the latest supported version. + // For system binaries (QEMU), queries the installed binary for its version. + GetVersion(p *paths.Paths) (string, error) + // StartVM launches the hypervisor process and boots the VM. // Returns the process ID and a Hypervisor client for subsequent operations. StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config VMConfig) (pid int, hv Hypervisor, err error) diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index 61f3496e..a0ebfb90 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "runtime" "syscall" "time" @@ -63,6 +64,30 @@ func (s *Starter) GetBinaryPath(p *paths.Paths, version string) (string, error) return "", fmt.Errorf("%s not found; install with: %s", binaryName, qemuInstallHint()) } +// GetVersion returns the version of the installed QEMU binary. +// Parses the output of "qemu-system-* --version" to extract the version string. +func (s *Starter) GetVersion(p *paths.Paths) (string, error) { + binaryPath, err := s.GetBinaryPath(p, "") + if err != nil { + return "", err + } + + cmd := exec.Command(binaryPath, "--version") + output, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("get qemu version: %w", err) + } + + // Parse "QEMU emulator version 8.2.0 (Debian ...)" -> "8.2.0" + re := regexp.MustCompile(`version (\d+\.\d+(?:\.\d+)?)`) + matches := re.FindStringSubmatch(string(output)) + if len(matches) >= 2 { + return matches[1], nil + } + + return "", fmt.Errorf("could not parse QEMU version from: %s", string(output)) +} + // StartVM launches QEMU with the VM configuration and returns a Hypervisor client. // QEMU receives all configuration via command-line arguments at process start. func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { diff --git a/lib/hypervisor/qemu/process_test.go b/lib/hypervisor/qemu/process_test.go new file mode 100644 index 00000000..ff8e93a3 --- /dev/null +++ b/lib/hypervisor/qemu/process_test.go @@ -0,0 +1,100 @@ +package qemu + +import ( + "os/exec" + "regexp" + "testing" + + "github.com/onkernel/hypeman/lib/paths" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestGetVersion_Integration is an integration test that verifies GetVersion +// works correctly with the actual QEMU binary installed on the system. +func TestGetVersion_Integration(t *testing.T) { + // Skip if QEMU is not installed + binaryName, err := qemuBinaryName() + if err != nil { + t.Skipf("Skipping test: %v", err) + } + + _, err = exec.LookPath(binaryName) + if err != nil { + t.Skipf("Skipping test: QEMU binary %s not found in PATH", binaryName) + } + + // Create starter and get version + starter := NewStarter() + tmpDir := t.TempDir() + p := paths.New(tmpDir) + + version, err := starter.GetVersion(p) + require.NoError(t, err, "GetVersion should not return an error") + + // Verify version is not empty + assert.NotEmpty(t, version, "Version should not be empty") + + // Verify version matches expected format (e.g., "8.2.0", "9.0", "7.2.1") + versionPattern := regexp.MustCompile(`^\d+\.\d+(\.\d+)?$`) + assert.Regexp(t, versionPattern, version, "Version should match pattern X.Y or X.Y.Z") + + t.Logf("Detected QEMU version: %s", version) +} + +// TestGetVersion_ParsesVersionCorrectly tests the version parsing logic +// with various version string formats. +func TestGetVersion_ParsesVersionCorrectly(t *testing.T) { + tests := []struct { + name string + output string + expected string + wantErr bool + }{ + { + name: "debian format", + output: "QEMU emulator version 8.2.0 (Debian 1:8.2.0+dfsg-1)", + expected: "8.2.0", + }, + { + name: "simple format", + output: "QEMU emulator version 9.0.0", + expected: "9.0.0", + }, + { + name: "two part version", + output: "QEMU emulator version 9.0", + expected: "9.0", + }, + { + name: "with git info", + output: "QEMU emulator version 7.2.1 (qemu-7.2.1-1.fc38)", + expected: "7.2.1", + }, + { + name: "invalid format", + output: "Some random output", + wantErr: true, + }, + { + name: "empty output", + output: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Use the same regex as in GetVersion + re := regexp.MustCompile(`version (\d+\.\d+(?:\.\d+)?)`) + matches := re.FindStringSubmatch(tt.output) + + if tt.wantErr { + assert.Less(t, len(matches), 2, "Should not match for invalid input") + } else { + require.GreaterOrEqual(t, len(matches), 2, "Should find version match") + assert.Equal(t, tt.expected, matches[1], "Parsed version should match expected") + } + }) + } +} diff --git a/lib/instances/create.go b/lib/instances/create.go index a440a017..3938fff8 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -15,7 +15,6 @@ import ( "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/network" "github.com/onkernel/hypeman/lib/system" - "github.com/onkernel/hypeman/lib/vmm" "github.com/onkernel/hypeman/lib/volumes" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" @@ -225,6 +224,13 @@ func (m *manager) createInstance( return nil, fmt.Errorf("get vm starter for %s: %w", hvType, err) } + // Get hypervisor version + hvVersion, err := starter.GetVersion(m.paths) + if err != nil { + log.WarnContext(ctx, "failed to get hypervisor version", "hypervisor", hvType, "error", err) + hvVersion = "unknown" + } + // 10. Validate, resolve, and auto-bind devices (GPU passthrough) // Track devices we've marked as attached for cleanup on error. // The cleanup closure captures this slice by reference, so it will see @@ -295,7 +301,7 @@ func (m *manager) createInstance( StoppedAt: nil, KernelVersion: string(kernelVer), HypervisorType: hvType, - HypervisorVersion: string(vmm.V49_0), // Use latest + HypervisorVersion: hvVersion, SocketPath: m.paths.InstanceSocket(id, starter.SocketName()), DataDir: m.paths.InstanceDir(id), VsockCID: vsockCID, From 0366ca37e53958e6282c4e2c746da6aa077da5cd Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 15:03:10 -0500 Subject: [PATCH 12/14] Fix log naming --- lib/instances/exec_test.go | 2 +- lib/instances/network_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/instances/exec_test.go b/lib/instances/exec_test.go index f8679bd2..1efb471e 100644 --- a/lib/instances/exec_test.go +++ b/lib/instances/exec_test.go @@ -22,7 +22,7 @@ func waitForExecAgent(ctx context.Context, mgr *manager, instanceID string, time deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { logs, err := collectLogs(ctx, mgr, instanceID, 100) - if err == nil && strings.Contains(logs, "[exec-agent] listening on vsock port 2222") { + if err == nil && strings.Contains(logs, "[guest-agent] listening on vsock port 2222") { return nil } time.Sleep(500 * time.Millisecond) diff --git a/lib/instances/network_test.go b/lib/instances/network_test.go index 7a2f8ed6..0ad25494 100644 --- a/lib/instances/network_test.go +++ b/lib/instances/network_test.go @@ -110,7 +110,7 @@ func TestCreateInstanceWithNetwork(t *testing.T) { // Wait for exec agent to be ready t.Log("Waiting for exec agent...") - err = waitForLogMessage(ctx, manager, inst.Id, "[exec-agent] listening", 10*time.Second) + err = waitForLogMessage(ctx, manager, inst.Id, "[guest-agent] listening", 10*time.Second) require.NoError(t, err, "Exec agent should be listening") t.Log("Exec agent is ready") From 6c9b877f45006448f59a82dcc0f5ffca9cd9353c Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 15:16:47 -0500 Subject: [PATCH 13/14] fix indent --- lib/instances/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/instances/manager.go b/lib/instances/manager.go index e3d3d231..915879d4 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -264,7 +264,7 @@ func (m *manager) RotateLogs(ctx context.Context, maxBytes int64, maxFiles int) m.paths.InstanceHypemanLog(inst.Id), } for _, logPath := range logPaths { - if err := rotateLogIfNeeded(logPath, maxBytes, maxFiles); err != nil { + if err := rotateLogIfNeeded(logPath, maxBytes, maxFiles); err != nil { lastErr = err // Continue with other logs, but track error } } From 72c0d9d9e17aa420c0f7bad2f986939822c9e66a Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Tue, 23 Dec 2025 15:21:24 -0500 Subject: [PATCH 14/14] Add startup warning --- cmd/api/api/cp.go | 1 - cmd/api/main.go | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/api/api/cp.go b/cmd/api/api/cp.go index dcd45f95..89b40a8b 100644 --- a/cmd/api/api/cp.go +++ b/cmd/api/api/cp.go @@ -419,4 +419,3 @@ func (s *ApiService) handleCopyFrom(ctx context.Context, ws *websocket.Conn, ins } return bytesReceived, nil } - diff --git a/cmd/api/main.go b/cmd/api/main.go index 5d003352..dfb556b1 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -23,6 +23,7 @@ import ( "github.com/onkernel/hypeman/cmd/api/api" "github.com/onkernel/hypeman/cmd/api/config" "github.com/onkernel/hypeman/lib/guest" + "github.com/onkernel/hypeman/lib/hypervisor/qemu" "github.com/onkernel/hypeman/lib/instances" mw "github.com/onkernel/hypeman/lib/middleware" "github.com/onkernel/hypeman/lib/oapi" @@ -125,6 +126,11 @@ func run() error { } logger.Info("KVM access verified") + // Check if QEMU is available (optional - only warn if not present) + if _, err := (&qemu.Starter{}).GetBinaryPath(nil, ""); err != nil { + logger.Warn("QEMU not available - QEMU hypervisor will not work", "error", err) + } + // Validate log rotation config var logMaxSize datasize.ByteSize if err := logMaxSize.UnmarshalText([]byte(app.Config.LogMaxSize)); err != nil {