/* libguestfs * Copyright (C) 2009-2025 Red Hat Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ /** * Implementation of the C backend. * * For more details see L. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cloexec.h" #include "guestfs.h" #include "guestfs-internal.h" #include "guestfs_protocol.h" #include "qemuopts.h" #include "ignore-value.h" /* Per-handle data. */ struct backend_direct_data { const char *qemu; /* Qemu binary name. */ pid_t pid; /* Qemu PID. */ pid_t recoverypid; /* Recovery process PID. */ char guestfsd_sock[UNIX_PATH_MAX]; /* Path to daemon socket. */ }; /* Helper that sends all output from a command to debug. */ static void debug_lines (guestfs_h *g, void *retv, const char *buf, size_t len) { debug (g, "qemu: %s", buf); } static char * create_cow_overlay_direct (guestfs_h *g, void *datav, struct drive *drv) { char *overlay; CLEANUP_FREE char *backing_drive = NULL; struct guestfs_disk_create_argv optargs; backing_drive = guestfs_int_drive_source_qemu_param (g, &drv->src); if (!backing_drive) return NULL; overlay = guestfs_int_make_temp_path (g, "overlay", "qcow2"); if (!overlay) return NULL; optargs.bitmask = GUESTFS_DISK_CREATE_BACKINGFILE_BITMASK; optargs.backingfile = backing_drive; if (drv->src.format) { optargs.bitmask |= GUESTFS_DISK_CREATE_BACKINGFORMAT_BITMASK; optargs.backingformat = drv->src.format; } if (guestfs_disk_create_argv (g, overlay, "qcow2", -1, &optargs) == -1) { free (overlay); return NULL; } /* Caller sets g->overlay in the handle to this, and then manages * the memory. */ return overlay; } /* On Debian, /dev/kvm is mode 0660 and group kvm, so users need to * add themselves to the kvm group otherwise things are going to be * very slow (this is Debian bug 640328). Warn about this. */ static void debian_kvm_warning (guestfs_h *g) { #ifdef __linux__ uid_t euid = geteuid (); gid_t egid = getegid (); struct stat statbuf; gid_t kvm_group; CLEANUP_FREE gid_t *groups = NULL; int ngroups; size_t i; /* Doesn't apply if running as root. */ if (euid == 0) return; if (stat ("/dev/kvm", &statbuf) == -1) return; if ((statbuf.st_mode & 0777) != 0660) return; /* They might be running libguestfs as root or have chowned /dev/kvm, so: */ if (geteuid () == statbuf.st_uid) return; kvm_group = statbuf.st_gid; /* Is the current process a member of the KVM group? */ if (egid == kvm_group) return; ngroups = getgroups (0, NULL); if (ngroups > 0) { groups = safe_malloc (g, ngroups * sizeof (gid_t)); if (getgroups (ngroups, groups) == -1) { warning (g, "getgroups: %m (ignored)"); return; } for (i = 0; i < (size_t) ngroups; ++i) { if (groups[i] == kvm_group) return; } } /* No, so emit the warning. Note that \n characters cannot appear * in warnings. */ warning (g, _("current user is not a member of the KVM group (group ID %d). " "This user cannot access /dev/kvm, so libguestfs may run very slowly. " "It is recommended that you 'chmod 0666 /dev/kvm' or add the current user " "to the KVM group (you might need to log out and log in again)."), (int) kvm_group); #endif /* __linux__ */ } /* Some macros which make using qemuopts a bit easier. */ #define flag(flag) \ do { \ if (qemuopts_add_flag (qopts, (flag)) == -1) goto qemuopts_error; \ } while (0) #define arg(flag, value) \ do { \ if (qemuopts_add_arg (qopts, (flag), (value)) == -1) goto qemuopts_error; \ } while (0) #define arg_format(flag, fs, ...) \ do { \ if (qemuopts_add_arg_format (qopts, (flag), (fs), ##__VA_ARGS__) == -1) \ goto qemuopts_error; \ } while (0) #define arg_noquote(flag, value) \ do { \ if (qemuopts_add_arg_noquote (qopts, (flag), (value)) == -1) \ goto qemuopts_error; \ } while (0) #define start_list(flag) \ if (qemuopts_start_arg_list (qopts, (flag)) == -1) goto qemuopts_error; \ do #define append_list(value) \ do { \ if (qemuopts_append_arg_list (qopts, (value)) == -1) \ goto qemuopts_error; \ } while (0) #define append_list_format(fs, ...) \ do { \ if (qemuopts_append_arg_list_format (qopts, (fs), ##__VA_ARGS__) == -1) \ goto qemuopts_error; \ } while (0) #define end_list() \ while (0); \ do { \ if (qemuopts_end_arg_list (qopts) == -1) goto qemuopts_error; \ } while (0) /** * Add the standard elements of the C<-drive> parameter. */ static int add_drive_standard_params (guestfs_h *g, struct backend_direct_data *data, struct qemuopts *qopts, size_t i, struct drive *drv) { if (!drv->overlay) { CLEANUP_FREE char *file = NULL; /* file= parameter. */ file = guestfs_int_drive_source_qemu_param (g, &drv->src); append_list_format ("file=%s", file); if (drv->readonly) append_list ("snapshot=on"); append_list_format ("cache=%s", drv->cachemode ? drv->cachemode : "writeback"); if (drv->src.format) append_list_format ("format=%s", drv->src.format); if (drv->copyonread) append_list ("copy-on-read=on"); /* Discard mode. */ switch (drv->discard) { case discard_disable: /* Since the default is always discard=ignore, don't specify it * on the command line. */ break; case discard_enable: if (!guestfs_int_discard_possible (g, drv)) return -1; /*FALLTHROUGH*/ case discard_besteffort: append_list ("discard=unmap"); } } else { /* Writable qcow2 overlay on top of read-only drive. * * Add the file-specific locking option only for files, as * qemu won't accept options unknown to the block driver in * use. */ if (drv->src.protocol == drive_protocol_file) { append_list_format ("file.file.filename=%s", drv->overlay); append_list ("file.driver=qcow2"); append_list ("file.backing.file.locking=off"); } else { /* Ancient qemu (esp. qemu 1.5 in RHEL 7) didn't understand the * file.file.filename= parameter, so use the safer old-style * form of parameters unless we actually want to specify the * locking flag above. */ append_list_format ("file=%s", drv->overlay); append_list ("format=qcow2"); } append_list ("cache=unsafe"); } append_list_format ("id=hd%zu", i); return 0; /* This label is called implicitly from the qemuopts macros on error. */ qemuopts_error: perrorf (g, "qemuopts"); return -1; } /** * Add the physical_block_size and logical_block_size elements of the C<-device> * parameter. */ static int add_device_blocksize_params (guestfs_h *g, struct qemuopts *qopts, struct drive *drv) { if (drv->blocksize) { append_list_format ("physical_block_size=%d", drv->blocksize); append_list_format ("logical_block_size=%d", drv->blocksize); } return 0; /* This label is called implicitly from the qemuopts macros on error. */ qemuopts_error: perrorf (g, "qemuopts"); return -1; } static int add_drive (guestfs_h *g, struct backend_direct_data *data, struct qemuopts *qopts, size_t i, struct drive *drv) { start_list ("-drive") { if (add_drive_standard_params (g, data, qopts, i, drv) == -1) return -1; append_list ("if=none"); } end_list (); start_list ("-device") { append_list ("scsi-hd"); append_list_format ("drive=hd%zu", i); if (drv->disk_label) append_list_format ("serial=%s", drv->disk_label); if (add_device_blocksize_params (g, qopts, drv) == -1) return -1; } end_list (); return 0; /* This label is called implicitly from the qemuopts macros on error. */ qemuopts_error: perrorf (g, "qemuopts"); return -1; } static int add_drives (guestfs_h *g, struct backend_direct_data *data, struct qemuopts *qopts) { size_t i; struct drive *drv; ITER_DRIVES (g, i, drv) { if (add_drive (g, data, qopts, i, drv) == -1) return -1; } return 0; } /** * Launch passt such that it daemonizes. * * On error, C<-1> is returned; C and C are not modified. * * On success, C<0> is returned. C contains the PID of the passt * background process. C contains the pathname of the unix domain * socket where passt will accept a single connection. */ static int launch_passt (guestfs_h *g, long *passt_pid, char (*sockpath)[UNIX_PATH_MAX]) { int rc; char sockpath_local[sizeof *sockpath]; char *pid_path; struct command *cmd; int passt_status; int passt_exit; char *pid_str; long passt_pid_local; char *endptr; rc = -1; if (guestfs_int_create_socketname (g, "passt.sock", &sockpath_local) == -1) return rc; pid_path = guestfs_int_make_pid_path (g, "passt"); if (pid_path == NULL) return rc; cmd = guestfs_int_new_command (g); if (cmd == NULL) goto free_pid_path; guestfs_int_cmd_add_arg (cmd, "passt"); guestfs_int_cmd_add_arg (cmd, "--one-off"); guestfs_int_cmd_add_arg (cmd, "--socket"); guestfs_int_cmd_add_arg (cmd, sockpath_local); guestfs_int_cmd_add_arg (cmd, "--pid"); guestfs_int_cmd_add_arg (cmd, pid_path); guestfs_int_cmd_add_arg (cmd, "--address"); guestfs_int_cmd_add_arg (cmd, NETWORK_ADDRESS); guestfs_int_cmd_add_arg (cmd, "--netmask"); guestfs_int_cmd_add_arg (cmd, NETWORK_PREFIX); guestfs_int_cmd_add_arg (cmd, "--mac-addr"); guestfs_int_cmd_add_arg (cmd, NETWORK_GW_MAC); guestfs_int_cmd_add_arg (cmd, "--gateway"); guestfs_int_cmd_add_arg (cmd, NETWORK_GW_IP); passt_status = guestfs_int_cmd_run (cmd); if (passt_status == -1) /* guestfs_int_cmd_run() reports errors internally, so just bail here */ goto close_cmd; if (WIFSIGNALED (passt_status)) { error (g, _("passt was killed with signal %d"), WTERMSIG (passt_status)); goto close_cmd; } if (!WIFEXITED (passt_status)) { error (g, _("internal error: unexpected exit status from passt (%d)"), passt_status); goto close_cmd; } passt_exit = WEXITSTATUS (passt_status); if (passt_exit != 0) { error (g, _("passt exited with status %d"), passt_exit); goto close_cmd; } /* At this point passt has forked into the background, dropped privileges, and * written a PID file. Due to "--one-off", passt will exit once our QEMU * appliance disappears (forcibly or cleanly); however, we still need the * passt PID *temporarily*, so we can kill passt in case we encounter an error * *before* starting the appliance. */ if (guestfs_int_read_whole_file (g, pid_path, &pid_str, NULL) == -1) /* Any error has been reported internally, so just bail. We can't kill * passt here because we've failed to get its PID in the first place... */ goto close_cmd; errno = 0; passt_pid_local = strtol (pid_str, &endptr, 10); if (endptr == pid_str || (*endptr != '\0' && *endptr != '\n') || errno != 0 || passt_pid_local <= 1) { /* Same thing, we can't kill passt just yet. */ error (g, _("failed to parse passt PID from '%s'"), pid_path); goto free_pid_str; } /* We're done. */ *passt_pid = passt_pid_local; ignore_value (strcpy (*sockpath, sockpath_local)); rc = 0; free_pid_str: free (pid_str); close_cmd: guestfs_int_cmd_close (cmd); free_pid_path: free (pid_path); return rc; } /* Pick a default, arch-specific qemu. */ static const char * get_default_hv_direct (guestfs_h *g, void *datav) { if (host_cpu[0] == 'i' && strchr ("3456", host_cpu[1]) && host_cpu[2] == '8' && host_cpu[3] == '6' && host_cpu[4] == '\0') return "qemu-system-i386"; else if (STRPREFIX (host_cpu, "arm")) return "qemu-system-arm"; else if (STREQ (host_cpu, "powerpc64") || STREQ (host_cpu, "powerpc64le") || STREQ (host_cpu, "ppc64le")) return "qemu-system-ppc64"; else return "qemu-system-" host_cpu; } static int launch_direct (guestfs_h *g, void *datav, const char *arg) { struct backend_direct_data *data = datav; struct qemuopts *qopts = NULL; int daemon_accept_sock = -1, console_sock = -1; int r; long passt_pid = -1; int flags; int sv[2]; struct sockaddr_un addr; CLEANUP_FREE char *uefi_code = NULL, *uefi_vars = NULL; int uefi_flags; CLEANUP_FREE char *kernel = NULL, *initrd = NULL, *appliance = NULL; int has_appliance_drive; uint32_t size; CLEANUP_FREE void *buf = NULL; struct hv_param *hp; int has_kvm; int force_tcg; int force_kvm; const char *accel_val = "kvm:tcg"; const char *cpu_model; CLEANUP_FREE char *append = NULL; CLEANUP_FREE_STRING_LIST char **argv = NULL; CLEANUP_FREE_STRING_LIST char **env = NULL; if (!g->nr_drives) { error (g, _("you must call guestfs_add_drive before guestfs_launch")); return -1; } guestfs_int_launch_send_progress (g, 0); /* Locate and/or build the appliance. */ if (guestfs_int_build_appliance (g, &kernel, &initrd, &appliance) == -1) return -1; has_appliance_drive = appliance != NULL; guestfs_int_launch_send_progress (g, 3); data->qemu = g->hv ? : get_default_hv_direct (g, data); debug (g, "chosen qemu: %s", data->qemu); debug (g, "begin testing qemu features"); /* If debugging, print the qemu version. */ if (g->verbose) { CLEANUP_CMD_CLOSE struct command *cmd = guestfs_int_new_command (g); guestfs_int_cmd_add_arg (cmd, data->qemu); guestfs_int_cmd_add_arg (cmd, "-version"); guestfs_int_cmd_set_stdout_callback (cmd, debug_lines, NULL, 0); guestfs_int_cmd_run (cmd); } /* Work out if KVM is supported or if the user wants to force TCG. */ if ((has_kvm = guestfs_int_platform_has_kvm (g, data->qemu)) == -1) goto cleanup0; debug (g, "qemu KVM: %s", has_kvm ? "enabled" : "disabled"); force_tcg = guestfs_int_get_backend_setting_bool (g, "force_tcg"); if (force_tcg == -1) return -1; else if (force_tcg) accel_val = "tcg"; force_kvm = guestfs_int_get_backend_setting_bool (g, "force_kvm"); if (force_kvm == -1) return -1; else if (force_kvm) accel_val = "kvm"; if (force_kvm && force_tcg) { error (g, "Both force_kvm and force_tcg backend settings supplied."); return -1; } if (!has_kvm) { if (!force_tcg) debian_kvm_warning (g); if (force_kvm) { error (g, "force_kvm supplied but kvm not available."); return -1; } } /* Using virtio-serial, we need to create a local Unix domain socket * for qemu to connect to. */ if (guestfs_int_create_socketname (g, "guestfsd.sock", &data->guestfsd_sock) == -1) goto cleanup0; daemon_accept_sock = socket (AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); if (daemon_accept_sock == -1) { perrorf (g, "socket"); goto cleanup0; } addr.sun_family = AF_UNIX; strncpy (addr.sun_path, data->guestfsd_sock, UNIX_PATH_MAX); addr.sun_path[UNIX_PATH_MAX-1] = '\0'; if (bind (daemon_accept_sock, (struct sockaddr *) &addr, sizeof addr) == -1) { perrorf (g, "bind"); goto cleanup0; } if (listen (daemon_accept_sock, 1) == -1) { perrorf (g, "listen"); goto cleanup0; } if (!g->direct_mode) { if (socketpair (AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0, sv) == -1) { perrorf (g, "socketpair"); goto cleanup0; } } debug (g, "finished testing qemu features"); /* Construct the qemu command line. We have to do this before * forking, because after fork we are not allowed to use * non-signal-safe functions such as malloc. */ qopts = qemuopts_create (); if (qopts == NULL) { qemuopts_error: perrorf (g, "qemuopts"); goto cleanup0; } if (qemuopts_set_binary (qopts, data->qemu) == -1) goto qemuopts_error; /* CVE-2011-4127 mitigation: Disable SCSI ioctls on virtio-blk * devices. */ arg ("-global", VIRTIO_DEVICE_NAME ("virtio-blk") ".scsi=off"); /* Disable qemu defaults and per-user configuration file so we get * an unconfigured qemu. */ flag ("-no-user-config"); flag ("-nodefaults"); /* This disables the host-side display (SDL, Gtk). */ arg ("-display", "none"); /* See guestfs.pod / gdb */ if (guestfs_int_get_backend_setting_bool (g, "gdb") > 0) { flag ("-S"); flag ("-s"); warning (g, "qemu debugging is enabled, connect gdb to tcp::1234 to begin"); } start_list ("-machine") { #ifdef MACHINE_TYPE append_list (MACHINE_TYPE); #endif #ifdef __aarch64__ if (has_kvm && !force_tcg) append_list ("gic-version=host"); #endif append_list_format ("accel=%s", accel_val); #if defined(__i386__) || defined(__x86_64__) /* Tell seabios to send debug messages to the serial port. * This used to be done by sgabios. */ if (g->verbose) append_list ("graphics=off"); #endif } end_list (); cpu_model = guestfs_int_get_cpu_model (has_kvm && !force_tcg); if (cpu_model) arg ("-cpu", cpu_model); if (g->smp > 1) arg_format ("-smp", "%d", g->smp); arg_format ("-m", "%d", g->memsize); /* Force exit instead of reboot on panic */ flag ("-no-reboot"); /* These are recommended settings, see RHBZ#1053847. */ arg ("-rtc", "driftfix=slew"); #if defined(__i386__) || defined(__x86_64__) arg ("-global", "kvm-pit.lost_tick_policy=discard"); #endif /* UEFI (firmware) if required. */ if (guestfs_int_get_uefi (g, NULL, NULL, &uefi_code, &uefi_vars, &uefi_flags) == -1) goto cleanup0; if (uefi_flags & UEFI_FLAG_SECURE_BOOT_REQUIRED) { /* Implementing this requires changes to the qemu command line. * See RHBZ#1367615 for details. As the guestfs_int_get_uefi * function is only implemented for aarch64, and UEFI secure boot * is some way off on aarch64 (2017/2018), we only need to worry * about this later. */ error (g, "internal error: direct backend " "does not implement UEFI secure boot, " "see comments in the code"); goto cleanup0; } if (uefi_code) { start_list ("-drive") { append_list ("if=pflash"); append_list ("format=raw"); append_list_format ("file=%s", uefi_code); append_list ("readonly"); } end_list (); if (uefi_vars) { start_list ("-drive") { append_list ("if=pflash"); append_list ("format=raw"); append_list_format ("file=%s", uefi_vars); } end_list (); } } /* Kernel and initrd. */ arg ("-kernel", kernel); arg ("-initrd", initrd); /* Add a good source of entropy, eg for cryptographic operations. */ start_list ("-object") { append_list ("rng-random"); append_list ("filename=/dev/urandom"); append_list ("id=rng0"); } end_list (); start_list ("-device") { append_list (VIRTIO_DEVICE_NAME ("virtio-rng")); append_list ("rng=rng0"); } end_list (); /* Create the virtio-scsi bus. */ start_list ("-device") { append_list (VIRTIO_DEVICE_NAME ("virtio-scsi")); append_list ("id=scsi"); } end_list (); /* Add drives (except for the appliance drive). */ if (add_drives (g, data, qopts) == -1) goto cleanup0; /* Add the ext2 appliance drive (after all the drives). */ if (has_appliance_drive) { start_list ("-drive") { append_list_format ("file=%s", appliance); append_list ("snapshot=on"); append_list ("id=appliance"); append_list ("cache=unsafe"); append_list ("if=none"); #ifndef APPLIANCE_FORMAT_AUTO append_list ("format=raw"); #endif } end_list (); start_list ("-device") { append_list ("scsi-hd"); append_list ("drive=appliance"); } end_list (); } /* Create the virtio serial bus. */ arg ("-device", VIRTIO_DEVICE_NAME ("virtio-serial")); /* Create the serial console. */ #ifndef __s390x__ arg ("-serial", "stdio"); #else start_list ("-chardev") { append_list ("stdio"); append_list ("id=charconsole0"); } end_list (); start_list ("-device") { append_list ("sclpconsole"); append_list ("chardev=charconsole0"); } end_list (); #endif /* Set up virtio-serial for the communications channel. */ start_list ("-chardev") { append_list ("socket"); append_list_format ("path=%s", data->guestfsd_sock); append_list ("id=channel0"); } end_list (); start_list ("-device") { append_list ("virtserialport"); append_list ("chardev=channel0"); append_list ("name=org.libguestfs.channel.0"); } end_list (); /* Enable user networking. */ if (g->enable_network) { /* If passt is available, ask for passt rather than SLIRP (RHBZ#2184967) */ if (guestfs_int_passt_runnable (g)) { char passt_sock[UNIX_PATH_MAX]; if (launch_passt (g, &passt_pid, &passt_sock) == -1) goto cleanup0; start_list ("-netdev") { append_list ("stream"); append_list ("id=usernet"); append_list ("addr.type=unix"); append_list_format ("addr.path=%s", passt_sock); } end_list (); } else { start_list ("-netdev") { append_list ("user"); append_list ("id=usernet"); append_list ("net=" NETWORK_ADDRESS "/" NETWORK_PREFIX); } end_list (); } start_list ("-device") { append_list (VIRTIO_DEVICE_NAME ("virtio-net")); append_list ("netdev=usernet"); } end_list (); } flags = 0; if (!has_kvm || force_tcg) flags |= APPLIANCE_COMMAND_LINE_IS_TCG; append = guestfs_int_appliance_command_line (g, appliance, flags); arg ("-append", append); /* Note: custom command line parameters must come last so that * qemu -set parameters can modify previously added options. */ /* Add any qemu parameters. */ for (hp = g->hv_params; hp; hp = hp->next) { if (!hp->hv_value) flag (hp->hv_param); else arg_noquote (hp->hv_param, hp->hv_value); } /* Get the argv list from the command line. */ argv = qemuopts_to_argv (qopts); /* Create the environ for the child process. */ env = guestfs_int_copy_environ (environ, "LC_ALL", "C", /* Prevents qemu opening /dev/dsp */ "QEMU_AUDIO_DRV", "none", NULL); if (env == NULL) goto cleanup0; r = fork (); if (r == -1) { perrorf (g, "fork"); if (!g->direct_mode) { close (sv[0]); close (sv[1]); } goto cleanup0; } if (r == 0) { /* Child (qemu). */ if (!g->direct_mode) { /* Set up stdin, stdout, stderr. */ close (0); close (1); close (sv[0]); /* We set the FD_CLOEXEC flag on the socket above, but now (in * the child) it's safe to unset this flag so qemu can use the * socket. */ set_cloexec_flag (sv[1], 0); /* Stdin. */ if (dup (sv[1]) == -1) { dup_failed: perror ("dup failed"); _exit (EXIT_FAILURE); } /* Stdout. */ if (dup (sv[1]) == -1) goto dup_failed; /* Particularly since qemu 0.15, qemu spews all sorts of debug * information on stderr. It is useful to both capture this and * not confuse casual users, so send stderr to the pipe as well. */ close (2); if (dup (sv[1]) == -1) goto dup_failed; close (sv[1]); /* Close any other file descriptors that we don't want to pass * to qemu. This prevents file descriptors which didn't have * O_CLOEXEC set properly from leaking into the subprocess. See * RHBZ#1123007. */ close_file_descriptors (fd > 2); } /* Unblock the SIGTERM signal since we will need to send that to * the subprocess (RHBZ#1460338). */ guestfs_int_unblock_sigterm (); /* Dump the command line (after setting up stderr above). */ if (g->verbose) qemuopts_to_channel (qopts, stderr); /* Put qemu in a new process group. */ if (g->pgroup) setpgid (0, 0); execvpe (data->qemu, argv, env); /* Run qemu. */ perror (data->qemu); _exit (EXIT_FAILURE); } /* Parent (library). */ data->pid = r; qemuopts_free (qopts); qopts = NULL; /* Fork the recovery process off which will kill qemu if the parent * process fails to do so (eg. if the parent segfaults). */ data->recoverypid = -1; if (g->recovery_proc) { r = fork (); if (r == 0) { size_t i; struct sigaction sa; pid_t qemu_pid = data->pid; pid_t parent_pid = getppid (); /* Remove all signal handlers. See the justification here: * https://www.redhat.com/archives/libvir-list/2008-August/msg00303.html * We don't mask signal handlers yet, so this isn't completely * race-free, but better than not doing it at all. */ memset (&sa, 0, sizeof sa); sa.sa_handler = SIG_DFL; sa.sa_flags = 0; sigemptyset (&sa.sa_mask); for (i = 1; i < NSIG; ++i) sigaction (i, &sa, NULL); /* Close all other file descriptors. This ensures that we don't * hold open (eg) pipes from the parent process. */ close_file_descriptors (1); /* Unblock the SIGTERM signal since we will need to respond to * SIGTERM from the parent (RHBZ#1460338). */ guestfs_int_unblock_sigterm (); /* It would be nice to be able to put this in the same process * group as qemu (ie. setpgid (0, qemu_pid)). However this is * not possible because we don't have any guarantee here that * the qemu process has started yet. */ if (g->pgroup) setpgid (0, 0); /* Writing to argv is hideously complicated and error prone. See: * http://git.postgresql.org/gitweb/?p=postgresql.git;a=blob;f=src/backend/utils/misc/ps_status.c;hb=HEAD */ /* Loop around waiting for one or both of the other processes to * disappear. It's fair to say this is very hairy. The PIDs that * we are looking at might be reused by another process. We are * effectively polling. Is the cure worse than the disease? */ for (;;) { if (kill (qemu_pid, 0) == -1) /* qemu's gone away, we aren't needed */ _exit (EXIT_SUCCESS); if (kill (parent_pid, 0) == -1) { /* Parent's gone away, qemu still around, so kill qemu. */ kill (qemu_pid, 9); _exit (EXIT_SUCCESS); } sleep (2); } } /* Don't worry, if the fork failed, this will be -1. The recovery * process isn't essential. */ data->recoverypid = r; } if (!g->direct_mode) { /* Close the other end of the socketpair. */ close (sv[1]); console_sock = sv[0]; /* stdin of child */ sv[0] = -1; } g->state = LAUNCHING; /* Wait for qemu to start and to connect back to us via * virtio-serial and send the GUESTFS_LAUNCH_FLAG message. */ g->conn = guestfs_int_new_conn_socket_listening (g, daemon_accept_sock, console_sock); if (!g->conn) goto cleanup1; /* g->conn now owns these sockets. */ daemon_accept_sock = console_sock = -1; r = g->conn->ops->accept_connection (g, g->conn); if (r == -1) goto cleanup1; if (r == 0) { guestfs_int_launch_failed_error (g); goto cleanup1; } /* NB: We reach here just because qemu has opened the socket. It * does not mean the daemon is up until we read the * GUESTFS_LAUNCH_FLAG below. Failures in qemu startup can still * happen even if we reach here, even early failures like not being * able to open a drive. */ r = guestfs_int_recv_from_daemon (g, &size, &buf); if (r == -1) { guestfs_int_launch_failed_error (g); goto cleanup1; } if (size != GUESTFS_LAUNCH_FLAG) { guestfs_int_launch_failed_error (g); goto cleanup1; } debug (g, "appliance is up"); /* From this point onward, even if we fail, QEMU terminating (forcefully or * gracefully) will cause passt to go away as well. Note that we can't * precisely tell whether QEMU managed to open the passt socket before QEMU * failed. Therefore, err on the side of killing passt needlessly, rather * than not killing it when needed -- that's why we re-set "passt_pid" to (-1) * only this late during QEMU startup verification. */ passt_pid = -1; /* This is possible in some really strange situations, such as * guestfsd starts up OK but then qemu immediately exits. Check for * it because the caller is probably expecting to be able to send * commands after this function returns. */ if (g->state != READY) { error (g, _("qemu launched and contacted daemon, but state != READY")); goto cleanup1; } guestfs_int_launch_send_progress (g, 12); if (has_appliance_drive) guestfs_int_add_dummy_appliance_drive (g); return 0; cleanup1: if (!g->direct_mode && sv[0] >= 0) close (sv[0]); if (data->pid > 0) kill (data->pid, 9); if (data->recoverypid > 0) kill (data->recoverypid, 9); if (data->pid > 0) guestfs_int_waitpid_noerror (data->pid); if (data->recoverypid > 0) guestfs_int_waitpid_noerror (data->recoverypid); data->pid = 0; data->recoverypid = 0; memset (&g->launch_t, 0, sizeof g->launch_t); cleanup0: if (passt_pid != -1) kill (passt_pid, SIGTERM); if (qopts != NULL) qemuopts_free (qopts); if (daemon_accept_sock >= 0) close (daemon_accept_sock); if (console_sock >= 0) close (console_sock); if (g->conn) { g->conn->ops->free_connection (g, g->conn); g->conn = NULL; } g->state = CONFIG; return -1; } static int shutdown_direct (guestfs_h *g, void *datav, int check_for_errors) { struct backend_direct_data *data = datav; int ret = 0; int status; struct rusage rusage; /* Signal qemu to shutdown cleanly, and kill the recovery process. */ if (data->pid > 0) { debug (g, "sending SIGTERM to process %d", data->pid); kill (data->pid, SIGTERM); } if (data->recoverypid > 0) kill (data->recoverypid, 9); /* Wait for subprocess(es) to exit. */ if (g->recovery_proc /* RHBZ#998482 */ && data->pid > 0) { if (guestfs_int_wait4 (g, data->pid, &status, &rusage, "qemu") == -1) ret = -1; else if (!WIFEXITED (status) || WEXITSTATUS (status) != 0) { guestfs_int_external_command_failed (g, status, data->qemu, NULL); ret = -1; } else /* Print the actual memory usage of qemu, useful for seeing * if techniques like DAX are having any effect. */ debug (g, "qemu maxrss %ldK", rusage.ru_maxrss); } if (data->recoverypid > 0) guestfs_int_waitpid_noerror (data->recoverypid); data->pid = data->recoverypid = 0; if (data->guestfsd_sock[0] != '\0') { unlink (data->guestfsd_sock); data->guestfsd_sock[0] = '\0'; } return ret; } static int get_pid_direct (guestfs_h *g, void *datav) { struct backend_direct_data *data = datav; if (data->pid > 0) return data->pid; else { error (g, "get_pid: no qemu subprocess"); return -1; } } /* Maximum number of disks. */ static int max_disks_direct (guestfs_h *g, void *datav) { return 255; } static struct backend_ops backend_direct_ops = { .data_size = sizeof (struct backend_direct_data), .create_cow_overlay = create_cow_overlay_direct, .get_default_hv = get_default_hv_direct, .launch = launch_direct, .shutdown = shutdown_direct, .get_pid = get_pid_direct, .max_disks = max_disks_direct, }; void guestfs_int_init_direct_backend (void) { guestfs_int_register_backend ("direct", &backend_direct_ops); }