Files
libguestfs/lib/launch-direct.c
Richard W.M. Jones dce5ce605d lib/launch-direct.c: Set lost_tick_policy=discard unconditionally
We already assume qemu version is must greater than 1.3.0 so we do not
need this qemu version test.
2025-09-29 16:49:03 +01:00

1139 lines
33 KiB
C

/* libguestfs
* Copyright (C) 2009-2025 Red Hat Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* Implementation of the C<direct> backend.
*
* For more details see L<guestfs(3)/BACKENDS>.
*/
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <signal.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <assert.h>
#include <string.h>
#include <libintl.h>
#include "cloexec.h"
#include "guestfs.h"
#include "guestfs-internal.h"
#include "guestfs_protocol.h"
#include "qemuopts.h"
#include "ignore-value.h"
/* Per-handle data. */
struct backend_direct_data {
pid_t pid; /* Qemu PID. */
pid_t recoverypid; /* Recovery process PID. */
struct version qemu_version; /* qemu version (0 if unable to parse). */
struct qemu_data *qemu_data; /* qemu -help output etc. */
char guestfsd_sock[UNIX_PATH_MAX]; /* Path to daemon socket. */
};
static char *
create_cow_overlay_direct (guestfs_h *g, void *datav, struct drive *drv)
{
char *overlay;
CLEANUP_FREE char *backing_drive = NULL;
struct guestfs_disk_create_argv optargs;
backing_drive = guestfs_int_drive_source_qemu_param (g, &drv->src);
if (!backing_drive)
return NULL;
overlay = guestfs_int_make_temp_path (g, "overlay", "qcow2");
if (!overlay)
return NULL;
optargs.bitmask = GUESTFS_DISK_CREATE_BACKINGFILE_BITMASK;
optargs.backingfile = backing_drive;
if (drv->src.format) {
optargs.bitmask |= GUESTFS_DISK_CREATE_BACKINGFORMAT_BITMASK;
optargs.backingformat = drv->src.format;
}
if (guestfs_disk_create_argv (g, overlay, "qcow2", -1, &optargs) == -1) {
free (overlay);
return NULL;
}
/* Caller sets g->overlay in the handle to this, and then manages
* the memory.
*/
return overlay;
}
/* On Debian, /dev/kvm is mode 0660 and group kvm, so users need to
* add themselves to the kvm group otherwise things are going to be
* very slow (this is Debian bug 640328). Warn about this.
*/
static void
debian_kvm_warning (guestfs_h *g)
{
#ifdef __linux__
uid_t euid = geteuid ();
gid_t egid = getegid ();
struct stat statbuf;
gid_t kvm_group;
CLEANUP_FREE gid_t *groups = NULL;
int ngroups;
size_t i;
/* Doesn't apply if running as root. */
if (euid == 0)
return;
if (stat ("/dev/kvm", &statbuf) == -1)
return;
if ((statbuf.st_mode & 0777) != 0660)
return;
/* They might be running libguestfs as root or have chowned /dev/kvm, so: */
if (geteuid () == statbuf.st_uid)
return;
kvm_group = statbuf.st_gid;
/* Is the current process a member of the KVM group? */
if (egid == kvm_group)
return;
ngroups = getgroups (0, NULL);
if (ngroups > 0) {
groups = safe_malloc (g, ngroups * sizeof (gid_t));
if (getgroups (ngroups, groups) == -1) {
warning (g, "getgroups: %m (ignored)");
return;
}
for (i = 0; i < (size_t) ngroups; ++i) {
if (groups[i] == kvm_group)
return;
}
}
/* No, so emit the warning. Note that \n characters cannot appear
* in warnings.
*/
warning (g,
_("current user is not a member of the KVM group (group ID %d). "
"This user cannot access /dev/kvm, so libguestfs may run very slowly. "
"It is recommended that you 'chmod 0666 /dev/kvm' or add the current user "
"to the KVM group (you might need to log out and log in again)."),
(int) kvm_group);
#endif /* __linux__ */
}
/* Some macros which make using qemuopts a bit easier. */
#define flag(flag) \
do { \
if (qemuopts_add_flag (qopts, (flag)) == -1) goto qemuopts_error; \
} while (0)
#define arg(flag, value) \
do { \
if (qemuopts_add_arg (qopts, (flag), (value)) == -1) goto qemuopts_error; \
} while (0)
#define arg_format(flag, fs, ...) \
do { \
if (qemuopts_add_arg_format (qopts, (flag), (fs), ##__VA_ARGS__) == -1) \
goto qemuopts_error; \
} while (0)
#define arg_noquote(flag, value) \
do { \
if (qemuopts_add_arg_noquote (qopts, (flag), (value)) == -1) \
goto qemuopts_error; \
} while (0)
#define start_list(flag) \
if (qemuopts_start_arg_list (qopts, (flag)) == -1) goto qemuopts_error; \
do
#define append_list(value) \
do { \
if (qemuopts_append_arg_list (qopts, (value)) == -1) \
goto qemuopts_error; \
} while (0)
#define append_list_format(fs, ...) \
do { \
if (qemuopts_append_arg_list_format (qopts, (fs), ##__VA_ARGS__) == -1) \
goto qemuopts_error; \
} while (0)
#define end_list() \
while (0); \
do { \
if (qemuopts_end_arg_list (qopts) == -1) goto qemuopts_error; \
} while (0)
/**
* Add the standard elements of the C<-drive> parameter.
*/
static int
add_drive_standard_params (guestfs_h *g, struct backend_direct_data *data,
struct qemuopts *qopts,
size_t i, struct drive *drv)
{
if (!drv->overlay) {
CLEANUP_FREE char *file = NULL;
/* file= parameter. */
file = guestfs_int_drive_source_qemu_param (g, &drv->src);
append_list_format ("file=%s", file);
if (drv->readonly)
append_list ("snapshot=on");
append_list_format ("cache=%s",
drv->cachemode ? drv->cachemode : "writeback");
if (drv->src.format)
append_list_format ("format=%s", drv->src.format);
if (drv->copyonread)
append_list ("copy-on-read=on");
/* Discard mode. */
switch (drv->discard) {
case discard_disable:
/* Since the default is always discard=ignore, don't specify it
* on the command line.
*/
break;
case discard_enable:
if (!guestfs_int_discard_possible (g, drv))
return -1;
/*FALLTHROUGH*/
case discard_besteffort:
append_list ("discard=unmap");
}
}
else {
/* Writable qcow2 overlay on top of read-only drive.
*
* Add the file-specific locking option only for files, as
* qemu won't accept options unknown to the block driver in
* use.
*/
if (drv->src.protocol == drive_protocol_file) {
append_list_format ("file.file.filename=%s", drv->overlay);
append_list ("file.driver=qcow2");
append_list ("file.backing.file.locking=off");
}
else {
/* Ancient qemu (esp. qemu 1.5 in RHEL 7) didn't understand the
* file.file.filename= parameter, so use the safer old-style
* form of parameters unless we actually want to specify the
* locking flag above.
*/
append_list_format ("file=%s", drv->overlay);
append_list ("format=qcow2");
}
append_list ("cache=unsafe");
}
append_list_format ("id=hd%zu", i);
return 0;
/* This label is called implicitly from the qemuopts macros on error. */
qemuopts_error:
perrorf (g, "qemuopts");
return -1;
}
/**
* Add the physical_block_size and logical_block_size elements of the C<-device>
* parameter.
*/
static int
add_device_blocksize_params (guestfs_h *g, struct qemuopts *qopts,
struct drive *drv)
{
if (drv->blocksize) {
append_list_format ("physical_block_size=%d", drv->blocksize);
append_list_format ("logical_block_size=%d", drv->blocksize);
}
return 0;
/* This label is called implicitly from the qemuopts macros on error. */
qemuopts_error:
perrorf (g, "qemuopts");
return -1;
}
static int
add_drive (guestfs_h *g, struct backend_direct_data *data,
struct qemuopts *qopts, size_t i, struct drive *drv)
{
start_list ("-drive") {
if (add_drive_standard_params (g, data, qopts, i, drv) == -1)
return -1;
append_list ("if=none");
} end_list ();
start_list ("-device") {
append_list ("scsi-hd");
append_list_format ("drive=hd%zu", i);
if (drv->disk_label)
append_list_format ("serial=%s", drv->disk_label);
if (add_device_blocksize_params (g, qopts, drv) == -1)
return -1;
} end_list ();
return 0;
/* This label is called implicitly from the qemuopts macros on error. */
qemuopts_error:
perrorf (g, "qemuopts");
return -1;
}
static int
add_drives (guestfs_h *g, struct backend_direct_data *data,
struct qemuopts *qopts)
{
size_t i;
struct drive *drv;
ITER_DRIVES (g, i, drv) {
if (add_drive (g, data, qopts, i, drv) == -1)
return -1;
}
return 0;
}
/**
* Launch passt such that it daemonizes.
*
* On error, C<-1> is returned; C<passt_pid> and C<sockpath> are not modified.
*
* On success, C<0> is returned. C<passt_pid> contains the PID of the passt
* background process. C<sockpath> contains the pathname of the unix domain
* socket where passt will accept a single connection.
*/
static int
launch_passt (guestfs_h *g, long *passt_pid, char (*sockpath)[UNIX_PATH_MAX])
{
int rc;
char sockpath_local[sizeof *sockpath];
char *pid_path;
struct command *cmd;
int passt_status;
int passt_exit;
char *pid_str;
long passt_pid_local;
char *endptr;
rc = -1;
if (guestfs_int_create_socketname (g, "passt.sock", &sockpath_local) == -1)
return rc;
pid_path = guestfs_int_make_pid_path (g, "passt");
if (pid_path == NULL)
return rc;
cmd = guestfs_int_new_command (g);
if (cmd == NULL)
goto free_pid_path;
guestfs_int_cmd_add_arg (cmd, "passt");
guestfs_int_cmd_add_arg (cmd, "--one-off");
guestfs_int_cmd_add_arg (cmd, "--socket");
guestfs_int_cmd_add_arg (cmd, sockpath_local);
guestfs_int_cmd_add_arg (cmd, "--pid");
guestfs_int_cmd_add_arg (cmd, pid_path);
guestfs_int_cmd_add_arg (cmd, "--address");
guestfs_int_cmd_add_arg (cmd, NETWORK_ADDRESS);
guestfs_int_cmd_add_arg (cmd, "--netmask");
guestfs_int_cmd_add_arg (cmd, NETWORK_PREFIX);
guestfs_int_cmd_add_arg (cmd, "--mac-addr");
guestfs_int_cmd_add_arg (cmd, NETWORK_GW_MAC);
guestfs_int_cmd_add_arg (cmd, "--gateway");
guestfs_int_cmd_add_arg (cmd, NETWORK_GW_IP);
passt_status = guestfs_int_cmd_run (cmd);
if (passt_status == -1)
/* guestfs_int_cmd_run() reports errors internally, so just bail here */
goto close_cmd;
if (WIFSIGNALED (passt_status)) {
error (g, _("passt was killed with signal %d"), WTERMSIG (passt_status));
goto close_cmd;
}
if (!WIFEXITED (passt_status)) {
error (g, _("internal error: unexpected exit status from passt (%d)"),
passt_status);
goto close_cmd;
}
passt_exit = WEXITSTATUS (passt_status);
if (passt_exit != 0) {
error (g, _("passt exited with status %d"), passt_exit);
goto close_cmd;
}
/* At this point passt has forked into the background, dropped privileges, and
* written a PID file. Due to "--one-off", passt will exit once our QEMU
* appliance disappears (forcibly or cleanly); however, we still need the
* passt PID *temporarily*, so we can kill passt in case we encounter an error
* *before* starting the appliance.
*/
if (guestfs_int_read_whole_file (g, pid_path, &pid_str, NULL) == -1)
/* Any error has been reported internally, so just bail. We can't kill
* passt here because we've failed to get its PID in the first place...
*/
goto close_cmd;
errno = 0;
passt_pid_local = strtol (pid_str, &endptr, 10);
if (endptr == pid_str || (*endptr != '\0' && *endptr != '\n') || errno != 0 ||
passt_pid_local <= 1) {
/* Same thing, we can't kill passt just yet. */
error (g, _("failed to parse passt PID from '%s'"), pid_path);
goto free_pid_str;
}
/* We're done. */
*passt_pid = passt_pid_local;
ignore_value (strcpy (*sockpath, sockpath_local));
rc = 0;
free_pid_str:
free (pid_str);
close_cmd:
guestfs_int_cmd_close (cmd);
free_pid_path:
free (pid_path);
return rc;
}
static int
launch_direct (guestfs_h *g, void *datav, const char *arg)
{
struct backend_direct_data *data = datav;
struct qemuopts *qopts = NULL;
int daemon_accept_sock = -1, console_sock = -1;
int r;
long passt_pid = -1;
int flags;
int sv[2];
struct sockaddr_un addr;
CLEANUP_FREE char *uefi_code = NULL, *uefi_vars = NULL;
int uefi_flags;
CLEANUP_FREE char *kernel = NULL, *initrd = NULL, *appliance = NULL;
int has_appliance_drive;
uint32_t size;
CLEANUP_FREE void *buf = NULL;
struct hv_param *hp;
bool has_kvm;
int force_tcg;
int force_kvm;
const char *accel_val = "kvm:tcg";
const char *cpu_model;
CLEANUP_FREE char *append = NULL;
CLEANUP_FREE_STRING_LIST char **argv = NULL;
CLEANUP_FREE_STRING_LIST char **env = NULL;
if (!g->nr_drives) {
error (g, _("you must call guestfs_add_drive before guestfs_launch"));
return -1;
}
guestfs_int_launch_send_progress (g, 0);
/* Locate and/or build the appliance. */
if (guestfs_int_build_appliance (g, &kernel, &initrd, &appliance) == -1)
return -1;
has_appliance_drive = appliance != NULL;
guestfs_int_launch_send_progress (g, 3);
debug (g, "begin testing qemu features");
/* Get qemu help text and version. */
if (data->qemu_data == NULL) {
data->qemu_data = guestfs_int_test_qemu (g);
if (data->qemu_data == NULL)
goto cleanup0;
data->qemu_version = guestfs_int_qemu_version (g, data->qemu_data);
debug (g, "qemu version: %d.%d",
data->qemu_version.v_major, data->qemu_version.v_minor);
}
/* Work out if KVM is supported or if the user wants to force TCG. */
has_kvm = guestfs_int_platform_has_kvm (g, data->qemu_data);
debug (g, "qemu KVM: %s", has_kvm ? "enabled" : "disabled");
force_tcg = guestfs_int_get_backend_setting_bool (g, "force_tcg");
if (force_tcg == -1)
return -1;
else if (force_tcg)
accel_val = "tcg";
force_kvm = guestfs_int_get_backend_setting_bool (g, "force_kvm");
if (force_kvm == -1)
return -1;
else if (force_kvm)
accel_val = "kvm";
if (force_kvm && force_tcg) {
error (g, "Both force_kvm and force_tcg backend settings supplied.");
return -1;
}
if (!has_kvm) {
if (!force_tcg)
debian_kvm_warning (g);
if (force_kvm) {
error (g, "force_kvm supplied but kvm not available.");
return -1;
}
}
/* Using virtio-serial, we need to create a local Unix domain socket
* for qemu to connect to.
*/
if (guestfs_int_create_socketname (g, "guestfsd.sock",
&data->guestfsd_sock) == -1)
goto cleanup0;
daemon_accept_sock = socket (AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
if (daemon_accept_sock == -1) {
perrorf (g, "socket");
goto cleanup0;
}
addr.sun_family = AF_UNIX;
strncpy (addr.sun_path, data->guestfsd_sock, UNIX_PATH_MAX);
addr.sun_path[UNIX_PATH_MAX-1] = '\0';
if (bind (daemon_accept_sock, (struct sockaddr *) &addr,
sizeof addr) == -1) {
perrorf (g, "bind");
goto cleanup0;
}
if (listen (daemon_accept_sock, 1) == -1) {
perrorf (g, "listen");
goto cleanup0;
}
if (!g->direct_mode) {
if (socketpair (AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0, sv) == -1) {
perrorf (g, "socketpair");
goto cleanup0;
}
}
debug (g, "finished testing qemu features");
/* Construct the qemu command line. We have to do this before
* forking, because after fork we are not allowed to use
* non-signal-safe functions such as malloc.
*/
qopts = qemuopts_create ();
if (qopts == NULL) {
qemuopts_error:
perrorf (g, "qemuopts");
goto cleanup0;
}
if (qemuopts_set_binary (qopts, g->hv) == -1) goto qemuopts_error;
/* CVE-2011-4127 mitigation: Disable SCSI ioctls on virtio-blk
* devices.
*/
arg ("-global", VIRTIO_DEVICE_NAME ("virtio-blk") ".scsi=off");
/* Disable qemu defaults and per-user configuration file so we get
* an unconfigured qemu.
*/
flag ("-no-user-config");
flag ("-nodefaults");
/* This disables the host-side display (SDL, Gtk). */
arg ("-display", "none");
/* See guestfs.pod / gdb */
if (guestfs_int_get_backend_setting_bool (g, "gdb") > 0) {
flag ("-S");
flag ("-s");
warning (g, "qemu debugging is enabled, connect gdb to tcp::1234 to begin");
}
start_list ("-machine") {
#ifdef MACHINE_TYPE
append_list (MACHINE_TYPE);
#endif
#ifdef __aarch64__
if (has_kvm && !force_tcg)
append_list ("gic-version=host");
#endif
append_list_format ("accel=%s", accel_val);
#if defined(__i386__) || defined(__x86_64__)
/* Tell seabios to send debug messages to the serial port.
* This used to be done by sgabios.
*/
if (g->verbose)
append_list ("graphics=off");
#endif
} end_list ();
cpu_model = guestfs_int_get_cpu_model (has_kvm && !force_tcg);
if (cpu_model) {
#if defined(__x86_64__)
/* Temporary workaround for RHBZ#2082806 */
if (STREQ (cpu_model, "max")) {
start_list ("-cpu") {
append_list (cpu_model);
append_list ("la57=off");
} end_list ();
}
else
#endif
arg ("-cpu", cpu_model);
}
if (g->smp > 1)
arg_format ("-smp", "%d", g->smp);
arg_format ("-m", "%d", g->memsize);
/* Force exit instead of reboot on panic */
flag ("-no-reboot");
/* These are recommended settings, see RHBZ#1053847. */
arg ("-rtc", "driftfix=slew");
#if defined(__i386__) || defined(__x86_64__)
arg ("-global", "kvm-pit.lost_tick_policy=discard");
#endif
/* UEFI (firmware) if required. */
if (guestfs_int_get_uefi (g, NULL, NULL, &uefi_code, &uefi_vars,
&uefi_flags) == -1)
goto cleanup0;
if (uefi_flags & UEFI_FLAG_SECURE_BOOT_REQUIRED) {
/* Implementing this requires changes to the qemu command line.
* See RHBZ#1367615 for details. As the guestfs_int_get_uefi
* function is only implemented for aarch64, and UEFI secure boot
* is some way off on aarch64 (2017/2018), we only need to worry
* about this later.
*/
error (g, "internal error: direct backend "
"does not implement UEFI secure boot, "
"see comments in the code");
goto cleanup0;
}
if (uefi_code) {
start_list ("-drive") {
append_list ("if=pflash");
append_list ("format=raw");
append_list_format ("file=%s", uefi_code);
append_list ("readonly");
} end_list ();
if (uefi_vars) {
start_list ("-drive") {
append_list ("if=pflash");
append_list ("format=raw");
append_list_format ("file=%s", uefi_vars);
} end_list ();
}
}
/* Kernel and initrd. */
arg ("-kernel", kernel);
arg ("-initrd", initrd);
/* Add a random number generator (backend for virtio-rng). This
* isn't strictly necessary but means we won't need to hang around
* when needing entropy.
*/
if (guestfs_int_qemu_supports_device (g, data->qemu_data,
VIRTIO_DEVICE_NAME ("virtio-rng"))) {
start_list ("-object") {
append_list ("rng-random");
append_list ("filename=/dev/urandom");
append_list ("id=rng0");
} end_list ();
start_list ("-device") {
append_list (VIRTIO_DEVICE_NAME ("virtio-rng"));
append_list ("rng=rng0");
} end_list ();
}
/* Create the virtio-scsi bus. */
start_list ("-device") {
append_list (VIRTIO_DEVICE_NAME ("virtio-scsi"));
append_list ("id=scsi");
} end_list ();
/* Add drives (except for the appliance drive). */
if (add_drives (g, data, qopts) == -1)
goto cleanup0;
/* Add the ext2 appliance drive (after all the drives). */
if (has_appliance_drive) {
start_list ("-drive") {
append_list_format ("file=%s", appliance);
append_list ("snapshot=on");
append_list ("id=appliance");
append_list ("cache=unsafe");
append_list ("if=none");
#ifndef APPLIANCE_FORMAT_AUTO
append_list ("format=raw");
#endif
} end_list ();
start_list ("-device") {
append_list ("scsi-hd");
append_list ("drive=appliance");
} end_list ();
}
/* Create the virtio serial bus. */
arg ("-device", VIRTIO_DEVICE_NAME ("virtio-serial"));
/* Create the serial console. */
#ifndef __s390x__
arg ("-serial", "stdio");
#else
start_list ("-chardev") {
append_list ("stdio");
append_list ("id=charconsole0");
} end_list ();
start_list ("-device") {
append_list ("sclpconsole");
append_list ("chardev=charconsole0");
} end_list ();
#endif
/* Set up virtio-serial for the communications channel. */
start_list ("-chardev") {
append_list ("socket");
append_list_format ("path=%s", data->guestfsd_sock);
append_list ("id=channel0");
} end_list ();
start_list ("-device") {
append_list ("virtserialport");
append_list ("chardev=channel0");
append_list ("name=org.libguestfs.channel.0");
} end_list ();
/* Enable user networking. */
if (g->enable_network) {
/* If qemu is 7.2.0+ and "passt" is available, ask for passt rather
* than SLIRP. RHBZ#2184967.
*/
if (guestfs_int_version_ge (&data->qemu_version, 7, 2, 0) &&
guestfs_int_passt_runnable (g)) {
char passt_sock[UNIX_PATH_MAX];
if (launch_passt (g, &passt_pid, &passt_sock) == -1)
goto cleanup0;
start_list ("-netdev") {
append_list ("stream");
append_list ("id=usernet");
append_list ("addr.type=unix");
append_list_format ("addr.path=%s", passt_sock);
} end_list ();
}
else {
start_list ("-netdev") {
append_list ("user");
append_list ("id=usernet");
append_list ("net=" NETWORK_ADDRESS "/" NETWORK_PREFIX);
} end_list ();
}
start_list ("-device") {
append_list (VIRTIO_DEVICE_NAME ("virtio-net"));
append_list ("netdev=usernet");
} end_list ();
}
flags = 0;
if (!has_kvm || force_tcg)
flags |= APPLIANCE_COMMAND_LINE_IS_TCG;
append = guestfs_int_appliance_command_line (g, appliance, flags);
arg ("-append", append);
/* Note: custom command line parameters must come last so that
* qemu -set parameters can modify previously added options.
*/
/* Add any qemu parameters. */
for (hp = g->hv_params; hp; hp = hp->next) {
if (!hp->hv_value)
flag (hp->hv_param);
else
arg_noquote (hp->hv_param, hp->hv_value);
}
/* Get the argv list from the command line. */
argv = qemuopts_to_argv (qopts);
/* Create the environ for the child process. */
env = guestfs_int_copy_environ (environ,
"LC_ALL", "C",
/* Prevents qemu opening /dev/dsp */
"QEMU_AUDIO_DRV", "none",
NULL);
if (env == NULL)
goto cleanup0;
r = fork ();
if (r == -1) {
perrorf (g, "fork");
if (!g->direct_mode) {
close (sv[0]);
close (sv[1]);
}
goto cleanup0;
}
if (r == 0) { /* Child (qemu). */
if (!g->direct_mode) {
/* Set up stdin, stdout, stderr. */
close (0);
close (1);
close (sv[0]);
/* We set the FD_CLOEXEC flag on the socket above, but now (in
* the child) it's safe to unset this flag so qemu can use the
* socket.
*/
set_cloexec_flag (sv[1], 0);
/* Stdin. */
if (dup (sv[1]) == -1) {
dup_failed:
perror ("dup failed");
_exit (EXIT_FAILURE);
}
/* Stdout. */
if (dup (sv[1]) == -1)
goto dup_failed;
/* Particularly since qemu 0.15, qemu spews all sorts of debug
* information on stderr. It is useful to both capture this and
* not confuse casual users, so send stderr to the pipe as well.
*/
close (2);
if (dup (sv[1]) == -1)
goto dup_failed;
close (sv[1]);
/* Close any other file descriptors that we don't want to pass
* to qemu. This prevents file descriptors which didn't have
* O_CLOEXEC set properly from leaking into the subprocess. See
* RHBZ#1123007.
*/
close_file_descriptors (fd > 2);
}
/* Unblock the SIGTERM signal since we will need to send that to
* the subprocess (RHBZ#1460338).
*/
guestfs_int_unblock_sigterm ();
/* Dump the command line (after setting up stderr above). */
if (g->verbose)
qemuopts_to_channel (qopts, stderr);
/* Put qemu in a new process group. */
if (g->pgroup)
setpgid (0, 0);
execve (g->hv, argv, env); /* Run qemu. */
perror (g->hv);
_exit (EXIT_FAILURE);
}
/* Parent (library). */
data->pid = r;
qemuopts_free (qopts);
qopts = NULL;
/* Fork the recovery process off which will kill qemu if the parent
* process fails to do so (eg. if the parent segfaults).
*/
data->recoverypid = -1;
if (g->recovery_proc) {
r = fork ();
if (r == 0) {
size_t i;
struct sigaction sa;
pid_t qemu_pid = data->pid;
pid_t parent_pid = getppid ();
/* Remove all signal handlers. See the justification here:
* https://www.redhat.com/archives/libvir-list/2008-August/msg00303.html
* We don't mask signal handlers yet, so this isn't completely
* race-free, but better than not doing it at all.
*/
memset (&sa, 0, sizeof sa);
sa.sa_handler = SIG_DFL;
sa.sa_flags = 0;
sigemptyset (&sa.sa_mask);
for (i = 1; i < NSIG; ++i)
sigaction (i, &sa, NULL);
/* Close all other file descriptors. This ensures that we don't
* hold open (eg) pipes from the parent process.
*/
close_file_descriptors (1);
/* Unblock the SIGTERM signal since we will need to respond to
* SIGTERM from the parent (RHBZ#1460338).
*/
guestfs_int_unblock_sigterm ();
/* It would be nice to be able to put this in the same process
* group as qemu (ie. setpgid (0, qemu_pid)). However this is
* not possible because we don't have any guarantee here that
* the qemu process has started yet.
*/
if (g->pgroup)
setpgid (0, 0);
/* Writing to argv is hideously complicated and error prone. See:
* http://git.postgresql.org/gitweb/?p=postgresql.git;a=blob;f=src/backend/utils/misc/ps_status.c;hb=HEAD
*/
/* Loop around waiting for one or both of the other processes to
* disappear. It's fair to say this is very hairy. The PIDs that
* we are looking at might be reused by another process. We are
* effectively polling. Is the cure worse than the disease?
*/
for (;;) {
if (kill (qemu_pid, 0) == -1) /* qemu's gone away, we aren't needed */
_exit (EXIT_SUCCESS);
if (kill (parent_pid, 0) == -1) {
/* Parent's gone away, qemu still around, so kill qemu. */
kill (qemu_pid, 9);
_exit (EXIT_SUCCESS);
}
sleep (2);
}
}
/* Don't worry, if the fork failed, this will be -1. The recovery
* process isn't essential.
*/
data->recoverypid = r;
}
if (!g->direct_mode) {
/* Close the other end of the socketpair. */
close (sv[1]);
console_sock = sv[0]; /* stdin of child */
sv[0] = -1;
}
g->state = LAUNCHING;
/* Wait for qemu to start and to connect back to us via
* virtio-serial and send the GUESTFS_LAUNCH_FLAG message.
*/
g->conn =
guestfs_int_new_conn_socket_listening (g, daemon_accept_sock, console_sock);
if (!g->conn)
goto cleanup1;
/* g->conn now owns these sockets. */
daemon_accept_sock = console_sock = -1;
r = g->conn->ops->accept_connection (g, g->conn);
if (r == -1)
goto cleanup1;
if (r == 0) {
guestfs_int_launch_failed_error (g);
goto cleanup1;
}
/* NB: We reach here just because qemu has opened the socket. It
* does not mean the daemon is up until we read the
* GUESTFS_LAUNCH_FLAG below. Failures in qemu startup can still
* happen even if we reach here, even early failures like not being
* able to open a drive.
*/
r = guestfs_int_recv_from_daemon (g, &size, &buf);
if (r == -1) {
guestfs_int_launch_failed_error (g);
goto cleanup1;
}
if (size != GUESTFS_LAUNCH_FLAG) {
guestfs_int_launch_failed_error (g);
goto cleanup1;
}
debug (g, "appliance is up");
/* From this point onward, even if we fail, QEMU terminating (forcefully or
* gracefully) will cause passt to go away as well. Note that we can't
* precisely tell whether QEMU managed to open the passt socket before QEMU
* failed. Therefore, err on the side of killing passt needlessly, rather
* than not killing it when needed -- that's why we re-set "passt_pid" to (-1)
* only this late during QEMU startup verification.
*/
passt_pid = -1;
/* This is possible in some really strange situations, such as
* guestfsd starts up OK but then qemu immediately exits. Check for
* it because the caller is probably expecting to be able to send
* commands after this function returns.
*/
if (g->state != READY) {
error (g, _("qemu launched and contacted daemon, but state != READY"));
goto cleanup1;
}
guestfs_int_launch_send_progress (g, 12);
if (has_appliance_drive)
guestfs_int_add_dummy_appliance_drive (g);
return 0;
cleanup1:
if (!g->direct_mode && sv[0] >= 0)
close (sv[0]);
if (data->pid > 0) kill (data->pid, 9);
if (data->recoverypid > 0) kill (data->recoverypid, 9);
if (data->pid > 0) guestfs_int_waitpid_noerror (data->pid);
if (data->recoverypid > 0) guestfs_int_waitpid_noerror (data->recoverypid);
data->pid = 0;
data->recoverypid = 0;
memset (&g->launch_t, 0, sizeof g->launch_t);
guestfs_int_free_qemu_data (data->qemu_data);
data->qemu_data = NULL;
cleanup0:
if (passt_pid != -1)
kill (passt_pid, SIGTERM);
if (qopts != NULL)
qemuopts_free (qopts);
if (daemon_accept_sock >= 0)
close (daemon_accept_sock);
if (console_sock >= 0)
close (console_sock);
if (g->conn) {
g->conn->ops->free_connection (g, g->conn);
g->conn = NULL;
}
g->state = CONFIG;
return -1;
}
static int
shutdown_direct (guestfs_h *g, void *datav, int check_for_errors)
{
struct backend_direct_data *data = datav;
int ret = 0;
int status;
struct rusage rusage;
/* Signal qemu to shutdown cleanly, and kill the recovery process. */
if (data->pid > 0) {
debug (g, "sending SIGTERM to process %d", data->pid);
kill (data->pid, SIGTERM);
}
if (data->recoverypid > 0) kill (data->recoverypid, 9);
/* Wait for subprocess(es) to exit. */
if (g->recovery_proc /* RHBZ#998482 */ && data->pid > 0) {
if (guestfs_int_wait4 (g, data->pid, &status, &rusage, "qemu") == -1)
ret = -1;
else if (!WIFEXITED (status) || WEXITSTATUS (status) != 0) {
guestfs_int_external_command_failed (g, status, g->hv, NULL);
ret = -1;
}
else
/* Print the actual memory usage of qemu, useful for seeing
* if techniques like DAX are having any effect.
*/
debug (g, "qemu maxrss %ldK", rusage.ru_maxrss);
}
if (data->recoverypid > 0) guestfs_int_waitpid_noerror (data->recoverypid);
data->pid = data->recoverypid = 0;
if (data->guestfsd_sock[0] != '\0') {
unlink (data->guestfsd_sock);
data->guestfsd_sock[0] = '\0';
}
guestfs_int_free_qemu_data (data->qemu_data);
data->qemu_data = NULL;
return ret;
}
static int
get_pid_direct (guestfs_h *g, void *datav)
{
struct backend_direct_data *data = datav;
if (data->pid > 0)
return data->pid;
else {
error (g, "get_pid: no qemu subprocess");
return -1;
}
}
/* Maximum number of disks. */
static int
max_disks_direct (guestfs_h *g, void *datav)
{
return 255;
}
static struct backend_ops backend_direct_ops = {
.data_size = sizeof (struct backend_direct_data),
.create_cow_overlay = create_cow_overlay_direct,
.launch = launch_direct,
.shutdown = shutdown_direct,
.get_pid = get_pid_direct,
.max_disks = max_disks_direct,
};
void
guestfs_int_init_direct_backend (void)
{
guestfs_int_register_backend ("direct", &backend_direct_ops);
}