From 7704df98b09f886a9cf1d42aa6aa9a776ad1be3c Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 8 Nov 2011 10:50:12 +0100
Subject: vvfat: Fix read-write mode

vvfat used to directly call into the qcow2 block driver instead of using the
block.c wrappers. With the coroutine conversion, this stopped working.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/vvfat.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/block/vvfat.c b/block/vvfat.c
index 8511fe523c..131680f6f4 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1254,15 +1254,15 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
 	   return -1;
 	if (s->qcow) {
 	    int n;
-	    if (s->qcow->drv->bdrv_is_allocated(s->qcow,
-			sector_num, nb_sectors-i, &n)) {
+            if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) {
 DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n));
-		if (s->qcow->drv->bdrv_read(s->qcow, sector_num, buf+i*0x200, n))
-		    return -1;
-		i += n - 1;
-		sector_num += n - 1;
-		continue;
-	    }
+                if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) {
+                    return -1;
+                }
+                i += n - 1;
+                sector_num += n - 1;
+                continue;
+            }
 DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
 	}
 	if(sector_num<s->faked_sectors) {
@@ -1516,7 +1516,7 @@ static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
 	return 0;
 
     for (i = 0; !was_modified && i < s->sectors_per_cluster; i++)
-	was_modified = s->qcow->drv->bdrv_is_allocated(s->qcow,
+	was_modified = bdrv_is_allocated(s->qcow,
 		cluster2sector(s, cluster_num) + i, 1, &dummy);
 
     return was_modified;
@@ -1665,16 +1665,16 @@ static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
 		int64_t offset = cluster2sector(s, cluster_num);
 
 		vvfat_close_current_file(s);
-		for (i = 0; i < s->sectors_per_cluster; i++)
-		    if (!s->qcow->drv->bdrv_is_allocated(s->qcow,
-				offset + i, 1, &dummy)) {
-			if (vvfat_read(s->bs,
-				    offset, s->cluster_buffer, 1))
-			    return -1;
-			if (s->qcow->drv->bdrv_write(s->qcow,
-				    offset, s->cluster_buffer, 1))
-			    return -2;
-		    }
+                for (i = 0; i < s->sectors_per_cluster; i++) {
+                    if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) {
+                        if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) {
+                            return -1;
+                        }
+                        if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) {
+                            return -2;
+                        }
+                    }
+                }
 	    }
 	}
 
@@ -2619,7 +2619,9 @@ static int do_commit(BDRVVVFATState* s)
 	return ret;
     }
 
-    s->qcow->drv->bdrv_make_empty(s->qcow);
+    if (s->qcow->drv->bdrv_make_empty) {
+        s->qcow->drv->bdrv_make_empty(s->qcow);
+    }
 
     memset(s->used_clusters, 0, sector2cluster(s, s->sector_count));
 
@@ -2714,7 +2716,7 @@ DLOG(checkpoint());
      * Use qcow backend. Commit later.
      */
 DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors));
-    ret = s->qcow->drv->bdrv_write(s->qcow, sector_num, buf, nb_sectors);
+    ret = bdrv_write(s->qcow, sector_num, buf, nb_sectors);
     if (ret < 0) {
 	fprintf(stderr, "Error writing to qcow backend\n");
 	return ret;
-- 
cgit v1.2.1


From 025ccaa7f9d2f54a79567599d3eb402100bed7a4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 7 Nov 2011 17:50:13 +0100
Subject: block: add eject request callback

Recent versions of udev always keep the tray locked so that the kernel
can observe "eject request" events (aka tray button presses) even on
discs that aren't mounted.  Add support for these events in the ATAPI
and SCSI cd drive device models.

To let management cope with the behavior of udev, an event should also
be added for "tray opened/closed".  This way, after issuing an "eject"
command, management can poll until the guests actually reacts to the
command.  They can then issue the "change" command after the tray has been
opened, or try with "eject -f" after a (configurable?) timeout.  However,
with this patch and the corresponding support in the device models,
at least it is possible to do a manual two-step eject+change sequence.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c    |  7 +++++++
 block.h    | 10 ++++++++++
 blockdev.c | 10 ++++++----
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index 9bb236c989..5c30c9dd68 100644
--- a/block.c
+++ b/block.c
@@ -816,6 +816,13 @@ bool bdrv_dev_has_removable_media(BlockDriverState *bs)
     return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
 }
 
+void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
+{
+    if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
+        bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
+    }
+}
+
 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
 {
     if (bs->dev_ops && bs->dev_ops->is_tray_open) {
diff --git a/block.h b/block.h
index 38cd748b60..051a25d8d6 100644
--- a/block.h
+++ b/block.h
@@ -38,6 +38,15 @@ typedef struct BlockDevOps {
      * Device models with removable media must implement this callback.
      */
     void (*change_media_cb)(void *opaque, bool load);
+    /*
+     * Runs when an eject request is issued from the monitor, the tray
+     * is closed, and the medium is locked.
+     * Device models that do not implement is_medium_locked will not need
+     * this callback.  Device models that can lock the medium or tray might
+     * want to implement the callback and unlock the tray when "force" is
+     * true, even if they do not support eject requests.
+     */
+    void (*eject_request_cb)(void *opaque, bool force);
     /*
      * Is the virtual tray open?
      * Device models implement this only when the device has a tray.
@@ -111,6 +120,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev);
 void *bdrv_get_attached_dev(BlockDriverState *bs);
 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
                       void *opaque);
+void bdrv_dev_eject_request(BlockDriverState *bs, bool force);
 bool bdrv_dev_has_removable_media(BlockDriverState *bs);
 bool bdrv_dev_is_tray_open(BlockDriverState *bs);
 bool bdrv_dev_is_medium_locked(BlockDriverState *bs);
diff --git a/blockdev.c b/blockdev.c
index 0827bf7743..222818690d 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -635,10 +635,12 @@ static int eject_device(Monitor *mon, BlockDriverState *bs, int force)
         qerror_report(QERR_DEVICE_NOT_REMOVABLE, bdrv_get_device_name(bs));
         return -1;
     }
-    if (!force && !bdrv_dev_is_tray_open(bs)
-        && bdrv_dev_is_medium_locked(bs)) {
-        qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs));
-        return -1;
+    if (bdrv_dev_is_medium_locked(bs) && !bdrv_dev_is_tray_open(bs)) {
+        bdrv_dev_eject_request(bs, force);
+        if (!force) {
+            qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs));
+            return -1;
+        }
     }
     bdrv_close(bs);
     return 0;
-- 
cgit v1.2.1


From 2df0a3a3085adfde93505bc73c938310b0820c36 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 25 Oct 2011 12:53:39 +0200
Subject: atapi: implement eject requests

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/atapi.c | 11 ++++++++---
 hw/ide/core.c  | 13 +++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 90b6729692..1fed359ab1 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -516,9 +516,14 @@ static unsigned int event_status_media(IDEState *s,
 
     /* Event notification descriptor */
     event_code = MEC_NO_CHANGE;
-    if (media_status != MS_TRAY_OPEN && s->events.new_media) {
-        event_code = MEC_NEW_MEDIA;
-        s->events.new_media = false;
+    if (media_status != MS_TRAY_OPEN) {
+        if (s->events.new_media) {
+            event_code = MEC_NEW_MEDIA;
+            s->events.new_media = false;
+        } else if (s->events.eject_request) {
+            event_code = MEC_EJECT_REQUESTED;
+            s->events.eject_request = false;
+        }
     }
 
     buf[4] = event_code;
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 9a2fd30607..93a1a689c4 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -804,6 +804,18 @@ static void ide_cd_change_cb(void *opaque, bool load)
      */
     s->cdrom_changed = 1;
     s->events.new_media = true;
+    s->events.eject_request = false;
+    ide_set_irq(s->bus);
+}
+
+static void ide_cd_eject_request_cb(void *opaque, bool force)
+{
+    IDEState *s = opaque;
+
+    s->events.eject_request = true;
+    if (force) {
+        s->tray_locked = false;
+    }
     ide_set_irq(s->bus);
 }
 
@@ -1811,6 +1823,7 @@ static bool ide_cd_is_medium_locked(void *opaque)
 
 static const BlockDevOps ide_cd_block_ops = {
     .change_media_cb = ide_cd_change_cb,
+    .eject_request_cb = ide_cd_eject_request_cb,
     .is_tray_open = ide_cd_is_tray_open,
     .is_medium_locked = ide_cd_is_medium_locked,
 };
-- 
cgit v1.2.1


From 4480de19d9b62c20e8cae0bcb6c2d00a954615a3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 25 Oct 2011 12:53:40 +0200
Subject: scsi-disk: implement eject requests

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-disk.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 1c04872af7..62f538f4f8 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -65,6 +65,7 @@ struct SCSIDiskState
     uint32_t removable;
     bool media_changed;
     bool media_event;
+    bool eject_request;
     QEMUBH *bh;
     char *version;
     char *serial;
@@ -671,9 +672,14 @@ static int scsi_event_status_media(SCSIDiskState *s, uint8_t *outbuf)
 
     /* Event notification descriptor */
     event_code = MEC_NO_CHANGE;
-    if (media_status != MS_TRAY_OPEN && s->media_event) {
-        event_code = MEC_NEW_MEDIA;
-        s->media_event = false;
+    if (media_status != MS_TRAY_OPEN) {
+        if (s->media_event) {
+            event_code = MEC_NEW_MEDIA;
+            s->media_event = false;
+        } else if (s->eject_request) {
+            event_code = MEC_EJECT_REQUESTED;
+            s->eject_request = false;
+        }
     }
 
     outbuf[0] = event_code;
@@ -1470,6 +1476,17 @@ static void scsi_cd_change_media_cb(void *opaque, bool load)
     s->tray_open = !load;
     s->qdev.unit_attention = SENSE_CODE(UNIT_ATTENTION_NO_MEDIUM);
     s->media_event = true;
+    s->eject_request = false;
+}
+
+static void scsi_cd_eject_request_cb(void *opaque, bool force)
+{
+    SCSIDiskState *s = opaque;
+
+    s->eject_request = true;
+    if (force) {
+        s->tray_locked = false;
+    }
 }
 
 static bool scsi_cd_is_tray_open(void *opaque)
@@ -1484,6 +1501,7 @@ static bool scsi_cd_is_medium_locked(void *opaque)
 
 static const BlockDevOps scsi_cd_block_ops = {
     .change_media_cb = scsi_cd_change_media_cb,
+    .eject_request_cb = scsi_cd_eject_request_cb,
     .is_tray_open = scsi_cd_is_tray_open,
     .is_medium_locked = scsi_cd_is_medium_locked,
 };
-- 
cgit v1.2.1


From 74624688b3c003d1ed2763953aaf59974965fa22 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Nov 2011 15:51:18 +0100
Subject: nbd: treat EPIPE from NBD_DO_IT as success

This can be seen with "qemu-nbd -v -c", which returns 1 instead of 0
when you disconnect with "qemu-nbd -d".

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 nbd.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nbd.c b/nbd.c
index fb5e424abe..e6c931c5ce 100644
--- a/nbd.c
+++ b/nbd.c
@@ -425,6 +425,13 @@ int nbd_client(int fd)
     TRACE("Doing NBD loop");
 
     ret = ioctl(fd, NBD_DO_IT);
+    if (ret == -1 && errno == EPIPE) {
+        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
+         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
+         * that case.
+         */
+        ret = 0;
+    }
     serrno = errno;
 
     TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
-- 
cgit v1.2.1


From bb345110f091af3aac32038a45a8776fc9fb505d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Nov 2011 15:51:19 +0100
Subject: qemu-nbd: trap SIGTERM

The client process right now uses SIGTERM to interrupt the server side.
This does not affect the exit status of "qemu-nbd -v -c" because the
server is a child process.  This will change when both sides will be
in the same process, and anyway cleaning up things nicely upon SIGTERM
is good practice.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-nbd.c | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index d8d3e15a84..d997bb06ca 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -36,6 +36,7 @@
 
 #define NBD_BUFFER_SIZE (1024*1024)
 
+static int sigterm_wfd;
 static int verbose;
 
 static void usage(const char *name)
@@ -163,6 +164,14 @@ static int find_partition(BlockDriverState *bs, int partition,
     return -1;
 }
 
+static void termsig_handler(int signum)
+{
+    static int sigterm_reported;
+    if (!sigterm_reported) {
+        sigterm_reported = (write(sigterm_wfd, "", 1) == 1);
+    }
+}
+
 static void show_parts(const char *device)
 {
     if (fork() == 0) {
@@ -231,6 +240,18 @@ int main(int argc, char **argv)
     int max_fd;
     int persistent = 0;
 
+    /* Set up a SIGTERM handler so that we exit with a nice status code.  */
+    struct sigaction sa_sigterm;
+    int sigterm_fd[2];
+    if (qemu_pipe(sigterm_fd) == -1) {
+        err(EXIT_FAILURE, "Error setting up communication pipe");
+    }
+
+    sigterm_wfd = sigterm_fd[1];
+    memset(&sa_sigterm, 0, sizeof(sa_sigterm));
+    sa_sigterm.sa_handler = termsig_handler;
+    sigaction(SIGTERM, &sa_sigterm, NULL);
+
     while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
         switch (ch) {
         case 's':
@@ -423,7 +444,6 @@ int main(int argc, char **argv)
             close(fd);
  out:
             kill(pid, SIGTERM);
-            unlink(socket);
 
             return ret;
         }
@@ -444,18 +464,22 @@ int main(int argc, char **argv)
     nb_fds++;
 
     data = qemu_blockalign(bs, NBD_BUFFER_SIZE);
-    if (data == NULL)
+    if (data == NULL) {
         errx(EXIT_FAILURE, "Cannot allocate data buffer");
+    }
 
     do {
-
         FD_ZERO(&fds);
+        FD_SET(sigterm_fd[0], &fds);
         for (i = 0; i < nb_fds; i++)
             FD_SET(sharing_fds[i], &fds);
 
-        ret = select(max_fd + 1, &fds, NULL, NULL, NULL);
-        if (ret == -1)
+        do {
+            ret = select(max_fd + 1, &fds, NULL, NULL, NULL);
+        } while (ret == -1 && errno == EINTR);
+        if (ret == -1 || FD_ISSET(sigterm_fd[0], &fds)) {
             break;
+        }
 
         if (FD_ISSET(sharing_fds[0], &fds))
             ret--;
-- 
cgit v1.2.1


From b32f6c28d5c753a8c667453f6300a2650bc52a47 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Nov 2011 15:51:20 +0100
Subject: qemu-nbd: rename socket variable

It will be moved to a global variable by the next patch, and it
would conflict with the socket function.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-nbd.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index d997bb06ca..20b69c1507 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -202,8 +202,7 @@ int main(int argc, char **argv)
     socklen_t addr_len = sizeof(addr);
     off_t fd_size;
     char *device = NULL;
-    char *socket = NULL;
-    char sockpath[128];
+    char *sockpath = NULL;
     const char *sopt = "hVb:o:p:rsnP:c:dvk:e:t";
     struct option lopt[] = {
         { "help", 0, NULL, 'h' },
@@ -294,8 +293,8 @@ int main(int argc, char **argv)
                 errx(EXIT_FAILURE, "Invalid partition %d", partition);
             break;
         case 'k':
-            socket = optarg;
-            if (socket[0] != '/')
+            sockpath = optarg;
+            if (sockpath[0] != '/')
                 errx(EXIT_FAILURE, "socket path must be absolute\n");
             break;
         case 'd':
@@ -384,10 +383,9 @@ int main(int argc, char **argv)
             }
         }
 
-        if (socket == NULL) {
-            snprintf(sockpath, sizeof(sockpath), SOCKET_PATH,
-                     basename(device));
-            socket = sockpath;
+        if (sockpath == NULL) {
+            sockpath = g_malloc(128);
+            snprintf(sockpath, 128, SOCKET_PATH, basename(device));
         }
 
         pid = fork();
@@ -401,7 +399,7 @@ int main(int argc, char **argv)
             bdrv_close(bs);
 
             do {
-                sock = unix_socket_outgoing(socket);
+                sock = unix_socket_outgoing(sockpath);
                 if (sock == -1) {
                     if (errno != ENOENT && errno != ECONNREFUSED) {
                         ret = 1;
@@ -452,8 +450,8 @@ int main(int argc, char **argv)
 
     sharing_fds = g_malloc((shared + 1) * sizeof(int));
 
-    if (socket) {
-        sharing_fds[0] = unix_socket_incoming(socket);
+    if (sockpath) {
+        sharing_fds[0] = unix_socket_incoming(sockpath);
     } else {
         sharing_fds[0] = tcp_socket_incoming(bindto, port);
     }
@@ -515,8 +513,9 @@ int main(int argc, char **argv)
     close(sharing_fds[0]);
     bdrv_close(bs);
     g_free(sharing_fds);
-    if (socket)
-        unlink(socket);
+    if (sockpath) {
+        unlink(sockpath);
+    }
 
     return 0;
 }
-- 
cgit v1.2.1


From a517e88baa9bef2b5c54a11d62b2b2ab2a5c4ab7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Nov 2011 15:51:21 +0100
Subject: qemu-nbd: move client to a thread

This avoids that qemu-nbd uses both forking and threads, which do
not behave well together.

qemu-nbd is already Unix only, and there is no qemu_thread_join,
so for now use pthreads.

Since the parent and child no longer have separate file descriptors,
we can open the NBD device before daemonizing, instead of checking
with access(2) and restricting the open to the client only.

Reported-by: Pierre Riteau <pierre.riteau@irisa.fr>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-nbd.c | 173 +++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 93 insertions(+), 80 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 20b69c1507..ffc2a8ae68 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -31,6 +31,7 @@
 #include <arpa/inet.h>
 #include <signal.h>
 #include <libgen.h>
+#include <pthread.h>
 
 #define SOCKET_PATH    "/var/lock/qemu-nbd-%s"
 
@@ -38,6 +39,9 @@
 
 static int sigterm_wfd;
 static int verbose;
+static char *device;
+static char *srcpath;
+static char *sockpath;
 
 static void usage(const char *name)
 {
@@ -172,21 +176,70 @@ static void termsig_handler(int signum)
     }
 }
 
-static void show_parts(const char *device)
+static void *show_parts(void *arg)
 {
-    if (fork() == 0) {
-        int nbd;
+    int nbd;
+
+    /* linux just needs an open() to trigger
+     * the partition table update
+     * but remember to load the module with max_part != 0 :
+     *     modprobe nbd max_part=63
+     */
+    nbd = open(device, O_RDWR);
+    if (nbd != -1) {
+        close(nbd);
+    }
+    return NULL;
+}
 
-        /* linux just needs an open() to trigger
-         * the partition table update
-         * but remember to load the module with max_part != 0 :
-         *     modprobe nbd max_part=63
-         */
-        nbd = open(device, O_RDWR);
-        if (nbd != -1)
-              close(nbd);
-        exit(0);
+static void *nbd_client_thread(void *arg)
+{
+    int fd = *(int *)arg;
+    off_t size;
+    size_t blocksize;
+    uint32_t nbdflags;
+    int sock;
+    int ret;
+    pthread_t show_parts_thread;
+
+    do {
+        sock = unix_socket_outgoing(sockpath);
+        if (sock == -1) {
+            if (errno != ENOENT && errno != ECONNREFUSED) {
+                goto out;
+            }
+            sleep(1);  /* wait parent */
+        }
+    } while (sock == -1);
+
+    ret = nbd_receive_negotiate(sock, NULL, &nbdflags,
+                                &size, &blocksize);
+    if (ret == -1) {
+        goto out;
+    }
+
+    ret = nbd_init(fd, sock, nbdflags, size, blocksize);
+    if (ret == -1) {
+        goto out;
+    }
+
+    /* update partition table */
+    pthread_create(&show_parts_thread, NULL, show_parts, NULL);
+
+    fprintf(stderr, "NBD device %s is now connected to %s\n",
+            device, srcpath);
+
+    ret = nbd_client(fd);
+    if (ret) {
+        goto out;
     }
+    close(fd);
+    kill(getpid(), SIGTERM);
+    return (void *) EXIT_SUCCESS;
+
+out:
+    kill(getpid(), SIGTERM);
+    return (void *) EXIT_FAILURE;
 }
 
 int main(int argc, char **argv)
@@ -201,8 +254,6 @@ int main(int argc, char **argv)
     struct sockaddr_in addr;
     socklen_t addr_len = sizeof(addr);
     off_t fd_size;
-    char *device = NULL;
-    char *sockpath = NULL;
     const char *sopt = "hVb:o:p:rsnP:c:dvk:e:t";
     struct option lopt[] = {
         { "help", 0, NULL, 'h' },
@@ -238,8 +289,11 @@ int main(int argc, char **argv)
     int nb_fds = 0;
     int max_fd;
     int persistent = 0;
+    pthread_t client_thread;
 
-    /* Set up a SIGTERM handler so that we exit with a nice status code.  */
+    /* The client thread uses SIGTERM to interrupt the server.  A signal
+     * handler ensures that "qemu-nbd -v -c" exits with a nice status code.
+     */
     struct sigaction sa_sigterm;
     int sigterm_fd[2];
     if (qemu_pipe(sigterm_fd) == -1) {
@@ -356,9 +410,10 @@ int main(int argc, char **argv)
 
     bs = bdrv_new("hda");
 
-    if ((ret = bdrv_open(bs, argv[optind], flags, NULL)) < 0) {
+    srcpath = argv[optind];
+    if ((ret = bdrv_open(bs, srcpath, flags, NULL)) < 0) {
         errno = -ret;
-        err(EXIT_FAILURE, "Failed to bdrv_open '%s'", argv[optind]);
+        err(EXIT_FAILURE, "Failed to bdrv_open '%s'", srcpath);
     }
 
     fd_size = bs->total_sectors * 512;
@@ -368,12 +423,14 @@ int main(int argc, char **argv)
         err(EXIT_FAILURE, "Could not find partition %d", partition);
 
     if (device) {
-        pid_t pid;
-        int sock;
+        int ret;
 
-        /* want to fail before daemonizing */
-        if (access(device, R_OK|W_OK) == -1) {
-            err(EXIT_FAILURE, "Could not access '%s'", device);
+        /* Open before spawning new threads.  In the future, we may
+         * drop privileges after opening.
+         */
+        fd = open(device, O_RDWR);
+        if (fd == -1) {
+            err(EXIT_FAILURE, "Failed to open %s", device);
         }
 
         if (!verbose) {
@@ -388,64 +445,14 @@ int main(int argc, char **argv)
             snprintf(sockpath, 128, SOCKET_PATH, basename(device));
         }
 
-        pid = fork();
-        if (pid < 0)
-            return 1;
-        if (pid != 0) {
-            off_t size;
-            size_t blocksize;
-
-            ret = 0;
-            bdrv_close(bs);
-
-            do {
-                sock = unix_socket_outgoing(sockpath);
-                if (sock == -1) {
-                    if (errno != ENOENT && errno != ECONNREFUSED) {
-                        ret = 1;
-                        goto out;
-                    }
-                    sleep(1);	/* wait children */
-                }
-            } while (sock == -1);
-
-            fd = open(device, O_RDWR);
-            if (fd == -1) {
-                ret = 1;
-                goto out;
-            }
-
-            ret = nbd_receive_negotiate(sock, NULL, &nbdflags,
-                                        &size, &blocksize);
-            if (ret == -1) {
-                ret = 1;
-                goto out;
-            }
-
-            ret = nbd_init(fd, sock, nbdflags, size, blocksize);
-            if (ret == -1) {
-                ret = 1;
-                goto out;
-            }
-
-            printf("NBD device %s is now connected to file %s\n",
-                    device, argv[optind]);
-
-	    /* update partition table */
-
-            show_parts(device);
-
-            ret = nbd_client(fd);
-            if (ret) {
-                ret = 1;
-            }
-            close(fd);
- out:
-            kill(pid, SIGTERM);
-
-            return ret;
+        ret = pthread_create(&client_thread, NULL, nbd_client_thread, &fd);
+        if (ret != 0) {
+            errx(EXIT_FAILURE, "Failed to create client thread: %s",
+                 strerror(ret));
         }
-        /* children */
+    } else {
+        /* Shut up GCC warnings.  */
+        memset(&client_thread, 0, sizeof(client_thread));
     }
 
     sharing_fds = g_malloc((shared + 1) * sizeof(int));
@@ -517,5 +524,11 @@ int main(int argc, char **argv)
         unlink(sockpath);
     }
 
-    return 0;
+    if (device) {
+        void *ret;
+        pthread_join(client_thread, &ret);
+        exit(ret != NULL);
+    } else {
+        exit(EXIT_SUCCESS);
+    }
 }
-- 
cgit v1.2.1


From c1f8fdc3620781c87a0bc519edb691af975c6356 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Nov 2011 15:51:22 +0100
Subject: qemu-nbd: print error messages from the daemon through a pipe

In order to get nice error messages, keep the qemu-nbd process running
until before issuing NBD_DO_IT and connected to the daemon with a pipe.
This lets the qemu-nbd process relay error messages from the daemon and
exit with a nonzero status if appropriate.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-nbd.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index ffc2a8ae68..b330d8d10b 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -226,8 +226,13 @@ static void *nbd_client_thread(void *arg)
     /* update partition table */
     pthread_create(&show_parts_thread, NULL, show_parts, NULL);
 
-    fprintf(stderr, "NBD device %s is now connected to %s\n",
-            device, srcpath);
+    if (verbose) {
+        fprintf(stderr, "NBD device %s is now connected to %s\n",
+                device, srcpath);
+    } else {
+        /* Close stderr so that the qemu-nbd process exits.  */
+        dup2(STDOUT_FILENO, STDERR_FILENO);
+    }
 
     ret = nbd_client(fd);
     if (ret) {
@@ -406,6 +411,58 @@ int main(int argc, char **argv)
 	return 0;
     }
 
+    if (device && !verbose) {
+        int stderr_fd[2];
+        pid_t pid;
+        int ret;
+
+        if (qemu_pipe(stderr_fd) == -1) {
+            err(EXIT_FAILURE, "Error setting up communication pipe");
+        }
+
+        /* Now daemonize, but keep a communication channel open to
+         * print errors and exit with the proper status code.
+         */
+        pid = fork();
+        if (pid == 0) {
+            close(stderr_fd[0]);
+            ret = qemu_daemon(0, 0);
+
+            /* Temporarily redirect stderr to the parent's pipe...  */
+            dup2(stderr_fd[1], STDERR_FILENO);
+            if (ret == -1) {
+                err(EXIT_FAILURE, "Failed to daemonize");
+            }
+
+            /* ... close the descriptor we inherited and go on.  */
+            close(stderr_fd[1]);
+        } else {
+            bool errors = false;
+            char *buf;
+
+            /* In the parent.  Print error messages from the child until
+             * it closes the pipe.
+             */
+            close(stderr_fd[1]);
+            buf = g_malloc(1024);
+            while ((ret = read(stderr_fd[0], buf, 1024)) > 0) {
+                errors = true;
+                ret = qemu_write_full(STDERR_FILENO, buf, ret);
+                if (ret == -1) {
+                    exit(EXIT_FAILURE);
+                }
+            }
+            if (ret == -1) {
+                err(EXIT_FAILURE, "Cannot read from daemon");
+            }
+
+            /* Usually the daemon should not print any message.
+             * Exit with zero status in that case.
+             */
+            exit(errors);
+        }
+    }
+
     bdrv_init();
 
     bs = bdrv_new("hda");
@@ -433,13 +490,6 @@ int main(int argc, char **argv)
             err(EXIT_FAILURE, "Failed to open %s", device);
         }
 
-        if (!verbose) {
-            /* detach client and server */
-            if (qemu_daemon(0, 0) == -1) {
-                err(EXIT_FAILURE, "Failed to daemonize");
-            }
-        }
-
         if (sockpath == NULL) {
             sockpath = g_malloc(128);
             snprintf(sockpath, 128, SOCKET_PATH, basename(device));
-- 
cgit v1.2.1


From f1ef5555c24914f8ad4051a20571a5c62022d928 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Nov 2011 15:51:23 +0100
Subject: qemu-nbd: fix socket creation race

Now that the client and server are in the same process, there is
no need to race on the creation of the socket.  We can open the
listening socket before starting the client thread.

This avoids that "qemu-nbd -v -c" prints this once before connecting
successfully to the socket:

    connect(unix:/var/lock/qemu-nbd-nbd0): No such file or directory

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-nbd.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index b330d8d10b..6f84ad01e5 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -205,10 +205,7 @@ static void *nbd_client_thread(void *arg)
     do {
         sock = unix_socket_outgoing(sockpath);
         if (sock == -1) {
-            if (errno != ENOENT && errno != ECONNREFUSED) {
-                goto out;
-            }
-            sleep(1);  /* wait parent */
+            goto out;
         }
     } while (sock == -1);
 
@@ -480,8 +477,6 @@ int main(int argc, char **argv)
         err(EXIT_FAILURE, "Could not find partition %d", partition);
 
     if (device) {
-        int ret;
-
         /* Open before spawning new threads.  In the future, we may
          * drop privileges after opening.
          */
@@ -494,15 +489,6 @@ int main(int argc, char **argv)
             sockpath = g_malloc(128);
             snprintf(sockpath, 128, SOCKET_PATH, basename(device));
         }
-
-        ret = pthread_create(&client_thread, NULL, nbd_client_thread, &fd);
-        if (ret != 0) {
-            errx(EXIT_FAILURE, "Failed to create client thread: %s",
-                 strerror(ret));
-        }
-    } else {
-        /* Shut up GCC warnings.  */
-        memset(&client_thread, 0, sizeof(client_thread));
     }
 
     sharing_fds = g_malloc((shared + 1) * sizeof(int));
@@ -515,6 +501,20 @@ int main(int argc, char **argv)
 
     if (sharing_fds[0] == -1)
         return 1;
+
+    if (device) {
+        int ret;
+
+        ret = pthread_create(&client_thread, NULL, nbd_client_thread, &fd);
+        if (ret != 0) {
+            errx(EXIT_FAILURE, "Failed to create client thread: %s",
+                 strerror(ret));
+        }
+    } else {
+        /* Shut up GCC warnings.  */
+        memset(&client_thread, 0, sizeof(client_thread));
+    }
+
     max_fd = sharing_fds[0];
     nb_fds++;
 
-- 
cgit v1.2.1


From 802ddc375acabf06c0cb5a2716ca3573bc9ca89f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 4 Nov 2011 15:51:24 +0100
Subject: qemu-nbd: open the block device after starting the client thread

This is cleaner, because we do not need to close the block device when
there is an error opening /dev/nbdX.  It was done this way only to
print errors before daemonizing.

At the same time, use atexit to ensure that the block device is closed
whenever we exit.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-nbd.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 6f84ad01e5..291cba2eaa 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -460,22 +460,6 @@ int main(int argc, char **argv)
         }
     }
 
-    bdrv_init();
-
-    bs = bdrv_new("hda");
-
-    srcpath = argv[optind];
-    if ((ret = bdrv_open(bs, srcpath, flags, NULL)) < 0) {
-        errno = -ret;
-        err(EXIT_FAILURE, "Failed to bdrv_open '%s'", srcpath);
-    }
-
-    fd_size = bs->total_sectors * 512;
-
-    if (partition != -1 &&
-        find_partition(bs, partition, &dev_offset, &fd_size))
-        err(EXIT_FAILURE, "Could not find partition %d", partition);
-
     if (device) {
         /* Open before spawning new threads.  In the future, we may
          * drop privileges after opening.
@@ -491,6 +475,23 @@ int main(int argc, char **argv)
         }
     }
 
+    bdrv_init();
+    atexit(bdrv_close_all);
+
+    bs = bdrv_new("hda");
+    srcpath = argv[optind];
+    if ((ret = bdrv_open(bs, srcpath, flags, NULL)) < 0) {
+        errno = -ret;
+        err(EXIT_FAILURE, "Failed to bdrv_open '%s'", argv[optind]);
+    }
+
+    fd_size = bs->total_sectors * 512;
+
+    if (partition != -1 &&
+        find_partition(bs, partition, &dev_offset, &fd_size)) {
+        err(EXIT_FAILURE, "Could not find partition %d", partition);
+    }
+
     sharing_fds = g_malloc((shared + 1) * sizeof(int));
 
     if (sockpath) {
@@ -568,7 +569,6 @@ int main(int argc, char **argv)
     qemu_vfree(data);
 
     close(sharing_fds[0]);
-    bdrv_close(bs);
     g_free(sharing_fds);
     if (sockpath) {
         unlink(sockpath);
-- 
cgit v1.2.1


From 78439f6af1caa3e8bdafc9fc2d62aeefa53ed63a Mon Sep 17 00:00:00 2001
From: Charles Arnold <carnold@suse.com>
Date: Wed, 9 Nov 2011 09:32:25 -0700
Subject: block: Fix vpc initialization of the Dynamic Disk Header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Data Offset field in the Dynamic Disk Header is an 8 byte field.
Although the specification (2006-10-11) gives an example of initializing
only the first 4 bytes, images generated by Microsoft on Windows initialize
all 8 bytes.

Failure to initialize all 8 bytes results in errors from utilities
like Citrix's vhd-util which checks specifically for the proper Data
Offset field initialization.

Signed-off-by: Charles Arnold <carnold@suse.com>
Reviewed-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vpc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/vpc.c b/block/vpc.c
index 79be7d051b..54633b65f2 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -617,7 +617,11 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options)
 
     memcpy(dyndisk_header->magic, "cxsparse", 8);
 
-    dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFF);
+    /*
+     * Note: The spec is actually wrong here for data_offset, it says
+     * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
+     */
+    dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFFFFFFFFFFULL);
     dyndisk_header->table_offset = be64_to_cpu(3 * 512);
     dyndisk_header->version = be32_to_cpu(0x00010000);
     dyndisk_header->block_size = be32_to_cpu(block_size);
-- 
cgit v1.2.1


From 980bda8ba2322f665934913fbe6c4c202d61a9f9 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Wed, 9 Nov 2011 21:59:50 +0000
Subject: hw/pc.c: Fix use-while-uninitialized of fd_type[]

Fix a use-while-uninitialized of the fd_type[] array (introduced
in commit 34d4260e1, noticed by Coverity). This is more theoretical
than practical, since it's quite hard to get here with floppy==NULL
(the qdev_try_create() of the isa-fdc device has to fail).

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/pc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index 3015671858..33778fe422 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -335,7 +335,7 @@ void pc_cmos_init(ram_addr_t ram_size, ram_addr_t above_4g_mem_size,
                   ISADevice *s)
 {
     int val, nb, nb_heads, max_track, last_sect, i;
-    FDriveType fd_type[2];
+    FDriveType fd_type[2] = { FDRIVE_DRV_NONE, FDRIVE_DRV_NONE };
     BlockDriverState *fd[MAX_FD];
     static pc_cmos_init_late_arg arg;
 
@@ -385,8 +385,6 @@ void pc_cmos_init(ram_addr_t ram_size, ram_addr_t above_4g_mem_size,
                 bdrv_get_floppy_geometry_hint(fd[i], &nb_heads, &max_track,
                                               &last_sect, FDRIVE_DRV_NONE,
                                               &fd_type[i]);
-            } else {
-                fd_type[i] = FDRIVE_DRV_NONE;
             }
         }
     }
-- 
cgit v1.2.1


From c68b89acd636ff545bc7cb92739c41999291ce3c Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 10 Nov 2011 17:25:44 +0100
Subject: block: Rename bdrv_co_flush to bdrv_co_flush_to_disk

There are two different types of flush that you can do: Flushing one level up
to the OS (i.e. writing data to the host page cache) or flushing it all the way
down to the disk. The existing functions flush to the disk, reflect this in the
function name.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c           |  4 ++--
 block/cow.c       | 22 ++++++++++++----------
 block/qcow.c      | 18 ++++++++++--------
 block/qcow2.c     |  6 +++---
 block/raw-win32.c | 15 +++++++++------
 block/raw.c       |  8 ++++----
 block/rbd.c       | 10 +++++-----
 block/vdi.c       |  2 +-
 block/vmdk.c      |  4 ++--
 block/vpc.c       |  8 +++++---
 block_int.h       |  7 ++++++-
 11 files changed, 59 insertions(+), 45 deletions(-)

diff --git a/block.c b/block.c
index 5c30c9dd68..6521eaa546 100644
--- a/block.c
+++ b/block.c
@@ -2793,8 +2793,8 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
         return 0;
     } else if (!bs->drv) {
         return 0;
-    } else if (bs->drv->bdrv_co_flush) {
-        return bs->drv->bdrv_co_flush(bs);
+    } else if (bs->drv->bdrv_co_flush_to_disk) {
+        return bs->drv->bdrv_co_flush_to_disk(bs);
     } else if (bs->drv->bdrv_aio_flush) {
         BlockDriverAIOCB *acb;
         CoroutineIOCompletion co = {
diff --git a/block/cow.c b/block/cow.c
index 707c0aad88..089d395c40 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -326,16 +326,18 @@ static QEMUOptionParameter cow_create_options[] = {
 };
 
 static BlockDriver bdrv_cow = {
-    .format_name	= "cow",
-    .instance_size	= sizeof(BDRVCowState),
-    .bdrv_probe		= cow_probe,
-    .bdrv_open		= cow_open,
-    .bdrv_read          = cow_co_read,
-    .bdrv_write         = cow_co_write,
-    .bdrv_close		= cow_close,
-    .bdrv_create	= cow_create,
-    .bdrv_co_flush      = cow_co_flush,
-    .bdrv_is_allocated	= cow_is_allocated,
+    .format_name    = "cow",
+    .instance_size  = sizeof(BDRVCowState),
+
+    .bdrv_probe     = cow_probe,
+    .bdrv_open      = cow_open,
+    .bdrv_close     = cow_close,
+    .bdrv_create    = cow_create,
+
+    .bdrv_read              = cow_co_read,
+    .bdrv_write             = cow_co_write,
+    .bdrv_co_flush_to_disk  = cow_co_flush,
+    .bdrv_is_allocated      = cow_is_allocated,
 
     .create_options = cow_create_options,
 };
diff --git a/block/qcow.c b/block/qcow.c
index 35e21eb6b3..adecee06c9 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -828,14 +828,16 @@ static BlockDriver bdrv_qcow = {
     .bdrv_open		= qcow_open,
     .bdrv_close		= qcow_close,
     .bdrv_create	= qcow_create,
-    .bdrv_is_allocated	= qcow_is_allocated,
-    .bdrv_set_key	= qcow_set_key,
-    .bdrv_make_empty	= qcow_make_empty,
-    .bdrv_co_readv      = qcow_co_readv,
-    .bdrv_co_writev     = qcow_co_writev,
-    .bdrv_co_flush      = qcow_co_flush,
-    .bdrv_write_compressed = qcow_write_compressed,
-    .bdrv_get_info	= qcow_get_info,
+
+    .bdrv_co_readv          = qcow_co_readv,
+    .bdrv_co_writev         = qcow_co_writev,
+    .bdrv_co_flush_to_disk  = qcow_co_flush,
+    .bdrv_is_allocated      = qcow_is_allocated,
+
+    .bdrv_set_key           = qcow_set_key,
+    .bdrv_make_empty        = qcow_make_empty,
+    .bdrv_write_compressed  = qcow_write_compressed,
+    .bdrv_get_info          = qcow_get_info,
 
     .create_options = qcow_create_options,
 };
diff --git a/block/qcow2.c b/block/qcow2.c
index ef057d31e0..f7f73fe376 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1243,9 +1243,9 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_set_key       = qcow2_set_key,
     .bdrv_make_empty    = qcow2_make_empty,
 
-    .bdrv_co_readv      = qcow2_co_readv,
-    .bdrv_co_writev     = qcow2_co_writev,
-    .bdrv_co_flush      = qcow2_co_flush,
+    .bdrv_co_readv          = qcow2_co_readv,
+    .bdrv_co_writev         = qcow2_co_writev,
+    .bdrv_co_flush_to_disk  = qcow2_co_flush,
 
     .bdrv_co_discard        = qcow2_co_discard,
     .bdrv_truncate          = qcow2_truncate,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index f5f73bcd64..e4b0b75b70 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -281,9 +281,11 @@ static BlockDriver bdrv_file = {
     .bdrv_file_open	= raw_open,
     .bdrv_close		= raw_close,
     .bdrv_create	= raw_create,
-    .bdrv_co_flush      = raw_flush,
-    .bdrv_read		= raw_read,
-    .bdrv_write		= raw_write,
+
+    .bdrv_read              = raw_read,
+    .bdrv_write             = raw_write,
+    .bdrv_co_flush_to_disk  = raw_flush,
+
     .bdrv_truncate	= raw_truncate,
     .bdrv_getlength	= raw_getlength,
     .bdrv_get_allocated_file_size
@@ -409,11 +411,12 @@ static BlockDriver bdrv_host_device = {
     .bdrv_probe_device	= hdev_probe_device,
     .bdrv_file_open	= hdev_open,
     .bdrv_close		= raw_close,
-    .bdrv_co_flush      = raw_flush,
     .bdrv_has_zero_init = hdev_has_zero_init,
 
-    .bdrv_read		= raw_read,
-    .bdrv_write	        = raw_write,
+    .bdrv_read              = raw_read,
+    .bdrv_write             = raw_write,
+    .bdrv_co_flush_to_disk  = raw_flush,
+
     .bdrv_getlength	= raw_getlength,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
diff --git a/block/raw.c b/block/raw.c
index 33cc4716d3..6098070d65 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -111,10 +111,10 @@ static BlockDriver bdrv_raw = {
     .bdrv_open          = raw_open,
     .bdrv_close         = raw_close,
 
-    .bdrv_co_readv      = raw_co_readv,
-    .bdrv_co_writev     = raw_co_writev,
-    .bdrv_co_flush      = raw_co_flush,
-    .bdrv_co_discard    = raw_co_discard,
+    .bdrv_co_readv          = raw_co_readv,
+    .bdrv_co_writev         = raw_co_writev,
+    .bdrv_co_flush_to_disk  = raw_co_flush,
+    .bdrv_co_discard        = raw_co_discard,
 
     .bdrv_probe         = raw_probe,
     .bdrv_getlength     = raw_getlength,
diff --git a/block/rbd.c b/block/rbd.c
index c684e0cb0b..9088c52d24 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -851,18 +851,18 @@ static BlockDriver bdrv_rbd = {
     .bdrv_file_open     = qemu_rbd_open,
     .bdrv_close         = qemu_rbd_close,
     .bdrv_create        = qemu_rbd_create,
-    .bdrv_co_flush      = qemu_rbd_co_flush,
     .bdrv_get_info      = qemu_rbd_getinfo,
     .create_options     = qemu_rbd_create_options,
     .bdrv_getlength     = qemu_rbd_getlength,
     .bdrv_truncate      = qemu_rbd_truncate,
     .protocol_name      = "rbd",
 
-    .bdrv_aio_readv     = qemu_rbd_aio_readv,
-    .bdrv_aio_writev    = qemu_rbd_aio_writev,
+    .bdrv_aio_readv         = qemu_rbd_aio_readv,
+    .bdrv_aio_writev        = qemu_rbd_aio_writev,
+    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
 
-    .bdrv_snapshot_create = qemu_rbd_snap_create,
-    .bdrv_snapshot_list = qemu_rbd_snap_list,
+    .bdrv_snapshot_create   = qemu_rbd_snap_create,
+    .bdrv_snapshot_list     = qemu_rbd_snap_list,
 };
 
 static void bdrv_rbd_init(void)
diff --git a/block/vdi.c b/block/vdi.c
index 523a6409c5..684a4a87b6 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -980,7 +980,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_open = vdi_open,
     .bdrv_close = vdi_close,
     .bdrv_create = vdi_create,
-    .bdrv_co_flush = vdi_co_flush,
+    .bdrv_co_flush_to_disk = vdi_co_flush,
     .bdrv_is_allocated = vdi_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
diff --git a/block/vmdk.c b/block/vmdk.c
index 985006e203..e53a2f0920 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1581,8 +1581,8 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_write     = vmdk_co_write,
     .bdrv_close     = vmdk_close,
     .bdrv_create    = vmdk_create,
-    .bdrv_co_flush  = vmdk_co_flush,
-    .bdrv_is_allocated  = vmdk_is_allocated,
+    .bdrv_co_flush_to_disk  = vmdk_co_flush,
+    .bdrv_is_allocated      = vmdk_is_allocated,
     .bdrv_get_allocated_file_size  = vmdk_get_allocated_file_size,
 
     .create_options = vmdk_create_options,
diff --git a/block/vpc.c b/block/vpc.c
index 54633b65f2..39a324705d 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -665,14 +665,16 @@ static QEMUOptionParameter vpc_create_options[] = {
 static BlockDriver bdrv_vpc = {
     .format_name    = "vpc",
     .instance_size  = sizeof(BDRVVPCState),
+
     .bdrv_probe     = vpc_probe,
     .bdrv_open      = vpc_open,
-    .bdrv_read      = vpc_co_read,
-    .bdrv_write     = vpc_co_write,
-    .bdrv_co_flush  = vpc_co_flush,
     .bdrv_close     = vpc_close,
     .bdrv_create    = vpc_create,
 
+    .bdrv_read              = vpc_co_read,
+    .bdrv_write             = vpc_co_write,
+    .bdrv_co_flush_to_disk  = vpc_co_flush,
+
     .create_options = vpc_create_options,
 };
 
diff --git a/block_int.h b/block_int.h
index f4547f6d93..5aadc1fcee 100644
--- a/block_int.h
+++ b/block_int.h
@@ -84,10 +84,15 @@ struct BlockDriver {
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
-    int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors);
 
+    /*
+     * Flushes all data that was already written to the OS all the way down to
+     * the disk (for example raw-posix calls fsync()).
+     */
+    int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
+
     int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
         int num_reqs);
     int (*bdrv_merge_requests)(BlockDriverState *bs, BlockRequest* a,
-- 
cgit v1.2.1


From eb489bb1eceea0d710cd5f5122d37213300ceef6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 10 Nov 2011 18:10:11 +0100
Subject: block: Introduce bdrv_co_flush_to_os

qcow2 has a writeback metadata cache, so flushing a qcow2 image actually
consists of writing back that cache to the protocol and only then flushes the
protocol in order to get everything stable on disk.

This introduces a separate bdrv_co_flush_to_os to reflect the split.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c       | 13 ++++++++++++-
 block/qcow2.c | 10 ++++++++--
 block_int.h   |  7 +++++++
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 6521eaa546..b1a462956f 100644
--- a/block.c
+++ b/block.c
@@ -2789,11 +2789,22 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
+    int ret;
+
     if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return 0;
     } else if (!bs->drv) {
         return 0;
-    } else if (bs->drv->bdrv_co_flush_to_disk) {
+    }
+
+    if (bs->drv->bdrv_co_flush_to_os) {
+        ret = bs->drv->bdrv_co_flush_to_os(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    if (bs->drv->bdrv_co_flush_to_disk) {
         return bs->drv->bdrv_co_flush_to_disk(bs);
     } else if (bs->drv->bdrv_aio_flush) {
         BlockDriverAIOCB *acb;
diff --git a/block/qcow2.c b/block/qcow2.c
index f7f73fe376..5c784eee51 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1105,7 +1105,7 @@ fail:
     return ret;
 }
 
-static int qcow2_co_flush(BlockDriverState *bs)
+static int qcow2_co_flush_to_os(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
@@ -1124,6 +1124,11 @@ static int qcow2_co_flush(BlockDriverState *bs)
     }
     qemu_co_mutex_unlock(&s->lock);
 
+    return 0;
+}
+
+static int qcow2_co_flush_to_disk(BlockDriverState *bs)
+{
     return bdrv_co_flush(bs->file);
 }
 
@@ -1245,7 +1250,8 @@ static BlockDriver bdrv_qcow2 = {
 
     .bdrv_co_readv          = qcow2_co_readv,
     .bdrv_co_writev         = qcow2_co_writev,
-    .bdrv_co_flush_to_disk  = qcow2_co_flush,
+    .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
+    .bdrv_co_flush_to_disk  = qcow2_co_flush_to_disk,
 
     .bdrv_co_discard        = qcow2_co_discard,
     .bdrv_truncate          = qcow2_truncate,
diff --git a/block_int.h b/block_int.h
index 5aadc1fcee..1ec4921cc6 100644
--- a/block_int.h
+++ b/block_int.h
@@ -93,6 +93,13 @@ struct BlockDriver {
      */
     int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
 
+    /*
+     * Flushes all internal caches to the OS. The data may still sit in a
+     * writeback cache of the host OS, but it will survive a crash of the qemu
+     * process.
+     */
+    int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
+
     int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
         int num_reqs);
     int (*bdrv_merge_requests)(BlockDriverState *bs, BlockRequest* a,
-- 
cgit v1.2.1


From ca716364f045da710ddef7fc6420bd6760e45b56 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 10 Nov 2011 18:13:59 +0100
Subject: block: Make cache=unsafe flush to the OS

cache=unsafe completely ignored bdrv_flush, because flushing the host disk
costs a lot of performance. However, this means that qcow2 images (and
potentially any other format) can lose data even after the guest has issued a
flush if the qemu process crashes/is killed. In case of a host crash, data loss
is certainly expected with cache=unsafe, but if just the qemu process dies this
is a bit too unsafe.

Now that we have two separate flush functions, we can choose to flush
everythign to the OS, but don't enforce that it's physically written to the
disk.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index b1a462956f..86910b046c 100644
--- a/block.c
+++ b/block.c
@@ -2791,12 +2791,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
     int ret;
 
-    if (bs->open_flags & BDRV_O_NO_FLUSH) {
-        return 0;
-    } else if (!bs->drv) {
+    if (!bs->drv) {
         return 0;
     }
 
+    /* Write back cached data to the OS even with cache=unsafe */
     if (bs->drv->bdrv_co_flush_to_os) {
         ret = bs->drv->bdrv_co_flush_to_os(bs);
         if (ret < 0) {
@@ -2804,6 +2803,11 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
         }
     }
 
+    /* But don't actually force it to the disk with cache=unsafe */
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        return 0;
+    }
+
     if (bs->drv->bdrv_co_flush_to_disk) {
         return bs->drv->bdrv_co_flush_to_disk(bs);
     } else if (bs->drv->bdrv_aio_flush) {
-- 
cgit v1.2.1