From 1b9f1491f82de0c8f9b09d1a06ac33304449a634 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 19 Sep 2011 11:26:48 +0200
Subject: qcow2: Unlock during COW

Unlocking during COW allows for more parallelism. One change it requires is
that buffers are dynamically allocated instead of just using a per-image
buffer.

While touching the code, drop the synchronous qcow2_read() function and replace
it by a bdrv_read() call.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 104 +++++++++++++++++---------------------------------
 1 file changed, 35 insertions(+), 69 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index f4e049fa90..0e337075b3 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -289,89 +289,51 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
     }
 }
 
-
-static int qcow2_read(BlockDriverState *bs, int64_t sector_num,
-                      uint8_t *buf, int nb_sectors)
-{
-    BDRVQcowState *s = bs->opaque;
-    int ret, index_in_cluster, n, n1;
-    uint64_t cluster_offset;
-    struct iovec iov;
-    QEMUIOVector qiov;
-
-    while (nb_sectors > 0) {
-        n = nb_sectors;
-
-        ret = qcow2_get_cluster_offset(bs, sector_num << 9, &n,
-            &cluster_offset);
-        if (ret < 0) {
-            return ret;
-        }
-
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        if (!cluster_offset) {
-            if (bs->backing_hd) {
-                /* read from the base image */
-                iov.iov_base = buf;
-                iov.iov_len = n * 512;
-                qemu_iovec_init_external(&qiov, &iov, 1);
-
-                n1 = qcow2_backing_read1(bs->backing_hd, &qiov, sector_num, n);
-                if (n1 > 0) {
-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING);
-                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
-                    if (ret < 0)
-                        return -1;
-                }
-            } else {
-                memset(buf, 0, 512 * n);
-            }
-        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-            if (qcow2_decompress_cluster(bs, cluster_offset) < 0)
-                return -1;
-            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
-        } else {
-            BLKDBG_EVENT(bs->file, BLKDBG_READ);
-            ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512);
-            if (ret != n * 512)
-                return -1;
-            if (s->crypt_method) {
-                qcow2_encrypt_sectors(s, sector_num, buf, buf, n, 0,
-                                &s->aes_decrypt_key);
-            }
-        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
-    }
-    return 0;
-}
-
 static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
                         uint64_t cluster_offset, int n_start, int n_end)
 {
     BDRVQcowState *s = bs->opaque;
     int n, ret;
+    void *buf;
+
+    /*
+     * If this is the last cluster and it is only partially used, we must only
+     * copy until the end of the image, or bdrv_check_request will fail for the
+     * bdrv_read/write calls below.
+     */
+    if (start_sect + n_end > bs->total_sectors) {
+        n_end = bs->total_sectors - start_sect;
+    }
 
     n = n_end - n_start;
-    if (n <= 0)
+    if (n <= 0) {
         return 0;
+    }
+
+    buf = qemu_blockalign(bs, n * BDRV_SECTOR_SIZE);
+
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
-    ret = qcow2_read(bs, start_sect + n_start, s->cluster_data, n);
-    if (ret < 0)
-        return ret;
+    ret = bdrv_read(bs, start_sect + n_start, buf, n);
+    if (ret < 0) {
+        goto out;
+    }
+
     if (s->crypt_method) {
         qcow2_encrypt_sectors(s, start_sect + n_start,
-                        s->cluster_data,
-                        s->cluster_data, n, 1,
+                        buf, buf, n, 1,
                         &s->aes_encrypt_key);
     }
+
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start,
-        s->cluster_data, n);
-    if (ret < 0)
-        return ret;
-    return 0;
+    ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start, buf, n);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(buf);
+    return ret;
 }
 
 
@@ -620,7 +582,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
     if (m->n_start) {
         cow = true;
+        qemu_co_mutex_unlock(&s->lock);
         ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
+        qemu_co_mutex_lock(&s->lock);
         if (ret < 0)
             goto err;
     }
@@ -628,8 +592,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     if (m->nb_available & (s->cluster_sectors - 1)) {
         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
         cow = true;
+        qemu_co_mutex_unlock(&s->lock);
         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
                 m->nb_available - end, s->cluster_sectors);
+        qemu_co_mutex_lock(&s->lock);
         if (ret < 0)
             goto err;
     }
-- 
cgit v1.2.1


From aef4acb6616ab7fb5c105660aa8a2cee4e250e75 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 30 Nov 2011 12:23:41 +0000
Subject: qcow2: avoid reentrant bdrv_read() in copy_sectors()

A BlockDriverState should not issue requests on itself through the
public block layer interface.  Nested, or reentrant, requests are
problematic because they do I/O throttling and request tracking twice.

Features like block layer copy-on-read use request tracking to avoid
race conditions between concurrent requests.  The reentrant request will
have to "wait" for its parent request to complete.  But the parent is
waiting for the reentrant request to make progress so we have reached
deadlock.

The solution is for block drivers to avoid the public block layer
interfaces for reentrant requests.   Instead they should call their own
internal functions if they wish to perform reentrant requests.

This is also a good opportunity to make copy_sectors() a true
coroutine_fn.  That means calling bdrv_co_writev() instead of
bdrv_write().  Behavior is unchanged but we're being explicit that this
executes in coroutine context.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 0e337075b3..07a2e936fd 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -289,12 +289,15 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
     }
 }
 
-static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
-                        uint64_t cluster_offset, int n_start, int n_end)
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+                                     uint64_t start_sect,
+                                     uint64_t cluster_offset,
+                                     int n_start, int n_end)
 {
     BDRVQcowState *s = bs->opaque;
+    QEMUIOVector qiov;
+    struct iovec iov;
     int n, ret;
-    void *buf;
 
     /*
      * If this is the last cluster and it is only partially used, we must only
@@ -310,29 +313,37 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
         return 0;
     }
 
-    buf = qemu_blockalign(bs, n * BDRV_SECTOR_SIZE);
+    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
-    ret = bdrv_read(bs, start_sect + n_start, buf, n);
+
+    /* Call .bdrv_co_readv() directly instead of using the public block-layer
+     * interface.  This avoids double I/O throttling and request tracking,
+     * which can lead to deadlock when block layer copy-on-read is enabled.
+     */
+    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
     if (ret < 0) {
         goto out;
     }
 
     if (s->crypt_method) {
         qcow2_encrypt_sectors(s, start_sect + n_start,
-                        buf, buf, n, 1,
+                        iov.iov_base, iov.iov_base, n, 1,
                         &s->aes_encrypt_key);
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start, buf, n);
+    ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
     if (ret < 0) {
         goto out;
     }
 
     ret = 0;
 out:
-    qemu_vfree(buf);
+    qemu_vfree(iov.iov_base);
     return ret;
 }
 
-- 
cgit v1.2.1


From 23e9a39e7db60a8cbe4ac1291d4f4feab08930ce Mon Sep 17 00:00:00 2001
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Date: Tue, 1 Nov 2011 16:04:32 +0800
Subject: qed: adjust the way to get nb_sectors

This patch is only to refactor some lines of codes to get better and more robust codes.

As you have seen, in qed_read_table_cb() it's nice to
use qiov->size because that function doesn't obviously use a single
struct iovec.

In other two functions, if qiov use more than one struct iovec, the existing way will get wrong nb_sectors.
To make the code more robust, it will be nicer to refactor the existing way as below.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Acked-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed-table.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index f31f9ff069..8ee844346c 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -29,7 +29,7 @@ static void qed_read_table_cb(void *opaque, int ret)
 {
     QEDReadTableCB *read_table_cb = opaque;
     QEDTable *table = read_table_cb->table;
-    int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t);
+    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
     int i;
 
     /* Handle I/O error */
@@ -65,7 +65,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
 
     qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
     aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
-                           read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE,
+                           qiov->size / BDRV_SECTOR_SIZE,
                            qed_read_table_cb, read_table_cb);
     if (!aiocb) {
         qed_read_table_cb(read_table_cb, -EIO);
@@ -160,7 +160,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
 
     aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
                             &write_table_cb->qiov,
-                            write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE,
+                            write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
                             qed_write_table_cb, write_table_cb);
     if (!aiocb) {
         qed_write_table_cb(write_table_cb, -EIO);
-- 
cgit v1.2.1


From 4e5b184d63b5f031800c3ac215eebac126b1ab31 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Oct 2011 18:03:58 +0200
Subject: xen_disk: remove dead code

Xen_disk.c has support for using synchronous I/O instead of asynchronous,
but it is compiled out by default.  Remove it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/xen_disk.c | 86 ++---------------------------------------------------------
 1 file changed, 2 insertions(+), 84 deletions(-)

diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index 286bbac54a..192e81746f 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -49,7 +49,6 @@ static int syncwrite    = 0;
 static int batch_maps   = 0;
 
 static int max_requests = 32;
-static int use_aio      = 1;
 
 /* ------------------------------------------------------------- */
 
@@ -314,76 +313,6 @@ static int ioreq_map(struct ioreq *ioreq)
     return 0;
 }
 
-static int ioreq_runio_qemu_sync(struct ioreq *ioreq)
-{
-    struct XenBlkDev *blkdev = ioreq->blkdev;
-    int i, rc;
-    off_t pos;
-
-    if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
-        goto err_no_map;
-    }
-    if (ioreq->presync) {
-        bdrv_flush(blkdev->bs);
-    }
-
-    switch (ioreq->req.operation) {
-    case BLKIF_OP_READ:
-        pos = ioreq->start;
-        for (i = 0; i < ioreq->v.niov; i++) {
-            rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE,
-                           ioreq->v.iov[i].iov_base,
-                           ioreq->v.iov[i].iov_len / BLOCK_SIZE);
-            if (rc != 0) {
-                xen_be_printf(&blkdev->xendev, 0, "rd I/O error (%p, len %zd)\n",
-                              ioreq->v.iov[i].iov_base,
-                              ioreq->v.iov[i].iov_len);
-                goto err;
-            }
-            pos += ioreq->v.iov[i].iov_len;
-        }
-        break;
-    case BLKIF_OP_WRITE:
-    case BLKIF_OP_WRITE_BARRIER:
-        if (!ioreq->req.nr_segments) {
-            break;
-        }
-        pos = ioreq->start;
-        for (i = 0; i < ioreq->v.niov; i++) {
-            rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE,
-                            ioreq->v.iov[i].iov_base,
-                            ioreq->v.iov[i].iov_len / BLOCK_SIZE);
-            if (rc != 0) {
-                xen_be_printf(&blkdev->xendev, 0, "wr I/O error (%p, len %zd)\n",
-                              ioreq->v.iov[i].iov_base,
-                              ioreq->v.iov[i].iov_len);
-                goto err;
-            }
-            pos += ioreq->v.iov[i].iov_len;
-        }
-        break;
-    default:
-        /* unknown operation (shouldn't happen -- parse catches this) */
-        goto err;
-    }
-
-    if (ioreq->postsync) {
-        bdrv_flush(blkdev->bs);
-    }
-    ioreq->status = BLKIF_RSP_OKAY;
-
-    ioreq_unmap(ioreq);
-    ioreq_finish(ioreq);
-    return 0;
-
-err:
-    ioreq_unmap(ioreq);
-err_no_map:
-    ioreq_finish(ioreq);
-    ioreq->status = BLKIF_RSP_ERROR;
-    return -1;
-}
-
 static void qemu_aio_complete(void *opaque, int ret)
 {
     struct ioreq *ioreq = opaque;
@@ -554,9 +483,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
     rp = blkdev->rings.common.sring->req_prod;
     xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
 
-    if (use_aio) {
-        blk_send_response_all(blkdev);
-    }
+    blk_send_response_all(blkdev);
     while (rc != rp) {
         /* pull request from ring */
         if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
@@ -579,16 +506,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
             continue;
         }
 
-        if (use_aio) {
-            /* run i/o in aio mode */
-            ioreq_runio_qemu_aio(ioreq);
-        } else {
-            /* run i/o in sync mode */
-            ioreq_runio_qemu_sync(ioreq);
-        }
-    }
-    if (!use_aio) {
-        blk_send_response_all(blkdev);
+        ioreq_runio_qemu_aio(ioreq);
     }
 
     if (blkdev->more_work && blkdev->requests_inflight < max_requests) {
-- 
cgit v1.2.1


From 3535a9c6bef5234ab4df3f143763a0e89a1ad29c Mon Sep 17 00:00:00 2001
From: Li Zhi Hui <zhihuili@linux.vnet.ibm.com>
Date: Tue, 8 Nov 2011 14:21:13 +0800
Subject: block: Use bdrv functions to replace file operation in cow.c

Since common file operation functions lack of error detection,
so change them to bdrv series functions.

Signed-off-by: Li Zhi Hui <zhihuili@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/cow.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/block/cow.c b/block/cow.c
index 089d395c40..9858d71f00 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -243,12 +243,12 @@ static void cow_close(BlockDriverState *bs)
 
 static int cow_create(const char *filename, QEMUOptionParameter *options)
 {
-    int fd, cow_fd;
     struct cow_header_v2 cow_header;
     struct stat st;
     int64_t image_sectors = 0;
     const char *image_filename = NULL;
     int ret;
+    BlockDriverState *cow_bs;
 
     /* Read out options */
     while (options && options->name) {
@@ -260,10 +260,16 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
         options++;
     }
 
-    cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
-              0644);
-    if (cow_fd < 0)
-        return -errno;
+    ret = bdrv_create_file(filename, options);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
     memset(&cow_header, 0, sizeof(cow_header));
     cow_header.magic = cpu_to_be32(COW_MAGIC);
     cow_header.version = cpu_to_be32(COW_VERSION);
@@ -271,16 +277,9 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
         /* Note: if no file, we put a dummy mtime */
         cow_header.mtime = cpu_to_be32(0);
 
-        fd = open(image_filename, O_RDONLY | O_BINARY);
-        if (fd < 0) {
-            close(cow_fd);
-            goto mtime_fail;
-        }
-        if (fstat(fd, &st) != 0) {
-            close(fd);
+        if (stat(image_filename, &st) != 0) {
             goto mtime_fail;
         }
-        close(fd);
         cow_header.mtime = cpu_to_be32(st.st_mtime);
     mtime_fail:
         pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
@@ -288,21 +287,20 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
     }
     cow_header.sectorsize = cpu_to_be32(512);
     cow_header.size = cpu_to_be64(image_sectors * 512);
-    ret = qemu_write_full(cow_fd, &cow_header, sizeof(cow_header));
+    ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header));
     if (ret != sizeof(cow_header)) {
-        ret = -errno;
         goto exit;
     }
 
     /* resize to include at least all the bitmap */
-    ret = ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3));
+    ret = bdrv_truncate(cow_bs,
+        sizeof(cow_header) + ((image_sectors + 7) >> 3));
     if (ret) {
-        ret = -errno;
         goto exit;
     }
 
 exit:
-    close(cow_fd);
+    bdrv_delete(cow_bs);
     return ret;
 }
 
-- 
cgit v1.2.1


From 0563e191516289c9d2f282a8c50f2eecef2fa773 Mon Sep 17 00:00:00 2001
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Date: Thu, 3 Nov 2011 16:57:25 +0800
Subject: block: add the blockio limits command line support

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c         | 39 +++++++++++++++++++++++++++++++++++++++
 block.h         |  4 ++++
 block_int.h     | 29 +++++++++++++++++++++++++++++
 blockdev.c      | 44 ++++++++++++++++++++++++++++++++++++++++++++
 qemu-config.c   | 24 ++++++++++++++++++++++++
 qemu-options.hx |  1 +
 6 files changed, 141 insertions(+)

diff --git a/block.c b/block.c
index d0158877d6..8cb41c0b41 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,7 @@
 #include "qjson.h"
 #include "qemu-coroutine.h"
 #include "qmp-commands.h"
+#include "qemu-timer.h"
 
 #ifdef CONFIG_BSD
 #include <sys/types.h>
@@ -105,6 +106,36 @@ int is_windows_drive(const char *filename)
 }
 #endif
 
+/* throttling disk I/O limits */
+static void bdrv_block_timer(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+
+    qemu_co_queue_next(&bs->throttled_reqs);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    qemu_co_queue_init(&bs->throttled_reqs);
+    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
+    bs->slice_start = qemu_get_clock_ns(vm_clock);
+    bs->slice_end   = bs->slice_start + bs->slice_time;
+    memset(&bs->io_base, 0, sizeof(bs->io_base));
+    bs->io_limits_enabled = true;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+    BlockIOLimit *io_limits = &bs->io_limits;
+    return io_limits->bps[BLOCK_IO_LIMIT_READ]
+         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
+         || io_limits->iops[BLOCK_IO_LIMIT_READ]
+         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -1526,6 +1557,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
     *psecs = bs->secs;
 }
 
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        BlockIOLimit *io_limits)
+{
+    bs->io_limits = *io_limits;
+    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
+}
+
 /* Recognize floppy formats */
 typedef struct FDFormat {
     FDriveType drive;
diff --git a/block.h b/block.h
index a826059897..2d244088c3 100644
--- a/block.h
+++ b/block.h
@@ -98,6 +98,10 @@ void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
+/* disk I/O throttling */
+void bdrv_io_limits_enable(BlockDriverState *bs);
+bool bdrv_io_limits_enabled(BlockDriverState *bs);
+
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
 BlockDriver *bdrv_find_protocol(const char *filename);
diff --git a/block_int.h b/block_int.h
index 77c0187c3d..97b1c2be54 100644
--- a/block_int.h
+++ b/block_int.h
@@ -34,6 +34,12 @@
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
 
+#define BLOCK_IO_LIMIT_READ     0
+#define BLOCK_IO_LIMIT_WRITE    1
+#define BLOCK_IO_LIMIT_TOTAL    2
+
+#define BLOCK_IO_SLICE_TIME     100000000
+
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"
 #define BLOCK_OPT_COMPAT6       "compat6"
@@ -50,6 +56,16 @@ typedef struct AIOPool {
     BlockDriverAIOCB *free_aiocb;
 } AIOPool;
 
+typedef struct BlockIOLimit {
+    int64_t bps[3];
+    int64_t iops[3];
+} BlockIOLimit;
+
+typedef struct BlockIOBaseValue {
+    uint64_t bytes[2];
+    uint64_t ios[2];
+} BlockIOBaseValue;
+
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -201,6 +217,16 @@ struct BlockDriverState {
 
     void *sync_aiocb;
 
+    /* the time for latest disk I/O */
+    int64_t slice_time;
+    int64_t slice_start;
+    int64_t slice_end;
+    BlockIOLimit io_limits;
+    BlockIOBaseValue  io_base;
+    CoQueue      throttled_reqs;
+    QEMUTimer    *block_timer;
+    bool         io_limits_enabled;
+
     /* I/O stats (display with "info blockstats"). */
     uint64_t nr_bytes[BDRV_MAX_IOTYPE];
     uint64_t nr_ops[BDRV_MAX_IOTYPE];
@@ -244,6 +270,9 @@ void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
                    BlockDriverCompletionFunc *cb, void *opaque);
 void qemu_aio_release(void *p);
 
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        BlockIOLimit *io_limits);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
diff --git a/blockdev.c b/blockdev.c
index 222818690d..bbe5f11be7 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -216,6 +216,26 @@ static int parse_block_error_action(const char *buf, int is_read)
     }
 }
 
+static bool do_check_io_limits(BlockIOLimit *io_limits)
+{
+    bool bps_flag;
+    bool iops_flag;
+
+    assert(io_limits);
+
+    bps_flag  = (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] != 0)
+                 && ((io_limits->bps[BLOCK_IO_LIMIT_READ] != 0)
+                 || (io_limits->bps[BLOCK_IO_LIMIT_WRITE] != 0));
+    iops_flag = (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] != 0)
+                 && ((io_limits->iops[BLOCK_IO_LIMIT_READ] != 0)
+                 || (io_limits->iops[BLOCK_IO_LIMIT_WRITE] != 0));
+    if (bps_flag || iops_flag) {
+        return false;
+    }
+
+    return true;
+}
+
 DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 {
     const char *buf;
@@ -235,6 +255,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     int on_read_error, on_write_error;
     const char *devaddr;
     DriveInfo *dinfo;
+    BlockIOLimit io_limits;
     int snapshot = 0;
     int ret;
 
@@ -353,6 +374,26 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         }
     }
 
+    /* disk I/O throttling */
+    io_limits.bps[BLOCK_IO_LIMIT_TOTAL]  =
+                           qemu_opt_get_number(opts, "bps", 0);
+    io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+                           qemu_opt_get_number(opts, "bps_rd", 0);
+    io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+                           qemu_opt_get_number(opts, "bps_wr", 0);
+    io_limits.iops[BLOCK_IO_LIMIT_TOTAL] =
+                           qemu_opt_get_number(opts, "iops", 0);
+    io_limits.iops[BLOCK_IO_LIMIT_READ]  =
+                           qemu_opt_get_number(opts, "iops_rd", 0);
+    io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
+                           qemu_opt_get_number(opts, "iops_wr", 0);
+
+    if (!do_check_io_limits(&io_limits)) {
+        error_report("bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr) "
+                     "cannot be used at the same time");
+        return NULL;
+    }
+
     on_write_error = BLOCK_ERR_STOP_ENOSPC;
     if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
         if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
@@ -460,6 +501,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 
     bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
 
+    /* disk I/O throttling */
+    bdrv_set_io_limits(dinfo->bdrv, &io_limits);
+
     switch(type) {
     case IF_IDE:
     case IF_SCSI:
diff --git a/qemu-config.c b/qemu-config.c
index 597d7e10b1..1aa080fbcb 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -85,6 +85,30 @@ static QemuOptsList qemu_drive_opts = {
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
             .help = "open drive file as read-only",
+        },{
+            .name = "iops",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit total I/O operations per second",
+        },{
+            .name = "iops_rd",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit read operations per second",
+        },{
+            .name = "iops_wr",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit write operations per second",
+        },{
+            .name = "bps",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit total bytes per second",
+        },{
+            .name = "bps_rd",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit read bytes per second",
+        },{
+            .name = "bps_wr",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit write bytes per second",
         },
         { /* end of list */ }
     },
diff --git a/qemu-options.hx b/qemu-options.hx
index 681eaf198e..25a7be7948 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -136,6 +136,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
     "       [,readonly=on|off]\n"
+    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
-- 
cgit v1.2.1


From e9e6295b28b331762e67d466f77ba07a349edbbc Mon Sep 17 00:00:00 2001
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Date: Thu, 3 Nov 2011 16:57:26 +0800
Subject: CoQueue: introduce qemu_co_queue_wait_insert_head

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-coroutine-lock.c | 8 ++++++++
 qemu-coroutine.h      | 6 ++++++
 2 files changed, 14 insertions(+)

diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
index 6b58160058..9549c075ee 100644
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@@ -61,6 +61,14 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
     assert(qemu_in_coroutine());
 }
 
+void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue)
+{
+    Coroutine *self = qemu_coroutine_self();
+    QTAILQ_INSERT_HEAD(&queue->entries, self, co_queue_next);
+    qemu_coroutine_yield();
+    assert(qemu_in_coroutine());
+}
+
 bool qemu_co_queue_next(CoQueue *queue)
 {
     Coroutine *next;
diff --git a/qemu-coroutine.h b/qemu-coroutine.h
index b8fc4f4332..8a2e5d2a10 100644
--- a/qemu-coroutine.h
+++ b/qemu-coroutine.h
@@ -117,6 +117,12 @@ void qemu_co_queue_init(CoQueue *queue);
  */
 void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 
+/**
+ * Adds the current coroutine to the head of the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue);
+
 /**
  * Restarts the next coroutine in the CoQueue and removes it from the queue.
  *
-- 
cgit v1.2.1


From 98f90dba5ee56f699b28509a6cc7a9a8a57636eb Mon Sep 17 00:00:00 2001
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Date: Tue, 8 Nov 2011 13:00:14 +0800
Subject: block: add I/O throttling algorithm

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c     | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block.h     |   1 +
 block_int.h |   1 +
 3 files changed, 236 insertions(+)

diff --git a/block.c b/block.c
index 8cb41c0b41..42bd308639 100644
--- a/block.c
+++ b/block.c
@@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                                bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
 #endif
 
 /* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    while (qemu_co_queue_next(&bs->throttled_reqs));
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
+
+    bs->slice_start = 0;
+    bs->slice_end   = 0;
+    bs->slice_time  = 0;
+    memset(&bs->io_base, 0, sizeof(bs->io_base));
+}
+
 static void bdrv_block_timer(void *opaque)
 {
     BlockDriverState *bs = opaque;
@@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
 }
 
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+                                     bool is_write, int nb_sectors)
+{
+    int64_t wait_time = -1;
+
+    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+        qemu_co_queue_wait(&bs->throttled_reqs);
+    }
+
+    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
+     * throttled requests will not be dequeued until the current request is
+     * allowed to be serviced. So if the current request still exceeds the
+     * limits, it will be inserted to the head. All requests followed it will
+     * be still in throttled_reqs queue.
+     */
+
+    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
+        qemu_mod_timer(bs->block_timer,
+                       wait_time + qemu_get_clock_ns(vm_clock));
+        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+    }
+
+    qemu_co_queue_next(&bs->throttled_reqs);
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
         bdrv_dev_change_media_cb(bs, true);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
 
         bdrv_dev_change_media_cb(bs, false);
     }
+
+    /*throttling disk I/O limits*/
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_disable(bs);
+    }
 }
 
 void bdrv_close_all(void)
@@ -1298,6 +1358,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         return -EIO;
     }
 
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, false, nb_sectors);
+    }
+
     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 }
 
@@ -1328,6 +1393,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         return -EIO;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, true, nb_sectors);
+    }
+
     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 
     if (bs->dirty_bitmap) {
@@ -2519,6 +2589,170 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+/* block I/O throttling */
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait)
+{
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_base, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
+    }
+
+    /* bytes_base: the bytes of data which have been read/written; and
+     *             it is obtained from the history statistic info.
+     * bytes_res: the remaining bytes of data which need to be read/written.
+     * (bytes_base + bytes_res) / bps_limit: used to calcuate
+     *             the total time for completing reading/writting all data.
+     */
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_base + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
+
+    /* When the I/O rate at runtime exceeds the limits,
+     * bs->slice_end need to be extended in order that the current statistic
+     * info can be kept until the timer fire, so it is increased and tuned
+     * based on the result of experiment.
+     */
+    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait)
+{
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_base;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
+    }
+
+    if (ios_base + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_base + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait)
+{
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + bs->slice_time;
+    } else {
+        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
+        bs->slice_start = now;
+        bs->slice_end   = now + bs->slice_time;
+
+        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
+        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
+
+        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
+        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index 2d244088c3..83e17cad96 100644
--- a/block.h
+++ b/block.h
@@ -100,6 +100,7 @@ void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
 /* disk I/O throttling */
 void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
 bool bdrv_io_limits_enabled(BlockDriverState *bs);
 
 void bdrv_init(void);
diff --git a/block_int.h b/block_int.h
index 97b1c2be54..e2799e4f6b 100644
--- a/block_int.h
+++ b/block_int.h
@@ -39,6 +39,7 @@
 #define BLOCK_IO_LIMIT_TOTAL    2
 
 #define BLOCK_IO_SLICE_TIME     100000000
+#define NANOSECONDS_PER_SECOND  1000000000.0
 
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"
-- 
cgit v1.2.1


From 727f005e6a5416ea903d0ccc2428cbdc663aa1d2 Mon Sep 17 00:00:00 2001
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Date: Tue, 8 Nov 2011 13:00:31 +0800
Subject: hmp/qmp: add block_set_io_throttle

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c          | 15 ++++++++++++++
 blockdev.c       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 blockdev.h       |  2 ++
 hmp-commands.hx  | 15 ++++++++++++++
 hmp.c            | 10 ++++++++++
 qapi-schema.json | 16 ++++++++++++++-
 qerror.c         |  4 ++++
 qerror.h         |  3 +++
 qmp-commands.hx  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-
 9 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 42bd308639..5f979bddae 100644
--- a/block.c
+++ b/block.c
@@ -1978,6 +1978,21 @@ BlockInfoList *qmp_query_block(Error **errp)
                 info->value->inserted->has_backing_file = true;
                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
             }
+
+            if (bs->io_limits_enabled) {
+                info->value->inserted->bps =
+                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+                info->value->inserted->bps_rd =
+                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
+                info->value->inserted->bps_wr =
+                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
+                info->value->inserted->iops =
+                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+                info->value->inserted->iops_rd =
+                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
+                info->value->inserted->iops_wr =
+                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
+            }
         }
 
         /* XXX: waiting for the qapi to support GSList */
diff --git a/blockdev.c b/blockdev.c
index bbe5f11be7..9068c5b1d0 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -759,6 +759,65 @@ int do_change_block(Monitor *mon, const char *device,
     return monitor_read_bdrv_key_start(mon, bs, NULL, NULL);
 }
 
+/* throttling disk I/O limits */
+int do_block_set_io_throttle(Monitor *mon,
+                       const QDict *qdict, QObject **ret_data)
+{
+    BlockIOLimit io_limits;
+    const char *devname = qdict_get_str(qdict, "device");
+    BlockDriverState *bs;
+
+    io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+                        = qdict_get_try_int(qdict, "bps", -1);
+    io_limits.bps[BLOCK_IO_LIMIT_READ]
+                        = qdict_get_try_int(qdict, "bps_rd", -1);
+    io_limits.bps[BLOCK_IO_LIMIT_WRITE]
+                        = qdict_get_try_int(qdict, "bps_wr", -1);
+    io_limits.iops[BLOCK_IO_LIMIT_TOTAL]
+                        = qdict_get_try_int(qdict, "iops", -1);
+    io_limits.iops[BLOCK_IO_LIMIT_READ]
+                        = qdict_get_try_int(qdict, "iops_rd", -1);
+    io_limits.iops[BLOCK_IO_LIMIT_WRITE]
+                        = qdict_get_try_int(qdict, "iops_wr", -1);
+
+    bs = bdrv_find(devname);
+    if (!bs) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, devname);
+        return -1;
+    }
+
+    if ((io_limits.bps[BLOCK_IO_LIMIT_TOTAL] == -1)
+        || (io_limits.bps[BLOCK_IO_LIMIT_READ] == -1)
+        || (io_limits.bps[BLOCK_IO_LIMIT_WRITE] == -1)
+        || (io_limits.iops[BLOCK_IO_LIMIT_TOTAL] == -1)
+        || (io_limits.iops[BLOCK_IO_LIMIT_READ] == -1)
+        || (io_limits.iops[BLOCK_IO_LIMIT_WRITE] == -1)) {
+        qerror_report(QERR_MISSING_PARAMETER,
+                      "bps/bps_rd/bps_wr/iops/iops_rd/iops_wr");
+        return -1;
+    }
+
+    if (!do_check_io_limits(&io_limits)) {
+        qerror_report(QERR_INVALID_PARAMETER_COMBINATION);
+        return -1;
+    }
+
+    bs->io_limits = io_limits;
+    bs->slice_time = BLOCK_IO_SLICE_TIME;
+
+    if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_enable(bs);
+    } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_disable(bs);
+    } else {
+        if (bs->block_timer) {
+            qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock));
+        }
+    }
+
+    return 0;
+}
+
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");
diff --git a/blockdev.h b/blockdev.h
index 3587786a64..1b48a75499 100644
--- a/blockdev.h
+++ b/blockdev.h
@@ -63,6 +63,8 @@ int do_block_set_passwd(Monitor *mon, const QDict *qdict, QObject **ret_data);
 int do_change_block(Monitor *mon, const char *device,
                     const char *filename, const char *fmt);
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_block_set_io_throttle(Monitor *mon,
+                             const QDict *qdict, QObject **ret_data);
 int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data);
 int do_block_resize(Monitor *mon, const QDict *qdict, QObject **ret_data);
 
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 089c1ac23d..f8d855e3be 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1206,6 +1206,21 @@ ETEXI
         .mhandler.cmd_new = do_block_set_passwd,
     },
 
+STEXI
+@item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
+@findex block_set_io_throttle
+Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
+ETEXI
+
+    {
+        .name       = "block_set_io_throttle",
+        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l",
+        .params     = "device bps bps_rd bps_wr iops iops_rd iops_wr",
+        .help       = "change I/O throttle limits for a block drive",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_block_set_io_throttle,
+    },
+
 STEXI
 @item block_passwd @var{device} @var{password}
 @findex block_passwd
diff --git a/hmp.c b/hmp.c
index 443d3a7ff4..dfab7ad9bb 100644
--- a/hmp.c
+++ b/hmp.c
@@ -216,6 +216,16 @@ void hmp_info_block(Monitor *mon)
                            info->value->inserted->ro,
                            info->value->inserted->drv,
                            info->value->inserted->encrypted);
+
+            monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
+                            " bps_wr=%" PRId64 " iops=%" PRId64
+                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
+                            info->value->inserted->bps,
+                            info->value->inserted->bps_rd,
+                            info->value->inserted->bps_wr,
+                            info->value->inserted->iops,
+                            info->value->inserted->iops_rd,
+                            info->value->inserted->iops_wr);
         } else {
             monitor_printf(mon, " [not inserted]");
         }
diff --git a/qapi-schema.json b/qapi-schema.json
index cb1ba776df..fbbdbe0914 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -370,13 +370,27 @@
 #
 # @encrypted: true if the backing device is encrypted
 #
+# @bps: total throughput limit in bytes per second is specified
+#
+# @bps_rd: read throughput limit in bytes per second is specified
+#
+# @bps_wr: write throughput limit in bytes per second is specified
+#
+# @iops: total I/O operations per second is specified
+#
+# @iops_rd: read I/O operations per second is specified
+#
+# @iops_wr: write I/O operations per second is specified
+#
 # Since: 0.14.0
 #
 # Notes: This interface is only found in @BlockInfo.
 ##
 { 'type': 'BlockDeviceInfo',
   'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str',
-            '*backing_file': 'str', 'encrypted': 'bool' } }
+            '*backing_file': 'str', 'encrypted': 'bool',
+            'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
+            'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int'} }
 
 ##
 # @BlockDeviceIoStatus:
diff --git a/qerror.c b/qerror.c
index fdf62b9e27..d679d67e9e 100644
--- a/qerror.c
+++ b/qerror.c
@@ -246,6 +246,10 @@ static const QErrorStringTable qerror_table[] = {
         .error_fmt = QERR_QGA_COMMAND_FAILED,
         .desc      = "Guest agent command failed, error was '%(message)'",
     },
+    {
+        .error_fmt = QERR_INVALID_PARAMETER_COMBINATION,
+        .desc      = "Invalid paramter combination",
+    },
     {}
 };
 
diff --git a/qerror.h b/qerror.h
index 2d3d43bed1..560d458aea 100644
--- a/qerror.h
+++ b/qerror.h
@@ -204,4 +204,7 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_QGA_COMMAND_FAILED \
     "{ 'class': 'QgaCommandFailed', 'data': { 'message': %s } }"
 
+#define QERR_INVALID_PARAMETER_COMBINATION \
+    "{ 'class': 'InvalidParameterCombination', 'data': {} }"
+
 #endif /* QERROR_H */
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 97975a5207..94da2a8ef2 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -848,6 +848,44 @@ Example:
                                                "password": "12345" } }
 <- { "return": {} }
 
+EQMP
+
+    {
+        .name       = "block_set_io_throttle",
+        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l",
+        .params     = "device bps bps_rd bps_wr iops iops_rd iops_wr",
+        .help       = "change I/O throttle limits for a block drive",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_block_set_io_throttle,
+    },
+
+SQMP
+block_set_io_throttle
+------------
+
+Change I/O throttle limits for a block drive.
+
+Arguments:
+
+- "device": device name (json-string)
+- "bps":  total throughput limit in bytes per second(json-int)
+- "bps_rd":  read throughput limit in bytes per second(json-int)
+- "bps_wr":  read throughput limit in bytes per second(json-int)
+- "iops":  total I/O operations per second(json-int)
+- "iops_rd":  read I/O operations per second(json-int)
+- "iops_wr":  write I/O operations per second(json-int)
+
+Example:
+
+-> { "execute": "block_set_io_throttle", "arguments": { "device": "virtio0",
+                                               "bps": "1000000",
+                                               "bps_rd": "0",
+                                               "bps_wr": "0",
+                                               "iops": "0",
+                                               "iops_rd": "0",
+                                               "iops_wr": "0" } }
+<- { "return": {} }
+
 EQMP
 
     {
@@ -1152,6 +1190,13 @@ Each json-object contain the following:
                                 "tftp", "vdi", "vmdk", "vpc", "vvfat"
          - "backing_file": backing file name (json-string, optional)
          - "encrypted": true if encrypted, false otherwise (json-bool)
+         - "bps": limit total bytes per second (json-int)
+         - "bps_rd": limit read bytes per second (json-int)
+         - "bps_wr": limit write bytes per second (json-int)
+         - "iops": limit total I/O operations per second (json-int)
+         - "iops_rd": limit read operations per second (json-int)
+         - "iops_wr": limit write operations per second (json-int)
+
 - "io-status": I/O operation status, only present if the device supports it
                and the VM is configured to stop on errors. It's always reset
                to "ok" when the "cont" command is issued (json_string, optional)
@@ -1171,7 +1216,13 @@ Example:
                "ro":false,
                "drv":"qcow2",
                "encrypted":false,
-               "file":"disks/test.img"
+               "file":"disks/test.img",
+               "bps":1000000,
+               "bps_rd":0,
+               "bps_wr":0,
+               "iops":1000000,
+               "iops_rd":0,
+               "iops_wr":0,
             },
             "type":"unknown"
          },
-- 
cgit v1.2.1


From a968168c580cb45a700e0b218c7f6871d91ee257 Mon Sep 17 00:00:00 2001
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Date: Thu, 10 Nov 2011 16:23:22 +0800
Subject: block: Add coroutine_fn marker to coroutine functions

Looks better when reviewing these source files.

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c     | 4 ++--
 block/qcow2.c    | 8 ++++----
 block/sheepdog.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 4814ed0ced..326ef878e6 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -433,7 +433,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
     return 0;
 }
 
-static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
@@ -531,7 +531,7 @@ fail:
     goto done;
 }
 
-static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
diff --git a/block/qcow2.c b/block/qcow2.c
index d7805ce943..a6a4f4772b 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -377,7 +377,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
     return n1;
 }
 
-static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                           int remaining_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
@@ -517,7 +517,7 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
     }
 }
 
-static int qcow2_co_writev(BlockDriverState *bs,
+static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                            int64_t sector_num,
                            int remaining_sectors,
                            QEMUIOVector *qiov)
@@ -1137,7 +1137,7 @@ fail:
     return ret;
 }
 
-static int qcow2_co_flush_to_os(BlockDriverState *bs)
+static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
@@ -1159,7 +1159,7 @@ static int qcow2_co_flush_to_os(BlockDriverState *bs)
     return 0;
 }
 
-static int qcow2_co_flush_to_disk(BlockDriverState *bs)
+static coroutine_fn int qcow2_co_flush_to_disk(BlockDriverState *bs)
 {
     return bdrv_co_flush(bs->file);
 }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 62f1f3a0cf..aa9707f2ae 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1715,7 +1715,7 @@ out:
     return 1;
 }
 
-static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
@@ -1744,7 +1744,7 @@ static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     return acb->ret;
 }
 
-static int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
-- 
cgit v1.2.1


From 42deb29fed92577db57c54e844f852481600b756 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 11:43:28 +0100
Subject: qcow2: Return real error code in qcow2_read_snapshots

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 25 ++++++++++++++++++++-----
 block/qcow2.c          |  5 +++--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index bdc33ba94c..4134bbc253 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -68,6 +68,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
     int i, id_str_size, name_size;
     int64_t offset;
     uint32_t extra_data_size;
+    int ret;
 
     if (!s->nb_snapshots) {
         s->snapshots = NULL;
@@ -77,10 +78,15 @@ int qcow2_read_snapshots(BlockDriverState *bs)
 
     offset = s->snapshots_offset;
     s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+
     for(i = 0; i < s->nb_snapshots; i++) {
+        /* Read statically sized part of the snapshot header */
         offset = align_offset(offset, 8);
-        if (bdrv_pread(bs->file, offset, &h, sizeof(h)) != sizeof(h))
+        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
             goto fail;
+        }
+
         offset += sizeof(h);
         sn = s->snapshots + i;
         sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
@@ -94,25 +100,34 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         id_str_size = be16_to_cpu(h.id_str_size);
         name_size = be16_to_cpu(h.name_size);
 
+        /* Skip extra data */
         offset += extra_data_size;
 
+        /* Read snapshot ID */
         sn->id_str = g_malloc(id_str_size + 1);
-        if (bdrv_pread(bs->file, offset, sn->id_str, id_str_size) != id_str_size)
+        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += id_str_size;
         sn->id_str[id_str_size] = '\0';
 
+        /* Read snapshot name */
         sn->name = g_malloc(name_size + 1);
-        if (bdrv_pread(bs->file, offset, sn->name, name_size) != name_size)
+        ret = bdrv_pread(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += name_size;
         sn->name[name_size] = '\0';
     }
+
     s->snapshots_size = offset - s->snapshots_offset;
     return 0;
- fail:
+
+fail:
     qcow2_free_snapshots(bs);
-    return -1;
+    return ret;
 }
 
 /* add at the end of the file a new list of snapshots */
diff --git a/block/qcow2.c b/block/qcow2.c
index a6a4f4772b..3f8a1281d7 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -273,8 +273,9 @@ static int qcow2_open(BlockDriverState *bs, int flags)
         }
         bs->backing_file[len] = '\0';
     }
-    if (qcow2_read_snapshots(bs) < 0) {
-        ret = -EINVAL;
+
+    ret = qcow2_read_snapshots(bs);
+    if (ret < 0) {
         goto fail;
     }
 
-- 
cgit v1.2.1


From 07fd8779001c3e6ff3ac25416a31381ff8aca308 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 12:00:59 +0100
Subject: qcow2: Return real error code in qcow2_write_snapshots

Doesn't immediately fix anything as the callers don't use the return
value, but they will be fixed next.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 4134bbc253..e91a286333 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -140,6 +140,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     uint64_t data64;
     uint32_t data32;
     int64_t offset, snapshots_offset;
+    int ret;
 
     /* compute the size of the snapshots */
     offset = 0;
@@ -152,6 +153,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     }
     snapshots_size = offset;
 
+    /* Allocate space for the new snapshot list */
     snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
     bdrv_flush(bs->file);
     offset = snapshots_offset;
@@ -159,6 +161,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         return offset;
     }
 
+    /* Write all snapshots to the new list */
     for(i = 0; i < s->nb_snapshots; i++) {
         sn = s->snapshots + i;
         memset(&h, 0, sizeof(h));
@@ -174,34 +177,59 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         h.id_str_size = cpu_to_be16(id_str_size);
         h.name_size = cpu_to_be16(name_size);
         offset = align_offset(offset, 8);
-        if (bdrv_pwrite_sync(bs->file, offset, &h, sizeof(h)) < 0)
+
+        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
             goto fail;
+        }
         offset += sizeof(h);
-        if (bdrv_pwrite_sync(bs->file, offset, sn->id_str, id_str_size) < 0)
+
+        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += id_str_size;
-        if (bdrv_pwrite_sync(bs->file, offset, sn->name, name_size) < 0)
+
+        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += name_size;
     }
 
-    /* update the various header fields */
+    /*
+     * Update the header to point to the new snapshot table. This requires the
+     * new table and its refcounts to be stable on disk.
+     *
+     * FIXME This should be done with a single write
+     */
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
     data64 = cpu_to_be64(snapshots_offset);
-    if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, snapshots_offset),
-                    &data64, sizeof(data64)) < 0)
+    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, snapshots_offset),
+                      &data64, sizeof(data64));
+    if (ret < 0) {
         goto fail;
+    }
+
     data32 = cpu_to_be32(s->nb_snapshots);
-    if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
-                    &data32, sizeof(data32)) < 0)
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+                           &data32, sizeof(data32));
+    if (ret < 0) {
         goto fail;
+    }
 
     /* free the old snapshot table */
     qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size);
     s->snapshots_offset = snapshots_offset;
     s->snapshots_size = snapshots_size;
     return 0;
- fail:
-    return -1;
+
+fail:
+    return ret;
 }
 
 static void find_new_snapshot_id(BlockDriverState *bs,
-- 
cgit v1.2.1


From d69969c4046a81af106e723ff1bc2740365e67c1 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 18 Nov 2011 18:27:00 +0100
Subject: qcow2: Update snapshot table information at once

Failing in the middle wouldn't help with the integrity of the image, so
doing everything in a single request seems better.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index e91a286333..1976df7f00 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -137,8 +137,10 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     QCowSnapshot *sn;
     QCowSnapshotHeader h;
     int i, name_size, id_str_size, snapshots_size;
-    uint64_t data64;
-    uint32_t data32;
+    struct {
+        uint32_t nb_snapshots;
+        uint64_t snapshots_offset;
+    } QEMU_PACKED header_data;
     int64_t offset, snapshots_offset;
     int ret;
 
@@ -200,24 +202,20 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     /*
      * Update the header to point to the new snapshot table. This requires the
      * new table and its refcounts to be stable on disk.
-     *
-     * FIXME This should be done with a single write
      */
     ret = bdrv_flush(bs);
     if (ret < 0) {
         goto fail;
     }
 
-    data64 = cpu_to_be64(snapshots_offset);
-    ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, snapshots_offset),
-                      &data64, sizeof(data64));
-    if (ret < 0) {
-        goto fail;
-    }
+    QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
+        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+
+    header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
+    header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);
 
-    data32 = cpu_to_be32(s->nb_snapshots);
     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
-                           &data32, sizeof(data32));
+                           &header_data, sizeof(header_data));
     if (ret < 0) {
         goto fail;
     }
-- 
cgit v1.2.1


From 03343166f703d5c8f02b8519f8493c56e5541ae7 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 17:46:29 +0100
Subject: qcow2: Cleanups and memleak fix in qcow2_snapshot_create

sn->id_str could be leaked before this. The rest of this patch changes
comments, fixes coding style or removes checks that are unnecessary with
g_malloc.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 1976df7f00..53d9233ba9 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -284,21 +284,20 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     memset(sn, 0, sizeof(*sn));
 
+    /* Generate an ID if it wasn't passed */
     if (sn_info->id_str[0] == '\0') {
-        /* compute a new id */
         find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
     }
 
-    /* check that the ID is unique */
-    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
+    /* Check that the ID is unique */
+    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
         return -ENOENT;
+    }
 
+    /* Populate sn with passed data */
     sn->id_str = g_strdup(sn_info->id_str);
-    if (!sn->id_str)
-        goto fail;
     sn->name = g_strdup(sn_info->name);
-    if (!sn->name)
-        goto fail;
+
     sn->vm_state_size = sn_info->vm_state_size;
     sn->date_sec = sn_info->date_sec;
     sn->date_nsec = sn_info->date_nsec;
@@ -308,7 +307,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     if (ret < 0)
         goto fail;
 
-    /* create the L1 table of the snapshot */
+    /* Allocate the L1 table of the snapshot and copy the current one there. */
     l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
     if (l1_table_offset < 0) {
         goto fail;
@@ -318,12 +317,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     sn->l1_table_offset = l1_table_offset;
     sn->l1_size = s->l1_size;
 
-    if (s->l1_size != 0) {
-        l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
-    } else {
-        l1_table = NULL;
-    }
-
+    l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
     for(i = 0; i < s->l1_size; i++) {
         l1_table[i] = cpu_to_be64(s->l1_table[i]);
     }
@@ -350,7 +344,9 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     }
 #endif
     return 0;
- fail:
+
+fail:
+    g_free(sn->id_str);
     g_free(sn->name);
     g_free(l1_table);
     return -1;
-- 
cgit v1.2.1


From d1ea98d56dc2485b4637a1ca19feef786b3aee8f Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 12:43:59 +0100
Subject: qcow2: Rework qcow2_snapshot_create error handling

Increase refcounts only after allocating a new L1 table has succeeded in
order to make leaks less likely. If writing the snapshot table fails,
revert in-memory state to be consistent with that on disk.

While at it, make it return the real error codes instead of -1.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 55 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 53d9233ba9..a8add9c857 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -277,7 +277,9 @@ static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 {
     BDRVQcowState *s = bs->opaque;
-    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
+    QCowSnapshot *new_snapshot_list = NULL;
+    QCowSnapshot *old_snapshot_list = NULL;
+    QCowSnapshot sn1, *sn = &sn1;
     int i, ret;
     uint64_t *l1_table = NULL;
     int64_t l1_table_offset;
@@ -303,16 +305,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     sn->date_nsec = sn_info->date_nsec;
     sn->vm_clock_nsec = sn_info->vm_clock_nsec;
 
-    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
-    if (ret < 0)
-        goto fail;
-
     /* Allocate the L1 table of the snapshot and copy the current one there. */
     l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
     if (l1_table_offset < 0) {
+        ret = l1_table_offset;
         goto fail;
     }
-    bdrv_flush(bs->file);
 
     sn->l1_table_offset = l1_table_offset;
     sn->l1_size = s->l1_size;
@@ -321,22 +319,50 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     for(i = 0; i < s->l1_size; i++) {
         l1_table[i] = cpu_to_be64(s->l1_table[i]);
     }
-    if (bdrv_pwrite_sync(bs->file, sn->l1_table_offset,
-                    l1_table, s->l1_size * sizeof(uint64_t)) < 0)
+
+    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
+                      s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
         goto fail;
+    }
+
     g_free(l1_table);
     l1_table = NULL;
 
-    snapshots1 = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    /*
+     * Increase the refcounts of all clusters and make sure everything is
+     * stable on disk before updating the snapshot table to contain a pointer
+     * to the new L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Append the new snapshot to the snapshot list */
+    new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
     if (s->snapshots) {
-        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
-        g_free(s->snapshots);
+        memcpy(new_snapshot_list, s->snapshots,
+               s->nb_snapshots * sizeof(QCowSnapshot));
+        old_snapshot_list = s->snapshots;
     }
-    s->snapshots = snapshots1;
+    s->snapshots = new_snapshot_list;
     s->snapshots[s->nb_snapshots++] = *sn;
 
-    if (qcow2_write_snapshots(bs) < 0)
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
+        g_free(s->snapshots);
+        s->snapshots = old_snapshot_list;
         goto fail;
+    }
+
+    g_free(old_snapshot_list);
+
 #ifdef DEBUG_ALLOC
     {
       BdrvCheckResult result = {0};
@@ -349,7 +375,8 @@ fail:
     g_free(sn->id_str);
     g_free(sn->name);
     g_free(l1_table);
-    return -1;
+
+    return ret;
 }
 
 /* copy the snapshot 'snapshot_name' into the current disk image */
-- 
cgit v1.2.1


From 589f284b7690bbb4ed37170590bae9527ed31b42 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 15:04:11 +0100
Subject: qcow2: Return real error in qcow2_snapshot_goto

Besides fixing the return code, this adds some comments that make clear
how the code works and that it potentially breaks images if we fail in
the wrong place. Actually fixing this is left for the next patch.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 51 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index a8add9c857..1ee22eb28a 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -386,17 +386,32 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     QCowSnapshot *sn;
     int i, snapshot_index;
     int cur_l1_bytes, sn_l1_bytes;
+    int ret;
 
+    /* Search the snapshot */
     snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
-    if (snapshot_index < 0)
+    if (snapshot_index < 0) {
         return -ENOENT;
+    }
     sn = &s->snapshots[snapshot_index];
 
-    if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
+    /* Decrease refcount of clusters of current L1 table.
+     * FIXME This is too early! */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
+                                         s->l1_size, -1);
+    if (ret < 0) {
         goto fail;
+    }
 
-    if (qcow2_grow_l1_table(bs, sn->l1_size, true) < 0)
+    /*
+     * Make sure that the current L1 table is big enough to contain the whole
+     * L1 table of the snapshot. If the snapshot L1 table is smaller, the
+     * current one must be padded with zeros.
+     */
+    ret = qcow2_grow_l1_table(bs, sn->l1_size, true);
+    if (ret < 0) {
         goto fail;
+    }
 
     cur_l1_bytes = s->l1_size * sizeof(uint64_t);
     sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
@@ -405,19 +420,32 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
         memset(s->l1_table + sn->l1_size, 0, cur_l1_bytes - sn_l1_bytes);
     }
 
-    /* copy the snapshot l1 table to the current l1 table */
-    if (bdrv_pread(bs->file, sn->l1_table_offset,
-                   s->l1_table, sn_l1_bytes) < 0)
+    /*
+     * Copy the snapshot L1 table to the current L1 table.
+     *
+     * Before overwriting the old current L1 table on disk, make sure to
+     * increase all refcounts for the clusters referenced by the new one.
+     */
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, s->l1_table, sn_l1_bytes);
+    if (ret < 0) {
         goto fail;
-    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset,
-                    s->l1_table, cur_l1_bytes) < 0)
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
+                           cur_l1_bytes);
+    if (ret < 0) {
         goto fail;
+    }
+
     for(i = 0;i < s->l1_size; i++) {
         be64_to_cpus(&s->l1_table[i]);
     }
 
-    if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
+    /* FIXME This is too late! */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
         goto fail;
+    }
 
 #ifdef DEBUG_ALLOC
     {
@@ -426,8 +454,9 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     }
 #endif
     return 0;
- fail:
-    return -EIO;
+
+fail:
+    return ret;
 }
 
 int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
-- 
cgit v1.2.1


From 43a0cac4658bbee9c9e84554712a94daa092c1cd Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 15:20:45 +0100
Subject: qcow2: Fix order of refcount updates in qcow2_snapshot_goto

The refcount updates must be moved so that in the worst case we can get
cluster leaks, but refcounts may never be too low.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-refcount.c |  7 +++++-
 block/qcow2-snapshot.c | 61 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 9605367777..2db2ede3d1 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -700,6 +700,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     l2_table = NULL;
     l1_table = NULL;
     l1_size2 = l1_size * sizeof(uint64_t);
+
+    /* WARNING: qcow2_snapshot_goto relies on this function not using the
+     * l1_table_offset when it is the current s->l1_table_offset! Be careful
+     * when changing this! */
     if (l1_table_offset != s->l1_table_offset) {
         if (l1_size2 != 0) {
             l1_table = g_malloc0(align_offset(l1_size2, 512));
@@ -819,7 +823,8 @@ fail:
     qcow2_cache_set_writethrough(bs, s->refcount_block_cache,
         old_refcount_writethrough);
 
-    if (l1_modified) {
+    /* Update L1 only if it isn't deleted anyway (addend = -1) */
+    if (addend >= 0 && l1_modified) {
         for(i = 0; i < l1_size; i++)
             cpu_to_be64s(&l1_table[i]);
         if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 1ee22eb28a..9fb3ff0253 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -387,6 +387,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     int i, snapshot_index;
     int cur_l1_bytes, sn_l1_bytes;
     int ret;
+    uint64_t *sn_l1_table = NULL;
 
     /* Search the snapshot */
     snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
@@ -395,14 +396,6 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     }
     sn = &s->snapshots[snapshot_index];
 
-    /* Decrease refcount of clusters of current L1 table.
-     * FIXME This is too early! */
-    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
-                                         s->l1_size, -1);
-    if (ret < 0) {
-        goto fail;
-    }
-
     /*
      * Make sure that the current L1 table is big enough to contain the whole
      * L1 table of the snapshot. If the snapshot L1 table is smaller, the
@@ -416,33 +409,66 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     cur_l1_bytes = s->l1_size * sizeof(uint64_t);
     sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
 
-    if (cur_l1_bytes > sn_l1_bytes) {
-        memset(s->l1_table + sn->l1_size, 0, cur_l1_bytes - sn_l1_bytes);
-    }
-
     /*
      * Copy the snapshot L1 table to the current L1 table.
      *
      * Before overwriting the old current L1 table on disk, make sure to
      * increase all refcounts for the clusters referenced by the new one.
+     * Decrease the refcount referenced by the old one only when the L1
+     * table is overwritten.
      */
-    ret = bdrv_pread(bs->file, sn->l1_table_offset, s->l1_table, sn_l1_bytes);
+    sn_l1_table = g_malloc0(cur_l1_bytes);
+
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset,
+                                         sn->l1_size, 1);
     if (ret < 0) {
         goto fail;
     }
 
-    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
                            cur_l1_bytes);
     if (ret < 0) {
         goto fail;
     }
 
+    /*
+     * Decrease refcount of clusters of current L1 table.
+     *
+     * At this point, the in-memory s->l1_table points to the old L1 table,
+     * whereas on disk we already have the new one.
+     *
+     * qcow2_update_snapshot_refcount special cases the current L1 table to use
+     * the in-memory data instead of really using the offset to load a new one,
+     * which is why this works.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
+                                         s->l1_size, -1);
+
+    /*
+     * Now update the in-memory L1 table to be in sync with the on-disk one. We
+     * need to do this even if updating refcounts failed.
+     */
     for(i = 0;i < s->l1_size; i++) {
-        be64_to_cpus(&s->l1_table[i]);
+        s->l1_table[i] = be64_to_cpu(sn_l1_table[i]);
     }
 
-    /* FIXME This is too late! */
-    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    g_free(sn_l1_table);
+    sn_l1_table = NULL;
+
+    /*
+     * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed
+     * when we decreased the refcount of the old snapshot.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
     if (ret < 0) {
         goto fail;
     }
@@ -456,6 +482,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     return 0;
 
 fail:
+    g_free(sn_l1_table);
     return ret;
 }
 
-- 
cgit v1.2.1


From 9a4767809fe9ac184806bef38be2e2a84e451a65 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 17:22:10 +0100
Subject: qcow2: Fix order in qcow2_snapshot_delete

First the snapshot must be deleted and only then the refcounts can be
decreased.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 9fb3ff0253..e959ef263e 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -489,32 +489,50 @@ fail:
 int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
 {
     BDRVQcowState *s = bs->opaque;
-    QCowSnapshot *sn;
+    QCowSnapshot sn;
     int snapshot_index, ret;
 
+    /* Search the snapshot */
     snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
-    if (snapshot_index < 0)
+    if (snapshot_index < 0) {
         return -ENOENT;
-    sn = &s->snapshots[snapshot_index];
+    }
+    sn = s->snapshots[snapshot_index];
 
-    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
-    if (ret < 0)
+    /* Remove it from the snapshot list */
+    memmove(s->snapshots + snapshot_index,
+            s->snapshots + snapshot_index + 1,
+            (s->nb_snapshots - snapshot_index - 1) * sizeof(sn));
+    s->nb_snapshots--;
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
         return ret;
-    /* must update the copied flag on the current cluster offsets */
-    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
-    if (ret < 0)
+    }
+
+    /*
+     * The snapshot is now unused, clean up. If we fail after this point, we
+     * won't recover but just leak clusters.
+     */
+    g_free(sn.id_str);
+    g_free(sn.name);
+
+    /*
+     * Now decrease the refcounts of clusters referenced by the snapshot and
+     * free the L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
+                                         sn.l1_size, -1);
+    if (ret < 0) {
         return ret;
-    qcow2_free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
+    }
+    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t));
 
-    g_free(sn->id_str);
-    g_free(sn->name);
-    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
-    s->nb_snapshots--;
-    ret = qcow2_write_snapshots(bs);
+    /* must update the copied flag on the current cluster offsets */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
     if (ret < 0) {
-        /* XXX: restore snapshot if error ? */
         return ret;
     }
+
 #ifdef DEBUG_ALLOC
     {
         BdrvCheckResult result = {0};
-- 
cgit v1.2.1


From e3f652b33228e16e117a93fb919c4e1e4753f5a5 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 16 Nov 2011 17:30:33 +0100
Subject: qcow2: Fix error path in qcow2_snapshot_load_tmp

If the bdrv_read() of the snapshot's L1 table fails, return the right
error code and make sure that the old L1 table is still loaded and we
don't break the BlockDriverState completely.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-snapshot.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index e959ef263e..c3112bf71a 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -573,32 +573,42 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
 int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
 {
-    int i, snapshot_index, l1_size2;
+    int i, snapshot_index;
     BDRVQcowState *s = bs->opaque;
     QCowSnapshot *sn;
+    uint64_t *new_l1_table;
+    int new_l1_bytes;
+    int ret;
 
+    assert(bs->read_only);
+
+    /* Search the snapshot */
     snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
     if (snapshot_index < 0) {
         return -ENOENT;
     }
-
     sn = &s->snapshots[snapshot_index];
-    s->l1_size = sn->l1_size;
-    l1_size2 = s->l1_size * sizeof(uint64_t);
-    if (s->l1_table != NULL) {
-        g_free(s->l1_table);
-    }
 
-    s->l1_table_offset = sn->l1_table_offset;
-    s->l1_table = g_malloc0(align_offset(l1_size2, 512));
+    /* Allocate and read in the snapshot's L1 table */
+    new_l1_bytes = s->l1_size * sizeof(uint64_t);
+    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
 
-    if (bdrv_pread(bs->file, sn->l1_table_offset,
-                   s->l1_table, l1_size2) != l1_size2) {
-        return -1;
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
+    if (ret < 0) {
+        g_free(new_l1_table);
+        return ret;
     }
 
+    /* Switch the L1 table */
+    g_free(s->l1_table);
+
+    s->l1_size = sn->l1_size;
+    s->l1_table_offset = sn->l1_table_offset;
+    s->l1_table = new_l1_table;
+
     for(i = 0;i < s->l1_size; i++) {
         be64_to_cpus(&s->l1_table[i]);
     }
+
     return 0;
 }
-- 
cgit v1.2.1


From 05c4af54c670f5143bd4ac5d79aa1ef53a9f31ca Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:18 +0000
Subject: block: use public bdrv_is_allocated() interface

There is no need for bdrv_commit() to use the BlockDriver
.bdrv_is_allocated() interface directly.  Converting to the public
interface gives us the freedom to drop .bdrv_is_allocated() entirely in
favor of a new .bdrv_co_is_allocated() in the future.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 5f979bddae..0d5bee0653 100644
--- a/block.c
+++ b/block.c
@@ -1013,7 +1013,7 @@ int bdrv_commit(BlockDriverState *bs)
     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
 
     for (sector = 0; sector < total_sectors; sector += n) {
-        if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
+        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
 
             if (bdrv_read(bs, sector, buf, n) != 0) {
                 ret = -EIO;
-- 
cgit v1.2.1


From 376ae3f1cb5a08f53c9425bfaf90b3f70ab240f1 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:19 +0000
Subject: block: add .bdrv_co_is_allocated()

This patch adds the .bdrv_co_is_allocated() interface which is identical
to .bdrv_is_allocated() but runs in coroutine context.  Running in
coroutine context implies that other coroutines might be performing I/O
at the same time.   Therefore it must be safe to run while the following
BlockDriver functions are in-flight:

    .bdrv_co_readv()
    .bdrv_co_writev()
    .bdrv_co_flush()
    .bdrv_co_is_allocated()

The new .bdrv_co_is_allocated() interface is useful because it can be
used when a VM is running, whereas .bdrv_is_allocated() is a synchronous
interface that does not cope with parallel requests.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c     | 37 +++++++++++++++++++++++++++++++++++++
 block_int.h |  2 ++
 2 files changed, 39 insertions(+)

diff --git a/block.c b/block.c
index 0d5bee0653..611c429be9 100644
--- a/block.c
+++ b/block.c
@@ -1887,6 +1887,26 @@ int bdrv_has_zero_init(BlockDriverState *bs)
     return 1;
 }
 
+typedef struct BdrvCoIsAllocatedData {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    int *pnum;
+    int ret;
+    bool done;
+} BdrvCoIsAllocatedData;
+
+/* Coroutine wrapper for bdrv_is_allocated() */
+static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
+{
+    BdrvCoIsAllocatedData *data = opaque;
+    BlockDriverState *bs = data->bs;
+
+    data->ret = bs->drv->bdrv_co_is_allocated(bs, data->sector_num,
+                                              data->nb_sectors, data->pnum);
+    data->done = true;
+}
+
 /*
  * Returns true iff the specified sector is present in the disk image. Drivers
  * not implementing the functionality are assumed to not support backing files,
@@ -1902,6 +1922,23 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 	int *pnum)
 {
     int64_t n;
+    if (bs->drv->bdrv_co_is_allocated) {
+        Coroutine *co;
+        BdrvCoIsAllocatedData data = {
+            .bs = bs,
+            .sector_num = sector_num,
+            .nb_sectors = nb_sectors,
+            .pnum = pnum,
+            .done = false,
+        };
+
+        co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
+        qemu_coroutine_enter(co, &data);
+        while (!data.done) {
+            qemu_aio_wait();
+        }
+        return data.ret;
+    }
     if (!bs->drv->bdrv_is_allocated) {
         if (sector_num >= bs->total_sectors) {
             *pnum = 0;
diff --git a/block_int.h b/block_int.h
index e2799e4f6b..d5b5457d90 100644
--- a/block_int.h
+++ b/block_int.h
@@ -103,6 +103,8 @@ struct BlockDriver {
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors);
+    int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum);
 
     /*
      * Invalidate any cached meta-data.
-- 
cgit v1.2.1


From b7d5a5b8ae5425a4dee69b043b1f334237b3b06a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:20 +0000
Subject: qed: convert to .bdrv_co_is_allocated()

The bdrv_qed_is_allocated() function is a synchronous wrapper around
qed_find_cluster(), which performs the cluster lookup.  In order to
convert the synchronous function to a coroutine function we yield
instead of using qemu_aio_wait().  Note that QED's cache is already safe
for parallel requests so no locking is needed.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 7e22e77b9d..22e467202f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -661,6 +661,7 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
 }
 
 typedef struct {
+    Coroutine *co;
     int is_allocated;
     int *pnum;
 } QEDIsAllocatedCB;
@@ -670,10 +671,14 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
     QEDIsAllocatedCB *cb = opaque;
     *cb->pnum = len / BDRV_SECTOR_SIZE;
     cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+    if (cb->co) {
+        qemu_coroutine_enter(cb->co, NULL);
+    }
 }
 
-static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                                  int nb_sectors, int *pnum)
+static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 int nb_sectors, int *pnum)
 {
     BDRVQEDState *s = bs->opaque;
     uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
@@ -686,8 +691,10 @@ static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
 
     qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
 
+    /* Now sleep if the callback wasn't invoked immediately */
     while (cb.is_allocated == -1) {
-        qemu_aio_wait();
+        cb.co = qemu_coroutine_self();
+        qemu_coroutine_yield();
     }
 
     qed_unref_l2_cache_entry(request.l2_table);
@@ -1485,7 +1492,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_open                = bdrv_qed_open,
     .bdrv_close               = bdrv_qed_close,
     .bdrv_create              = bdrv_qed_create,
-    .bdrv_is_allocated        = bdrv_qed_is_allocated,
+    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
     .bdrv_make_empty          = bdrv_qed_make_empty,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
-- 
cgit v1.2.1


From f8a2e5e3ca6146d4cc66a4750daf44a0cf043319 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:21 +0000
Subject: block: convert qcow2, qcow2, and vmdk to .bdrv_co_is_allocated()

The qcow2, qcow, and vmdk block drivers are based on coroutines.  They have a
coroutine mutex which protects internal state.  We can convert the
.bdrv_is_allocated() function to .bdrv_co_is_allocated() by holding the mutex
around the cluster lookup operation.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c  |  8 +++++---
 block/qcow2.c | 13 ++++++++-----
 block/vmdk.c  |  8 +++++---
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 326ef878e6..b16955d764 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -368,14 +368,16 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum)
+static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster, n;
     uint64_t cluster_offset;
 
+    qemu_co_mutex_lock(&s->lock);
     cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    qemu_co_mutex_unlock(&s->lock);
     index_in_cluster = sector_num & (s->cluster_sectors - 1);
     n = s->cluster_sectors - index_in_cluster;
     if (n > nb_sectors)
@@ -844,7 +846,7 @@ static BlockDriver bdrv_qcow = {
     .bdrv_co_readv          = qcow_co_readv,
     .bdrv_co_writev         = qcow_co_writev,
     .bdrv_co_flush_to_disk  = qcow_co_flush,
-    .bdrv_is_allocated      = qcow_is_allocated,
+    .bdrv_co_is_allocated   = qcow_co_is_allocated,
 
     .bdrv_set_key           = qcow_set_key,
     .bdrv_make_empty        = qcow_make_empty,
diff --git a/block/qcow2.c b/block/qcow2.c
index 3f8a1281d7..5ac9fb4828 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -344,16 +344,19 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     return 0;
 }
 
-static int qcow2_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                              int nb_sectors, int *pnum)
+static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
+    BDRVQcowState *s = bs->opaque;
     uint64_t cluster_offset;
     int ret;
 
     *pnum = nb_sectors;
-    /* FIXME We can get errors here, but the bdrv_is_allocated interface can't
-     * pass them on today */
+    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
+     * can't pass them on today */
+    qemu_co_mutex_lock(&s->lock);
     ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+    qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
         *pnum = 0;
     }
@@ -1277,7 +1280,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_open          = qcow2_open,
     .bdrv_close         = qcow2_close,
     .bdrv_create        = qcow2_create,
-    .bdrv_is_allocated  = qcow2_is_allocated,
+    .bdrv_co_is_allocated = qcow2_co_is_allocated,
     .bdrv_set_key       = qcow2_set_key,
     .bdrv_make_empty    = qcow2_make_empty,
 
diff --git a/block/vmdk.c b/block/vmdk.c
index f5441591d7..5623ac10cd 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -861,8 +861,8 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
     return NULL;
 }
 
-static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum)
+static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVVmdkState *s = bs->opaque;
     int64_t index_in_cluster, n, ret;
@@ -873,8 +873,10 @@ static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
     if (!extent) {
         return 0;
     }
+    qemu_co_mutex_lock(&s->lock);
     ret = get_cluster_offset(bs, extent, NULL,
                             sector_num * 512, 0, &offset);
+    qemu_co_mutex_unlock(&s->lock);
     /* get_cluster_offset returning 0 means success */
     ret = !ret;
 
@@ -1596,7 +1598,7 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_close     = vmdk_close,
     .bdrv_create    = vmdk_create,
     .bdrv_co_flush_to_disk  = vmdk_co_flush,
-    .bdrv_is_allocated      = vmdk_is_allocated,
+    .bdrv_co_is_allocated   = vmdk_co_is_allocated,
     .bdrv_get_allocated_file_size  = vmdk_get_allocated_file_size,
 
     .create_options = vmdk_create_options,
-- 
cgit v1.2.1


From 73f703ca8f6c9def3c353b67ead01afd10e440c7 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:22 +0000
Subject: vvfat: convert to .bdrv_co_is_allocated()

It is trivial to switch from the synchronous .bdrv_is_allocated()
interface to .bdrv_co_is_allocated() since vvfat_is_allocated() does not
block.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vvfat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index a310ce8c3e..eeffc4a4a8 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2758,7 +2758,7 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-static int vvfat_is_allocated(BlockDriverState *bs,
+static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs,
 	int64_t sector_num, int nb_sectors, int* n)
 {
     BDRVVVFATState* s = bs->opaque;
@@ -2855,7 +2855,7 @@ static BlockDriver bdrv_vvfat = {
     .bdrv_read          = vvfat_co_read,
     .bdrv_write         = vvfat_co_write,
     .bdrv_close		= vvfat_close,
-    .bdrv_is_allocated	= vvfat_is_allocated,
+    .bdrv_co_is_allocated = vvfat_co_is_allocated,
     .protocol_name	= "fat",
 };
 
-- 
cgit v1.2.1


From e850b35a1f41304c31b971bde95f0534f2afd791 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:23 +0000
Subject: vdi: convert to .bdrv_co_is_allocated()

It is trivial to switch from the synchronous .bdrv_is_allocated()
interface to .bdrv_co_is_allocated() since vdi_is_allocated() does not
block.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index 02da6b44d0..e1d8cffc2d 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -472,8 +472,8 @@ static int vdi_open(BlockDriverState *bs, int flags)
     return -1;
 }
 
-static int vdi_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum)
+static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
     /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
     BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
@@ -996,7 +996,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_close = vdi_close,
     .bdrv_create = vdi_create,
     .bdrv_co_flush_to_disk = vdi_co_flush,
-    .bdrv_is_allocated = vdi_is_allocated,
+    .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
     .bdrv_aio_readv = vdi_aio_readv,
-- 
cgit v1.2.1


From 81145834d39897c6f153ac26a4077f90f269c5fc Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:24 +0000
Subject: cow: convert to .bdrv_co_is_allocated()

The cow block driver does not keep internal state for cluster lookups.
This means it is safe to perform cluster lookups in coroutine context
without risk of race conditions that corrupt internal state.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/cow.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/cow.c b/block/cow.c
index 9858d71f00..7ae887b670 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -132,8 +132,8 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
 /* Return true if first block has been changed (ie. current version is
  * in COW file).  Set the number of continuous blocks for which that
  * is true. */
-static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
-        int nb_sectors, int *num_same)
+static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *num_same)
 {
     int changed;
 
@@ -178,7 +178,7 @@ static int cow_read(BlockDriverState *bs, int64_t sector_num,
     int ret, n;
 
     while (nb_sectors > 0) {
-        if (cow_is_allocated(bs, sector_num, nb_sectors, &n)) {
+        if (bdrv_is_allocated(bs, sector_num, nb_sectors, &n)) {
             ret = bdrv_pread(bs->file,
                         s->cow_sectors_offset + sector_num * 512,
                         buf, n * 512);
@@ -335,7 +335,7 @@ static BlockDriver bdrv_cow = {
     .bdrv_read              = cow_co_read,
     .bdrv_write             = cow_co_write,
     .bdrv_co_flush_to_disk  = cow_co_flush,
-    .bdrv_is_allocated      = cow_is_allocated,
+    .bdrv_co_is_allocated   = cow_co_is_allocated,
 
     .create_options = cow_create_options,
 };
-- 
cgit v1.2.1


From 6aebab140d1891813628f0148c6c4d66b5c1dd66 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:25 +0000
Subject: block: drop .bdrv_is_allocated() interface

Now that all block drivers have been converted to
.bdrv_co_is_allocated() we can drop .bdrv_is_allocated().

Note that the public bdrv_is_allocated() interface is still available
but is in fact a synchronous wrapper around .bdrv_co_is_allocated().

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c     | 38 ++++++++++++++++++--------------------
 block_int.h |  2 --
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/block.c b/block.c
index 611c429be9..50b058d210 100644
--- a/block.c
+++ b/block.c
@@ -1921,25 +1921,8 @@ static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 	int *pnum)
 {
-    int64_t n;
-    if (bs->drv->bdrv_co_is_allocated) {
-        Coroutine *co;
-        BdrvCoIsAllocatedData data = {
-            .bs = bs,
-            .sector_num = sector_num,
-            .nb_sectors = nb_sectors,
-            .pnum = pnum,
-            .done = false,
-        };
-
-        co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
-        qemu_coroutine_enter(co, &data);
-        while (!data.done) {
-            qemu_aio_wait();
-        }
-        return data.ret;
-    }
-    if (!bs->drv->bdrv_is_allocated) {
+    if (!bs->drv->bdrv_co_is_allocated) {
+        int64_t n;
         if (sector_num >= bs->total_sectors) {
             *pnum = 0;
             return 0;
@@ -1948,7 +1931,22 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
         *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
         return 1;
     }
-    return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
+
+    Coroutine *co;
+    BdrvCoIsAllocatedData data = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .pnum = pnum,
+        .done = false,
+    };
+
+    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
+    qemu_coroutine_enter(co, &data);
+    while (!data.done) {
+        qemu_aio_wait();
+    }
+    return data.ret;
 }
 
 void bdrv_mon_event(const BlockDriverState *bdrv,
diff --git a/block_int.h b/block_int.h
index d5b5457d90..a0b46d71d1 100644
--- a/block_int.h
+++ b/block_int.h
@@ -80,8 +80,6 @@ struct BlockDriver {
                       const uint8_t *buf, int nb_sectors);
     void (*bdrv_close)(BlockDriverState *bs);
     int (*bdrv_create)(const char *filename, QEMUOptionParameter *options);
-    int (*bdrv_is_allocated)(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum);
     int (*bdrv_set_key)(BlockDriverState *bs, const char *key);
     int (*bdrv_make_empty)(BlockDriverState *bs);
     /* aio */
-- 
cgit v1.2.1


From 060f51c9dee3c58e2748c773ef1f7142047a4a2f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2011 12:44:26 +0000
Subject: block: add bdrv_co_is_allocated() interface

This patch introduces the public bdrv_co_is_allocated() interface which
can be used to query image allocation status while the VM is running.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 37 ++++++++++++++++++++++++-------------
 block.h |  2 ++
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/block.c b/block.c
index 50b058d210..d82854a3f7 100644
--- a/block.c
+++ b/block.c
@@ -1896,17 +1896,6 @@ typedef struct BdrvCoIsAllocatedData {
     bool done;
 } BdrvCoIsAllocatedData;
 
-/* Coroutine wrapper for bdrv_is_allocated() */
-static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
-{
-    BdrvCoIsAllocatedData *data = opaque;
-    BlockDriverState *bs = data->bs;
-
-    data->ret = bs->drv->bdrv_co_is_allocated(bs, data->sector_num,
-                                              data->nb_sectors, data->pnum);
-    data->done = true;
-}
-
 /*
  * Returns true iff the specified sector is present in the disk image. Drivers
  * not implementing the functionality are assumed to not support backing files,
@@ -1918,8 +1907,8 @@ static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
  *
  * 'nb_sectors' is the max value 'pnum' should be set to.
  */
-int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-	int *pnum)
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, int *pnum)
 {
     if (!bs->drv->bdrv_co_is_allocated) {
         int64_t n;
@@ -1932,6 +1921,28 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
         return 1;
     }
 
+    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
+}
+
+/* Coroutine wrapper for bdrv_is_allocated() */
+static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
+{
+    BdrvCoIsAllocatedData *data = opaque;
+    BlockDriverState *bs = data->bs;
+
+    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
+                                     data->pnum);
+    data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_is_allocated().
+ *
+ * See bdrv_co_is_allocated() for details.
+ */
+int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                      int *pnum)
+{
     Coroutine *co;
     BdrvCoIsAllocatedData data = {
         .bs = bs,
diff --git a/block.h b/block.h
index 83e17cad96..8482dbce75 100644
--- a/block.h
+++ b/block.h
@@ -143,6 +143,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, int *pnum);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
-- 
cgit v1.2.1


From 3951690a4a29e031492090131d001e5047938631 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 17 Nov 2011 13:40:25 +0000
Subject: qemu-common: add QEMU_ALIGN_DOWN() and QEMU_ALIGN_UP() macros

Add macros for aligning a number to a multiple, for example:

QEMU_ALIGN_DOWN(500, 2000) = 0
QEMU_ALIGN_UP(500, 2000) = 2000

Since ALIGN_UP() is a common macro name use the QEMU_* namespace prefix.
Hopefully this will protect us from included headers that leak something
with a similar name.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-common.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/qemu-common.h b/qemu-common.h
index 2ce47aa12d..44870fe523 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -341,6 +341,12 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return res.ll;
 }
 
+/* Round number down to multiple */
+#define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m))
+
 #include "module.h"
 
 #endif
-- 
cgit v1.2.1


From e8ee5e4c476d5b0654d8f1271a2b7c065acc486e Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 17 Nov 2011 13:40:26 +0000
Subject: coroutine: add qemu_co_queue_restart_all()

It's common to wake up all waiting coroutines.  Introduce the
qemu_co_queue_restart_all() function to do this instead of looping over
qemu_co_queue_next() in every caller.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c         |  2 +-
 qemu-coroutine-lock.c | 15 ++++++++-------
 qemu-coroutine.h      |  5 +++++
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 5ac9fb4828..a2be7d7949 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -516,7 +516,7 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
     /* Restart all dependent requests */
     if (!qemu_co_queue_empty(&m->dependent_requests)) {
         qemu_co_mutex_unlock(&s->lock);
-        while(qemu_co_queue_next(&m->dependent_requests));
+        qemu_co_queue_restart_all(&m->dependent_requests);
         qemu_co_mutex_lock(&s->lock);
     }
 }
diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
index 9549c075ee..26ad76bf50 100644
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@@ -84,6 +84,13 @@ bool qemu_co_queue_next(CoQueue *queue)
     return (next != NULL);
 }
 
+void qemu_co_queue_restart_all(CoQueue *queue)
+{
+    while (qemu_co_queue_next(queue)) {
+        /* Do nothing */
+    }
+}
+
 bool qemu_co_queue_empty(CoQueue *queue)
 {
     return (QTAILQ_FIRST(&queue->entries) == NULL);
@@ -144,13 +151,7 @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
     assert(qemu_in_coroutine());
     if (lock->writer) {
         lock->writer = false;
-        while (!qemu_co_queue_empty(&lock->queue)) {
-            /*
-             * Wakeup every body. This will include some
-             * writers too.
-             */
-            qemu_co_queue_next(&lock->queue);
-        }
+        qemu_co_queue_restart_all(&lock->queue);
     } else {
         lock->reader--;
         assert(lock->reader >= 0);
diff --git a/qemu-coroutine.h b/qemu-coroutine.h
index 8a2e5d2a10..8a55fe125e 100644
--- a/qemu-coroutine.h
+++ b/qemu-coroutine.h
@@ -130,6 +130,11 @@ void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue);
  */
 bool qemu_co_queue_next(CoQueue *queue);
 
+/**
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
+ */
+void qemu_co_queue_restart_all(CoQueue *queue);
+
 /**
  * Checks if the CoQueue is empty.
  */
-- 
cgit v1.2.1


From dbffbdcfff69431b622866ac5ea78df74fdc02d4 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 17 Nov 2011 13:40:27 +0000
Subject: block: add request tracking

The block layer does not know about pending requests.  This information
is necessary for copy-on-read since overlapping requests must be
serialized to prevent races that corrupt the image.

The BlockDriverState gets a new tracked_request list field which
contains all pending requests.  Each request is a BdrvTrackedRequest
record with sector_num, nb_sectors, and is_write fields.

Note that request tracking is always enabled but hopefully this extra
work is so small that it doesn't justify adding an enable/disable flag.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c     | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 block_int.h |  4 ++++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index d82854a3f7..16436670dc 100644
--- a/block.c
+++ b/block.c
@@ -1071,6 +1071,42 @@ void bdrv_commit_all(void)
     }
 }
 
+struct BdrvTrackedRequest {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    bool is_write;
+    QLIST_ENTRY(BdrvTrackedRequest) list;
+};
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+    QLIST_REMOVE(req, list);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+                                  BlockDriverState *bs,
+                                  int64_t sector_num,
+                                  int nb_sectors, bool is_write)
+{
+    *req = (BdrvTrackedRequest){
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .is_write = is_write,
+    };
+
+    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
 /*
  * Return values:
  * 0        - success
@@ -1350,6 +1386,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+    int ret;
 
     if (!drv) {
         return -ENOMEDIUM;
@@ -1363,7 +1401,10 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, false, nb_sectors);
     }
 
-    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    tracked_request_end(&req);
+    return ret;
 }
 
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
@@ -1381,6 +1422,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
     int ret;
 
     if (!bs->drv) {
@@ -1398,6 +1440,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, true, nb_sectors);
     }
 
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+
     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 
     if (bs->dirty_bitmap) {
@@ -1408,6 +1452,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bs->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 
+    tracked_request_end(&req);
+
     return ret;
 }
 
diff --git a/block_int.h b/block_int.h
index a0b46d71d1..ea818e4e71 100644
--- a/block_int.h
+++ b/block_int.h
@@ -51,6 +51,8 @@
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
 
+typedef struct BdrvTrackedRequest BdrvTrackedRequest;
+
 typedef struct AIOPool {
     void (*cancel)(BlockDriverAIOCB *acb);
     int aiocb_size;
@@ -255,6 +257,8 @@ struct BlockDriverState {
     int in_use; /* users other than guest access, eg. block migration */
     QTAILQ_ENTRY(BlockDriverState) list;
     void *private;
+
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 };
 
 struct BlockDriverAIOCB {
-- 
cgit v1.2.1


From 53fec9d3fd3d3a4ded4b92dfa843436bf71380f1 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2011 16:08:47 +0000
Subject: block: add interface to toggle copy-on-read

The bdrv_enable_copy_on_read()/bdrv_disable_copy_on_read() functions can
be used to programmatically enable or disable copy-on-read for a block
device.  Later patches add the actual copy-on-read logic.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c     | 22 ++++++++++++++++++++++
 block.h     |  4 ++++
 block_int.h |  2 ++
 3 files changed, 28 insertions(+)

diff --git a/block.c b/block.c
index 16436670dc..9b3cb75646 100644
--- a/block.c
+++ b/block.c
@@ -538,6 +538,22 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
     return 0;
 }
 
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+    bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+    assert(bs->copy_on_read > 0);
+    bs->copy_on_read--;
+}
+
 /*
  * Common part for opening disk images and files
  */
@@ -559,6 +575,11 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename,
     bs->growable = 0;
     bs->buffer_alignment = 512;
 
+    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
+    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
+        bdrv_enable_copy_on_read(bs);
+    }
+
     pstrcpy(bs->filename, sizeof(bs->filename), filename);
     bs->backing_file[0] = '\0';
 
@@ -801,6 +822,7 @@ void bdrv_close(BlockDriverState *bs)
 #endif
         bs->opaque = NULL;
         bs->drv = NULL;
+        bs->copy_on_read = 0;
 
         if (bs->file != NULL) {
             bdrv_close(bs->file);
diff --git a/block.h b/block.h
index 8482dbce75..5f8a98a746 100644
--- a/block.h
+++ b/block.h
@@ -70,6 +70,7 @@ typedef struct BlockDevOps {
 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
+#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
 
@@ -312,6 +313,9 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
                       int nr_sectors);
 int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 
+void bdrv_enable_copy_on_read(BlockDriverState *bs);
+void bdrv_disable_copy_on_read(BlockDriverState *bs);
+
 void bdrv_set_in_use(BlockDriverState *bs, int in_use);
 int bdrv_in_use(BlockDriverState *bs);
 
diff --git a/block_int.h b/block_int.h
index ea818e4e71..311bd2a6fa 100644
--- a/block_int.h
+++ b/block_int.h
@@ -198,6 +198,8 @@ struct BlockDriverState {
     int encrypted; /* if true, the media is encrypted */
     int valid_key; /* if true, a valid encryption key has been set */
     int sg;        /* if true, the device is a /dev/sg* */
+    int copy_on_read; /* if true, copy read backing sectors into image
+                         note this is a reference count */
 
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
-- 
cgit v1.2.1


From f4658285f99473367dbbc34ce6970ec4637c2388 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 17 Nov 2011 13:40:29 +0000
Subject: block: wait for overlapping requests

When copy-on-read is enabled it is necessary to wait for overlapping
requests before issuing new requests.  This prevents races between the
copy-on-read and a write request.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/block.c b/block.c
index 9b3cb75646..1b3c809ed4 100644
--- a/block.c
+++ b/block.c
@@ -1099,6 +1099,7 @@ struct BdrvTrackedRequest {
     int nb_sectors;
     bool is_write;
     QLIST_ENTRY(BdrvTrackedRequest) list;
+    CoQueue wait_queue; /* coroutines blocked on this request */
 };
 
 /**
@@ -1109,6 +1110,7 @@ struct BdrvTrackedRequest {
 static void tracked_request_end(BdrvTrackedRequest *req)
 {
     QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
 }
 
 /**
@@ -1126,9 +1128,34 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
         .is_write = is_write,
     };
 
+    qemu_co_queue_init(&req->wait_queue);
+
     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 }
 
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+                                     int64_t sector_num, int nb_sectors) {
+    return false; /* not yet implemented */
+}
+
+static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors)
+{
+    BdrvTrackedRequest *req;
+    bool retry;
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &bs->tracked_requests, list) {
+            if (tracked_request_overlaps(req, sector_num, nb_sectors)) {
+                qemu_co_queue_wait(&req->wait_queue);
+                retry = true;
+                break;
+            }
+        }
+    } while (retry);
+}
+
 /*
  * Return values:
  * 0        - success
@@ -1423,6 +1450,10 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, false, nb_sectors);
     }
 
+    if (bs->copy_on_read) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
     tracked_request_end(&req);
@@ -1462,6 +1493,10 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, true, nb_sectors);
     }
 
+    if (bs->copy_on_read) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-- 
cgit v1.2.1


From d83947ac6d2a58fdaf128fbe1b22bc9639e79fe5 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 23 Nov 2011 11:47:56 +0000
Subject: block: request overlap detection

Detect overlapping requests and remember to align to cluster boundaries
if the image format uses them.  This assumes that allocating I/O is
performed in cluster granularity - which is true for qcow2, qed, etc.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 1b3c809ed4..ca0e8ece1e 100644
--- a/block.c
+++ b/block.c
@@ -1133,21 +1133,62 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 }
 
+/**
+ * Round a region to cluster boundaries
+ */
+static void round_to_clusters(BlockDriverState *bs,
+                              int64_t sector_num, int nb_sectors,
+                              int64_t *cluster_sector_num,
+                              int *cluster_nb_sectors)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_sector_num = sector_num;
+        *cluster_nb_sectors = nb_sectors;
+    } else {
+        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                            nb_sectors, c);
+    }
+}
+
 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
                                      int64_t sector_num, int nb_sectors) {
-    return false; /* not yet implemented */
+    /*        aaaa   bbbb */
+    if (sector_num >= req->sector_num + req->nb_sectors) {
+        return false;
+    }
+    /* bbbb   aaaa        */
+    if (req->sector_num >= sector_num + nb_sectors) {
+        return false;
+    }
+    return true;
 }
 
 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors)
 {
     BdrvTrackedRequest *req;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
     bool retry;
 
+    /* If we touch the same cluster it counts as an overlap.  This guarantees
+     * that allocating writes will be serialized and not race with each other
+     * for the same cluster.  For example, in copy-on-read it ensures that the
+     * CoR read and write operations are atomic and guest writes cannot
+     * interleave between them.
+     */
+    round_to_clusters(bs, sector_num, nb_sectors,
+                      &cluster_sector_num, &cluster_nb_sectors);
+
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
-            if (tracked_request_overlaps(req, sector_num, nb_sectors)) {
+            if (tracked_request_overlaps(req, cluster_sector_num,
+                                         cluster_nb_sectors)) {
                 qemu_co_queue_wait(&req->wait_queue);
                 retry = true;
                 break;
-- 
cgit v1.2.1


From ab1859218a95977dc6881fcccce328d8146a6bdb Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 17 Nov 2011 13:40:31 +0000
Subject: block: core copy-on-read logic

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c      | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 trace-events |  1 +
 2 files changed, 73 insertions(+)

diff --git a/block.c b/block.c
index ca0e8ece1e..dd5d5ca4d2 100644
--- a/block.c
+++ b/block.c
@@ -1469,6 +1469,61 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
+static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    /* Perform I/O through a temporary buffer so that users who scribble over
+     * their read buffer while the operation is in progress do not end up
+     * modifying the image file.  This is critical for zero-copy guest I/O
+     * where anything might happen inside guest memory.
+     */
+    void *bounce_buffer;
+
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    size_t skip_bytes;
+    int ret;
+
+    /* Cover entire cluster so no additional backing file I/O is required when
+     * allocating cluster in the image file.
+     */
+    round_to_clusters(bs, sector_num, nb_sectors,
+                      &cluster_sector_num, &cluster_nb_sectors);
+
+    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
+                                cluster_sector_num, cluster_nb_sectors);
+
+    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+    ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+                                 &bounce_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+
+    ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+                                  &bounce_qiov);
+    if (ret < 0) {
+        /* It might be okay to ignore write errors for guest requests.  If this
+         * is a deliberate copy-on-read then we don't want to ignore the error.
+         * Simply report it in all cases.
+         */
+        goto err;
+    }
+
+    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
+                           nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+    qemu_vfree(bounce_buffer);
+    return ret;
+}
+
 /*
  * Handle a read request in coroutine context
  */
@@ -1496,7 +1551,24 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     }
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+
+    if (bs->copy_on_read) {
+        int pnum;
+
+        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        if (ret < 0) {
+            goto out;
+        }
+
+        if (!ret || pnum != nb_sectors) {
+            ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            goto out;
+        }
+    }
+
     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+
+out:
     tracked_request_end(&req);
     return ret;
 }
diff --git a/trace-events b/trace-events
index 962caca598..518b76b137 100644
--- a/trace-events
+++ b/trace-events
@@ -69,6 +69,7 @@ bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
+bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
 
 # hw/virtio-blk.c
 virtio_blk_req_complete(void *req, int status) "req %p status %d"
-- 
cgit v1.2.1


From fb0490f69feb96b7e92457f176dc834ff0b00b09 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 17 Nov 2011 13:40:32 +0000
Subject: block: add -drive copy-on-read=on|off

This patch adds the -drive copy-on-read=on|off command-line option:

  copy-on-read=on|off
  copy-on-read is "on" or "off" and enables whether to copy read backing
  file sectors into the image file.  Copy-on-read avoids accessing the
  same backing file sectors repeatedly and is useful when the backing
  file is over a slow network.  By default copy-on-read is off.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c      | 6 ++++++
 hmp-commands.hx | 5 +++--
 qemu-config.c   | 4 ++++
 qemu-options.hx | 9 ++++++++-
 4 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 9068c5b1d0..af4e239b93 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -257,6 +257,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     DriveInfo *dinfo;
     BlockIOLimit io_limits;
     int snapshot = 0;
+    bool copy_on_read;
     int ret;
 
     translation = BIOS_ATA_TRANSLATION_AUTO;
@@ -273,6 +274,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 
     snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
     ro = qemu_opt_get_bool(opts, "readonly", 0);
+    copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false);
 
     file = qemu_opt_get(opts, "file");
     serial = qemu_opt_get(opts, "serial");
@@ -546,6 +548,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         bdrv_flags |= (BDRV_O_SNAPSHOT|BDRV_O_CACHE_WB|BDRV_O_NO_FLUSH);
     }
 
+    if (copy_on_read) {
+        bdrv_flags |= BDRV_O_COPY_ON_READ;
+    }
+
     if (media == MEDIA_CDROM) {
         /* CDROM is fine for any interface, don't check.  */
         ro = 1;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index f8d855e3be..79a919526d 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -860,9 +860,10 @@ ETEXI
         .args_type  = "pci_addr:s,opts:s",
         .params     = "[[<domain>:]<bus>:]<slot>\n"
                       "[file=file][,if=type][,bus=n]\n"
-                      "[,unit=m][,media=d][index=i]\n"
+                      "[,unit=m][,media=d][,index=i]\n"
                       "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
-                      "[snapshot=on|off][,cache=on|off]",
+                      "[,snapshot=on|off][,cache=on|off]\n"
+                      "[,readonly=on|off][,copy-on-read=on|off]",
         .help       = "add drive to PCI storage controller",
         .mhandler.cmd = drive_hot_add,
     },
diff --git a/qemu-config.c b/qemu-config.c
index 1aa080fbcb..18f30204a1 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -109,6 +109,10 @@ static QemuOptsList qemu_drive_opts = {
             .name = "bps_wr",
             .type = QEMU_OPT_NUMBER,
             .help = "limit write bytes per second",
+        },{
+            .name = "copy-on-read",
+            .type = QEMU_OPT_BOOL,
+            .help = "copy read data from backing file into image file",
         },
         { /* end of list */ }
     },
diff --git a/qemu-options.hx b/qemu-options.hx
index 25a7be7948..b3db10c0ad 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -135,7 +135,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
     "       [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
-    "       [,readonly=on|off]\n"
+    "       [,readonly=on|off][,copy-on-read=on|off]\n"
     "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
@@ -187,6 +187,9 @@ host disk is full; report the error to the guest otherwise).
 The default setting is @option{werror=enospc} and @option{rerror=report}.
 @item readonly
 Open drive @option{file} as read-only. Guest write attempts will fail.
+@item copy-on-read=@var{copy-on-read}
+@var{copy-on-read} is "on" or "off" and enables whether to copy read backing
+file sectors into the image file.
 @end table
 
 By default, writethrough caching is used for all block device.  This means that
@@ -218,6 +221,10 @@ like your host losing power, the disk storage getting disconnected accidently,
 etc. you're image will most probably be rendered unusable.   When using
 the @option{-snapshot} option, unsafe caching is always used.
 
+Copy-on-read avoids accessing the same backing file sectors repeatedly and is
+useful when the backing file is over a slow network.  By default copy-on-read
+is off.
+
 Instead of @option{-cdrom} you can use:
 @example
 qemu -drive file=file,index=2,media=cdrom
-- 
cgit v1.2.1


From e94d13873334b1880746feca59ce84c0eabc1021 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 23 Nov 2011 15:00:04 +0000
Subject: cow: use bdrv_co_is_allocated()

Now that bdrv_co_is_allocated() is available we can use it instead of
the synchronous bdrv_is_allocated() interface.  This is a follow-up that
Kevin Wolf <kwolf@redhat.com> pointed out after applying the series that
introduces bdrv_co_is_allocated().

It is safe to make cow_read() a coroutine_fn because its only caller is
a coroutine_fn.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/cow.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/cow.c b/block/cow.c
index 7ae887b670..586493c623 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -171,14 +171,14 @@ static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
     return error;
 }
 
-static int cow_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
+                                 uint8_t *buf, int nb_sectors)
 {
     BDRVCowState *s = bs->opaque;
     int ret, n;
 
     while (nb_sectors > 0) {
-        if (bdrv_is_allocated(bs, sector_num, nb_sectors, &n)) {
+        if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) {
             ret = bdrv_pread(bs->file,
                         s->cow_sectors_offset + sector_num * 512,
                         buf, n * 512);
-- 
cgit v1.2.1


From c57c465800a9d193f9b41c97dbbbb6cd4fb08ff6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 24 Nov 2011 06:15:28 -0500
Subject: dma-helpers: Add trace events

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 dma-helpers.c | 10 ++++++++++
 trace-events  |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/dma-helpers.c b/dma-helpers.c
index bdcd38cd27..9d6b6fa1fd 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -9,6 +9,7 @@
 
 #include "dma.h"
 #include "block_int.h"
+#include "trace.h"
 
 void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint)
 {
@@ -83,6 +84,8 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
 
 static void dma_complete(DMAAIOCB *dbs, int ret)
 {
+    trace_dma_complete(dbs, ret, dbs->common.cb);
+
     dma_bdrv_unmap(dbs);
     if (dbs->common.cb) {
         dbs->common.cb(dbs->common.opaque, ret);
@@ -106,6 +109,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
     target_phys_addr_t cur_addr, cur_len;
     void *mem;
 
+    trace_dma_bdrv_cb(dbs, ret);
+
     dbs->acb = NULL;
     dbs->sector_num += dbs->iov.size / 512;
     dma_bdrv_unmap(dbs);
@@ -130,6 +135,7 @@ static void dma_bdrv_cb(void *opaque, int ret)
     }
 
     if (dbs->iov.size == 0) {
+        trace_dma_map_wait(dbs);
         cpu_register_map_client(dbs, continue_after_map_failure);
         return;
     }
@@ -145,6 +151,8 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb)
 {
     DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
 
+    trace_dma_aio_cancel(dbs);
+
     if (dbs->acb) {
         BlockDriverAIOCB *acb = dbs->acb;
         dbs->acb = NULL;
@@ -168,6 +176,8 @@ BlockDriverAIOCB *dma_bdrv_io(
 {
     DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
 
+    trace_dma_bdrv_io(dbs, bs, sector_num, to_dev);
+
     dbs->acb = NULL;
     dbs->bs = bs;
     dbs->sg = sg;
diff --git a/trace-events b/trace-events
index 518b76b137..bf1cf57032 100644
--- a/trace-events
+++ b/trace-events
@@ -632,3 +632,10 @@ win_helper_no_switch_pstate(uint32_t new_pstate_regs) "change_pstate: regs new=%
 win_helper_wrpil(uint32_t psrpil, uint32_t new_pil) "old=%x new=%x"
 win_helper_done(uint32_t tl) "tl=%d"
 win_helper_retry(uint32_t tl) "tl=%d"
+
+# dma-helpers.c
+dma_bdrv_io(void *dbs, void *bs, int64_t sector_num, bool to_dev) "dbs=%p bs=%p sector_num=%" PRId64 " to_dev=%d"
+dma_aio_cancel(void *dbs) "dbs=%p"
+dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p"
+dma_bdrv_cb(void *dbs, int ret) "dbs=%p ret=%d"
+dma_map_wait(void *dbs) "dbs=%p"
-- 
cgit v1.2.1


From bd9533e36ea3d6c964f65b8e407bad00dab59e5d Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Tue, 29 Nov 2011 13:49:51 +0000
Subject: block: implement bdrv_co_is_allocated() boundary cases

Cases beyond the end of the disk image are only implemented for block
drivers that do not provide .bdrv_co_is_allocated().  It's worth making
these cases generic so that block drivers that do implement
.bdrv_co_is_allocated() also get them for free.

Suggested-by: Mark Wu <wudxw@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/block.c b/block.c
index dd5d5ca4d2..ecc5b442d4 100644
--- a/block.c
+++ b/block.c
@@ -2117,23 +2117,33 @@ typedef struct BdrvCoIsAllocatedData {
  * not implementing the functionality are assumed to not support backing files,
  * hence all their sectors are reported as allocated.
  *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
  * 'pnum' is set to the number of sectors (including and immediately following
  * the specified sector) that are known to be in the same
  * allocated/unallocated state.
  *
- * 'nb_sectors' is the max value 'pnum' should be set to.
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
  */
 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
                                       int nb_sectors, int *pnum)
 {
+    int64_t n;
+
+    if (sector_num >= bs->total_sectors) {
+        *pnum = 0;
+        return 0;
+    }
+
+    n = bs->total_sectors - sector_num;
+    if (n < nb_sectors) {
+        nb_sectors = n;
+    }
+
     if (!bs->drv->bdrv_co_is_allocated) {
-        int64_t n;
-        if (sector_num >= bs->total_sectors) {
-            *pnum = 0;
-            return 0;
-        }
-        n = bs->total_sectors - sector_num;
-        *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
+        *pnum = nb_sectors;
         return 1;
     }
 
-- 
cgit v1.2.1


From 5f8b6491f20732e0a31e64bbf75b62def579e044 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 30 Nov 2011 12:23:42 +0000
Subject: block: wait_for_overlapping_requests() deadlock detection

Debugging a reentrant request deadlock was fun but in the future we need
a quick and obvious way of detecting such bugs.  Add an assert that
checks we are not about to deadlock when waiting for another request.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block.c b/block.c
index ecc5b442d4..50ce2be48e 100644
--- a/block.c
+++ b/block.c
@@ -1099,6 +1099,7 @@ struct BdrvTrackedRequest {
     int nb_sectors;
     bool is_write;
     QLIST_ENTRY(BdrvTrackedRequest) list;
+    Coroutine *co; /* owner, used for deadlock detection */
     CoQueue wait_queue; /* coroutines blocked on this request */
 };
 
@@ -1126,6 +1127,7 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
         .sector_num = sector_num,
         .nb_sectors = nb_sectors,
         .is_write = is_write,
+        .co = qemu_coroutine_self(),
     };
 
     qemu_co_queue_init(&req->wait_queue);
@@ -1189,6 +1191,12 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
             if (tracked_request_overlaps(req, cluster_sector_num,
                                          cluster_nb_sectors)) {
+                /* Hitting this means there was a reentrant request, for
+                 * example, a block driver issuing nested requests.  This must
+                 * never happen since it means deadlock.
+                 */
+                assert(qemu_coroutine_self() != req->co);
+
                 qemu_co_queue_wait(&req->wait_queue);
                 retry = true;
                 break;
-- 
cgit v1.2.1


From 922453bca6a927bb527068ae8679d587cfa45dbc Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 30 Nov 2011 12:23:43 +0000
Subject: block: convert qemu_aio_flush() calls to bdrv_drain_all()

Many places in QEMU call qemu_aio_flush() to complete all pending
asynchronous I/O.  Most of these places actually want to drain all block
requests but there is no block layer API to do so.

This patch introduces the bdrv_drain_all() API to wait for requests
across all BlockDriverStates to complete.  As a bonus we perform checks
after qemu_aio_wait() to ensure that requests really have finished.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block-migration.c |  2 +-
 block.c           | 19 +++++++++++++++++++
 block.h           |  1 +
 blockdev.c        |  4 ++--
 cpus.c            |  2 +-
 hw/ide/macio.c    |  5 +++--
 hw/ide/pci.c      |  2 +-
 hw/virtio-blk.c   |  2 +-
 hw/xen_platform.c |  2 +-
 qemu-io.c         |  4 ++--
 savevm.c          |  2 +-
 xen-mapcache.c    |  2 +-
 12 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 5f10486416..423c5a07b6 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -387,7 +387,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
 
     for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
         if (bmds_aio_inflight(bmds, sector)) {
-            qemu_aio_flush();
+            bdrv_drain_all();
         }
         if (bdrv_get_dirty(bmds->bs, sector)) {
 
diff --git a/block.c b/block.c
index 50ce2be48e..aa9d14245b 100644
--- a/block.c
+++ b/block.c
@@ -846,6 +846,25 @@ void bdrv_close_all(void)
     }
 }
 
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ */
+void bdrv_drain_all(void)
+{
+    BlockDriverState *bs;
+
+    qemu_aio_flush();
+
+    /* If requests are still pending there is a bug somewhere */
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        assert(QLIST_EMPTY(&bs->tracked_requests));
+        assert(qemu_co_queue_empty(&bs->throttled_reqs));
+    }
+}
+
 /* make a BlockDriverState anonymous by removing from bdrv_state list.
    Also, NULL terminate the device_name to prevent double remove */
 void bdrv_make_anon(BlockDriverState *bs)
diff --git a/block.h b/block.h
index 5f8a98a746..1790f9988c 100644
--- a/block.h
+++ b/block.h
@@ -214,6 +214,7 @@ int bdrv_flush(BlockDriverState *bs);
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
 void bdrv_flush_all(void);
 void bdrv_close_all(void);
+void bdrv_drain_all(void);
 
 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
 int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
diff --git a/blockdev.c b/blockdev.c
index af4e239b93..dbf0251a77 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -653,7 +653,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
         goto out;
     }
 
-    qemu_aio_flush();
+    bdrv_drain_all();
     bdrv_flush(bs);
 
     bdrv_close(bs);
@@ -840,7 +840,7 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
     }
 
     /* quiesce block driver; prevent further io */
-    qemu_aio_flush();
+    bdrv_drain_all();
     bdrv_flush(bs);
     bdrv_close(bs);
 
diff --git a/cpus.c b/cpus.c
index 82530c4a03..58e173c1d9 100644
--- a/cpus.c
+++ b/cpus.c
@@ -396,7 +396,7 @@ static void do_vm_stop(RunState state)
         pause_all_vcpus();
         runstate_set(state);
         vm_state_notify(0, state);
-        qemu_aio_flush();
+        bdrv_drain_all();
         bdrv_flush_all();
         monitor_protocol_event(QEVENT_STOP, NULL);
     }
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index 70b33422d2..c09d2e0a35 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -200,8 +200,9 @@ static void pmac_ide_flush(DBDMA_io *io)
 {
     MACIOIDEState *m = io->opaque;
 
-    if (m->aiocb)
-        qemu_aio_flush();
+    if (m->aiocb) {
+        bdrv_drain_all();
+    }
 }
 
 /* PowerMac IDE memory IO */
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 49b823df79..5078c0b565 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -309,7 +309,7 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
              * aio operation with preadv/pwritev.
              */
             if (bm->bus->dma->aiocb) {
-                qemu_aio_flush();
+                bdrv_drain_all();
                 assert(bm->bus->dma->aiocb == NULL);
                 assert((bm->status & BM_STATUS_DMAING) == 0);
             }
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index d6d1f87cda..4b0d113ba8 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -474,7 +474,7 @@ static void virtio_blk_reset(VirtIODevice *vdev)
      * This should cancel pending requests, but can't do nicely until there
      * are per-device request lists.
      */
-    qemu_aio_flush();
+    bdrv_drain_all();
 }
 
 /* coalesce internal state, copy to pci i/o region 0
diff --git a/hw/xen_platform.c b/hw/xen_platform.c
index 5e792f56f6..e62eaef7d1 100644
--- a/hw/xen_platform.c
+++ b/hw/xen_platform.c
@@ -120,7 +120,7 @@ static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t v
            devices, and bit 2 the non-primary-master IDE devices. */
         if (val & UNPLUG_ALL_IDE_DISKS) {
             DPRINTF("unplug disks\n");
-            qemu_aio_flush();
+            bdrv_drain_all();
             bdrv_flush_all();
             pci_unplug_disks(s->pci_dev.bus);
         }
diff --git a/qemu-io.c b/qemu-io.c
index de26422fcf..b35adbb00f 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -1853,9 +1853,9 @@ int main(int argc, char **argv)
     command_loop();
 
     /*
-     * Make sure all outstanding requests get flushed the program exits.
+     * Make sure all outstanding requests complete before the program exits.
      */
-    qemu_aio_flush();
+    bdrv_drain_all();
 
     if (bs) {
         bdrv_delete(bs);
diff --git a/savevm.c b/savevm.c
index f53cd4c154..0443554b39 100644
--- a/savevm.c
+++ b/savevm.c
@@ -2104,7 +2104,7 @@ int load_vmstate(const char *name)
     }
 
     /* Flush all IO requests so they don't interfere with the new state.  */
-    qemu_aio_flush();
+    bdrv_drain_all();
 
     bs = NULL;
     while ((bs = bdrv_next(bs))) {
diff --git a/xen-mapcache.c b/xen-mapcache.c
index 7bcb86e4f8..9fecc64e97 100644
--- a/xen-mapcache.c
+++ b/xen-mapcache.c
@@ -351,7 +351,7 @@ void xen_invalidate_map_cache(void)
     MapCacheRev *reventry;
 
     /* Flush pending AIO before destroying the mapcache */
-    qemu_aio_flush();
+    bdrv_drain_all();
 
     QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
         DPRINTF("There should be no locked mappings at this time, "
-- 
cgit v1.2.1