Merge remote-tracking branch 'kwolf/for-anthony' into staging

author: Anthony Liguori <aliguori@us.ibm.com> 2011-12-05 09:39:25 -0600
committer: Anthony Liguori <aliguori@us.ibm.com> 2011-12-05 09:39:25 -0600
commit: eb5d5beaebd102599a915f6c4813d445ddc9dc84 (patch)
tree: 12ce2331571a30c67bde0b8f4ddb55996dd0ba65
parent: f6480ca3f3423be5bee8b673ee6f5cc387659def (diff)
parent: 922453bca6a927bb527068ae8679d587cfa45dbc (diff)
download: qemu-eb5d5beaebd102599a915f6c4813d445ddc9dc84.tar.gz
40 files changed, 1329 insertions, 343 deletions
diff --git a/block-migration.c b/block-migration.c
index 5f10486416..423c5a07b6 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -387,7 +387,7 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
 
     for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
         if (bmds_aio_inflight(bmds, sector)) {
-            qemu_aio_flush();
+            bdrv_drain_all();
         }
         if (bdrv_get_dirty(bmds->bs, sector)) {
 
diff --git a/block.c b/block.c
index d0158877d6..aa9d14245b 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,7 @@
 #include "qjson.h"
 #include "qemu-coroutine.h"
 #include "qmp-commands.h"
+#include "qemu-timer.h"
 
 #ifdef CONFIG_BSD
 #include <sys/types.h>
@@ -73,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                                bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -105,6 +113,79 @@ int is_windows_drive(const char *filename)
 }
 #endif
 
+/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    while (qemu_co_queue_next(&bs->throttled_reqs));
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
+
+    bs->slice_start = 0;
+    bs->slice_end   = 0;
+    bs->slice_time  = 0;
+    memset(&bs->io_base, 0, sizeof(bs->io_base));
+}
+
+static void bdrv_block_timer(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+
+    qemu_co_queue_next(&bs->throttled_reqs);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    qemu_co_queue_init(&bs->throttled_reqs);
+    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+    bs->slice_time  = 5 * BLOCK_IO_SLICE_TIME;
+    bs->slice_start = qemu_get_clock_ns(vm_clock);
+    bs->slice_end   = bs->slice_start + bs->slice_time;
+    memset(&bs->io_base, 0, sizeof(bs->io_base));
+    bs->io_limits_enabled = true;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+    BlockIOLimit *io_limits = &bs->io_limits;
+    return io_limits->bps[BLOCK_IO_LIMIT_READ]
+         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
+         || io_limits->iops[BLOCK_IO_LIMIT_READ]
+         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
+}
+
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+                                     bool is_write, int nb_sectors)
+{
+    int64_t wait_time = -1;
+
+    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+        qemu_co_queue_wait(&bs->throttled_reqs);
+    }
+
+    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
+     * throttled requests will not be dequeued until the current request is
+     * allowed to be serviced. So if the current request still exceeds the
+     * limits, it will be inserted to the head. All requests followed it will
+     * be still in throttled_reqs queue.
+     */
+
+    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
+        qemu_mod_timer(bs->block_timer,
+                       wait_time + qemu_get_clock_ns(vm_clock));
+        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+    }
+
+    qemu_co_queue_next(&bs->throttled_reqs);
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -457,6 +538,22 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
     return 0;
 }
 
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+    bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+    assert(bs->copy_on_read > 0);
+    bs->copy_on_read--;
+}
+
 /*
  * Common part for opening disk images and files
  */
@@ -478,6 +575,11 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename,
     bs->growable = 0;
     bs->buffer_alignment = 512;
 
+    assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
+    if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
+        bdrv_enable_copy_on_read(bs);
+    }
+
     pstrcpy(bs->filename, sizeof(bs->filename), filename);
     bs->backing_file[0] = '\0';
 
@@ -687,6 +789,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
         bdrv_dev_change_media_cb(bs, true);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -715,6 +822,7 @@ void bdrv_close(BlockDriverState *bs)
 #endif
         bs->opaque = NULL;
         bs->drv = NULL;
+        bs->copy_on_read = 0;
 
         if (bs->file != NULL) {
             bdrv_close(bs->file);
@@ -722,6 +830,11 @@ void bdrv_close(BlockDriverState *bs)
 
         bdrv_dev_change_media_cb(bs, false);
     }
+
+    /*throttling disk I/O limits*/
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_disable(bs);
+    }
 }
 
 void bdrv_close_all(void)
@@ -733,6 +846,25 @@ void bdrv_close_all(void)
     }
 }
 
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ */
+void bdrv_drain_all(void)
+{
+    BlockDriverState *bs;
+
+    qemu_aio_flush();
+
+    /* If requests are still pending there is a bug somewhere */
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        assert(QLIST_EMPTY(&bs->tracked_requests));
+        assert(qemu_co_queue_empty(&bs->throttled_reqs));
+    }
+}
+
 /* make a BlockDriverState anonymous by removing from bdrv_state list.
    Also, NULL terminate the device_name to prevent double remove */
 void bdrv_make_anon(BlockDriverState *bs)
@@ -922,7 +1054,7 @@ int bdrv_commit(BlockDriverState *bs)
     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
 
     for (sector = 0; sector < total_sectors; sector += n) {
-        if (drv->bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
+        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
 
             if (bdrv_read(bs, sector, buf, n) != 0) {
                 ret = -EIO;
@@ -980,6 +1112,118 @@ void bdrv_commit_all(void)
     }
 }
 
+struct BdrvTrackedRequest {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    bool is_write;
+    QLIST_ENTRY(BdrvTrackedRequest) list;
+    Coroutine *co; /* owner, used for deadlock detection */
+    CoQueue wait_queue; /* coroutines blocked on this request */
+};
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+                                  BlockDriverState *bs,
+                                  int64_t sector_num,
+                                  int nb_sectors, bool is_write)
+{
+    *req = (BdrvTrackedRequest){
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .is_write = is_write,
+        .co = qemu_coroutine_self(),
+    };
+
+    qemu_co_queue_init(&req->wait_queue);
+
+    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+static void round_to_clusters(BlockDriverState *bs,
+                              int64_t sector_num, int nb_sectors,
+                              int64_t *cluster_sector_num,
+                              int *cluster_nb_sectors)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_sector_num = sector_num;
+        *cluster_nb_sectors = nb_sectors;
+    } else {
+        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                            nb_sectors, c);
+    }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+                                     int64_t sector_num, int nb_sectors) {
+    /*        aaaa   bbbb */
+    if (sector_num >= req->sector_num + req->nb_sectors) {
+        return false;
+    }
+    /* bbbb   aaaa        */
+    if (req->sector_num >= sector_num + nb_sectors) {
+        return false;
+    }
+    return true;
+}
+
+static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors)
+{
+    BdrvTrackedRequest *req;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    bool retry;
+
+    /* If we touch the same cluster it counts as an overlap.  This guarantees
+     * that allocating writes will be serialized and not race with each other
+     * for the same cluster.  For example, in copy-on-read it ensures that the
+     * CoR read and write operations are atomic and guest writes cannot
+     * interleave between them.
+     */
+    round_to_clusters(bs, sector_num, nb_sectors,
+                      &cluster_sector_num, &cluster_nb_sectors);
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &bs->tracked_requests, list) {
+            if (tracked_request_overlaps(req, cluster_sector_num,
+                                         cluster_nb_sectors)) {
+                /* Hitting this means there was a reentrant request, for
+                 * example, a block driver issuing nested requests.  This must
+                 * never happen since it means deadlock.
+                 */
+                assert(qemu_coroutine_self() != req->co);
+
+                qemu_co_queue_wait(&req->wait_queue);
+                retry = true;
+                break;
+            }
+        }
+    } while (retry);
+}
+
 /*
  * Return values:
  * 0        - success
@@ -1252,6 +1496,61 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
+static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    /* Perform I/O through a temporary buffer so that users who scribble over
+     * their read buffer while the operation is in progress do not end up
+     * modifying the image file.  This is critical for zero-copy guest I/O
+     * where anything might happen inside guest memory.
+     */
+    void *bounce_buffer;
+
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    size_t skip_bytes;
+    int ret;
+
+    /* Cover entire cluster so no additional backing file I/O is required when
+     * allocating cluster in the image file.
+     */
+    round_to_clusters(bs, sector_num, nb_sectors,
+                      &cluster_sector_num, &cluster_nb_sectors);
+
+    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
+                                cluster_sector_num, cluster_nb_sectors);
+
+    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+    ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+                                 &bounce_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+
+    ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+                                  &bounce_qiov);
+    if (ret < 0) {
+        /* It might be okay to ignore write errors for guest requests.  If this
+         * is a deliberate copy-on-read then we don't want to ignore the error.
+         * Simply report it in all cases.
+         */
+        goto err;
+    }
+
+    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
+                           nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+    qemu_vfree(bounce_buffer);
+    return ret;
+}
+
 /*
  * Handle a read request in coroutine context
  */
@@ -1259,6 +1558,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+    int ret;
 
     if (!drv) {
         return -ENOMEDIUM;
@@ -1267,7 +1568,36 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         return -EIO;
     }
 
-    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, false, nb_sectors);
+    }
+
+    if (bs->copy_on_read) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+
+    if (bs->copy_on_read) {
+        int pnum;
+
+        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        if (ret < 0) {
+            goto out;
+        }
+
+        if (!ret || pnum != nb_sectors) {
+            ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            goto out;
+        }
+    }
+
+    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+
+out:
+    tracked_request_end(&req);
+    return ret;
 }
 
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
@@ -1285,6 +1615,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
     int ret;
 
     if (!bs->drv) {
@@ -1297,6 +1628,17 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         return -EIO;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, true, nb_sectors);
+    }
+
+    if (bs->copy_on_read) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+
     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 
     if (bs->dirty_bitmap) {
@@ -1307,6 +1649,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bs->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 
+    tracked_request_end(&req);
+
     return ret;
 }
 
@@ -1526,6 +1870,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
     *psecs = bs->secs;
 }
 
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        BlockIOLimit *io_limits)
+{
+    bs->io_limits = *io_limits;
+    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
+}
+
 /* Recognize floppy formats */
 typedef struct FDFormat {
     FDriveType drive;
@@ -1778,31 +2130,87 @@ int bdrv_has_zero_init(BlockDriverState *bs)
     return 1;
 }
 
+typedef struct BdrvCoIsAllocatedData {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    int *pnum;
+    int ret;
+    bool done;
+} BdrvCoIsAllocatedData;
+
 /*
  * Returns true iff the specified sector is present in the disk image. Drivers
  * not implementing the functionality are assumed to not support backing files,
  * hence all their sectors are reported as allocated.
  *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
  * 'pnum' is set to the number of sectors (including and immediately following
  * the specified sector) that are known to be in the same
  * allocated/unallocated state.
  *
- * 'nb_sectors' is the max value 'pnum' should be set to.
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
  */
-int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-	int *pnum)
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, int *pnum)
 {
     int64_t n;
-    if (!bs->drv->bdrv_is_allocated) {
-        if (sector_num >= bs->total_sectors) {
-            *pnum = 0;
-            return 0;
-        }
-        n = bs->total_sectors - sector_num;
-        *pnum = (n < nb_sectors) ? (n) : (nb_sectors);
+
+    if (sector_num >= bs->total_sectors) {
+        *pnum = 0;
+        return 0;
+    }
+
+    n = bs->total_sectors - sector_num;
+    if (n < nb_sectors) {
+        nb_sectors = n;
+    }
+
+    if (!bs->drv->bdrv_co_is_allocated) {
+        *pnum = nb_sectors;
         return 1;
     }
-    return bs->drv->bdrv_is_allocated(bs, sector_num, nb_sectors, pnum);
+
+    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
+}
+
+/* Coroutine wrapper for bdrv_is_allocated() */
+static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
+{
+    BdrvCoIsAllocatedData *data = opaque;
+    BlockDriverState *bs = data->bs;
+
+    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
+                                     data->pnum);
+    data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_is_allocated().
+ *
+ * See bdrv_co_is_allocated() for details.
+ */
+int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                      int *pnum)
+{
+    Coroutine *co;
+    BdrvCoIsAllocatedData data = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .pnum = pnum,
+        .done = false,
+    };
+
+    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
+    qemu_coroutine_enter(co, &data);
+    while (!data.done) {
+        qemu_aio_wait();
+    }
+    return data.ret;
 }
 
 void bdrv_mon_event(const BlockDriverState *bdrv,
@@ -1869,6 +2277,21 @@ BlockInfoList *qmp_query_block(Error **errp)
                 info->value->inserted->has_backing_file = true;
                 info->value->inserted->backing_file = g_strdup(bs->backing_file);
             }
+
+            if (bs->io_limits_enabled) {
+                info->value->inserted->bps =
+                               bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+                info->value->inserted->bps_rd =
+                               bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
+                info->value->inserted->bps_wr =
+                               bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
+                info->value->inserted->iops =
+                               bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+                info->value->inserted->iops_rd =
+                               bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
+                info->value->inserted->iops_wr =
+                               bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
+            }
         }
 
         /* XXX: waiting for the qapi to support GSList */
@@ -2480,6 +2903,170 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+/* block I/O throttling */
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait)
+{
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_base, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
+    }
+
+    /* bytes_base: the bytes of data which have been read/written; and
+     *             it is obtained from the history statistic info.
+     * bytes_res: the remaining bytes of data which need to be read/written.
+     * (bytes_base + bytes_res) / bps_limit: used to calcuate
+     *             the total time for completing reading/writting all data.
+     */
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_base + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
+
+    /* When the I/O rate at runtime exceeds the limits,
+     * bs->slice_end need to be extended in order that the current statistic
+     * info can be kept until the timer fire, so it is increased and tuned
+     * based on the result of experiment.
+     */
+    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait)
+{
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_base;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
+    }
+
+    if (ios_base + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_base + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait)
+{
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + bs->slice_time;
+    } else {
+        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
+        bs->slice_start = now;
+        bs->slice_end   = now + bs->slice_time;
+
+        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
+        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
+
+        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
+        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index a826059897..1790f9988c 100644
--- a/block.h
+++ b/block.h
@@ -70,6 +70,7 @@ typedef struct BlockDevOps {
 #define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
+#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
 
@@ -98,6 +99,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
+/* disk I/O throttling */
+void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
+bool bdrv_io_limits_enabled(BlockDriverState *bs);
+
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
 BlockDriver *bdrv_find_protocol(const char *filename);
@@ -138,6 +144,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, int *pnum);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
@@ -206,6 +214,7 @@ int bdrv_flush(BlockDriverState *bs);
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
 void bdrv_flush_all(void);
 void bdrv_close_all(void);
+void bdrv_drain_all(void);
 
 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
 int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
@@ -305,6 +314,9 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
                       int nr_sectors);
 int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 
+void bdrv_enable_copy_on_read(BlockDriverState *bs);
+void bdrv_disable_copy_on_read(BlockDriverState *bs);
+
 void bdrv_set_in_use(BlockDriverState *bs, int in_use);
 int bdrv_in_use(BlockDriverState *bs);
 
diff --git a/block/cow.c b/block/cow.c
index 3448296190..3c527358c6 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -132,8 +132,8 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
 /* Return true if first block has been changed (ie. current version is
  * in COW file).  Set the number of continuous blocks for which that
  * is true. */
-static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
-        int nb_sectors, int *num_same)
+static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *num_same)
 {
     int changed;
 
@@ -171,14 +171,14 @@ static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
     return error;
 }
 
-static int cow_read(BlockDriverState *bs, int64_t sector_num,
-                    uint8_t *buf, int nb_sectors)
+static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
+                                 uint8_t *buf, int nb_sectors)
 {
     BDRVCowState *s = bs->opaque;
     int ret, n;
 
     while (nb_sectors > 0) {
-        if (cow_is_allocated(bs, sector_num, nb_sectors, &n)) {
+        if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) {
             ret = bdrv_pread(bs->file,
                         s->cow_sectors_offset + sector_num * 512,
                         buf, n * 512);
@@ -243,12 +243,12 @@ static void cow_close(BlockDriverState *bs)
 
 static int cow_create(const char *filename, QEMUOptionParameter *options)
 {
-    int fd, cow_fd;
     struct cow_header_v2 cow_header;
     struct stat st;
     int64_t image_sectors = 0;
     const char *image_filename = NULL;
     int ret;
+    BlockDriverState *cow_bs;
 
     /* Read out options */
     while (options && options->name) {
@@ -260,10 +260,16 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
         options++;
     }
 
-    cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
-              0644);
-    if (cow_fd < 0)
-        return -errno;
+    ret = bdrv_create_file(filename, options);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_file_open(&cow_bs, filename, BDRV_O_RDWR);
+    if (ret < 0) {
+        return ret;
+    }
+
     memset(&cow_header, 0, sizeof(cow_header));
     cow_header.magic = cpu_to_be32(COW_MAGIC);
     cow_header.version = cpu_to_be32(COW_VERSION);
@@ -271,16 +277,9 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
         /* Note: if no file, we put a dummy mtime */
         cow_header.mtime = cpu_to_be32(0);
 
-        fd = open(image_filename, O_RDONLY | O_BINARY);
-        if (fd < 0) {
-            close(cow_fd);
-            goto mtime_fail;
-        }
-        if (fstat(fd, &st) != 0) {
-            close(fd);
+        if (stat(image_filename, &st) != 0) {
             goto mtime_fail;
         }
-        close(fd);
         cow_header.mtime = cpu_to_be32(st.st_mtime);
     mtime_fail:
         pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
@@ -288,21 +287,20 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
     }
     cow_header.sectorsize = cpu_to_be32(512);
     cow_header.size = cpu_to_be64(image_sectors * 512);
-    ret = qemu_write_full(cow_fd, &cow_header, sizeof(cow_header));
+    ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header));
     if (ret != sizeof(cow_header)) {
-        ret = -errno;
         goto exit;
     }
 
     /* resize to include at least all the bitmap */
-    ret = ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3));
+    ret = bdrv_truncate(cow_bs,
+        sizeof(cow_header) + ((image_sectors + 7) >> 3));
     if (ret) {
-        ret = -errno;
         goto exit;
     }
 
 exit:
-    close(cow_fd);
+    bdrv_delete(cow_bs);
     return ret;
 }
 
@@ -337,7 +335,7 @@ static BlockDriver bdrv_cow = {
     .bdrv_read              = cow_co_read,
     .bdrv_write             = cow_co_write,
     .bdrv_co_flush_to_disk  = cow_co_flush,
-    .bdrv_is_allocated      = cow_is_allocated,
+    .bdrv_co_is_allocated   = cow_co_is_allocated,
 
     .create_options = cow_create_options,
 };
diff --git a/block/qcow.c b/block/qcow.c
index 4814ed0ced..b16955d764 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -368,14 +368,16 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum)
+static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster, n;
     uint64_t cluster_offset;
 
+    qemu_co_mutex_lock(&s->lock);
     cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    qemu_co_mutex_unlock(&s->lock);
     index_in_cluster = sector_num & (s->cluster_sectors - 1);
     n = s->cluster_sectors - index_in_cluster;
     if (n > nb_sectors)
@@ -433,7 +435,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
     return 0;
 }
 
-static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
                          int nb_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
@@ -531,7 +533,7 @@ fail:
     goto done;
 }
 
-static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
@@ -844,7 +846,7 @@ static BlockDriver bdrv_qcow = {
     .bdrv_co_readv          = qcow_co_readv,
     .bdrv_co_writev         = qcow_co_writev,
     .bdrv_co_flush_to_disk  = qcow_co_flush,
-    .bdrv_is_allocated      = qcow_is_allocated,
+    .bdrv_co_is_allocated   = qcow_co_is_allocated,
 
     .bdrv_set_key           = qcow_set_key,
     .bdrv_make_empty        = qcow_make_empty,
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index f4e049fa90..07a2e936fd 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -289,89 +289,62 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
     }
 }
 
-
-static int qcow2_read(BlockDriverState *bs, int64_t sector_num,
-                      uint8_t *buf, int nb_sectors)
+static int coroutine_fn copy_sectors(BlockDriverState *bs,
+                                     uint64_t start_sect,
+                                     uint64_t cluster_offset,
+                                     int n_start, int n_end)
 {
     BDRVQcowState *s = bs->opaque;
-    int ret, index_in_cluster, n, n1;
-    uint64_t cluster_offset;
-    struct iovec iov;
     QEMUIOVector qiov;
+    struct iovec iov;
+    int n, ret;
 
-    while (nb_sectors > 0) {
-        n = nb_sectors;
-
-        ret = qcow2_get_cluster_offset(bs, sector_num << 9, &n,
-            &cluster_offset);
-        if (ret < 0) {
-            return ret;
-        }
-
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        if (!cluster_offset) {
-            if (bs->backing_hd) {
-                /* read from the base image */
-                iov.iov_base = buf;
-                iov.iov_len = n * 512;
-                qemu_iovec_init_external(&qiov, &iov, 1);
-
-                n1 = qcow2_backing_read1(bs->backing_hd, &qiov, sector_num, n);
-                if (n1 > 0) {
-                    BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING);
-                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
-                    if (ret < 0)
-                        return -1;
-                }
-            } else {
-                memset(buf, 0, 512 * n);
-            }
-        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
-            if (qcow2_decompress_cluster(bs, cluster_offset) < 0)
-                return -1;
-            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
-        } else {
-            BLKDBG_EVENT(bs->file, BLKDBG_READ);
-            ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512);
-            if (ret != n * 512)
-                return -1;
-            if (s->crypt_method) {
-                qcow2_encrypt_sectors(s, sector_num, buf, buf, n, 0,
-                                &s->aes_decrypt_key);
-            }
-        }
-        nb_sectors -= n;
-        sector_num += n;
-        buf += n * 512;
+    /*
+     * If this is the last cluster and it is only partially used, we must only
+     * copy until the end of the image, or bdrv_check_request will fail for the
+     * bdrv_read/write calls below.
+     */
+    if (start_sect + n_end > bs->total_sectors) {
+        n_end = bs->total_sectors - start_sect;
     }
-    return 0;
-}
-
-static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
-                        uint64_t cluster_offset, int n_start, int n_end)
-{
-    BDRVQcowState *s = bs->opaque;
-    int n, ret;
 
     n = n_end - n_start;
-    if (n <= 0)
+    if (n <= 0) {
         return 0;
+    }
+
+    iov.iov_len = n * BDRV_SECTOR_SIZE;
+    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
-    ret = qcow2_read(bs, start_sect + n_start, s->cluster_data, n);
-    if (ret < 0)
-        return ret;
+
+    /* Call .bdrv_co_readv() directly instead of using the public block-layer
+     * interface.  This avoids double I/O throttling and request tracking,
+     * which can lead to deadlock when block layer copy-on-read is enabled.
+     */
+    ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
     if (s->crypt_method) {
         qcow2_encrypt_sectors(s, start_sect + n_start,
-                        s->cluster_data,
-                        s->cluster_data, n, 1,
+                        iov.iov_base, iov.iov_base, n, 1,
                         &s->aes_encrypt_key);
     }
+
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-    ret = bdrv_write(bs->file, (cluster_offset >> 9) + n_start,
-        s->cluster_data, n);
-    if (ret < 0)
-        return ret;
-    return 0;
+    ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = 0;
+out:
+    qemu_vfree(iov.iov_base);
+    return ret;
 }
 
 
@@ -620,7 +593,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
     if (m->n_start) {
         cow = true;
+        qemu_co_mutex_unlock(&s->lock);
         ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
+        qemu_co_mutex_lock(&s->lock);
         if (ret < 0)
             goto err;
     }
@@ -628,8 +603,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     if (m->nb_available & (s->cluster_sectors - 1)) {
         uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
         cow = true;
+        qemu_co_mutex_unlock(&s->lock);
         ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
                 m->nb_available - end, s->cluster_sectors);
+        qemu_co_mutex_lock(&s->lock);
         if (ret < 0)
             goto err;
     }
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 9605367777..2db2ede3d1 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -700,6 +700,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     l2_table = NULL;
     l1_table = NULL;
     l1_size2 = l1_size * sizeof(uint64_t);
+
+    /* WARNING: qcow2_snapshot_goto relies on this function not using the
+     * l1_table_offset when it is the current s->l1_table_offset! Be careful
+     * when changing this! */
     if (l1_table_offset != s->l1_table_offset) {
         if (l1_size2 != 0) {
             l1_table = g_malloc0(align_offset(l1_size2, 512));
@@ -819,7 +823,8 @@ fail:
     qcow2_cache_set_writethrough(bs, s->refcount_block_cache,
         old_refcount_writethrough);
 
-    if (l1_modified) {
+    /* Update L1 only if it isn't deleted anyway (addend = -1) */
+    if (addend >= 0 && l1_modified) {
         for(i = 0; i < l1_size; i++)
             cpu_to_be64s(&l1_table[i]);
         if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index bdc33ba94c..c3112bf71a 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -68,6 +68,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
     int i, id_str_size, name_size;
     int64_t offset;
     uint32_t extra_data_size;
+    int ret;
 
     if (!s->nb_snapshots) {
         s->snapshots = NULL;
@@ -77,10 +78,15 @@ int qcow2_read_snapshots(BlockDriverState *bs)
 
     offset = s->snapshots_offset;
     s->snapshots = g_malloc0(s->nb_snapshots * sizeof(QCowSnapshot));
+
     for(i = 0; i < s->nb_snapshots; i++) {
+        /* Read statically sized part of the snapshot header */
         offset = align_offset(offset, 8);
-        if (bdrv_pread(bs->file, offset, &h, sizeof(h)) != sizeof(h))
+        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
             goto fail;
+        }
+
         offset += sizeof(h);
         sn = s->snapshots + i;
         sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
@@ -94,25 +100,34 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         id_str_size = be16_to_cpu(h.id_str_size);
         name_size = be16_to_cpu(h.name_size);
 
+        /* Skip extra data */
         offset += extra_data_size;
 
+        /* Read snapshot ID */
         sn->id_str = g_malloc(id_str_size + 1);
-        if (bdrv_pread(bs->file, offset, sn->id_str, id_str_size) != id_str_size)
+        ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += id_str_size;
         sn->id_str[id_str_size] = '\0';
 
+        /* Read snapshot name */
         sn->name = g_malloc(name_size + 1);
-        if (bdrv_pread(bs->file, offset, sn->name, name_size) != name_size)
+        ret = bdrv_pread(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += name_size;
         sn->name[name_size] = '\0';
     }
+
     s->snapshots_size = offset - s->snapshots_offset;
     return 0;
- fail:
+
+fail:
     qcow2_free_snapshots(bs);
-    return -1;
+    return ret;
 }
 
 /* add at the end of the file a new list of snapshots */
@@ -122,9 +137,12 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     QCowSnapshot *sn;
     QCowSnapshotHeader h;
     int i, name_size, id_str_size, snapshots_size;
-    uint64_t data64;
-    uint32_t data32;
+    struct {
+        uint32_t nb_snapshots;
+        uint64_t snapshots_offset;
+    } QEMU_PACKED header_data;
     int64_t offset, snapshots_offset;
+    int ret;
 
     /* compute the size of the snapshots */
     offset = 0;
@@ -137,6 +155,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     }
     snapshots_size = offset;
 
+    /* Allocate space for the new snapshot list */
     snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
     bdrv_flush(bs->file);
     offset = snapshots_offset;
@@ -144,6 +163,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         return offset;
     }
 
+    /* Write all snapshots to the new list */
     for(i = 0; i < s->nb_snapshots; i++) {
         sn = s->snapshots + i;
         memset(&h, 0, sizeof(h));
@@ -159,34 +179,55 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         h.id_str_size = cpu_to_be16(id_str_size);
         h.name_size = cpu_to_be16(name_size);
         offset = align_offset(offset, 8);
-        if (bdrv_pwrite_sync(bs->file, offset, &h, sizeof(h)) < 0)
+
+        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
+        if (ret < 0) {
             goto fail;
+        }
         offset += sizeof(h);
-        if (bdrv_pwrite_sync(bs->file, offset, sn->id_str, id_str_size) < 0)
+
+        ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += id_str_size;
-        if (bdrv_pwrite_sync(bs->file, offset, sn->name, name_size) < 0)
+
+        ret = bdrv_pwrite(bs->file, offset, sn->name, name_size);
+        if (ret < 0) {
             goto fail;
+        }
         offset += name_size;
     }
 
-    /* update the various header fields */
-    data64 = cpu_to_be64(snapshots_offset);
-    if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, snapshots_offset),
-                    &data64, sizeof(data64)) < 0)
+    /*
+     * Update the header to point to the new snapshot table. This requires the
+     * new table and its refcounts to be stable on disk.
+     */
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
         goto fail;
-    data32 = cpu_to_be32(s->nb_snapshots);
-    if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
-                    &data32, sizeof(data32)) < 0)
+    }
+
+    QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
+        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+
+    header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
+    header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);
+
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+                           &header_data, sizeof(header_data));
+    if (ret < 0) {
         goto fail;
+    }
 
     /* free the old snapshot table */
     qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size);
     s->snapshots_offset = snapshots_offset;
     s->snapshots_size = snapshots_size;
     return 0;
- fail:
-    return -1;
+
+fail:
+    return ret;
 }
 
 static void find_new_snapshot_id(BlockDriverState *bs,
@@ -236,72 +277,92 @@ static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 {
     BDRVQcowState *s = bs->opaque;
-    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
+    QCowSnapshot *new_snapshot_list = NULL;
+    QCowSnapshot *old_snapshot_list = NULL;
+    QCowSnapshot sn1, *sn = &sn1;
     int i, ret;
     uint64_t *l1_table = NULL;
     int64_t l1_table_offset;
 
     memset(sn, 0, sizeof(*sn));
 
+    /* Generate an ID if it wasn't passed */
     if (sn_info->id_str[0] == '\0') {
-        /* compute a new id */
         find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
     }
 
-    /* check that the ID is unique */
-    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
+    /* Check that the ID is unique */
+    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) {
         return -ENOENT;
+    }
 
+    /* Populate sn with passed data */
     sn->id_str = g_strdup(sn_info->id_str);
-    if (!sn->id_str)
-        goto fail;
     sn->name = g_strdup(sn_info->name);
-    if (!sn->name)
-        goto fail;
+
     sn->vm_state_size = sn_info->vm_state_size;
     sn->date_sec = sn_info->date_sec;
     sn->date_nsec = sn_info->date_nsec;
     sn->vm_clock_nsec = sn_info->vm_clock_nsec;
 
-    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
-    if (ret < 0)
-        goto fail;
-
-    /* create the L1 table of the snapshot */
+    /* Allocate the L1 table of the snapshot and copy the current one there. */
     l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
     if (l1_table_offset < 0) {
+        ret = l1_table_offset;
         goto fail;
     }
-    bdrv_flush(bs->file);
 
     sn->l1_table_offset = l1_table_offset;
     sn->l1_size = s->l1_size;
 
-    if (s->l1_size != 0) {
-        l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
-    } else {
-        l1_table = NULL;
-    }
-
+    l1_table = g_malloc(s->l1_size * sizeof(uint64_t));
     for(i = 0; i < s->l1_size; i++) {
         l1_table[i] = cpu_to_be64(s->l1_table[i]);
     }
-    if (bdrv_pwrite_sync(bs->file, sn->l1_table_offset,
-                    l1_table, s->l1_size * sizeof(uint64_t)) < 0)
+
+    ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
+                      s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
         goto fail;
+    }
+
     g_free(l1_table);
     l1_table = NULL;
 
-    snapshots1 = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    /*
+     * Increase the refcounts of all clusters and make sure everything is
+     * stable on disk before updating the snapshot table to contain a pointer
+     * to the new L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    ret = bdrv_flush(bs);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Append the new snapshot to the snapshot list */
+    new_snapshot_list = g_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
     if (s->snapshots) {
-        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
-        g_free(s->snapshots);
+        memcpy(new_snapshot_list, s->snapshots,
+               s->nb_snapshots * sizeof(QCowSnapshot));
+        old_snapshot_list = s->snapshots;
     }
-    s->snapshots = snapshots1;
+    s->snapshots = new_snapshot_list;
     s->snapshots[s->nb_snapshots++] = *sn;
 
-    if (qcow2_write_snapshots(bs) < 0)
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
+        g_free(s->snapshots);
+        s->snapshots = old_snapshot_list;
         goto fail;
+    }
+
+    g_free(old_snapshot_list);
+
 #ifdef DEBUG_ALLOC
     {
       BdrvCheckResult result = {0};
@@ -309,10 +370,13 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     }
 #endif
     return 0;
- fail:
+
+fail:
+    g_free(sn->id_str);
     g_free(sn->name);
     g_free(l1_table);
-    return -1;
+
+    return ret;
 }
 
 /* copy the snapshot 'snapshot_name' into the current disk image */
@@ -322,38 +386,92 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     QCowSnapshot *sn;
     int i, snapshot_index;
     int cur_l1_bytes, sn_l1_bytes;
+    int ret;
+    uint64_t *sn_l1_table = NULL;
 
+    /* Search the snapshot */
     snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
-    if (snapshot_index < 0)
+    if (snapshot_index < 0) {
         return -ENOENT;
+    }
     sn = &s->snapshots[snapshot_index];
 
-    if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
-        goto fail;
-
-    if (qcow2_grow_l1_table(bs, sn->l1_size, true) < 0)
+    /*
+     * Make sure that the current L1 table is big enough to contain the whole
+     * L1 table of the snapshot. If the snapshot L1 table is smaller, the
+     * current one must be padded with zeros.
+     */
+    ret = qcow2_grow_l1_table(bs, sn->l1_size, true);
+    if (ret < 0) {
         goto fail;
+    }
 
     cur_l1_bytes = s->l1_size * sizeof(uint64_t);
     sn_l1_bytes = sn->l1_size * sizeof(uint64_t);
 
-    if (cur_l1_bytes > sn_l1_bytes) {
-        memset(s->l1_table + sn->l1_size, 0, cur_l1_bytes - sn_l1_bytes);
+    /*
+     * Copy the snapshot L1 table to the current L1 table.
+     *
+     * Before overwriting the old current L1 table on disk, make sure to
+     * increase all refcounts for the clusters referenced by the new one.
+     * Decrease the refcount referenced by the old one only when the L1
+     * table is overwritten.
+     */
+    sn_l1_table = g_malloc0(cur_l1_bytes);
+
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes);
+    if (ret < 0) {
+        goto fail;
     }
 
-    /* copy the snapshot l1 table to the current l1 table */
-    if (bdrv_pread(bs->file, sn->l1_table_offset,
-                   s->l1_table, sn_l1_bytes) < 0)
+    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset,
+                                         sn->l1_size, 1);
+    if (ret < 0) {
         goto fail;
-    if (bdrv_pwrite_sync(bs->file, s->l1_table_offset,
-                    s->l1_table, cur_l1_bytes) < 0)
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
+                           cur_l1_bytes);
+    if (ret < 0) {
         goto fail;
+    }
+
+    /*
+     * Decrease refcount of clusters of current L1 table.
+     *
+     * At this point, the in-memory s->l1_table points to the old L1 table,
+     * whereas on disk we already have the new one.
+     *
+     * qcow2_update_snapshot_refcount special cases the current L1 table to use
+     * the in-memory data instead of really using the offset to load a new one,
+     * which is why this works.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset,
+                                         s->l1_size, -1);
+
+    /*
+     * Now update the in-memory L1 table to be in sync with the on-disk one. We
+     * need to do this even if updating refcounts failed.
+     */
     for(i = 0;i < s->l1_size; i++) {
-        be64_to_cpus(&s->l1_table[i]);
+        s->l1_table[i] = be64_to_cpu(sn_l1_table[i]);
     }
 
-    if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
+    if (ret < 0) {
         goto fail;
+    }
+
+    g_free(sn_l1_table);
+    sn_l1_table = NULL;
+
+    /*
+     * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed
+     * when we decreased the refcount of the old snapshot.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+    if (ret < 0) {
+        goto fail;
+    }
 
 #ifdef DEBUG_ALLOC
     {
@@ -362,39 +480,59 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
     }
 #endif
     return 0;
- fail:
-    return -EIO;
+
+fail:
+    g_free(sn_l1_table);
+    return ret;
 }
 
 int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
 {
     BDRVQcowState *s = bs->opaque;
-    QCowSnapshot *sn;
+    QCowSnapshot sn;
     int snapshot_index, ret;
 
+    /* Search the snapshot */
     snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
-    if (snapshot_index < 0)
+    if (snapshot_index < 0) {
         return -ENOENT;
-    sn = &s->snapshots[snapshot_index];
+    }
+    sn = s->snapshots[snapshot_index];
+
+    /* Remove it from the snapshot list */
+    memmove(s->snapshots + snapshot_index,
+            s->snapshots + snapshot_index + 1,
+            (s->nb_snapshots - snapshot_index - 1) * sizeof(sn));
+    s->nb_snapshots--;
+    ret = qcow2_write_snapshots(bs);
+    if (ret < 0) {
+        return ret;
+    }
 
-    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
-    if (ret < 0)
+    /*
+     * The snapshot is now unused, clean up. If we fail after this point, we
+     * won't recover but just leak clusters.
+     */
+    g_free(sn.id_str);
+    g_free(sn.name);
+
+    /*
+     * Now decrease the refcounts of clusters referenced by the snapshot and
+     * free the L1 table.
+     */
+    ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset,
+                                         sn.l1_size, -1);
+    if (ret < 0) {
         return ret;
+    }
+    qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t));
+
     /* must update the copied flag on the current cluster offsets */
     ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
-    if (ret < 0)
-        return ret;
-    qcow2_free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
-
-    g_free(sn->id_str);
-    g_free(sn->name);
-    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
-    s->nb_snapshots--;
-    ret = qcow2_write_snapshots(bs);
     if (ret < 0) {
-        /* XXX: restore snapshot if error ? */
         return ret;
     }
+
 #ifdef DEBUG_ALLOC
     {
         BdrvCheckResult result = {0};
@@ -435,32 +573,42 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
 int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name)
 {
-    int i, snapshot_index, l1_size2;
+    int i, snapshot_index;
     BDRVQcowState *s = bs->opaque;
     QCowSnapshot *sn;
+    uint64_t *new_l1_table;
+    int new_l1_bytes;
+    int ret;
+
+    assert(bs->read_only);
 
+    /* Search the snapshot */
     snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name);
     if (snapshot_index < 0) {
         return -ENOENT;
     }
-
     sn = &s->snapshots[snapshot_index];
-    s->l1_size = sn->l1_size;
-    l1_size2 = s->l1_size * sizeof(uint64_t);
-    if (s->l1_table != NULL) {
-        g_free(s->l1_table);
-    }
 
-    s->l1_table_offset = sn->l1_table_offset;
-    s->l1_table = g_malloc0(align_offset(l1_size2, 512));
+    /* Allocate and read in the snapshot's L1 table */
+    new_l1_bytes = s->l1_size * sizeof(uint64_t);
+    new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512));
 
-    if (bdrv_pread(bs->file, sn->l1_table_offset,
-                   s->l1_table, l1_size2) != l1_size2) {
-        return -1;
+    ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes);
+    if (ret < 0) {
+        g_free(new_l1_table);
+        return ret;
     }
 
+    /* Switch the L1 table */
+    g_free(s->l1_table);
+
+    s->l1_size = sn->l1_size;
+    s->l1_table_offset = sn->l1_table_offset;
+    s->l1_table = new_l1_table;
+
     for(i = 0;i < s->l1_size; i++) {
         be64_to_cpus(&s->l1_table[i]);
     }
+
     return 0;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index 9e1b1eb2ed..37cd4424d4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -273,8 +273,9 @@ static int qcow2_open(BlockDriverState *bs, int flags)
         }
         bs->backing_file[len] = '\0';
     }
-    if (qcow2_read_snapshots(bs) < 0) {
-        ret = -EINVAL;
+
+    ret = qcow2_read_snapshots(bs);
+    if (ret < 0) {
         goto fail;
     }
 
@@ -343,16 +344,19 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     return 0;
 }
 
-static int qcow2_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                              int nb_sectors, int *pnum)
+static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
+    BDRVQcowState *s = bs->opaque;
     uint64_t cluster_offset;
     int ret;
 
     *pnum = nb_sectors;
-    /* FIXME We can get errors here, but the bdrv_is_allocated interface can't
-     * pass them on today */
+    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
+     * can't pass them on today */
+    qemu_co_mutex_lock(&s->lock);
     ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
+    qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
         *pnum = 0;
     }
@@ -377,7 +381,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
     return n1;
 }
 
-static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
                           int remaining_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
@@ -512,12 +516,12 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
     /* Restart all dependent requests */
     if (!qemu_co_queue_empty(&m->dependent_requests)) {
         qemu_co_mutex_unlock(&s->lock);
-        while(qemu_co_queue_next(&m->dependent_requests));
+        qemu_co_queue_restart_all(&m->dependent_requests);
         qemu_co_mutex_lock(&s->lock);
     }
 }
 
-static int qcow2_co_writev(BlockDriverState *bs,
+static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                            int64_t sector_num,
                            int remaining_sectors,
                            QEMUIOVector *qiov)
@@ -1137,7 +1141,7 @@ fail:
     return ret;
 }
 
-static int qcow2_co_flush_to_os(BlockDriverState *bs)
+static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
@@ -1159,7 +1163,7 @@ static int qcow2_co_flush_to_os(BlockDriverState *bs)
     return 0;
 }
 
-static int qcow2_co_flush_to_disk(BlockDriverState *bs)
+static coroutine_fn int qcow2_co_flush_to_disk(BlockDriverState *bs)
 {
     return bdrv_co_flush(bs->file);
 }
@@ -1276,7 +1280,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_open          = qcow2_open,
     .bdrv_close         = qcow2_close,
     .bdrv_create        = qcow2_create,
-    .bdrv_is_allocated  = qcow2_is_allocated,
+    .bdrv_co_is_allocated = qcow2_co_is_allocated,
     .bdrv_set_key       = qcow2_set_key,
     .bdrv_make_empty    = qcow2_make_empty,
 
diff --git a/block/qed-table.c b/block/qed-table.c
index f31f9ff069..8ee844346c 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -29,7 +29,7 @@ static void qed_read_table_cb(void *opaque, int ret)
 {
     QEDReadTableCB *read_table_cb = opaque;
     QEDTable *table = read_table_cb->table;
-    int noffsets = read_table_cb->iov.iov_len / sizeof(uint64_t);
+    int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
     int i;
 
     /* Handle I/O error */
@@ -65,7 +65,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
 
     qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
     aiocb = bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
-                           read_table_cb->iov.iov_len / BDRV_SECTOR_SIZE,
+                           qiov->size / BDRV_SECTOR_SIZE,
                            qed_read_table_cb, read_table_cb);
     if (!aiocb) {
         qed_read_table_cb(read_table_cb, -EIO);
@@ -160,7 +160,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
 
     aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
                             &write_table_cb->qiov,
-                            write_table_cb->iov.iov_len / BDRV_SECTOR_SIZE,
+                            write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
                             qed_write_table_cb, write_table_cb);
     if (!aiocb) {
         qed_write_table_cb(write_table_cb, -EIO);
diff --git a/block/qed.c b/block/qed.c
index 7e22e77b9d..22e467202f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -661,6 +661,7 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
 }
 
 typedef struct {
+    Coroutine *co;
     int is_allocated;
     int *pnum;
 } QEDIsAllocatedCB;
@@ -670,10 +671,14 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
     QEDIsAllocatedCB *cb = opaque;
     *cb->pnum = len / BDRV_SECTOR_SIZE;
     cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+    if (cb->co) {
+        qemu_coroutine_enter(cb->co, NULL);
+    }
 }
 
-static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                                  int nb_sectors, int *pnum)
+static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+                                                 int64_t sector_num,
+                                                 int nb_sectors, int *pnum)
 {
     BDRVQEDState *s = bs->opaque;
     uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
@@ -686,8 +691,10 @@ static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
 
     qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
 
+    /* Now sleep if the callback wasn't invoked immediately */
     while (cb.is_allocated == -1) {
-        qemu_aio_wait();
+        cb.co = qemu_coroutine_self();
+        qemu_coroutine_yield();
     }
 
     qed_unref_l2_cache_entry(request.l2_table);
@@ -1485,7 +1492,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_open                = bdrv_qed_open,
     .bdrv_close               = bdrv_qed_close,
     .bdrv_create              = bdrv_qed_create,
-    .bdrv_is_allocated        = bdrv_qed_is_allocated,
+    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
     .bdrv_make_empty          = bdrv_qed_make_empty,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 62f1f3a0cf..aa9707f2ae 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1715,7 +1715,7 @@ out:
     return 1;
 }
 
-static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
@@ -1744,7 +1744,7 @@ static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     return acb->ret;
 }
 
-static int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
+static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
                        int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
diff --git a/block/vdi.c b/block/vdi.c
index 02da6b44d0..e1d8cffc2d 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -472,8 +472,8 @@ static int vdi_open(BlockDriverState *bs, int flags)
     return -1;
 }
 
-static int vdi_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum)
+static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
     /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
     BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
@@ -996,7 +996,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_close = vdi_close,
     .bdrv_create = vdi_create,
     .bdrv_co_flush_to_disk = vdi_co_flush,
-    .bdrv_is_allocated = vdi_is_allocated,
+    .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
     .bdrv_aio_readv = vdi_aio_readv,
diff --git a/block/vmdk.c b/block/vmdk.c
index f5441591d7..5623ac10cd 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -861,8 +861,8 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
     return NULL;
 }
 
-static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum)
+static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVVmdkState *s = bs->opaque;
     int64_t index_in_cluster, n, ret;
@@ -873,8 +873,10 @@ static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
     if (!extent) {
         return 0;
     }
+    qemu_co_mutex_lock(&s->lock);
     ret = get_cluster_offset(bs, extent, NULL,
                             sector_num * 512, 0, &offset);
+    qemu_co_mutex_unlock(&s->lock);
     /* get_cluster_offset returning 0 means success */
     ret = !ret;
 
@@ -1596,7 +1598,7 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_close     = vmdk_close,
     .bdrv_create    = vmdk_create,
     .bdrv_co_flush_to_disk  = vmdk_co_flush,
-    .bdrv_is_allocated      = vmdk_is_allocated,
+    .bdrv_co_is_allocated   = vmdk_co_is_allocated,
     .bdrv_get_allocated_file_size  = vmdk_get_allocated_file_size,
 
     .create_options = vmdk_create_options,
diff --git a/block/vvfat.c b/block/vvfat.c
index a310ce8c3e..eeffc4a4a8 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2758,7 +2758,7 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-static int vvfat_is_allocated(BlockDriverState *bs,
+static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs,
 	int64_t sector_num, int nb_sectors, int* n)
 {
     BDRVVVFATState* s = bs->opaque;
@@ -2855,7 +2855,7 @@ static BlockDriver bdrv_vvfat = {
     .bdrv_read          = vvfat_co_read,
     .bdrv_write         = vvfat_co_write,
     .bdrv_close		= vvfat_close,
-    .bdrv_is_allocated	= vvfat_is_allocated,
+    .bdrv_co_is_allocated = vvfat_co_is_allocated,
     .protocol_name	= "fat",
 };
 
diff --git a/block_int.h b/block_int.h
index 77c0187c3d..311bd2a6fa 100644
--- a/block_int.h
+++ b/block_int.h
@@ -34,6 +34,13 @@
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
 
+#define BLOCK_IO_LIMIT_READ     0
+#define BLOCK_IO_LIMIT_WRITE    1
+#define BLOCK_IO_LIMIT_TOTAL    2
+
+#define BLOCK_IO_SLICE_TIME     100000000
+#define NANOSECONDS_PER_SECOND  1000000000.0
+
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"
 #define BLOCK_OPT_COMPAT6       "compat6"
@@ -44,12 +51,24 @@
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
 
+typedef struct BdrvTrackedRequest BdrvTrackedRequest;
+
 typedef struct AIOPool {
     void (*cancel)(BlockDriverAIOCB *acb);
     int aiocb_size;
     BlockDriverAIOCB *free_aiocb;
 } AIOPool;
 
+typedef struct BlockIOLimit {
+    int64_t bps[3];
+    int64_t iops[3];
+} BlockIOLimit;
+
+typedef struct BlockIOBaseValue {
+    uint64_t bytes[2];
+    uint64_t ios[2];
+} BlockIOBaseValue;
+
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -63,8 +82,6 @@ struct BlockDriver {
                       const uint8_t *buf, int nb_sectors);
     void (*bdrv_close)(BlockDriverState *bs);
     int (*bdrv_create)(const char *filename, QEMUOptionParameter *options);
-    int (*bdrv_is_allocated)(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int *pnum);
     int (*bdrv_set_key)(BlockDriverState *bs, const char *key);
     int (*bdrv_make_empty)(BlockDriverState *bs);
     /* aio */
@@ -86,6 +103,8 @@ struct BlockDriver {
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors);
+    int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum);
 
     /*
      * Invalidate any cached meta-data.
@@ -179,6 +198,8 @@ struct BlockDriverState {
     int encrypted; /* if true, the media is encrypted */
     int valid_key; /* if true, a valid encryption key has been set */
     int sg;        /* if true, the device is a /dev/sg* */
+    int copy_on_read; /* if true, copy read backing sectors into image
+                         note this is a reference count */
 
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
@@ -201,6 +222,16 @@ struct BlockDriverState {
 
     void *sync_aiocb;
 
+    /* the time for latest disk I/O */
+    int64_t slice_time;
+    int64_t slice_start;
+    int64_t slice_end;
+    BlockIOLimit io_limits;
+    BlockIOBaseValue  io_base;
+    CoQueue      throttled_reqs;
+    QEMUTimer    *block_timer;
+    bool         io_limits_enabled;
+
     /* I/O stats (display with "info blockstats"). */
     uint64_t nr_bytes[BDRV_MAX_IOTYPE];
     uint64_t nr_ops[BDRV_MAX_IOTYPE];
@@ -228,6 +259,8 @@ struct BlockDriverState {
     int in_use; /* users other than guest access, eg. block migration */
     QTAILQ_ENTRY(BlockDriverState) list;
     void *private;
+
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 };
 
 struct BlockDriverAIOCB {
@@ -244,6 +277,9 @@ void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
                    BlockDriverCompletionFunc *cb, void *opaque);
 void qemu_aio_release(void *p);
 
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        BlockIOLimit *io_limits);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
diff --git a/blockdev.c b/blockdev.c
index 222818690d..dbf0251a77 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -216,6 +216,26 @@ static int parse_block_error_action(const char *buf, int is_read)
     }
 }
 
+static bool do_check_io_limits(BlockIOLimit *io_limits)
+{
+    bool bps_flag;
+    bool iops_flag;
+
+    assert(io_limits);
+
+    bps_flag  = (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] != 0)
+                 && ((io_limits->bps[BLOCK_IO_LIMIT_READ] != 0)
+                 || (io_limits->bps[BLOCK_IO_LIMIT_WRITE] != 0));
+    iops_flag = (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] != 0)
+                 && ((io_limits->iops[BLOCK_IO_LIMIT_READ] != 0)
+                 || (io_limits->iops[BLOCK_IO_LIMIT_WRITE] != 0));
+    if (bps_flag || iops_flag) {
+        return false;
+    }
+
+    return true;
+}
+
 DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 {
     const char *buf;
@@ -235,7 +255,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     int on_read_error, on_write_error;
     const char *devaddr;
     DriveInfo *dinfo;
+    BlockIOLimit io_limits;
     int snapshot = 0;
+    bool copy_on_read;
     int ret;
 
     translation = BIOS_ATA_TRANSLATION_AUTO;
@@ -252,6 +274,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 
     snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
     ro = qemu_opt_get_bool(opts, "readonly", 0);
+    copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false);
 
     file = qemu_opt_get(opts, "file");
     serial = qemu_opt_get(opts, "serial");
@@ -353,6 +376,26 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         }
     }
 
+    /* disk I/O throttling */
+    io_limits.bps[BLOCK_IO_LIMIT_TOTAL]  =
+                           qemu_opt_get_number(opts, "bps", 0);
+    io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+                           qemu_opt_get_number(opts, "bps_rd", 0);
+    io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+                           qemu_opt_get_number(opts, "bps_wr", 0);
+    io_limits.iops[BLOCK_IO_LIMIT_TOTAL] =
+                           qemu_opt_get_number(opts, "iops", 0);
+    io_limits.iops[BLOCK_IO_LIMIT_READ]  =
+                           qemu_opt_get_number(opts, "iops_rd", 0);
+    io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
+                           qemu_opt_get_number(opts, "iops_wr", 0);
+
+    if (!do_check_io_limits(&io_limits)) {
+        error_report("bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr) "
+                     "cannot be used at the same time");
+        return NULL;
+    }
+
     on_write_error = BLOCK_ERR_STOP_ENOSPC;
     if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
         if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
@@ -460,6 +503,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 
     bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
 
+    /* disk I/O throttling */
+    bdrv_set_io_limits(dinfo->bdrv, &io_limits);
+
     switch(type) {
     case IF_IDE:
     case IF_SCSI:
@@ -502,6 +548,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         bdrv_flags |= (BDRV_O_SNAPSHOT|BDRV_O_CACHE_WB|BDRV_O_NO_FLUSH);
     }
 
+    if (copy_on_read) {
+        bdrv_flags |= BDRV_O_COPY_ON_READ;
+    }
+
     if (media == MEDIA_CDROM) {
         /* CDROM is fine for any interface, don't check.  */
         ro = 1;
@@ -603,7 +653,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
         goto out;
     }
 
-    qemu_aio_flush();
+    bdrv_drain_all();
     bdrv_flush(bs);
 
     bdrv_close(bs);
@@ -715,6 +765,65 @@ int do_change_block(Monitor *mon, const char *device,
     return monitor_read_bdrv_key_start(mon, bs, NULL, NULL);
 }
 
+/* throttling disk I/O limits */
+int do_block_set_io_throttle(Monitor *mon,
+                       const QDict *qdict, QObject **ret_data)
+{
+    BlockIOLimit io_limits;
+    const char *devname = qdict_get_str(qdict, "device");
+    BlockDriverState *bs;
+
+    io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+                        = qdict_get_try_int(qdict, "bps", -1);
+    io_limits.bps[BLOCK_IO_LIMIT_READ]
+                        = qdict_get_try_int(qdict, "bps_rd", -1);
+    io_limits.bps[BLOCK_IO_LIMIT_WRITE]
+                        = qdict_get_try_int(qdict, "bps_wr", -1);
+    io_limits.iops[BLOCK_IO_LIMIT_TOTAL]
+                        = qdict_get_try_int(qdict, "iops", -1);
+    io_limits.iops[BLOCK_IO_LIMIT_READ]
+                        = qdict_get_try_int(qdict, "iops_rd", -1);
+    io_limits.iops[BLOCK_IO_LIMIT_WRITE]
+                        = qdict_get_try_int(qdict, "iops_wr", -1);
+
+    bs = bdrv_find(devname);
+    if (!bs) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, devname);
+        return -1;
+    }
+
+    if ((io_limits.bps[BLOCK_IO_LIMIT_TOTAL] == -1)
+        || (io_limits.bps[BLOCK_IO_LIMIT_READ] == -1)
+        || (io_limits.bps[BLOCK_IO_LIMIT_WRITE] == -1)
+        || (io_limits.iops[BLOCK_IO_LIMIT_TOTAL] == -1)
+        || (io_limits.iops[BLOCK_IO_LIMIT_READ] == -1)
+        || (io_limits.iops[BLOCK_IO_LIMIT_WRITE] == -1)) {
+        qerror_report(QERR_MISSING_PARAMETER,
+                      "bps/bps_rd/bps_wr/iops/iops_rd/iops_wr");
+        return -1;
+    }
+
+    if (!do_check_io_limits(&io_limits)) {
+        qerror_report(QERR_INVALID_PARAMETER_COMBINATION);
+        return -1;
+    }
+
+    bs->io_limits = io_limits;
+    bs->slice_time = BLOCK_IO_SLICE_TIME;
+
+    if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_enable(bs);
+    } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_disable(bs);
+    } else {
+        if (bs->block_timer) {
+            qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock));
+        }
+    }
+
+    return 0;
+}
+
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");
@@ -731,7 +840,7 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
     }
 
     /* quiesce block driver; prevent further io */
-    qemu_aio_flush();
+    bdrv_drain_all();
     bdrv_flush(bs);
     bdrv_close(bs);
 
diff --git a/blockdev.h b/blockdev.h
index 3587786a64..1b48a75499 100644
--- a/blockdev.h
+++ b/blockdev.h
@@ -63,6 +63,8 @@ int do_block_set_passwd(Monitor *mon, const QDict *qdict, QObject **ret_data);
 int do_change_block(Monitor *mon, const char *device,
                     const char *filename, const char *fmt);
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_block_set_io_throttle(Monitor *mon,
+                             const QDict *qdict, QObject **ret_data);
 int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data);
 int do_block_resize(Monitor *mon, const QDict *qdict, QObject **ret_data);
 
diff --git a/cpus.c b/cpus.c
index 9d2a2cb474..ca46ec6523 100644
--- a/cpus.c
+++ b/cpus.c
@@ -396,7 +396,7 @@ static void do_vm_stop(RunState state)
         pause_all_vcpus();
         runstate_set(state);
         vm_state_notify(0, state);
-        qemu_aio_flush();
+        bdrv_drain_all();
         bdrv_flush_all();
         monitor_protocol_event(QEVENT_STOP, NULL);
     }
diff --git a/dma-helpers.c b/dma-helpers.c
index bdcd38cd27..9d6b6fa1fd 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -9,6 +9,7 @@
 
 #include "dma.h"
 #include "block_int.h"
+#include "trace.h"
 
 void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint)
 {
@@ -83,6 +84,8 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
 
 static void dma_complete(DMAAIOCB *dbs, int ret)
 {
+    trace_dma_complete(dbs, ret, dbs->common.cb);
+
     dma_bdrv_unmap(dbs);
     if (dbs->common.cb) {
         dbs->common.cb(dbs->common.opaque, ret);
@@ -106,6 +109,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
     target_phys_addr_t cur_addr, cur_len;
     void *mem;
 
+    trace_dma_bdrv_cb(dbs, ret);
+
     dbs->acb = NULL;
     dbs->sector_num += dbs->iov.size / 512;
     dma_bdrv_unmap(dbs);
@@ -130,6 +135,7 @@ static void dma_bdrv_cb(void *opaque, int ret)
     }
 
     if (dbs->iov.size == 0) {
+        trace_dma_map_wait(dbs);
         cpu_register_map_client(dbs, continue_after_map_failure);
         return;
     }
@@ -145,6 +151,8 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb)
 {
     DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
 
+    trace_dma_aio_cancel(dbs);
+
     if (dbs->acb) {
         BlockDriverAIOCB *acb = dbs->acb;
         dbs->acb = NULL;
@@ -168,6 +176,8 @@ BlockDriverAIOCB *dma_bdrv_io(
 {
     DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
 
+    trace_dma_bdrv_io(dbs, bs, sector_num, to_dev);
+
     dbs->acb = NULL;
     dbs->bs = bs;
     dbs->sg = sg;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 089c1ac23d..79a919526d 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -860,9 +860,10 @@ ETEXI
         .args_type  = "pci_addr:s,opts:s",
         .params     = "[[<domain>:]<bus>:]<slot>\n"
                       "[file=file][,if=type][,bus=n]\n"
-                      "[,unit=m][,media=d][index=i]\n"
+                      "[,unit=m][,media=d][,index=i]\n"
                       "[,cyls=c,heads=h,secs=s[,trans=t]]\n"
-                      "[snapshot=on|off][,cache=on|off]",
+                      "[,snapshot=on|off][,cache=on|off]\n"
+                      "[,readonly=on|off][,copy-on-read=on|off]",
         .help       = "add drive to PCI storage controller",
         .mhandler.cmd = drive_hot_add,
     },
@@ -1207,6 +1208,21 @@ ETEXI
     },
 
 STEXI
+@item block_set_io_throttle @var{device} @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
+@findex block_set_io_throttle
+Change I/O throttle limits for a block drive to @var{bps} @var{bps_rd} @var{bps_wr} @var{iops} @var{iops_rd} @var{iops_wr}
+ETEXI
+
+    {
+        .name       = "block_set_io_throttle",
+        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l",
+        .params     = "device bps bps_rd bps_wr iops iops_rd iops_wr",
+        .help       = "change I/O throttle limits for a block drive",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_block_set_io_throttle,
+    },
+
+STEXI
 @item block_passwd @var{device} @var{password}
 @findex block_passwd
 Set the encrypted device @var{device} password to @var{password}
diff --git a/hmp.c b/hmp.c
index 443d3a7ff4..dfab7ad9bb 100644
--- a/hmp.c
+++ b/hmp.c
@@ -216,6 +216,16 @@ void hmp_info_block(Monitor *mon)
                            info->value->inserted->ro,
                            info->value->inserted->drv,
                            info->value->inserted->encrypted);
+
+            monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
+                            " bps_wr=%" PRId64 " iops=%" PRId64
+                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
+                            info->value->inserted->bps,
+                            info->value->inserted->bps_rd,
+                            info->value->inserted->bps_wr,
+                            info->value->inserted->iops,
+                            info->value->inserted->iops_rd,
+                            info->value->inserted->iops_wr);
         } else {
             monitor_printf(mon, " [not inserted]");
         }
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index 70b33422d2..c09d2e0a35 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -200,8 +200,9 @@ static void pmac_ide_flush(DBDMA_io *io)
 {
     MACIOIDEState *m = io->opaque;
 
-    if (m->aiocb)
-        qemu_aio_flush();
+    if (m->aiocb) {
+        bdrv_drain_all();
+    }
 }
 
 /* PowerMac IDE memory IO */
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 49b823df79..5078c0b565 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -309,7 +309,7 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
              * aio operation with preadv/pwritev.
              */
             if (bm->bus->dma->aiocb) {
-                qemu_aio_flush();
+                bdrv_drain_all();
                 assert(bm->bus->dma->aiocb == NULL);
                 assert((bm->status & BM_STATUS_DMAING) == 0);
             }
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index d6d1f87cda..4b0d113ba8 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -474,7 +474,7 @@ static void virtio_blk_reset(VirtIODevice *vdev)
      * This should cancel pending requests, but can't do nicely until there
      * are per-device request lists.
      */
-    qemu_aio_flush();
+    bdrv_drain_all();
 }
 
 /* coalesce internal state, copy to pci i/o region 0
diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index 286bbac54a..192e81746f 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -49,7 +49,6 @@ static int syncwrite    = 0;
 static int batch_maps   = 0;
 
 static int max_requests = 32;
-static int use_aio      = 1;
 
 /* ------------------------------------------------------------- */
 
@@ -314,76 +313,6 @@ static int ioreq_map(struct ioreq *ioreq)
     return 0;
 }
 
-static int ioreq_runio_qemu_sync(struct ioreq *ioreq)
-{
-    struct XenBlkDev *blkdev = ioreq->blkdev;
-    int i, rc;
-    off_t pos;
-
-    if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
-        goto err_no_map;
-    }
-    if (ioreq->presync) {
-        bdrv_flush(blkdev->bs);
-    }
-
-    switch (ioreq->req.operation) {
-    case BLKIF_OP_READ:
-        pos = ioreq->start;
-        for (i = 0; i < ioreq->v.niov; i++) {
-            rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE,
-                           ioreq->v.iov[i].iov_base,
-                           ioreq->v.iov[i].iov_len / BLOCK_SIZE);
-            if (rc != 0) {
-                xen_be_printf(&blkdev->xendev, 0, "rd I/O error (%p, len %zd)\n",
-                              ioreq->v.iov[i].iov_base,
-                              ioreq->v.iov[i].iov_len);
-                goto err;
-            }
-            pos += ioreq->v.iov[i].iov_len;
-        }
-        break;
-    case BLKIF_OP_WRITE:
-    case BLKIF_OP_WRITE_BARRIER:
-        if (!ioreq->req.nr_segments) {
-            break;
-        }
-        pos = ioreq->start;
-        for (i = 0; i < ioreq->v.niov; i++) {
-            rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE,
-                            ioreq->v.iov[i].iov_base,
-                            ioreq->v.iov[i].iov_len / BLOCK_SIZE);
-            if (rc != 0) {
-                xen_be_printf(&blkdev->xendev, 0, "wr I/O error (%p, len %zd)\n",
-                              ioreq->v.iov[i].iov_base,
-                              ioreq->v.iov[i].iov_len);
-                goto err;
-            }
-            pos += ioreq->v.iov[i].iov_len;
-        }
-        break;
-    default:
-        /* unknown operation (shouldn't happen -- parse catches this) */
-        goto err;
-    }
-
-    if (ioreq->postsync) {
-        bdrv_flush(blkdev->bs);
-    }
-    ioreq->status = BLKIF_RSP_OKAY;
-
-    ioreq_unmap(ioreq);
-    ioreq_finish(ioreq);
-    return 0;
-
-err:
-    ioreq_unmap(ioreq);
-err_no_map:
-    ioreq_finish(ioreq);
-    ioreq->status = BLKIF_RSP_ERROR;
-    return -1;
-}
-
 static void qemu_aio_complete(void *opaque, int ret)
 {
     struct ioreq *ioreq = opaque;
@@ -554,9 +483,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
     rp = blkdev->rings.common.sring->req_prod;
     xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
 
-    if (use_aio) {
-        blk_send_response_all(blkdev);
-    }
+    blk_send_response_all(blkdev);
     while (rc != rp) {
         /* pull request from ring */
         if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
@@ -579,16 +506,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
             continue;
         }
 
-        if (use_aio) {
-            /* run i/o in aio mode */
-            ioreq_runio_qemu_aio(ioreq);
-        } else {
-            /* run i/o in sync mode */
-            ioreq_runio_qemu_sync(ioreq);
-        }
-    }
-    if (!use_aio) {
-        blk_send_response_all(blkdev);
+        ioreq_runio_qemu_aio(ioreq);
     }
 
     if (blkdev->more_work && blkdev->requests_inflight < max_requests) {
diff --git a/hw/xen_platform.c b/hw/xen_platform.c
index 5e792f56f6..e62eaef7d1 100644
--- a/hw/xen_platform.c
+++ b/hw/xen_platform.c
@@ -120,7 +120,7 @@ static void platform_fixed_ioport_writew(void *opaque, uint32_t addr, uint32_t v
            devices, and bit 2 the non-primary-master IDE devices. */
         if (val & UNPLUG_ALL_IDE_DISKS) {
             DPRINTF("unplug disks\n");
-            qemu_aio_flush();
+            bdrv_drain_all();
             bdrv_flush_all();
             pci_unplug_disks(s->pci_dev.bus);
         }
diff --git a/qapi-schema.json b/qapi-schema.json
index cb1ba776df..fbbdbe0914 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -370,13 +370,27 @@
 #
 # @encrypted: true if the backing device is encrypted
 #
+# @bps: total throughput limit in bytes per second is specified
+#
+# @bps_rd: read throughput limit in bytes per second is specified
+#
+# @bps_wr: write throughput limit in bytes per second is specified
+#
+# @iops: total I/O operations per second is specified
+#
+# @iops_rd: read I/O operations per second is specified
+#
+# @iops_wr: write I/O operations per second is specified
+#
 # Since: 0.14.0
 #
 # Notes: This interface is only found in @BlockInfo.
 ##
 { 'type': 'BlockDeviceInfo',
   'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str',
-            '*backing_file': 'str', 'encrypted': 'bool' } }
+            '*backing_file': 'str', 'encrypted': 'bool',
+            'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
+            'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int'} }
 
 ##
 # @BlockDeviceIoStatus:
diff --git a/qemu-common.h b/qemu-common.h
index 2ce47aa12d..44870fe523 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -341,6 +341,12 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return res.ll;
 }
 
+/* Round number down to multiple */
+#define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m))
+
 #include "module.h"
 
 #endif
diff --git a/qemu-config.c b/qemu-config.c
index 597d7e10b1..18f30204a1 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -85,6 +85,34 @@ static QemuOptsList qemu_drive_opts = {
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
             .help = "open drive file as read-only",
+        },{
+            .name = "iops",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit total I/O operations per second",
+        },{
+            .name = "iops_rd",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit read operations per second",
+        },{
+            .name = "iops_wr",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit write operations per second",
+        },{
+            .name = "bps",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit total bytes per second",
+        },{
+            .name = "bps_rd",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit read bytes per second",
+        },{
+            .name = "bps_wr",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit write bytes per second",
+        },{
+            .name = "copy-on-read",
+            .type = QEMU_OPT_BOOL,
+            .help = "copy read data from backing file into image file",
         },
         { /* end of list */ }
     },
diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
index 6b58160058..26ad76bf50 100644
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@@ -61,6 +61,14 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
     assert(qemu_in_coroutine());
 }
 
+void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue)
+{
+    Coroutine *self = qemu_coroutine_self();
+    QTAILQ_INSERT_HEAD(&queue->entries, self, co_queue_next);
+    qemu_coroutine_yield();
+    assert(qemu_in_coroutine());
+}
+
 bool qemu_co_queue_next(CoQueue *queue)
 {
     Coroutine *next;
@@ -76,6 +84,13 @@ bool qemu_co_queue_next(CoQueue *queue)
     return (next != NULL);
 }
 
+void qemu_co_queue_restart_all(CoQueue *queue)
+{
+    while (qemu_co_queue_next(queue)) {
+        /* Do nothing */
+    }
+}
+
 bool qemu_co_queue_empty(CoQueue *queue)
 {
     return (QTAILQ_FIRST(&queue->entries) == NULL);
@@ -136,13 +151,7 @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
     assert(qemu_in_coroutine());
     if (lock->writer) {
         lock->writer = false;
-        while (!qemu_co_queue_empty(&lock->queue)) {
-            /*
-             * Wakeup every body. This will include some
-             * writers too.
-             */
-            qemu_co_queue_next(&lock->queue);
-        }
+        qemu_co_queue_restart_all(&lock->queue);
     } else {
         lock->reader--;
         assert(lock->reader >= 0);
diff --git a/qemu-coroutine.h b/qemu-coroutine.h
index b8fc4f4332..8a55fe125e 100644
--- a/qemu-coroutine.h
+++ b/qemu-coroutine.h
@@ -118,6 +118,12 @@ void qemu_co_queue_init(CoQueue *queue);
 void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 
 /**
+ * Adds the current coroutine to the head of the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue);
+
+/**
  * Restarts the next coroutine in the CoQueue and removes it from the queue.
  *
  * Returns true if a coroutine was restarted, false if the queue is empty.
@@ -125,6 +131,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
 bool qemu_co_queue_next(CoQueue *queue);
 
 /**
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
+ */
+void qemu_co_queue_restart_all(CoQueue *queue);
+
+/**
  * Checks if the CoQueue is empty.
  */
 bool qemu_co_queue_empty(CoQueue *queue);
diff --git a/qemu-io.c b/qemu-io.c
index f70de0b4a6..622debc6e3 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -1853,9 +1853,9 @@ int main(int argc, char **argv)
     command_loop();
 
     /*
-     * Make sure all outstanding requests get flushed the program exits.
+     * Make sure all outstanding requests complete before the program exits.
      */
-    qemu_aio_flush();
+    bdrv_drain_all();
 
     if (bs) {
         bdrv_delete(bs);
diff --git a/qemu-options.hx b/qemu-options.hx
index 681eaf198e..b3db10c0ad 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -135,7 +135,8 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
     "       [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
-    "       [,readonly=on|off]\n"
+    "       [,readonly=on|off][,copy-on-read=on|off]\n"
+    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
@@ -186,6 +187,9 @@ host disk is full; report the error to the guest otherwise).
 The default setting is @option{werror=enospc} and @option{rerror=report}.
 @item readonly
 Open drive @option{file} as read-only. Guest write attempts will fail.
+@item copy-on-read=@var{copy-on-read}
+@var{copy-on-read} is "on" or "off" and enables whether to copy read backing
+file sectors into the image file.
 @end table
 
 By default, writethrough caching is used for all block device.  This means that
@@ -217,6 +221,10 @@ like your host losing power, the disk storage getting disconnected accidently,
 etc. you're image will most probably be rendered unusable.   When using
 the @option{-snapshot} option, unsafe caching is always used.
 
+Copy-on-read avoids accessing the same backing file sectors repeatedly and is
+useful when the backing file is over a slow network.  By default copy-on-read
+is off.
+
 Instead of @option{-cdrom} you can use:
 @example
 qemu -drive file=file,index=2,media=cdrom
diff --git a/qerror.c b/qerror.c
index 935d30f62a..656efc2cf0 100644
--- a/qerror.c
+++ b/qerror.c
@@ -251,6 +251,10 @@ static const QErrorStringTable qerror_table[] = {
         .error_fmt = QERR_QGA_COMMAND_FAILED,
         .desc      = "Guest agent command failed, error was '%(message)'",
     },
+    {
+        .error_fmt = QERR_INVALID_PARAMETER_COMBINATION,
+        .desc      = "Invalid paramter combination",
+    },
     {}
 };
 
diff --git a/qerror.h b/qerror.h
index 6414cd9d5b..161d654b75 100644
--- a/qerror.h
+++ b/qerror.h
@@ -207,4 +207,7 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_QGA_COMMAND_FAILED \
     "{ 'class': 'QgaCommandFailed', 'data': { 'message': %s } }"
 
+#define QERR_INVALID_PARAMETER_COMBINATION \
+    "{ 'class': 'InvalidParameterCombination', 'data': {} }"
+
 #endif /* QERROR_H */
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 97975a5207..94da2a8ef2 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -851,6 +851,44 @@ Example:
 EQMP
 
     {
+        .name       = "block_set_io_throttle",
+        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l",
+        .params     = "device bps bps_rd bps_wr iops iops_rd iops_wr",
+        .help       = "change I/O throttle limits for a block drive",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_block_set_io_throttle,
+    },
+
+SQMP
+block_set_io_throttle
+------------
+
+Change I/O throttle limits for a block drive.
+
+Arguments:
+
+- "device": device name (json-string)
+- "bps":  total throughput limit in bytes per second(json-int)
+- "bps_rd":  read throughput limit in bytes per second(json-int)
+- "bps_wr":  read throughput limit in bytes per second(json-int)
+- "iops":  total I/O operations per second(json-int)
+- "iops_rd":  read I/O operations per second(json-int)
+- "iops_wr":  write I/O operations per second(json-int)
+
+Example:
+
+-> { "execute": "block_set_io_throttle", "arguments": { "device": "virtio0",
+                                               "bps": "1000000",
+                                               "bps_rd": "0",
+                                               "bps_wr": "0",
+                                               "iops": "0",
+                                               "iops_rd": "0",
+                                               "iops_wr": "0" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "set_password",
         .args_type  = "protocol:s,password:s,connected:s?",
         .params     = "protocol password action-if-connected",
@@ -1152,6 +1190,13 @@ Each json-object contain the following:
                                 "tftp", "vdi", "vmdk", "vpc", "vvfat"
          - "backing_file": backing file name (json-string, optional)
          - "encrypted": true if encrypted, false otherwise (json-bool)
+         - "bps": limit total bytes per second (json-int)
+         - "bps_rd": limit read bytes per second (json-int)
+         - "bps_wr": limit write bytes per second (json-int)
+         - "iops": limit total I/O operations per second (json-int)
+         - "iops_rd": limit read operations per second (json-int)
+         - "iops_wr": limit write operations per second (json-int)
+
 - "io-status": I/O operation status, only present if the device supports it
                and the VM is configured to stop on errors. It's always reset
                to "ok" when the "cont" command is issued (json_string, optional)
@@ -1171,7 +1216,13 @@ Example:
                "ro":false,
                "drv":"qcow2",
                "encrypted":false,
-               "file":"disks/test.img"
+               "file":"disks/test.img",
+               "bps":1000000,
+               "bps_rd":0,
+               "bps_wr":0,
+               "iops":1000000,
+               "iops_rd":0,
+               "iops_wr":0,
             },
             "type":"unknown"
          },
diff --git a/savevm.c b/savevm.c
index a827ca2b3e..1a21ab441f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -2104,7 +2104,7 @@ int load_vmstate(const char *name)
     }
 
     /* Flush all IO requests so they don't interfere with the new state.  */
-    qemu_aio_flush();
+    bdrv_drain_all();
 
     bs = NULL;
     while ((bs = bdrv_next(bs))) {
diff --git a/trace-events b/trace-events
index 962caca598..bf1cf57032 100644
--- a/trace-events
+++ b/trace-events
@@ -69,6 +69,7 @@ bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
+bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
 
 # hw/virtio-blk.c
 virtio_blk_req_complete(void *req, int status) "req %p status %d"
@@ -631,3 +632,10 @@ win_helper_no_switch_pstate(uint32_t new_pstate_regs) "change_pstate: regs new=%
 win_helper_wrpil(uint32_t psrpil, uint32_t new_pil) "old=%x new=%x"
 win_helper_done(uint32_t tl) "tl=%d"
 win_helper_retry(uint32_t tl) "tl=%d"
+
+# dma-helpers.c
+dma_bdrv_io(void *dbs, void *bs, int64_t sector_num, bool to_dev) "dbs=%p bs=%p sector_num=%" PRId64 " to_dev=%d"
+dma_aio_cancel(void *dbs) "dbs=%p"
+dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p"
+dma_bdrv_cb(void *dbs, int ret) "dbs=%p ret=%d"
+dma_map_wait(void *dbs) "dbs=%p"
diff --git a/xen-mapcache.c b/xen-mapcache.c
index 7bcb86e4f8..9fecc64e97 100644
--- a/xen-mapcache.c
+++ b/xen-mapcache.c
@@ -351,7 +351,7 @@ void xen_invalidate_map_cache(void)
     MapCacheRev *reventry;
 
     /* Flush pending AIO before destroying the mapcache */
-    qemu_aio_flush();
+    bdrv_drain_all();
 
     QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
         DPRINTF("There should be no locked mappings at this time, "
author	Anthony Liguori <aliguori@us.ibm.com>	2011-12-05 09:39:25 -0600
committer	Anthony Liguori <aliguori@us.ibm.com>	2011-12-05 09:39:25 -0600
commit	eb5d5beaebd102599a915f6c4813d445ddc9dc84 (patch)
tree	12ce2331571a30c67bde0b8f4ddb55996dd0ba65
parent	f6480ca3f3423be5bee8b673ee6f5cc387659def (diff)
parent	922453bca6a927bb527068ae8679d587cfa45dbc (diff)
download	qemu-eb5d5beaebd102599a915f6c4813d445ddc9dc84.tar.gz