summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile.objs5
-rw-r--r--block/backup.c155
-rw-r--r--block/blkdebug.c6
-rw-r--r--block/block-backend.c11
-rw-r--r--block/io.c2540
-rw-r--r--block/iscsi.c64
-rw-r--r--block/mirror.c53
-rw-r--r--block/null.c66
-rw-r--r--block/qapi.c46
-rw-r--r--block/qcow.c8
-rw-r--r--block/qcow2-refcount.c2
-rw-r--r--block/qcow2-snapshot.c6
-rw-r--r--block/qcow2.c14
-rw-r--r--block/qed.c6
-rw-r--r--block/quorum.c5
-rw-r--r--block/rbd.c2
-rw-r--r--block/sheepdog.c3
-rw-r--r--block/snapshot.c12
-rw-r--r--block/vdi.c6
-rw-r--r--block/vhdx.c10
-rw-r--r--block/vmdk.c10
-rw-r--r--block/vpc.c6
-rw-r--r--block/vvfat.c7
23 files changed, 2891 insertions, 152 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index db2933e469..0d8c2a4ab6 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,4 +1,4 @@
-block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o
block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
@@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
block-obj-$(CONFIG_POSIX) += raw-posix.o
block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o
+block-obj-y += null.o mirror.o io.o
block-obj-y += nbd.o nbd-client.o sheepdog.o
block-obj-$(CONFIG_LIBISCSI) += iscsi.o
@@ -37,6 +37,7 @@ gluster.o-libs := $(GLUSTERFS_LIBS)
ssh.o-cflags := $(LIBSSH2_CFLAGS)
ssh.o-libs := $(LIBSSH2_LIBS)
archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+block-obj-m += dmg.o
dmg.o-libs := $(BZIP2_LIBS)
qcow.o-libs := -lz
linux-aio.o-libs := -laio
diff --git a/block/backup.c b/block/backup.c
index 1c535b1ab9..d3f648ddd7 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -37,6 +37,8 @@ typedef struct CowRequest {
typedef struct BackupBlockJob {
BlockJob common;
BlockDriverState *target;
+ /* bitmap for sync=dirty-bitmap */
+ BdrvDirtyBitmap *sync_bitmap;
MirrorSyncMode sync_mode;
RateLimit limit;
BlockdevOnError on_source_error;
@@ -242,6 +244,91 @@ static void backup_complete(BlockJob *job, void *opaque)
g_free(data);
}
+static bool coroutine_fn yield_and_check(BackupBlockJob *job)
+{
+ if (block_job_is_cancelled(&job->common)) {
+ return true;
+ }
+
+ /* we need to yield so that bdrv_drain_all() returns.
+ * (without, VM does not reboot)
+ */
+ if (job->common.speed) {
+ uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+ job->sectors_read);
+ job->sectors_read = 0;
+ block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
+ } else {
+ block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
+ }
+
+ if (block_job_is_cancelled(&job->common)) {
+ return true;
+ }
+
+ return false;
+}
+
+static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
+{
+ bool error_is_read;
+ int ret = 0;
+ int clusters_per_iter;
+ uint32_t granularity;
+ int64_t sector;
+ int64_t cluster;
+ int64_t end;
+ int64_t last_cluster = -1;
+ BlockDriverState *bs = job->common.bs;
+ HBitmapIter hbi;
+
+ granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
+ clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+ bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
+
+ /* Find the next dirty sector(s) */
+ while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+ cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+
+ /* Fake progress updates for any clusters we skipped */
+ if (cluster != last_cluster + 1) {
+ job->common.offset += ((cluster - last_cluster - 1) *
+ BACKUP_CLUSTER_SIZE);
+ }
+
+ for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
+ do {
+ if (yield_and_check(job)) {
+ return ret;
+ }
+ ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
+ BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+ if ((ret < 0) &&
+ backup_error_action(job, error_is_read, -ret) ==
+ BLOCK_ERROR_ACTION_REPORT) {
+ return ret;
+ }
+ } while (ret < 0);
+ }
+
+ /* If the bitmap granularity is smaller than the backup granularity,
+ * we need to advance the iterator pointer to the next cluster. */
+ if (granularity < BACKUP_CLUSTER_SIZE) {
+ bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+ }
+
+ last_cluster = cluster - 1;
+ }
+
+ /* Play some final catchup with the progress meter */
+ end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+ if (last_cluster + 1 < end) {
+ job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+ }
+
+ return ret;
+}
+
static void coroutine_fn backup_run(void *opaque)
{
BackupBlockJob *job = opaque;
@@ -259,8 +346,7 @@ static void coroutine_fn backup_run(void *opaque)
qemu_co_rwlock_init(&job->flush_rwlock);
start = 0;
- end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
- BACKUP_SECTORS_PER_CLUSTER);
+ end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
job->bitmap = hbitmap_alloc(end, 0);
@@ -278,28 +364,13 @@ static void coroutine_fn backup_run(void *opaque)
qemu_coroutine_yield();
job->common.busy = true;
}
+ } else if (job->sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+ ret = backup_run_incremental(job);
} else {
/* Both FULL and TOP SYNC_MODE's require copying.. */
for (; start < end; start++) {
bool error_is_read;
-
- if (block_job_is_cancelled(&job->common)) {
- break;
- }
-
- /* we need to yield so that qemu_aio_flush() returns.
- * (without, VM does not reboot)
- */
- if (job->common.speed) {
- uint64_t delay_ns = ratelimit_calculate_delay(
- &job->limit, job->sectors_read);
- job->sectors_read = 0;
- block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
- } else {
- block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
- }
-
- if (block_job_is_cancelled(&job->common)) {
+ if (yield_and_check(job)) {
break;
}
@@ -357,6 +428,18 @@ static void coroutine_fn backup_run(void *opaque)
qemu_co_rwlock_wrlock(&job->flush_rwlock);
qemu_co_rwlock_unlock(&job->flush_rwlock);
+ if (job->sync_bitmap) {
+ BdrvDirtyBitmap *bm;
+ if (ret < 0) {
+ /* Merge the successor back into the parent, delete nothing. */
+ bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+ assert(bm);
+ } else {
+ /* Everything is fine, delete this bitmap and install the backup. */
+ bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+ assert(bm);
+ }
+ }
hbitmap_free(job->bitmap);
bdrv_iostatus_disable(target);
@@ -369,6 +452,7 @@ static void coroutine_fn backup_run(void *opaque)
void backup_start(BlockDriverState *bs, BlockDriverState *target,
int64_t speed, MirrorSyncMode sync_mode,
+ BdrvDirtyBitmap *sync_bitmap,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockCompletionFunc *cb, void *opaque,
@@ -412,17 +496,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
return;
}
+ if (sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+ if (!sync_bitmap) {
+ error_setg(errp, "must provide a valid bitmap name for "
+ "\"dirty-bitmap\" sync mode");
+ return;
+ }
+
+ /* Create a new bitmap, and freeze/disable this one. */
+ if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
+ return;
+ }
+ } else if (sync_bitmap) {
+ error_setg(errp,
+ "a sync_bitmap was provided to backup_run, "
+ "but received an incompatible sync_mode (%s)",
+ MirrorSyncMode_lookup[sync_mode]);
+ return;
+ }
+
len = bdrv_getlength(bs);
if (len < 0) {
error_setg_errno(errp, -len, "unable to get length for '%s'",
bdrv_get_device_name(bs));
- return;
+ goto error;
}
BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
cb, opaque, errp);
if (!job) {
- return;
+ goto error;
}
bdrv_op_block_all(target, job->common.blocker);
@@ -431,7 +534,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
job->on_target_error = on_target_error;
job->target = target;
job->sync_mode = sync_mode;
+ job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP ?
+ sync_bitmap : NULL;
job->common.len = len;
job->common.co = qemu_coroutine_create(backup_run);
qemu_coroutine_enter(job->common.co, job);
+ return;
+
+ error:
+ if (sync_bitmap) {
+ bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
+ }
}
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 63611e0a33..3c30edba73 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -721,6 +721,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
return bdrv_getlength(bs->file);
}
+static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
+{
+ return bdrv_truncate(bs->file, offset);
+}
+
static void blkdebug_refresh_filename(BlockDriverState *bs)
{
QDict *opts;
@@ -779,6 +784,7 @@ static BlockDriver bdrv_blkdebug = {
.bdrv_file_open = blkdebug_open,
.bdrv_close = blkdebug_close,
.bdrv_getlength = blkdebug_getlength,
+ .bdrv_truncate = blkdebug_truncate,
.bdrv_refresh_filename = blkdebug_refresh_filename,
.bdrv_aio_readv = blkdebug_aio_readv,
diff --git a/block/block-backend.c b/block/block-backend.c
index 48b6e4c05c..93e46f376a 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
}
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+ int nb_sectors, BdrvRequestFlags flags)
+{
+ int ret = blk_check_request(blk, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags);
+}
+
static void error_callback_bh(void *opaque)
{
struct BlockBackendAIOCB *acb = opaque;
diff --git a/block/io.c b/block/io.c
new file mode 100644
index 0000000000..1ce62c4fbc
--- /dev/null
+++ b/block/io.c
@@ -0,0 +1,2540 @@
+/*
+ * Block layer I/O functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "trace.h"
+#include "sysemu/qtest.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+
+/* throttling disk I/O limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+ ThrottleConfig *cfg)
+{
+ int i;
+
+ throttle_config(&bs->throttle_state, cfg);
+
+ for (i = 0; i < 2; i++) {
+ qemu_co_enter_next(&bs->throttled_reqs[i]);
+ }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
+{
+ bool drained = false;
+ bool enabled = bs->io_limits_enabled;
+ int i;
+
+ bs->io_limits_enabled = false;
+
+ for (i = 0; i < 2; i++) {
+ while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+ drained = true;
+ }
+ }
+
+ bs->io_limits_enabled = enabled;
+
+ return drained;
+}
+
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+ bs->io_limits_enabled = false;
+
+ bdrv_start_throttled_reqs(bs);
+
+ throttle_destroy(&bs->throttle_state);
+}
+
+static void bdrv_throttle_read_timer_cb(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+ qemu_co_enter_next(&bs->throttled_reqs[0]);
+}
+
+static void bdrv_throttle_write_timer_cb(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+ qemu_co_enter_next(&bs->throttled_reqs[1]);
+}
+
+/* should be called before bdrv_set_io_limits if a limit is set */
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+ int clock_type = QEMU_CLOCK_REALTIME;
+
+ if (qtest_enabled()) {
+ /* For testing block IO throttling only */
+ clock_type = QEMU_CLOCK_VIRTUAL;
+ }
+ assert(!bs->io_limits_enabled);
+ throttle_init(&bs->throttle_state,
+ bdrv_get_aio_context(bs),
+ clock_type,
+ bdrv_throttle_read_timer_cb,
+ bdrv_throttle_write_timer_cb,
+ bs);
+ bs->io_limits_enabled = true;
+}
+
+/* This function makes an IO wait if needed
+ *
+ * @nb_sectors: the number of sectors of the IO
+ * @is_write: is the IO a write
+ */
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+ unsigned int bytes,
+ bool is_write)
+{
+ /* does this io must wait */
+ bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
+
+ /* if must wait or any request of this type throttled queue the IO */
+ if (must_wait ||
+ !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
+ qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+ }
+
+ /* the IO will be executed, do the accounting */
+ throttle_account(&bs->throttle_state, is_write, bytes);
+
+
+ /* if the next request must wait -> do nothing */
+ if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+ return;
+ }
+
+ /* else queue next request for execution */
+ qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+}
+
+void bdrv_setup_io_funcs(BlockDriver *bdrv)
+{
+ /* Block drivers without coroutine functions need emulation */
+ if (!bdrv->bdrv_co_readv) {
+ bdrv->bdrv_co_readv = bdrv_co_readv_em;
+ bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+ /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+ * the block driver lacks aio we need to emulate that too.
+ */
+ if (!bdrv->bdrv_aio_readv) {
+ /* add AIO emulation layer */
+ bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+ bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+ }
+ }
+}
+
+void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+ BlockDriver *drv = bs->drv;
+ Error *local_err = NULL;
+
+ memset(&bs->bl, 0, sizeof(bs->bl));
+
+ if (!drv) {
+ return;
+ }
+
+ /* Take some limits from the children as a default */
+ if (bs->file) {
+ bdrv_refresh_limits(bs->file, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+ bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
+ bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+ } else {
+ bs->bl.opt_mem_alignment = 512;
+ }
+
+ if (bs->backing_hd) {
+ bdrv_refresh_limits(bs->backing_hd, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ bs->bl.opt_transfer_length =
+ MAX(bs->bl.opt_transfer_length,
+ bs->backing_hd->bl.opt_transfer_length);
+ bs->bl.max_transfer_length =
+ MIN_NON_ZERO(bs->bl.max_transfer_length,
+ bs->backing_hd->bl.max_transfer_length);
+ bs->bl.opt_mem_alignment =
+ MAX(bs->bl.opt_mem_alignment,
+ bs->backing_hd->bl.opt_mem_alignment);
+ }
+
+ /* Then let the driver override it */
+ if (drv->bdrv_refresh_limits) {
+ drv->bdrv_refresh_limits(bs, errp);
+ }
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+ bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+ assert(bs->copy_on_read > 0);
+ bs->copy_on_read--;
+}
+
+/* Check if any requests are in-flight (including throttled requests) */
+static bool bdrv_requests_pending(BlockDriverState *bs)
+{
+ if (!QLIST_EMPTY(&bs->tracked_requests)) {
+ return true;
+ }
+ if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
+ return true;
+ }
+ if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
+ return true;
+ }
+ if (bs->file && bdrv_requests_pending(bs->file)) {
+ return true;
+ }
+ if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
+ return true;
+ }
+ return false;
+}
+
+static bool bdrv_drain_one(BlockDriverState *bs)
+{
+ bool bs_busy;
+
+ bdrv_flush_io_queue(bs);
+ bdrv_start_throttled_reqs(bs);
+ bs_busy = bdrv_requests_pending(bs);
+ bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
+ return bs_busy;
+}
+
+/*
+ * Wait for pending requests to complete on a single BlockDriverState subtree
+ *
+ * See the warning in bdrv_drain_all(). This function can only be called if
+ * you are sure nothing can generate I/O because you have op blockers
+ * installed.
+ *
+ * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
+ * AioContext.
+ */
+void bdrv_drain(BlockDriverState *bs)
+{
+ while (bdrv_drain_one(bs)) {
+ /* Keep iterating */
+ }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ *
+ * Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a coroutine
+ * can be arbitrarily complex and a constant flow of I/O can come until the
+ * coroutine is complete. Because of this, it is not possible to have a
+ * function to drain a single device's I/O queue.
+ */
+void bdrv_drain_all(void)
+{
+ /* Always run first iteration so any pending completion BHs run */
+ bool busy = true;
+ BlockDriverState *bs = NULL;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ if (bs->job) {
+ block_job_pause(bs->job);
+ }
+ aio_context_release(aio_context);
+ }
+
+ while (busy) {
+ busy = false;
+ bs = NULL;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ busy |= bdrv_drain_one(bs);
+ aio_context_release(aio_context);
+ }
+ }
+
+ bs = NULL;
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ aio_context_acquire(aio_context);
+ if (bs->job) {
+ block_job_resume(bs->job);
+ }
+ aio_context_release(aio_context);
+ }
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+ if (req->serialising) {
+ req->bs->serialising_in_flight--;
+ }
+
+ QLIST_REMOVE(req, list);
+ qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+ BlockDriverState *bs,
+ int64_t offset,
+ unsigned int bytes, bool is_write)
+{
+ *req = (BdrvTrackedRequest){
+ .bs = bs,
+ .offset = offset,
+ .bytes = bytes,
+ .is_write = is_write,
+ .co = qemu_coroutine_self(),
+ .serialising = false,
+ .overlap_offset = offset,
+ .overlap_bytes = bytes,
+ };
+
+ qemu_co_queue_init(&req->wait_queue);
+
+ QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+{
+ int64_t overlap_offset = req->offset & ~(align - 1);
+ unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+ - overlap_offset;
+
+ if (!req->serialising) {
+ req->bs->serialising_in_flight++;
+ req->serialising = true;
+ }
+
+ req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+ req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ int64_t *cluster_sector_num,
+ int *cluster_nb_sectors)
+{
+ BlockDriverInfo bdi;
+
+ if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+ *cluster_sector_num = sector_num;
+ *cluster_nb_sectors = nb_sectors;
+ } else {
+ int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+ *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+ *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+ nb_sectors, c);
+ }
+}
+
+static int bdrv_get_cluster_size(BlockDriverState *bs)
+{
+ BlockDriverInfo bdi;
+ int ret;
+
+ ret = bdrv_get_info(bs, &bdi);
+ if (ret < 0 || bdi.cluster_size == 0) {
+ return bs->request_alignment;
+ } else {
+ return bdi.cluster_size;
+ }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+ int64_t offset, unsigned int bytes)
+{
+ /* aaaa bbbb */
+ if (offset >= req->overlap_offset + req->overlap_bytes) {
+ return false;
+ }
+ /* bbbb aaaa */
+ if (req->overlap_offset >= offset + bytes) {
+ return false;
+ }
+ return true;
+}
+
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+{
+ BlockDriverState *bs = self->bs;
+ BdrvTrackedRequest *req;
+ bool retry;
+ bool waited = false;
+
+ if (!bs->serialising_in_flight) {
+ return false;
+ }
+
+ do {
+ retry = false;
+ QLIST_FOREACH(req, &bs->tracked_requests, list) {
+ if (req == self || (!req->serialising && !self->serialising)) {
+ continue;
+ }
+ if (tracked_request_overlaps(req, self->overlap_offset,
+ self->overlap_bytes))
+ {
+ /* Hitting this means there was a reentrant request, for
+ * example, a block driver issuing nested requests. This must
+ * never happen since it means deadlock.
+ */
+ assert(qemu_coroutine_self() != req->co);
+
+ /* If the request is already (indirectly) waiting for us, or
+ * will wait for us as soon as it wakes up, then just go on
+ * (instead of producing a deadlock in the former case). */
+ if (!req->waiting_for) {
+ self->waiting_for = req;
+ qemu_co_queue_wait(&req->wait_queue);
+ self->waiting_for = NULL;
+ retry = true;
+ waited = true;
+ break;
+ }
+ }
+ }
+ } while (retry);
+
+ return waited;
+}
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+ size_t size)
+{
+ if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
+ return -EIO;
+ }
+
+ if (!bdrv_is_inserted(bs)) {
+ return -ENOMEDIUM;
+ }
+
+ if (offset < 0) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EIO;
+ }
+
+ return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+ nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+ BlockDriverState *bs;
+ int64_t offset;
+ QEMUIOVector *qiov;
+ bool is_write;
+ int ret;
+ BdrvRequestFlags flags;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ if (!rwco->is_write) {
+ rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+ rwco->qiov->size, rwco->qiov,
+ rwco->flags);
+ } else {
+ rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+ rwco->qiov->size, rwco->qiov,
+ rwco->flags);
+ }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+ QEMUIOVector *qiov, bool is_write,
+ BdrvRequestFlags flags)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .offset = offset,
+ .qiov = qiov,
+ .is_write = is_write,
+ .ret = NOT_DONE,
+ .flags = flags,
+ };
+
+ /**
+ * In sync call context, when the vcpu is blocked, this throttling timer
+ * will not fire; so the I/O throttling function has to be disabled here
+ * if it has been enabled.
+ */
+ if (bs->io_limits_enabled) {
+ fprintf(stderr, "Disabling I/O throttling on '%s' due "
+ "to synchronous I/O.\n", bdrv_get_device_name(bs));
+ bdrv_io_limits_disable(bs);
+ }
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_rw_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_rw_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+ return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+ int nb_sectors, bool is_write, BdrvRequestFlags flags)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+ };
+
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+ &qiov, is_write, flags);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ bool enabled;
+ int ret;
+
+ enabled = bs->io_limits_enabled;
+ bs->io_limits_enabled = false;
+ ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+ bs->io_limits_enabled = enabled;
+ return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+ -EIO generic I/O error (may happen for all errors)
+ -ENOMEDIUM No media inserted.
+ -EINVAL Invalid sector number or nb_sectors
+ -EACCES Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+}
+
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, BdrvRequestFlags flags)
+{
+ return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
+ BDRV_REQ_ZERO_WRITE | flags);
+}
+
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+ int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+ int n;
+
+ target_sectors = bdrv_nb_sectors(bs);
+ if (target_sectors < 0) {
+ return target_sectors;
+ }
+
+ for (;;) {
+ nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+ if (nb_sectors <= 0) {
+ return 0;
+ }
+ ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+ if (ret < 0) {
+ error_report("error getting block status at sector %" PRId64 ": %s",
+ sector_num, strerror(-ret));
+ return ret;
+ }
+ if (ret & BDRV_BLOCK_ZERO) {
+ sector_num += n;
+ continue;
+ }
+ ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+ if (ret < 0) {
+ error_report("error writing zeroes at sector %" PRId64 ": %s",
+ sector_num, strerror(-ret));
+ return ret;
+ }
+ sector_num += n;
+ }
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = bytes,
+ };
+ int ret;
+
+ if (bytes < 0) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return bytes;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+ int ret;
+
+ ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+ const void *buf, int bytes)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = bytes,
+ };
+
+ if (bytes < 0) {
+ return -EINVAL;
+ }
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+ const void *buf, int count)
+{
+ int ret;
+
+ ret = bdrv_pwrite(bs, offset, buf, count);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* No flush needed for cache modes that already do it */
+ if (bs->enable_write_cache) {
+ bdrv_flush(bs);
+ }
+
+ return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ /* Perform I/O through a temporary buffer so that users who scribble over
+ * their read buffer while the operation is in progress do not end up
+ * modifying the image file. This is critical for zero-copy guest I/O
+ * where anything might happen inside guest memory.
+ */
+ void *bounce_buffer;
+
+ BlockDriver *drv = bs->drv;
+ struct iovec iov;
+ QEMUIOVector bounce_qiov;
+ int64_t cluster_sector_num;
+ int cluster_nb_sectors;
+ size_t skip_bytes;
+ int ret;
+
+ /* Cover entire cluster so no additional backing file I/O is required when
+ * allocating cluster in the image file.
+ */
+ bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+ &cluster_sector_num, &cluster_nb_sectors);
+
+ trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+ cluster_sector_num, cluster_nb_sectors);
+
+ iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+ iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+ if (bounce_buffer == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+ ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ if (ret < 0) {
+ goto err;
+ }
+
+ if (drv->bdrv_co_write_zeroes &&
+ buffer_is_zero(bounce_buffer, iov.iov_len)) {
+ ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+ cluster_nb_sectors, 0);
+ } else {
+ /* This does not change the data on the disk, it is not necessary
+ * to flush even in cache=writethrough mode.
+ */
+ ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+ &bounce_qiov);
+ }
+
+ if (ret < 0) {
+ /* It might be okay to ignore write errors for guest requests. If this
+ * is a deliberate copy-on-read then we don't want to ignore the error.
+ * Simply report it in all cases.
+ */
+ goto err;
+ }
+
+ skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+ qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+ nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+ qemu_vfree(bounce_buffer);
+ return ret;
+}
+
+/*
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
+ */
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+ BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+ int64_t align, QEMUIOVector *qiov, int flags)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+
+ int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+ unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+ assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert(!qiov || bytes == qiov->size);
+
+ /* Handle Copy on Read and associated serialisation */
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ /* If we touch the same cluster it counts as an overlap. This
+ * guarantees that allocating writes will be serialized and not race
+ * with each other for the same cluster. For example, in copy-on-read
+ * it ensures that the CoR read and write operations are atomic and
+ * guest writes cannot interleave between them. */
+ mark_request_serialising(req, bdrv_get_cluster_size(bs));
+ }
+
+ wait_serialising_requests(req);
+
+ if (flags & BDRV_REQ_COPY_ON_READ) {
+ int pnum;
+
+ ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+ if (ret < 0) {
+ goto out;
+ }
+
+ if (!ret || pnum != nb_sectors) {
+ ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+ goto out;
+ }
+ }
+
+ /* Forward the request to the BlockDriver */
+ if (!bs->zero_beyond_eof) {
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ } else {
+ /* Read zeros after EOF */
+ int64_t total_sectors, max_nb_sectors;
+
+ total_sectors = bdrv_nb_sectors(bs);
+ if (total_sectors < 0) {
+ ret = total_sectors;
+ goto out;
+ }
+
+ max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
+ align >> BDRV_SECTOR_BITS);
+ if (nb_sectors < max_nb_sectors) {
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ } else if (max_nb_sectors > 0) {
+ QEMUIOVector local_qiov;
+
+ qemu_iovec_init(&local_qiov, qiov->niov);
+ qemu_iovec_concat(&local_qiov, qiov, 0,
+ max_nb_sectors * BDRV_SECTOR_SIZE);
+
+ ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
+ &local_qiov);
+
+ qemu_iovec_destroy(&local_qiov);
+ } else {
+ ret = 0;
+ }
+
+ /* Reading beyond end of file is supposed to produce zeroes */
+ if (ret == 0 && total_sectors < sector_num + nb_sectors) {
+ uint64_t offset = MAX(0, total_sectors - sector_num);
+ uint64_t bytes = (sector_num + nb_sectors - offset) *
+ BDRV_SECTOR_SIZE;
+ qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+ }
+ }
+
+out:
+ return ret;
+}
+
+static inline uint64_t bdrv_get_align(BlockDriverState *bs)
+{
+ /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+ return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+}
+
+static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
+ int64_t offset, size_t bytes)
+{
+ int64_t align = bdrv_get_align(bs);
+ return !(offset & (align - 1) || (bytes & (align - 1)));
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ BdrvTrackedRequest req;
+
+ uint64_t align = bdrv_get_align(bs);
+ uint8_t *head_buf = NULL;
+ uint8_t *tail_buf = NULL;
+ QEMUIOVector local_qiov;
+ bool use_local_qiov = false;
+ int ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_check_byte_request(bs, offset, bytes);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (bs->copy_on_read) {
+ flags |= BDRV_REQ_COPY_ON_READ;
+ }
+
+ /* throttling disk I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, bytes, false);
+ }
+
+ /* Align read if necessary by padding qiov */
+ if (offset & (align - 1)) {
+ head_buf = qemu_blockalign(bs, align);
+ qemu_iovec_init(&local_qiov, qiov->niov + 2);
+ qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+
+ bytes += offset & (align - 1);
+ offset = offset & ~(align - 1);
+ }
+
+ if ((offset + bytes) & (align - 1)) {
+ if (!use_local_qiov) {
+ qemu_iovec_init(&local_qiov, qiov->niov + 1);
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+ }
+ tail_buf = qemu_blockalign(bs, align);
+ qemu_iovec_add(&local_qiov, tail_buf,
+ align - ((offset + bytes) & (align - 1)));
+
+ bytes = ROUND_UP(bytes, align);
+ }
+
+ tracked_request_begin(&req, bs, offset, bytes, false);
+ ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+ use_local_qiov ? &local_qiov : qiov,
+ flags);
+ tracked_request_end(&req);
+
+ if (use_local_qiov) {
+ qemu_iovec_destroy(&local_qiov);
+ qemu_vfree(head_buf);
+ qemu_vfree(tail_buf);
+ }
+
+ return ret;
+}
+
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+ BDRV_REQ_COPY_ON_READ);
+}
+
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ QEMUIOVector qiov;
+ struct iovec iov = {0};
+ int ret = 0;
+
+ int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+ BDRV_REQUEST_MAX_SECTORS);
+
+ while (nb_sectors > 0 && !ret) {
+ int num = nb_sectors;
+
+ /* Align request. Block drivers can expect the "bulk" of the request
+ * to be aligned.
+ */
+ if (bs->bl.write_zeroes_alignment
+ && num > bs->bl.write_zeroes_alignment) {
+ if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+ /* Make a small request up to the first aligned sector. */
+ num = bs->bl.write_zeroes_alignment;
+ num -= sector_num % bs->bl.write_zeroes_alignment;
+ } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
+ /* Shorten the request to the last aligned sector. num cannot
+ * underflow because num > bs->bl.write_zeroes_alignment.
+ */
+ num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+ }
+ }
+
+ /* limit request size */
+ if (num > max_write_zeroes) {
+ num = max_write_zeroes;
+ }
+
+ ret = -ENOTSUP;
+ /* First try the efficient write zeroes operation */
+ if (drv->bdrv_co_write_zeroes) {
+ ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+ }
+
+ if (ret == -ENOTSUP) {
+ /* Fall back to bounce buffer if write zeroes is unsupported */
+ int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+ MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+ num = MIN(num, max_xfer_len);
+ iov.iov_len = num * BDRV_SECTOR_SIZE;
+ if (iov.iov_base == NULL) {
+ iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+ if (iov.iov_base == NULL) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+ }
+ qemu_iovec_init_external(&qiov, &iov, 1);
+
+ ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+
+ /* Keep bounce buffer around if it is big enough for all
+ * all future requests.
+ */
+ if (num < max_xfer_len) {
+ qemu_vfree(iov.iov_base);
+ iov.iov_base = NULL;
+ }
+ }
+
+ sector_num += num;
+ nb_sectors -= num;
+ }
+
+fail:
+ qemu_vfree(iov.iov_base);
+ return ret;
+}
+
+/*
+ * Forwards an already correctly aligned write request to the BlockDriver.
+ */
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+ BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+ QEMUIOVector *qiov, int flags)
+{
+ BlockDriver *drv = bs->drv;
+ bool waited;
+ int ret;
+
+ int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+ unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+ assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+ assert(!qiov || bytes == qiov->size);
+
+ waited = wait_serialising_requests(req);
+ assert(!waited || !req->serialising);
+ assert(req->overlap_offset <= offset);
+ assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+
+ ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+
+ if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
+ !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+ qemu_iovec_is_zero(qiov)) {
+ flags |= BDRV_REQ_ZERO_WRITE;
+ if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
+ flags |= BDRV_REQ_MAY_UNMAP;
+ }
+ }
+
+ if (ret < 0) {
+ /* Do nothing, write notifier decided to fail this request */
+ } else if (flags & BDRV_REQ_ZERO_WRITE) {
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+ ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
+ } else {
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+ ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+
+ if (ret == 0 && !bs->enable_write_cache) {
+ ret = bdrv_co_flush(bs);
+ }
+
+ bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+ block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+
+ if (ret >= 0) {
+ bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+ }
+
+ return ret;
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+ int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BdrvTrackedRequest req;
+ uint64_t align = bdrv_get_align(bs);
+ uint8_t *head_buf = NULL;
+ uint8_t *tail_buf = NULL;
+ QEMUIOVector local_qiov;
+ bool use_local_qiov = false;
+ int ret;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+ if (bs->read_only) {
+ return -EACCES;
+ }
+
+ ret = bdrv_check_byte_request(bs, offset, bytes);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* throttling disk I/O */
+ if (bs->io_limits_enabled) {
+ bdrv_io_limits_intercept(bs, bytes, true);
+ }
+
+ /*
+ * Align write if necessary by performing a read-modify-write cycle.
+ * Pad qiov with the read parts and be sure to have a tracked request not
+ * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+ */
+ tracked_request_begin(&req, bs, offset, bytes, true);
+
+ if (offset & (align - 1)) {
+ QEMUIOVector head_qiov;
+ struct iovec head_iov;
+
+ mark_request_serialising(&req, align);
+ wait_serialising_requests(&req);
+
+ head_buf = qemu_blockalign(bs, align);
+ head_iov = (struct iovec) {
+ .iov_base = head_buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+ ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+ align, &head_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+ qemu_iovec_init(&local_qiov, qiov->niov + 2);
+ qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+
+ bytes += offset & (align - 1);
+ offset = offset & ~(align - 1);
+ }
+
+ if ((offset + bytes) & (align - 1)) {
+ QEMUIOVector tail_qiov;
+ struct iovec tail_iov;
+ size_t tail_bytes;
+ bool waited;
+
+ mark_request_serialising(&req, align);
+ waited = wait_serialising_requests(&req);
+ assert(!waited || !use_local_qiov);
+
+ tail_buf = qemu_blockalign(bs, align);
+ tail_iov = (struct iovec) {
+ .iov_base = tail_buf,
+ .iov_len = align,
+ };
+ qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+ ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+ align, &tail_qiov, 0);
+ if (ret < 0) {
+ goto fail;
+ }
+ BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+ if (!use_local_qiov) {
+ qemu_iovec_init(&local_qiov, qiov->niov + 1);
+ qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+ use_local_qiov = true;
+ }
+
+ tail_bytes = (offset + bytes) & (align - 1);
+ qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+ bytes = ROUND_UP(bytes, align);
+ }
+
+ if (use_local_qiov) {
+ /* Local buffer may have non-zero data. */
+ flags &= ~BDRV_REQ_ZERO_WRITE;
+ }
+ ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+ use_local_qiov ? &local_qiov : qiov,
+ flags);
+
+fail:
+ tracked_request_end(&req);
+
+ if (use_local_qiov) {
+ qemu_iovec_destroy(&local_qiov);
+ }
+ qemu_vfree(head_buf);
+ qemu_vfree(tail_buf);
+
+ return ret;
+}
+
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return -EINVAL;
+ }
+
+ return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+ return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BdrvRequestFlags flags)
+{
+ int ret;
+
+ trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+
+ if (!(bs->open_flags & BDRV_O_UNMAP)) {
+ flags &= ~BDRV_REQ_MAY_UNMAP;
+ }
+ if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS)) {
+ ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+ BDRV_REQ_ZERO_WRITE | flags);
+ } else {
+ uint8_t *buf;
+ QEMUIOVector local_qiov;
+ size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
+
+ buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
+ memset(buf, 0, bytes);
+ qemu_iovec_init(&local_qiov, 1);
+ qemu_iovec_add(&local_qiov, buf, bytes);
+
+ ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
+ BDRV_REQ_ZERO_WRITE | flags);
+ qemu_vfree(buf);
+ }
+ return ret;
+}
+
+int bdrv_flush_all(void)
+{
+ BlockDriverState *bs = NULL;
+ int result = 0;
+
+ while ((bs = bdrv_next(bs))) {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+ int ret;
+
+ aio_context_acquire(aio_context);
+ ret = bdrv_flush(bs);
+ if (ret < 0 && !result) {
+ result = ret;
+ }
+ aio_context_release(aio_context);
+ }
+
+ return result;
+}
+
+typedef struct BdrvCoGetBlockStatusData {
+ BlockDriverState *bs;
+ BlockDriverState *base;
+ int64_t sector_num;
+ int nb_sectors;
+ int *pnum;
+ int64_t ret;
+ bool done;
+} BdrvCoGetBlockStatusData;
+
+/*
+ * Returns the allocation status of the specified sectors.
+ * Drivers not implementing the functionality are assumed to not support
+ * backing files, hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t total_sectors;
+ int64_t n;
+ int64_t ret, ret2;
+
+ total_sectors = bdrv_nb_sectors(bs);
+ if (total_sectors < 0) {
+ return total_sectors;
+ }
+
+ if (sector_num >= total_sectors) {
+ *pnum = 0;
+ return 0;
+ }
+
+ n = total_sectors - sector_num;
+ if (n < nb_sectors) {
+ nb_sectors = n;
+ }
+
+ if (!bs->drv->bdrv_co_get_block_status) {
+ *pnum = nb_sectors;
+ ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+ if (bs->drv->protocol_name) {
+ ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+ }
+ return ret;
+ }
+
+ ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+ if (ret < 0) {
+ *pnum = 0;
+ return ret;
+ }
+
+ if (ret & BDRV_BLOCK_RAW) {
+ assert(ret & BDRV_BLOCK_OFFSET_VALID);
+ return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ *pnum, pnum);
+ }
+
+ if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
+ ret |= BDRV_BLOCK_ALLOCATED;
+ }
+
+ if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
+ if (bdrv_unallocated_blocks_are_zero(bs)) {
+ ret |= BDRV_BLOCK_ZERO;
+ } else if (bs->backing_hd) {
+ BlockDriverState *bs2 = bs->backing_hd;
+ int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+ if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
+ ret |= BDRV_BLOCK_ZERO;
+ }
+ }
+ }
+
+ if (bs->file &&
+ (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
+ (ret & BDRV_BLOCK_OFFSET_VALID)) {
+ int file_pnum;
+
+ ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+ *pnum, &file_pnum);
+ if (ret2 >= 0) {
+ /* Ignore errors. This is just providing extra information, it
+ * is useful but not necessary.
+ */
+ if (!file_pnum) {
+ /* !file_pnum indicates an offset at or beyond the EOF; it is
+ * perfectly valid for the format block driver to point to such
+ * offsets, so catch it and mark everything as zero */
+ ret |= BDRV_BLOCK_ZERO;
+ } else {
+ /* Limit request to the range reported by the protocol driver */
+ *pnum = file_pnum;
+ ret |= (ret2 & BDRV_BLOCK_ZERO);
+ }
+ }
+ }
+
+ return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status() */
+static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
+{
+ BdrvCoGetBlockStatusData *data = opaque;
+ BlockDriverState *bs = data->bs;
+
+ data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
+ data->pnum);
+ data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_get_block_status().
+ *
+ * See bdrv_co_get_block_status() for details.
+ */
+int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ Coroutine *co;
+ BdrvCoGetBlockStatusData data = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .pnum = pnum,
+ .done = false,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_get_block_status_co_entry(&data);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+ qemu_coroutine_enter(co, &data);
+ while (!data.done) {
+ aio_poll(aio_context, true);
+ }
+ }
+ return data.ret;
+}
+
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+ if (ret < 0) {
+ return ret;
+ }
+ return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive). BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain. Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ */
+int bdrv_is_allocated_above(BlockDriverState *top,
+ BlockDriverState *base,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BlockDriverState *intermediate;
+ int ret, n = nb_sectors;
+
+ intermediate = top;
+ while (intermediate && intermediate != base) {
+ int pnum_inter;
+ ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+ &pnum_inter);
+ if (ret < 0) {
+ return ret;
+ } else if (ret) {
+ *pnum = pnum_inter;
+ return 1;
+ }
+
+ /*
+ * [sector_num, nb_sectors] is unallocated on top but intermediate
+ * might have
+ *
+ * [sector_num+x, nr_sectors] allocated.
+ */
+ if (n > pnum_inter &&
+ (intermediate == top ||
+ sector_num + pnum_inter < intermediate->total_sectors)) {
+ n = pnum_inter;
+ }
+
+ intermediate = intermediate->backing_hd;
+ }
+
+ *pnum = n;
+ return 0;
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BlockDriver *drv = bs->drv;
+ int ret;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+ if (!drv->bdrv_write_compressed) {
+ return -ENOTSUP;
+ }
+ ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ }
+
+ assert(QLIST_EMPTY(&bs->dirty_bitmaps));
+
+ return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+ int64_t pos, int size)
+{
+ QEMUIOVector qiov;
+ struct iovec iov = {
+ .iov_base = (void *) buf,
+ .iov_len = size,
+ };
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (!drv) {
+ return -ENOMEDIUM;
+ } else if (drv->bdrv_save_vmstate) {
+ return drv->bdrv_save_vmstate(bs, qiov, pos);
+ } else if (bs->file) {
+ return bdrv_writev_vmstate(bs->file, qiov, pos);
+ }
+
+ return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size)
+{
+ BlockDriver *drv = bs->drv;
+ if (!drv)
+ return -ENOMEDIUM;
+ if (drv->bdrv_load_vmstate)
+ return drv->bdrv_load_vmstate(bs, buf, pos, size);
+ if (bs->file)
+ return bdrv_load_vmstate(bs->file, buf, pos, size);
+ return -ENOTSUP;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+ cb, opaque, false);
+}
+
+BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+ cb, opaque, true);
+}
+
+BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
+
+ return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
+ BDRV_REQ_ZERO_WRITE | flags,
+ cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+ int error;
+ int num_requests;
+ int num_callbacks;
+ struct {
+ BlockCompletionFunc *cb;
+ void *opaque;
+ QEMUIOVector *free_qiov;
+ } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+ int i;
+
+ for (i = 0; i < mcb->num_callbacks; i++) {
+ mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+ if (mcb->callbacks[i].free_qiov) {
+ qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+ }
+ g_free(mcb->callbacks[i].free_qiov);
+ }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+ MultiwriteCB *mcb = opaque;
+
+ trace_multiwrite_cb(mcb, ret);
+
+ if (ret < 0 && !mcb->error) {
+ mcb->error = ret;
+ }
+
+ mcb->num_requests--;
+ if (mcb->num_requests == 0) {
+ multiwrite_user_cb(mcb);
+ g_free(mcb);
+ }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+ const BlockRequest *req1 = a, *req2 = b;
+
+ /*
+ * Note that we can't simply subtract req2->sector from req1->sector
+ * here as that could overflow the return value.
+ */
+ if (req1->sector > req2->sector) {
+ return 1;
+ } else if (req1->sector < req2->sector) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+ int num_reqs, MultiwriteCB *mcb)
+{
+ int i, outidx;
+
+ // Sort requests by start sector
+ qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+ // Check if adjacent requests touch the same clusters. If so, combine them,
+ // filling up gaps with zero sectors.
+ outidx = 0;
+ for (i = 1; i < num_reqs; i++) {
+ int merge = 0;
+ int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+ // Handle exactly sequential writes and overlapping writes.
+ if (reqs[i].sector <= oldreq_last) {
+ merge = 1;
+ }
+
+ if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+ merge = 0;
+ }
+
+ if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
+ reqs[i].nb_sectors > bs->bl.max_transfer_length) {
+ merge = 0;
+ }
+
+ if (merge) {
+ size_t size;
+ QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+ qemu_iovec_init(qiov,
+ reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+ // Add the first request to the merged one. If the requests are
+ // overlapping, drop the last sectors of the first request.
+ size = (reqs[i].sector - reqs[outidx].sector) << 9;
+ qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+ // We should need to add any zeros between the two requests
+ assert (reqs[i].sector <= oldreq_last);
+
+ // Add the second request
+ qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+ // Add tail of first request, if necessary
+ if (qiov->size < reqs[outidx].qiov->size) {
+ qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+ reqs[outidx].qiov->size - qiov->size);
+ }
+
+ reqs[outidx].nb_sectors = qiov->size >> 9;
+ reqs[outidx].qiov = qiov;
+
+ mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+ } else {
+ outidx++;
+ reqs[outidx].sector = reqs[i].sector;
+ reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+ reqs[outidx].qiov = reqs[i].qiov;
+ }
+ }
+
+ block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
+ return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+ MultiwriteCB *mcb;
+ int i;
+
+ /* don't submit writes if we don't have a medium */
+ if (bs->drv == NULL) {
+ for (i = 0; i < num_reqs; i++) {
+ reqs[i].error = -ENOMEDIUM;
+ }
+ return -1;
+ }
+
+ if (num_reqs == 0) {
+ return 0;
+ }
+
+ // Create MultiwriteCB structure
+ mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+ mcb->num_requests = 0;
+ mcb->num_callbacks = num_reqs;
+
+ for (i = 0; i < num_reqs; i++) {
+ mcb->callbacks[i].cb = reqs[i].cb;
+ mcb->callbacks[i].opaque = reqs[i].opaque;
+ }
+
+ // Check for mergable requests
+ num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+ trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+ /* Run the aio requests. */
+ mcb->num_requests = num_reqs;
+ for (i = 0; i < num_reqs; i++) {
+ bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
+ reqs[i].nb_sectors, reqs[i].flags,
+ multiwrite_cb, mcb,
+ true);
+ }
+
+ return 0;
+}
+
+void bdrv_aio_cancel(BlockAIOCB *acb)
+{
+ qemu_aio_ref(acb);
+ bdrv_aio_cancel_async(acb);
+ while (acb->refcnt > 1) {
+ if (acb->aiocb_info->get_aio_context) {
+ aio_poll(acb->aiocb_info->get_aio_context(acb), true);
+ } else if (acb->bs) {
+ aio_poll(bdrv_get_aio_context(acb->bs), true);
+ } else {
+ abort();
+ }
+ }
+ qemu_aio_unref(acb);
+}
+
+/* Async version of aio cancel. The caller is not blocked if the acb implements
+ * cancel_async, otherwise we do nothing and let the request normally complete.
+ * In either case the completion callback must be called. */
+void bdrv_aio_cancel_async(BlockAIOCB *acb)
+{
+ if (acb->aiocb_info->cancel_async) {
+ acb->aiocb_info->cancel_async(acb);
+ }
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockAIOCBSync {
+ BlockAIOCB common;
+ QEMUBH *bh;
+ int ret;
+ /* vector translation state */
+ QEMUIOVector *qiov;
+ uint8_t *bounce;
+ int is_write;
+} BlockAIOCBSync;
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+ .aiocb_size = sizeof(BlockAIOCBSync),
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+ BlockAIOCBSync *acb = opaque;
+
+ if (!acb->is_write && acb->ret >= 0) {
+ qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+ }
+ qemu_vfree(acb->bounce);
+ acb->common.cb(acb->common.opaque, acb->ret);
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+ qemu_aio_unref(acb);
+}
+
+static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ int is_write)
+
+{
+ BlockAIOCBSync *acb;
+
+ acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+ acb->is_write = is_write;
+ acb->qiov = qiov;
+ acb->bounce = qemu_try_blockalign(bs, qiov->size);
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
+
+ if (acb->bounce == NULL) {
+ acb->ret = -ENOMEM;
+ } else if (is_write) {
+ qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+ acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+ } else {
+ acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+ }
+
+ qemu_bh_schedule(acb->bh);
+
+ return &acb->common;
+}
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockAIOCBCoroutine {
+ BlockAIOCB common;
+ BlockRequest req;
+ bool is_write;
+ bool need_bh;
+ bool *done;
+ QEMUBH* bh;
+} BlockAIOCBCoroutine;
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+ .aiocb_size = sizeof(BlockAIOCBCoroutine),
+};
+
+static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+{
+ if (!acb->need_bh) {
+ acb->common.cb(acb->common.opaque, acb->req.error);
+ qemu_aio_unref(acb);
+ }
+}
+
+static void bdrv_co_em_bh(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+
+ assert(!acb->need_bh);
+ qemu_bh_delete(acb->bh);
+ bdrv_co_complete(acb);
+}
+
+static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+{
+ acb->need_bh = false;
+ if (acb->req.error != -EINPROGRESS) {
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
+ qemu_bh_schedule(acb->bh);
+ }
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ if (!acb->is_write) {
+ acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ } else {
+ acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+ acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ }
+
+ bdrv_co_complete(acb);
+}
+
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write)
+{
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ acb->req.qiov = qiov;
+ acb->req.flags = flags;
+ acb->is_write = is_write;
+
+ co = qemu_coroutine_create(bdrv_co_do_rw);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_flush(bs);
+ bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ trace_bdrv_aio_flush(bs, opaque);
+
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+
+ co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+ BlockAIOCBCoroutine *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+
+ acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+ bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ Coroutine *co;
+ BlockAIOCBCoroutine *acb;
+
+ trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+ acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+ acb->need_bh = true;
+ acb->req.error = -EINPROGRESS;
+ acb->req.sector = sector_num;
+ acb->req.nb_sectors = nb_sectors;
+ co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+ qemu_coroutine_enter(co, acb);
+
+ bdrv_co_maybe_schedule_bh(acb);
+ return &acb->common;
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlockAIOCB *acb;
+
+ acb = g_slice_alloc(aiocb_info->aiocb_size);
+ acb->aiocb_info = aiocb_info;
+ acb->bs = bs;
+ acb->cb = cb;
+ acb->opaque = opaque;
+ acb->refcnt = 1;
+ return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+ BlockAIOCB *acb = p;
+ acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+ BlockAIOCB *acb = p;
+ assert(acb->refcnt > 0);
+ if (--acb->refcnt == 0) {
+ g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+ }
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+ Coroutine *coroutine;
+ int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+ CoroutineIOCompletion *co = opaque;
+
+ co->ret = ret;
+ qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *iov,
+ bool is_write)
+{
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ BlockAIOCB *acb;
+
+ if (is_write) {
+ acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ } else {
+ acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ }
+
+ trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+ if (!acb) {
+ return -EIO;
+ }
+ qemu_coroutine_yield();
+
+ return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors,
+ QEMUIOVector *iov)
+{
+ return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+ RwCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+ int ret;
+
+ if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+ return 0;
+ }
+
+ /* Write back cached data to the OS even with cache=unsafe */
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+ if (bs->drv->bdrv_co_flush_to_os) {
+ ret = bs->drv->bdrv_co_flush_to_os(bs);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ /* But don't actually force it to the disk with cache=unsafe */
+ if (bs->open_flags & BDRV_O_NO_FLUSH) {
+ goto flush_parent;
+ }
+
+ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+ if (bs->drv->bdrv_co_flush_to_disk) {
+ ret = bs->drv->bdrv_co_flush_to_disk(bs);
+ } else if (bs->drv->bdrv_aio_flush) {
+ BlockAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ ret = -EIO;
+ } else {
+ qemu_coroutine_yield();
+ ret = co.ret;
+ }
+ } else {
+ /*
+ * Some block drivers always operate in either writethrough or unsafe
+ * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+ * know how the server works (because the behaviour is hardcoded or
+ * depends on server-side configuration), so we can't ensure that
+ * everything is safe on disk. Returning an error doesn't work because
+ * that would break guests even if the server operates in writethrough
+ * mode.
+ *
+ * Let's hope the user knows what he's doing.
+ */
+ ret = 0;
+ }
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
+ * in the case of cache=unsafe, so there are no useless flushes.
+ */
+flush_parent:
+ return bdrv_co_flush(bs->file);
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+ Coroutine *co;
+ RwCo rwco = {
+ .bs = bs,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_flush_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_flush_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+
+ return rwco.ret;
+}
+
+typedef struct DiscardCo {
+ BlockDriverState *bs;
+ int64_t sector_num;
+ int nb_sectors;
+ int ret;
+} DiscardCo;
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+ DiscardCo *rwco = opaque;
+
+ rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
+{
+ int max_discard, ret;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ if (ret < 0) {
+ return ret;
+ } else if (bs->read_only) {
+ return -EROFS;
+ }
+
+ bdrv_reset_dirty(bs, sector_num, nb_sectors);
+
+ /* Do nothing if disabled. */
+ if (!(bs->open_flags & BDRV_O_UNMAP)) {
+ return 0;
+ }
+
+ if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+ return 0;
+ }
+
+ max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
+ while (nb_sectors > 0) {
+ int ret;
+ int num = nb_sectors;
+
+ /* align request */
+ if (bs->bl.discard_alignment &&
+ num >= bs->bl.discard_alignment &&
+ sector_num % bs->bl.discard_alignment) {
+ if (num > bs->bl.discard_alignment) {
+ num = bs->bl.discard_alignment;
+ }
+ num -= sector_num % bs->bl.discard_alignment;
+ }
+
+ /* limit request size */
+ if (num > max_discard) {
+ num = max_discard;
+ }
+
+ if (bs->drv->bdrv_co_discard) {
+ ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+ } else {
+ BlockAIOCB *acb;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+
+ acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+ bdrv_co_io_em_complete, &co);
+ if (acb == NULL) {
+ return -EIO;
+ } else {
+ qemu_coroutine_yield();
+ ret = co.ret;
+ }
+ }
+ if (ret && ret != -ENOTSUP) {
+ return ret;
+ }
+
+ sector_num += num;
+ nb_sectors -= num;
+ }
+ return 0;
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+ Coroutine *co;
+ DiscardCo rwco = {
+ .bs = bs,
+ .sector_num = sector_num,
+ .nb_sectors = nb_sectors,
+ .ret = NOT_DONE,
+ };
+
+ if (qemu_in_coroutine()) {
+ /* Fast-path if already in coroutine context */
+ bdrv_discard_co_entry(&rwco);
+ } else {
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ co = qemu_coroutine_create(bdrv_discard_co_entry);
+ qemu_coroutine_enter(co, &rwco);
+ while (rwco.ret == NOT_DONE) {
+ aio_poll(aio_context, true);
+ }
+ }
+
+ return rwco.ret;
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_ioctl)
+ return drv->bdrv_ioctl(bs, req, buf);
+ return -ENOTSUP;
+}
+
+BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlockDriver *drv = bs->drv;
+
+ if (drv && drv->bdrv_aio_ioctl)
+ return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+ return NULL;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+ return qemu_memalign(bdrv_opt_mem_align(bs), size);
+}
+
+void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+{
+ return memset(qemu_blockalign(bs, size), 0, size);
+}
+
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+ size_t align = bdrv_opt_mem_align(bs);
+
+ /* Ensure that NULL is never returned on success */
+ assert(align > 0);
+ if (size == 0) {
+ size = align;
+ }
+
+ return qemu_try_memalign(align, size);
+}
+
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+{
+ void *mem = qemu_try_blockalign(bs, size);
+
+ if (mem) {
+ memset(mem, 0, size);
+ }
+
+ return mem;
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+ int i;
+ size_t alignment = bdrv_opt_mem_align(bs);
+
+ for (i = 0; i < qiov->niov; i++) {
+ if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
+ return false;
+ }
+ if (qiov->iov[i].iov_len % alignment) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+ NotifierWithReturn *notifier)
+{
+ notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
+
+void bdrv_io_plug(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_io_plug) {
+ drv->bdrv_io_plug(bs);
+ } else if (bs->file) {
+ bdrv_io_plug(bs->file);
+ }
+}
+
+void bdrv_io_unplug(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_io_unplug) {
+ drv->bdrv_io_unplug(bs);
+ } else if (bs->file) {
+ bdrv_io_unplug(bs->file);
+ }
+}
+
+void bdrv_flush_io_queue(BlockDriverState *bs)
+{
+ BlockDriver *drv = bs->drv;
+ if (drv && drv->bdrv_flush_io_queue) {
+ drv->bdrv_flush_io_queue(bs);
+ } else if (bs->file) {
+ bdrv_flush_io_queue(bs->file);
+ }
+}
diff --git a/block/iscsi.c b/block/iscsi.c
index ba33290000..8fca1d32cb 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
* QEMU Block driver for iSCSI images
*
* Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -57,9 +57,6 @@ typedef struct IscsiLun {
int events;
QEMUTimer *nop_timer;
QEMUTimer *event_timer;
- uint8_t lbpme;
- uint8_t lbprz;
- uint8_t has_write_same;
struct scsi_inquiry_logical_block_provisioning lbp;
struct scsi_inquiry_block_limits bl;
unsigned char *zeroblock;
@@ -67,6 +64,11 @@ typedef struct IscsiLun {
int cluster_sectors;
bool use_16_for_rw;
bool write_protected;
+ bool lbpme;
+ bool lbprz;
+ bool dpofua;
+ bool has_write_same;
+ bool force_next_flush;
} IscsiLun;
typedef struct IscsiTask {
@@ -79,6 +81,7 @@ typedef struct IscsiTask {
QEMUBH *bh;
IscsiLun *iscsilun;
QEMUTimer retry_timer;
+ bool force_next_flush;
} IscsiTask;
typedef struct IscsiAIOCB {
@@ -100,7 +103,7 @@ typedef struct IscsiAIOCB {
#define NOP_INTERVAL 5000
#define MAX_NOP_FAILURES 3
#define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768};
/* this threshold is a trade-off knob to choose between
* the potential additional overhead of an extra GET_LBA_STATUS request
@@ -183,10 +186,13 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
iTask->do_retry = 1;
goto out;
}
- if (status == SCSI_STATUS_BUSY) {
+ /* status 0x28 is SCSI_TASK_SET_FULL. It was first introduced
+ * in libiscsi 1.10.0. Hardcode this value here to avoid
+ * the need to bump the libiscsi requirement to 1.10.0 */
+ if (status == SCSI_STATUS_BUSY || status == 0x28) {
unsigned retry_time =
exp_random(iscsi_retry_times[iTask->retries - 1]);
- error_report("iSCSI Busy (retry #%u in %u ms): %s",
+ error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s",
iTask->retries, retry_time,
iscsi_get_error(iscsi));
aio_timer_init(iTask->iscsilun->aio_context,
@@ -199,6 +205,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
}
}
error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+ } else {
+ iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
}
out:
@@ -369,6 +377,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
struct IscsiTask iTask;
uint64_t lba;
uint32_t num_sectors;
+ int fua;
if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
return -EINVAL;
@@ -384,15 +393,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
iscsi_co_init_iscsitask(iscsilun, &iTask);
retry:
+ fua = iscsilun->dpofua && !bs->enable_write_cache;
+ iTask.force_next_flush = !fua;
if (iscsilun->use_16_for_rw) {
iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
NULL, num_sectors * iscsilun->block_size,
- iscsilun->block_size, 0, 0, 0, 0, 0,
+ iscsilun->block_size, 0, 0, fua, 0, 0,
iscsi_co_generic_cb, &iTask);
} else {
iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
NULL, num_sectors * iscsilun->block_size,
- iscsilun->block_size, 0, 0, 0, 0, 0,
+ iscsilun->block_size, 0, 0, fua, 0, 0,
iscsi_co_generic_cb, &iTask);
}
if (iTask.task == NULL) {
@@ -460,7 +471,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
*pnum = nb_sectors;
/* LUN does not support logical block provisioning */
- if (iscsilun->lbpme == 0) {
+ if (!iscsilun->lbpme) {
goto out;
}
@@ -620,8 +631,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
return 0;
}
- iscsi_co_init_iscsitask(iscsilun, &iTask);
+ if (!iscsilun->force_next_flush) {
+ return 0;
+ }
+ iscsilun->force_next_flush = false;
+ iscsi_co_init_iscsitask(iscsilun, &iTask);
retry:
if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
0, iscsi_co_generic_cb, &iTask) == NULL) {
@@ -917,6 +932,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
}
iscsi_co_init_iscsitask(iscsilun, &iTask);
+ iTask.force_next_flush = true;
retry:
if (use_16_for_ws) {
iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
@@ -1121,8 +1137,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
} else {
iscsilun->block_size = rc16->block_length;
iscsilun->num_blocks = rc16->returned_lba + 1;
- iscsilun->lbpme = rc16->lbpme;
- iscsilun->lbprz = rc16->lbprz;
+ iscsilun->lbpme = !!rc16->lbpme;
+ iscsilun->lbprz = !!rc16->lbprz;
iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff);
}
}
@@ -1253,11 +1269,12 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
iscsi_timed_set_events, iscsilun);
}
-static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+static void iscsi_modesense_sync(IscsiLun *iscsilun)
{
struct scsi_task *task;
struct scsi_mode_sense *ms = NULL;
- bool wrprotected = false;
+ iscsilun->write_protected = false;
+ iscsilun->dpofua = false;
task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
1, SCSI_MODESENSE_PC_CURRENT,
@@ -1278,13 +1295,13 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun)
iscsi_get_error(iscsilun->iscsi));
goto out;
}
- wrprotected = ms->device_specific_parameter & 0x80;
+ iscsilun->write_protected = ms->device_specific_parameter & 0x80;
+ iscsilun->dpofua = ms->device_specific_parameter & 0x10;
out:
if (task) {
scsi_free_scsi_task(task);
}
- return wrprotected;
}
/*
@@ -1403,7 +1420,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
scsi_free_scsi_task(task);
task = NULL;
- iscsilun->write_protected = iscsi_is_write_protected(iscsilun);
+ iscsi_modesense_sync(iscsilun);
+
/* Check the write protect flag of the LUN if we want to write */
if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
iscsilun->write_protected) {
@@ -1481,7 +1499,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
iscsilun->block_size) >> BDRV_SECTOR_BITS;
- if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+ if (iscsilun->lbprz) {
iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
if (iscsilun->allocationmap == NULL) {
ret = -ENOMEM;
@@ -1501,6 +1519,9 @@ out:
if (ret) {
if (iscsi != NULL) {
+ if (iscsi_is_logged_in(iscsi)) {
+ iscsi_logout_sync(iscsi);
+ }
iscsi_destroy_context(iscsi);
}
memset(iscsilun, 0, sizeof(IscsiLun));
@@ -1514,6 +1535,9 @@ static void iscsi_close(BlockDriverState *bs)
struct iscsi_context *iscsi = iscsilun->iscsi;
iscsi_detach_aio_context(bs);
+ if (iscsi_is_logged_in(iscsi)) {
+ iscsi_logout_sync(iscsi);
+ }
iscsi_destroy_context(iscsi);
g_free(iscsilun->zeroblock);
g_free(iscsilun->allocationmap);
@@ -1649,7 +1673,7 @@ out:
static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
IscsiLun *iscsilun = bs->opaque;
- bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+ bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
return 0;
diff --git a/block/mirror.c b/block/mirror.c
index 405616422b..58f391a6d6 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -125,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret)
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
- BlockDriverState *source = s->common.bs;
BlockErrorAction action;
- bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
- op->nb_sectors);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, false, -ret);
if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
@@ -143,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret)
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
- BlockDriverState *source = s->common.bs;
BlockErrorAction action;
- bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
- op->nb_sectors);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, true, -ret);
if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
@@ -170,10 +166,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
s->sector_num = hbitmap_iter_next(&s->hbi);
if (s->sector_num < 0) {
- bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
+ bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
s->sector_num = hbitmap_iter_next(&s->hbi);
- trace_mirror_restart_iter(s,
- bdrv_get_dirty_count(source, s->dirty_bitmap));
+ trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
assert(s->sector_num >= 0);
}
@@ -288,8 +283,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
next_sector += sectors_per_chunk;
}
- bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num,
- nb_sectors);
+ bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
/* Copy the dirty cluster. */
s->in_flight++;
@@ -446,7 +440,7 @@ static void coroutine_fn mirror_run(void *opaque)
assert(n > 0);
if (ret == 1) {
- bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n);
+ bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
sector_num = next;
} else {
sector_num += n;
@@ -454,7 +448,7 @@ static void coroutine_fn mirror_run(void *opaque)
}
}
- bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
+ bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
for (;;) {
uint64_t delay_ns = 0;
@@ -466,7 +460,7 @@ static void coroutine_fn mirror_run(void *opaque)
goto immediate_exit;
}
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
/* s->common.offset contains the number of bytes already processed so
* far, cnt is the number of dirty sectors remaining and
* s->sectors_in_flight is the number of sectors currently being
@@ -475,7 +469,7 @@ static void coroutine_fn mirror_run(void *opaque)
(cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
/* Note that even when no rate limit is applied we need to yield
- * periodically with no pending I/O so that qemu_aio_flush() returns.
+ * periodically with no pending I/O so that bdrv_drain_all() returns.
* We do so every SLICE_TIME nanoseconds, or when there is an error,
* or when the source is clean, whichever comes first.
*/
@@ -488,9 +482,6 @@ static void coroutine_fn mirror_run(void *opaque)
continue;
} else if (cnt != 0) {
delay_ns = mirror_iteration(s);
- if (delay_ns == 0) {
- continue;
- }
}
}
@@ -516,7 +507,7 @@ static void coroutine_fn mirror_run(void *opaque)
should_complete = s->should_complete ||
block_job_is_cancelled(&s->common);
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
}
}
@@ -531,7 +522,7 @@ static void coroutine_fn mirror_run(void *opaque)
*/
trace_mirror_before_drain(s, cnt);
bdrv_drain(bs);
- cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+ cnt = bdrv_get_dirty_count(s->dirty_bitmap);
}
ret = 0;
@@ -634,7 +625,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
}
s->should_complete = true;
- block_job_resume(job);
+ block_job_enter(&s->common);
}
static const BlockJobDriver mirror_job_driver = {
@@ -656,7 +647,7 @@ static const BlockJobDriver commit_active_job_driver = {
static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
- int64_t speed, int64_t granularity,
+ int64_t speed, uint32_t granularity,
int64_t buf_size,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
@@ -668,15 +659,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
MirrorBlockJob *s;
if (granularity == 0) {
- /* Choose the default granularity based on the target file's cluster
- * size, clamped between 4k and 64k. */
- BlockDriverInfo bdi;
- if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
- granularity = MAX(4096, bdi.cluster_size);
- granularity = MIN(65536, granularity);
- } else {
- granularity = 65536;
- }
+ granularity = bdrv_get_default_bitmap_granularity(target);
}
assert ((granularity & (granularity - 1)) == 0);
@@ -703,7 +686,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
s->granularity = granularity;
s->buf_size = MAX(buf_size, granularity);
- s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp);
+ s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
if (!s->dirty_bitmap) {
return;
}
@@ -717,7 +700,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
- int64_t speed, int64_t granularity, int64_t buf_size,
+ int64_t speed, uint32_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockCompletionFunc *cb,
@@ -726,6 +709,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
bool is_none_mode;
BlockDriverState *base;
+ if (mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+ error_setg(errp, "Sync mode 'dirty-bitmap' not supported");
+ return;
+ }
is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
mirror_start_job(bs, target, replaces,
diff --git a/block/null.c b/block/null.c
index ec2bd27a4b..7d083233fb 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,8 +12,11 @@
#include "block/block_int.h"
+#define NULL_OPT_LATENCY "latency-ns"
+
typedef struct {
int64_t length;
+ int64_t latency_ns;
} BDRVNullState;
static QemuOptsList runtime_opts = {
@@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = {
.type = QEMU_OPT_SIZE,
.help = "size of the null block",
},
+ {
+ .name = NULL_OPT_LATENCY,
+ .type = QEMU_OPT_NUMBER,
+ .help = "nanoseconds (approximated) to wait "
+ "before completing request",
+ },
{ /* end of list */ }
},
};
@@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
{
QemuOpts *opts;
BDRVNullState *s = bs->opaque;
+ int ret = 0;
opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &error_abort);
s->length =
qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+ s->latency_ns =
+ qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0);
+ if (s->latency_ns < 0) {
+ error_setg(errp, "latency-ns is invalid");
+ ret = -EINVAL;
+ }
qemu_opts_del(opts);
- return 0;
+ return ret;
}
static void null_close(BlockDriverState *bs)
@@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs)
return s->length;
}
+static coroutine_fn int null_co_common(BlockDriverState *bs)
+{
+ BDRVNullState *s = bs->opaque;
+
+ if (s->latency_ns) {
+ co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME,
+ s->latency_ns);
+ }
+ return 0;
+}
+
static coroutine_fn int null_co_readv(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov)
{
- return 0;
+ return null_co_common(bs);
}
static coroutine_fn int null_co_writev(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov)
{
- return 0;
+ return null_co_common(bs);
}
static coroutine_fn int null_co_flush(BlockDriverState *bs)
{
- return 0;
+ return null_co_common(bs);
}
typedef struct {
BlockAIOCB common;
QEMUBH *bh;
+ QEMUTimer timer;
} NullAIOCB;
static const AIOCBInfo null_aiocb_info = {
@@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque)
qemu_aio_unref(acb);
}
+static void null_timer_cb(void *opaque)
+{
+ NullAIOCB *acb = opaque;
+ acb->common.cb(acb->common.opaque, 0);
+ timer_deinit(&acb->timer);
+ qemu_aio_unref(acb);
+}
+
static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
BlockCompletionFunc *cb,
void *opaque)
{
NullAIOCB *acb;
+ BDRVNullState *s = bs->opaque;
acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
- acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
- qemu_bh_schedule(acb->bh);
+ /* Only emulate latency after vcpu is running. */
+ if (s->latency_ns) {
+ aio_timer_init(bdrv_get_aio_context(bs), &acb->timer,
+ QEMU_CLOCK_REALTIME, SCALE_NS,
+ null_timer_cb, acb);
+ timer_mod_ns(&acb->timer,
+ qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns);
+ } else {
+ acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+ qemu_bh_schedule(acb->bh);
+ }
return &acb->common;
}
@@ -131,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
return null_aio_common(bs, cb, opaque);
}
+static int null_reopen_prepare(BDRVReopenState *reopen_state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static BlockDriver bdrv_null_co = {
.format_name = "null-co",
.protocol_name = "null-co",
@@ -143,6 +195,7 @@ static BlockDriver bdrv_null_co = {
.bdrv_co_readv = null_co_readv,
.bdrv_co_writev = null_co_writev,
.bdrv_co_flush_to_disk = null_co_flush,
+ .bdrv_reopen_prepare = null_reopen_prepare,
};
static BlockDriver bdrv_null_aio = {
@@ -157,6 +210,7 @@ static BlockDriver bdrv_null_aio = {
.bdrv_aio_readv = null_aio_readv,
.bdrv_aio_writev = null_aio_writev,
.bdrv_aio_flush = null_aio_flush,
+ .bdrv_reopen_prepare = null_reopen_prepare,
};
static void bdrv_null_init(void)
diff --git a/block/qapi.c b/block/qapi.c
index 8a19aed446..063dd1bc1f 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -31,8 +31,10 @@
#include "qapi/qmp/types.h"
#include "sysemu/block-backend.h"
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
{
+ ImageInfo **p_image_info;
+ BlockDriverState *bs0;
BlockDeviceInfo *info = g_malloc0(sizeof(*info));
info->file = g_strdup(bs->filename);
@@ -92,6 +94,25 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
info->write_threshold = bdrv_write_threshold_get(bs);
+ bs0 = bs;
+ p_image_info = &info->image;
+ while (1) {
+ Error *local_err = NULL;
+ bdrv_query_image_info(bs0, p_image_info, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ qapi_free_BlockDeviceInfo(info);
+ return NULL;
+ }
+ if (bs0->drv && bs0->backing_hd) {
+ bs0 = bs0->backing_hd;
+ (*p_image_info)->has_backing_image = true;
+ p_image_info = &((*p_image_info)->backing_image);
+ } else {
+ break;
+ }
+ }
+
return info;
}
@@ -264,9 +285,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
{
BlockInfo *info = g_malloc0(sizeof(*info));
BlockDriverState *bs = blk_bs(blk);
- BlockDriverState *bs0;
- ImageInfo **p_image_info;
- Error *local_err = NULL;
info->device = g_strdup(blk_name(blk));
info->type = g_strdup("unknown");
info->locked = blk_dev_is_medium_locked(blk);
@@ -289,23 +307,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
if (bs->drv) {
info->has_inserted = true;
- info->inserted = bdrv_block_device_info(bs);
-
- bs0 = bs;
- p_image_info = &info->inserted->image;
- while (1) {
- bdrv_query_image_info(bs0, p_image_info, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- goto err;
- }
- if (bs0->drv && bs0->backing_hd) {
- bs0 = bs0->backing_hd;
- (*p_image_info)->has_backing_image = true;
- p_image_info = &((*p_image_info)->backing_image);
- } else {
- break;
- }
+ info->inserted = bdrv_block_device_info(bs, errp);
+ if (info->inserted == NULL) {
+ goto err;
}
}
diff --git a/block/qcow.c b/block/qcow.c
index 055896910e..ab893284d2 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -124,7 +124,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
snprintf(version, sizeof(version), "QCOW version %" PRIu32,
header.version);
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "qcow", version);
+ bdrv_get_device_or_node_name(bs), "qcow", version);
ret = -ENOTSUP;
goto fail;
}
@@ -229,9 +229,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
}
/* Disable migration when qcow images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "qcow", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
qemu_co_mutex_init(&s->lock);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 6cbae1d205..f47260b808 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -2450,7 +2450,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
if (ret < 0) {
return ret;
} else if (ret > 0) {
- int metadata_ol_bitnr = ffs(ret) - 1;
+ int metadata_ol_bitnr = ctz32(ret);
assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 2aa9dcb1d1..17bb2119b2 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -351,10 +351,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
memset(sn, 0, sizeof(*sn));
- /* Generate an ID if it wasn't passed */
- if (sn_info->id_str[0] == '\0') {
- find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
- }
+ /* Generate an ID */
+ find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
/* Check that the ID is unique */
if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) {
diff --git a/block/qcow2.c b/block/qcow2.c
index 316a8db22b..b9a72e39d4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -208,7 +208,7 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
va_end(ap);
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "qcow2", msg);
+ bdrv_get_device_or_node_name(bs), "qcow2", msg);
}
static void report_unsupported_feature(BlockDriverState *bs,
@@ -1802,7 +1802,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
{
/* Calculate cluster_bits */
int cluster_bits;
- cluster_bits = ffs(cluster_size) - 1;
+ cluster_bits = ctz32(cluster_size);
if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
(1 << cluster_bits) != cluster_size)
{
@@ -2110,7 +2110,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
goto finish;
}
- refcount_order = ffs(refcount_bits) - 1;
+ refcount_order = ctz32(refcount_bits);
ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
cluster_size, prealloc, opts, version, refcount_order,
@@ -2824,6 +2824,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
int64_t size, const char *message_format, ...)
{
BDRVQcowState *s = bs->opaque;
+ const char *node_name;
char *message;
va_list ap;
@@ -2847,8 +2848,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
"corruption events will be suppressed\n", message);
}
- qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
- offset >= 0, offset, size >= 0, size,
+ node_name = bdrv_get_node_name(bs);
+ qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
+ *node_name != '\0', node_name,
+ message, offset >= 0, offset,
+ size >= 0, size,
fatal, &error_abort);
g_free(message);
diff --git a/block/qed.c b/block/qed.c
index 892b13c806..5bbe069ce9 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -408,7 +408,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
snprintf(buf, sizeof(buf), "%" PRIx64,
s->header.features & ~QED_FEATURE_MASK);
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "QED", buf);
+ bdrv_get_device_or_node_name(bs), "QED", buf);
return -ENOTSUP;
}
if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
@@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
s->table_nelems = (s->header.cluster_size * s->header.table_size) /
sizeof(uint64_t);
- s->l2_shift = ffs(s->header.cluster_size) - 1;
+ s->l2_shift = ctz32(s->header.cluster_size);
s->l2_mask = s->table_nelems - 1;
- s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+ s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
/* Header size calculation must not overflow uint32_t */
if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
diff --git a/block/quorum.c b/block/quorum.c
index 437b12251d..f91ef75a84 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
static void quorum_report_failure(QuorumAIOCB *acb)
{
- const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
- bdrv_get_device_name(acb->common.bs) :
- acb->common.bs->node_name;
-
+ const char *reference = bdrv_get_device_or_node_name(acb->common.bs);
qapi_event_send_quorum_failure(reference, acb->sector_num,
acb->nb_sectors, &error_abort);
}
diff --git a/block/rbd.c b/block/rbd.c
index f3ab2ddd5a..fbe87e035b 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -325,7 +325,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
error_setg(errp, "obj size too small");
return -EINVAL;
}
- obj_order = ffs(objsize) - 1;
+ obj_order = ctz32(objsize);
}
clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index c14172cfa6..bd7cbed048 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1716,7 +1716,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
if ((object_size - 1) & object_size) { /* not a power of 2? */
return -EINVAL;
}
- obj_order = ffs(object_size) - 1;
+ obj_order = ctz32(object_size);
if (obj_order < 20 || obj_order > 31) {
return -EINVAL;
}
@@ -2341,6 +2341,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
if (ret < 0) {
error_report("failed to create inode for snapshot: %s",
error_get_pretty(local_err));
+ error_free(local_err);
goto cleanup;
}
diff --git a/block/snapshot.c b/block/snapshot.c
index 698e1a1d58..50ae610139 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -246,9 +246,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
if (bs->file) {
return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp);
}
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- drv->format_name, bdrv_get_device_name(bs),
- "internal snapshot deletion");
+ error_setg(errp, "Block format '%s' used by device '%s' "
+ "does not support internal snapshot deletion",
+ drv->format_name, bdrv_get_device_name(bs));
return -ENOTSUP;
}
@@ -329,9 +329,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
if (drv->bdrv_snapshot_load_tmp) {
return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
}
- error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- drv->format_name, bdrv_get_device_name(bs),
- "temporarily load internal snapshot");
+ error_setg(errp, "Block format '%s' used by device '%s' "
+ "does not support temporarily loading internal snapshots",
+ drv->format_name, bdrv_get_device_name(bs));
return -ENOTSUP;
}
diff --git a/block/vdi.c b/block/vdi.c
index 53bd02fe22..7642ef3597 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
}
/* Disable migration when vdi images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vdi", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
qemu_co_mutex_init(&s->write_lock);
diff --git a/block/vhdx.c b/block/vhdx.c
index bb3ed45d5c..0776de7174 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
/* TODO: differencing files */
/* Disable migration when VHDX images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vhdx", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
return 0;
@@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
memset(iov1.iov_base, 0, iov1.iov_len);
qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
- sinfo.block_offset);
+ iov1.iov_len);
sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
}
@@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
memset(iov2.iov_base, 0, iov2.iov_len);
qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
- sinfo.block_offset);
+ iov2.iov_len);
sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
}
}
diff --git a/block/vmdk.c b/block/vmdk.c
index 8410a158a2..1c5e2ef1b3 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -523,7 +523,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
}
ret = vmdk_add_extent(bs, file, false,
le32_to_cpu(header.disk_sectors),
- le32_to_cpu(header.l1dir_offset) << 9,
+ (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
0,
le32_to_cpu(header.l1dir_size),
4096,
@@ -669,7 +669,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
le32_to_cpu(header.version));
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
- bdrv_get_device_name(bs), "vmdk", buf);
+ bdrv_get_device_or_node_name(bs), "vmdk", buf);
return -ENOTSUP;
} else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
/* VMware KB 2064959 explains that version 3 added support for
@@ -962,9 +962,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
qemu_co_mutex_init(&s->lock);
/* Disable migration when VMDK images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vmdk", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
g_free(buf);
return 0;
diff --git a/block/vpc.c b/block/vpc.c
index 43e768ee76..37572bab86 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -318,9 +318,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
qemu_co_mutex_init(&s->lock);
/* Disable migration when VHD images are used */
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vpc", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
return 0;
diff --git a/block/vvfat.c b/block/vvfat.c
index 9be632f404..e803589675 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1180,9 +1180,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
/* Disable migration when vvfat is used rw */
if (s->qcow) {
- error_set(&s->migration_blocker,
- QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
- "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
+ error_setg(&s->migration_blocker,
+ "The vvfat (rw) format used by node '%s' "
+ "does not support live migration",
+ bdrv_get_device_or_node_name(bs));
migrate_add_blocker(s->migration_blocker);
}