diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile.objs | 11 | ||||
-rw-r--r-- | block/blkdebug.c | 107 | ||||
-rw-r--r-- | block/iscsi.c | 152 | ||||
-rw-r--r-- | block/qcow2-cache.c | 25 | ||||
-rw-r--r-- | block/qcow2-cluster.c | 18 | ||||
-rw-r--r-- | block/qcow2-refcount.c | 64 | ||||
-rw-r--r-- | block/qcow2-snapshot.c | 6 | ||||
-rw-r--r-- | block/qcow2.c | 31 | ||||
-rw-r--r-- | block/qcow2.h | 8 | ||||
-rw-r--r-- | block/qed-check.c | 2 | ||||
-rw-r--r-- | block/qed.c | 7 | ||||
-rw-r--r-- | block/raw-posix.c | 105 | ||||
-rw-r--r-- | block/raw.c | 10 | ||||
-rw-r--r-- | block/rbd.c | 19 | ||||
-rw-r--r-- | block/sheepdog.c | 139 | ||||
-rw-r--r-- | block/stream.c | 109 | ||||
-rw-r--r-- | block/vdi.c | 7 |
17 files changed, 547 insertions, 273 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs new file mode 100644 index 0000000000..b5754d39bf --- /dev/null +++ b/block/Makefile.objs @@ -0,0 +1,11 @@ +block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o +block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o +block-obj-y += qed-check.o +block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o +block-obj-y += stream.o +block-obj-$(CONFIG_WIN32) += raw-win32.o +block-obj-$(CONFIG_POSIX) += raw-posix.o +block-obj-$(CONFIG_LIBISCSI) += iscsi.o +block-obj-$(CONFIG_CURL) += curl.o +block-obj-$(CONFIG_RBD) += rbd.o diff --git a/block/blkdebug.c b/block/blkdebug.c index e56e37da51..59dcea0650 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -26,24 +26,10 @@ #include "block_int.h" #include "module.h" -typedef struct BlkdebugVars { - int state; - - /* If inject_errno != 0, an error is injected for requests */ - int inject_errno; - - /* Decides if all future requests fail (false) or only the next one and - * after the next request inject_errno is reset to 0 (true) */ - bool inject_once; - - /* Decides if aio_readv/writev fails right away (true) or returns an error - * return value only in the callback (false) */ - bool inject_immediately; -} BlkdebugVars; - typedef struct BDRVBlkdebugState { - BlkdebugVars vars; - QLIST_HEAD(list, BlkdebugRule) rules[BLKDBG_EVENT_MAX]; + int state; + QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX]; + QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; } BDRVBlkdebugState; typedef struct BlkdebugAIOCB { @@ -73,12 +59,14 @@ typedef struct BlkdebugRule { int error; int immediately; int once; + int64_t sector; } inject; struct { int new_state; } set_state; } options; QLIST_ENTRY(BlkdebugRule) next; + QSIMPLEQ_ENTRY(BlkdebugRule) active_next; } BlkdebugRule; static QemuOptsList inject_error_opts = { @@ -98,6 +86,10 @@ static QemuOptsList inject_error_opts = { .type = QEMU_OPT_NUMBER, }, { + .name = "sector", + .type = QEMU_OPT_NUMBER, + }, + { .name = "once", .type = QEMU_OPT_BOOL, }, @@ -147,9 +139,7 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_L2_ALLOC_COW_READ] = "l2_alloc.cow_read", [BLKDBG_L2_ALLOC_WRITE] = "l2_alloc.write", - [BLKDBG_READ] = "read", [BLKDBG_READ_AIO] = "read_aio", - [BLKDBG_READ_BACKING] = "read_backing", [BLKDBG_READ_BACKING_AIO] = "read_backing_aio", [BLKDBG_READ_COMPRESSED] = "read_compressed", @@ -228,6 +218,7 @@ static int add_rule(QemuOpts *opts, void *opaque) rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0); rule->options.inject.immediately = qemu_opt_get_bool(opts, "immediately", 0); + rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1); break; case ACTION_SET_STATE: @@ -302,7 +293,7 @@ static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags) filename = c + 1; /* Set initial state */ - s->vars.state = 1; + s->state = 1; /* Open the backing file */ ret = bdrv_file_open(&bs->file, filename, flags); @@ -328,18 +319,18 @@ static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb) } static BlockDriverAIOCB *inject_error(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) + BlockDriverCompletionFunc *cb, void *opaque, BlkdebugRule *rule) { BDRVBlkdebugState *s = bs->opaque; - int error = s->vars.inject_errno; + int error = rule->options.inject.error; struct BlkdebugAIOCB *acb; QEMUBH *bh; - if (s->vars.inject_once) { - s->vars.inject_errno = 0; + if (rule->options.inject.once) { + QSIMPLEQ_INIT(&s->active_rules); } - if (s->vars.inject_immediately) { + if (rule->options.inject.immediately) { return NULL; } @@ -358,14 +349,21 @@ static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule = NULL; - if (s->vars.inject_errno) { - return inject_error(bs, cb, opaque); + QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { + if (rule->options.inject.sector == -1 || + (rule->options.inject.sector >= sector_num && + rule->options.inject.sector < sector_num + nb_sectors)) { + break; + } + } + + if (rule && rule->options.inject.error) { + return inject_error(bs, cb, opaque, rule); } - BlockDriverAIOCB *acb = - bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque); - return acb; + return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque); } static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs, @@ -373,14 +371,21 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule = NULL; + + QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { + if (rule->options.inject.sector == -1 || + (rule->options.inject.sector >= sector_num && + rule->options.inject.sector < sector_num + nb_sectors)) { + break; + } + } - if (s->vars.inject_errno) { - return inject_error(bs, cb, opaque); + if (rule && rule->options.inject.error) { + return inject_error(bs, cb, opaque, rule); } - BlockDriverAIOCB *acb = - bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); - return acb; + return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); } static void blkdebug_close(BlockDriverState *bs) @@ -397,44 +402,53 @@ static void blkdebug_close(BlockDriverState *bs) } } -static void process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, - BlkdebugVars *old_vars) +static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, + int old_state, bool injected) { BDRVBlkdebugState *s = bs->opaque; - BlkdebugVars *vars = &s->vars; /* Only process rules for the current state */ - if (rule->state && rule->state != old_vars->state) { - return; + if (rule->state && rule->state != old_state) { + return injected; } /* Take the action */ switch (rule->action) { case ACTION_INJECT_ERROR: - vars->inject_errno = rule->options.inject.error; - vars->inject_once = rule->options.inject.once; - vars->inject_immediately = rule->options.inject.immediately; + if (!injected) { + QSIMPLEQ_INIT(&s->active_rules); + injected = true; + } + QSIMPLEQ_INSERT_HEAD(&s->active_rules, rule, active_next); break; case ACTION_SET_STATE: - vars->state = rule->options.set_state.new_state; + s->state = rule->options.set_state.new_state; break; } + return injected; } static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event) { BDRVBlkdebugState *s = bs->opaque; struct BlkdebugRule *rule; - BlkdebugVars old_vars = s->vars; + int old_state = s->state; + bool injected; assert((int)event >= 0 && event < BLKDBG_EVENT_MAX); + injected = false; QLIST_FOREACH(rule, &s->rules[event], next) { - process_rule(bs, rule, &old_vars); + injected = process_rule(bs, rule, old_state, injected); } } +static int64_t blkdebug_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file); +} + static BlockDriver bdrv_blkdebug = { .format_name = "blkdebug", .protocol_name = "blkdebug", @@ -443,6 +457,7 @@ static BlockDriver bdrv_blkdebug = { .bdrv_file_open = blkdebug_open, .bdrv_close = blkdebug_close, + .bdrv_getlength = blkdebug_getlength, .bdrv_aio_readv = blkdebug_aio_readv, .bdrv_aio_writev = blkdebug_aio_writev, diff --git a/block/iscsi.c b/block/iscsi.c index ecb7a2211e..993a86d829 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -35,6 +35,10 @@ #include <iscsi/iscsi.h> #include <iscsi/scsi-lowlevel.h> +#ifdef __linux__ +#include <scsi/sg.h> +#include <hw/scsi-defs.h> +#endif typedef struct IscsiLun { struct iscsi_context *iscsi; @@ -56,6 +60,9 @@ typedef struct IscsiAIOCB { int canceled; size_t read_size; size_t read_offset; +#ifdef __linux__ + sg_io_hdr_t *ioh; +#endif } IscsiAIOCB; struct IscsiTask { @@ -514,6 +521,136 @@ iscsi_aio_discard(BlockDriverState *bs, return &acb->common; } +#ifdef __linux__ +static void +iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + IscsiAIOCB *acb = opaque; + + if (acb->canceled != 0) { + qemu_aio_release(acb); + scsi_free_scsi_task(acb->task); + acb->task = NULL; + return; + } + + acb->status = 0; + if (status < 0) { + error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s", + iscsi_get_error(iscsi)); + acb->status = -EIO; + } + + acb->ioh->driver_status = 0; + acb->ioh->host_status = 0; + acb->ioh->resid = 0; + +#define SG_ERR_DRIVER_SENSE 0x08 + + if (status == SCSI_STATUS_CHECK_CONDITION && acb->task->datain.size >= 2) { + int ss; + + acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE; + + acb->ioh->sb_len_wr = acb->task->datain.size - 2; + ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ? + acb->ioh->mx_sb_len : acb->ioh->sb_len_wr; + memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss); + } + + iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb); + scsi_free_scsi_task(acb->task); + acb->task = NULL; +} + +static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + struct iscsi_data data; + IscsiAIOCB *acb; + + assert(req == SG_IO); + + acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque); + + acb->iscsilun = iscsilun; + acb->canceled = 0; + acb->buf = NULL; + acb->ioh = buf; + + acb->task = malloc(sizeof(struct scsi_task)); + if (acb->task == NULL) { + error_report("iSCSI: Failed to allocate task for scsi command. %s", + iscsi_get_error(iscsi)); + qemu_aio_release(acb); + return NULL; + } + memset(acb->task, 0, sizeof(struct scsi_task)); + + switch (acb->ioh->dxfer_direction) { + case SG_DXFER_TO_DEV: + acb->task->xfer_dir = SCSI_XFER_WRITE; + break; + case SG_DXFER_FROM_DEV: + acb->task->xfer_dir = SCSI_XFER_READ; + break; + default: + acb->task->xfer_dir = SCSI_XFER_NONE; + break; + } + + acb->task->cdb_size = acb->ioh->cmd_len; + memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len); + acb->task->expxferlen = acb->ioh->dxfer_len; + + if (acb->task->xfer_dir == SCSI_XFER_WRITE) { + data.data = acb->ioh->dxferp; + data.size = acb->ioh->dxfer_len; + } + if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, + iscsi_aio_ioctl_cb, + (acb->task->xfer_dir == SCSI_XFER_WRITE) ? + &data : NULL, + acb) != 0) { + scsi_free_scsi_task(acb->task); + qemu_aio_release(acb); + return NULL; + } + + /* tell libiscsi to read straight into the buffer we got from ioctl */ + if (acb->task->xfer_dir == SCSI_XFER_READ) { + scsi_task_add_data_in_buffer(acb->task, + acb->ioh->dxfer_len, + acb->ioh->dxferp); + } + + iscsi_set_events(iscsilun); + + return &acb->common; +} + +static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + IscsiLun *iscsilun = bs->opaque; + + switch (req) { + case SG_GET_VERSION_NUM: + *(int *)buf = 30000; + break; + case SG_GET_SCSI_ID: + ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; + break; + default: + return -1; + } + return 0; +} +#endif + static int64_t iscsi_getlength(BlockDriverState *bs) { @@ -884,6 +1021,16 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags) if (iscsi_url != NULL) { iscsi_destroy_url(iscsi_url); } + + /* Medium changer or tape. We dont have any emulation for this so this must + * be sg ioctl compatible. We force it to be sg, otherwise qemu will try + * to read from the device to guess the image format. + */ + if (iscsilun->type == TYPE_MEDIUM_CHANGER || + iscsilun->type == TYPE_TAPE) { + bs->sg = 1; + } + return 0; failed: @@ -925,6 +1072,11 @@ static BlockDriver bdrv_iscsi = { .bdrv_aio_flush = iscsi_aio_flush, .bdrv_aio_discard = iscsi_aio_discard, + +#ifdef __linux__ + .bdrv_ioctl = iscsi_ioctl, + .bdrv_aio_ioctl = iscsi_aio_ioctl, +#endif }; static void iscsi_block_init(void) diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 710d4b1828..2d4322a8dd 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -40,11 +40,9 @@ struct Qcow2Cache { struct Qcow2Cache* depends; int size; bool depends_on_flush; - bool writethrough; }; -Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables, - bool writethrough) +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) { BDRVQcowState *s = bs->opaque; Qcow2Cache *c; @@ -53,7 +51,6 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables, c = g_malloc0(sizeof(*c)); c->size = num_tables; c->entries = g_malloc0(sizeof(*c->entries) * num_tables); - c->writethrough = writethrough; for (i = 0; i < c->size; i++) { c->entries[i].table = qemu_blockalign(bs, s->cluster_size); @@ -307,12 +304,7 @@ found: *table = NULL; assert(c->entries[i].ref >= 0); - - if (c->writethrough) { - return qcow2_cache_entry_flush(bs, c, i); - } else { - return 0; - } + return 0; } void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) @@ -329,16 +321,3 @@ void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table) found: c->entries[i].dirty = true; } - -bool qcow2_cache_set_writethrough(BlockDriverState *bs, Qcow2Cache *c, - bool enable) -{ - bool old = c->writethrough; - - if (!old && enable) { - qcow2_cache_flush(bs, c); - } - - c->writethrough = enable; - return old; -} diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4b3345b11b..d7e0e19d9c 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -471,6 +471,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); *cluster_offset &= L2E_OFFSET_MASK; break; + default: + abort(); } qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); @@ -538,7 +540,6 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, if (l2_offset) { qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t)); } - l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; } /* find the cluster offset for the given disk offset */ @@ -641,11 +642,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) } if (m->nb_available & (s->cluster_sectors - 1)) { - uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1); cow = true; qemu_co_mutex_unlock(&s->lock); - ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9), - m->nb_available - end, s->cluster_sectors); + ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available, + align_offset(m->nb_available, s->cluster_sectors)); qemu_co_mutex_lock(&s->lock); if (ret < 0) goto err; @@ -947,8 +947,16 @@ again: /* save info needed for meta data update */ if (nb_clusters > 0) { + /* + * requested_sectors: Number of sectors from the start of the first + * newly allocated cluster to the end of the (possibly shortened + * before) write request. + * + * avail_sectors: Number of sectors from the start of the first + * newly allocated to the end of the last newly allocated cluster. + */ int requested_sectors = n_end - keep_clusters * s->cluster_sectors; - int avail_sectors = (keep_clusters + nb_clusters) + int avail_sectors = nb_clusters << (s->cluster_bits - BDRV_SECTOR_BITS); *m = (QCowL2Meta) { diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 812c93c5c7..5e3f9153fb 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -367,7 +367,7 @@ static int alloc_refcount_block(BlockDriverState *bs, } for(i = 0; i < table_size; i++) { - cpu_to_be64s(&new_table[i]); + be64_to_cpus(&new_table[i]); } /* Hook up the new refcount table in the qcow2 header */ @@ -627,10 +627,11 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); assert(size > 0 && size <= s->cluster_size); if (s->free_byte_offset == 0) { - s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (s->free_byte_offset < 0) { - return s->free_byte_offset; + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + return offset; } + s->free_byte_offset = offset; } redo: free_in_cluster = s->cluster_size - @@ -726,13 +727,6 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t old_offset, old_l2_offset; int i, j, l1_modified = 0, nb_csectors, refcount; int ret; - bool old_l2_writethrough, old_refcount_writethrough; - - /* Switch caches to writeback mode during update */ - old_l2_writethrough = - qcow2_cache_set_writethrough(bs, s->l2_table_cache, false); - old_refcount_writethrough = - qcow2_cache_set_writethrough(bs, s->refcount_block_cache, false); l2_table = NULL; l1_table = NULL; @@ -856,11 +850,6 @@ fail: qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); } - /* Enable writethrough cache mode again */ - qcow2_cache_set_writethrough(bs, s->l2_table_cache, old_l2_writethrough); - qcow2_cache_set_writethrough(bs, s->refcount_block_cache, - old_refcount_writethrough); - /* Update L1 only if it isn't deleted anyway (addend = -1) */ if (addend >= 0 && l1_modified) { for(i = 0; i < l1_size; i++) @@ -1122,11 +1111,12 @@ fail: * Returns 0 if no errors are found, the number of errors in case the image is * detected as corrupted, and -errno when an internal error occurred. */ -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res) +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) { BDRVQcowState *s = bs->opaque; - int64_t size; - int nb_clusters, refcount1, refcount2, i; + int64_t size, i; + int nb_clusters, refcount1, refcount2; QCowSnapshot *sn; uint16_t *refcount_table; int ret; @@ -1170,14 +1160,15 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res) /* Refcount blocks are cluster aligned */ if (offset & (s->cluster_size - 1)) { - fprintf(stderr, "ERROR refcount block %d is not " + fprintf(stderr, "ERROR refcount block %" PRId64 " is not " "cluster aligned; refcount table entry corrupted\n", i); res->corruptions++; continue; } if (cluster >= nb_clusters) { - fprintf(stderr, "ERROR refcount block %d is outside image\n", i); + fprintf(stderr, "ERROR refcount block %" PRId64 + " is outside image\n", i); res->corruptions++; continue; } @@ -1186,7 +1177,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res) inc_refcounts(bs, res, refcount_table, nb_clusters, offset, s->cluster_size); if (refcount_table[cluster] != 1) { - fprintf(stderr, "ERROR refcount block %d refcount=%d\n", + fprintf(stderr, "ERROR refcount block %" PRId64 + " refcount=%d\n", i, refcount_table[cluster]); res->corruptions++; } @@ -1197,7 +1189,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res) for(i = 0; i < nb_clusters; i++) { refcount1 = get_refcount(bs, i); if (refcount1 < 0) { - fprintf(stderr, "Can't get refcount for cluster %d: %s\n", + fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", i, strerror(-refcount1)); res->check_errors++; continue; @@ -1205,9 +1197,31 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res) refcount2 = refcount_table[i]; if (refcount1 != refcount2) { - fprintf(stderr, "%s cluster %d refcount=%d reference=%d\n", - refcount1 < refcount2 ? "ERROR" : "Leaked", + + /* Check if we're allowed to fix the mismatch */ + int *num_fixed = NULL; + if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { + num_fixed = &res->leaks_fixed; + } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { + num_fixed = &res->corruptions_fixed; + } + + fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n", + num_fixed != NULL ? "Repairing" : + refcount1 < refcount2 ? "ERROR" : + "Leaked", i, refcount1, refcount2); + + if (num_fixed) { + ret = update_refcount(bs, i << s->cluster_bits, 1, + refcount2 - refcount1); + if (ret >= 0) { + (*num_fixed)++; + continue; + } + } + + /* And if we couldn't, print an error */ if (refcount1 < refcount2) { res->corruptions++; } else { diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 4561a2abf9..4e7c93b8b3 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -405,7 +405,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result); + qcow2_check_refcounts(bs, &result, 0); } #endif return 0; @@ -522,7 +522,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result); + qcow2_check_refcounts(bs, &result, 0); } #endif return 0; @@ -582,7 +582,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result); + qcow2_check_refcounts(bs, &result, 0); } #endif return 0; diff --git a/block/qcow2.c b/block/qcow2.c index 1b5b36cfc1..870148ddf8 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -220,7 +220,6 @@ static int qcow2_open(BlockDriverState *bs, int flags) int len, i, ret = 0; QCowHeader header; uint64_t ext_end; - bool writethrough; ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); if (ret < 0) { @@ -298,14 +297,6 @@ static int qcow2_open(BlockDriverState *bs, int flags) goto fail; } - if (!bs->read_only && s->autoclear_features != 0) { - s->autoclear_features = 0; - ret = qcow2_update_header(bs); - if (ret < 0) { - goto fail; - } - } - /* Check support for various header values */ if (header.refcount_order != 4) { report_unsupported(bs, "%d bit reference counts", @@ -367,10 +358,8 @@ static int qcow2_open(BlockDriverState *bs, int flags) } /* alloc L2 table/refcount block cache */ - writethrough = ((flags & BDRV_O_CACHE_WB) == 0); - s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE, writethrough); - s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE, - writethrough); + s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); + s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); s->cluster_cache = g_malloc(s->cluster_size); /* one more sector for decompressed data alignment */ @@ -411,13 +400,22 @@ static int qcow2_open(BlockDriverState *bs, int flags) goto fail; } + /* Clear unknown autoclear feature bits */ + if (!bs->read_only && s->autoclear_features != 0) { + s->autoclear_features = 0; + ret = qcow2_update_header(bs); + if (ret < 0) { + goto fail; + } + } + /* Initialise locks */ qemu_co_mutex_init(&s->lock); #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result); + qcow2_check_refcounts(bs, &result, 0); } #endif return ret; @@ -1467,9 +1465,10 @@ static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) } -static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result) +static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) { - return qcow2_check_refcounts(bs, result); + return qcow2_check_refcounts(bs, result, fix); } #if 0 diff --git a/block/qcow2.h b/block/qcow2.h index 93567f6451..455b6d7cfe 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -261,7 +261,8 @@ void qcow2_free_any_clusters(BlockDriverState *bs, int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend); -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res); +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix); /* qcow2-cluster.c functions */ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size); @@ -296,11 +297,8 @@ void qcow2_free_snapshots(BlockDriverState *bs); int qcow2_read_snapshots(BlockDriverState *bs); /* qcow2-cache.c functions */ -Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables, - bool writethrough); +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); -bool qcow2_cache_set_writethrough(BlockDriverState *bs, Qcow2Cache *c, - bool enable); void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table); int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); diff --git a/block/qed-check.c b/block/qed-check.c index 94327ff5b3..5edf60775b 100644 --- a/block/qed-check.c +++ b/block/qed-check.c @@ -87,6 +87,7 @@ static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) if (!qed_check_cluster_offset(s, offset)) { if (check->fix) { table->offsets[i] = 0; + check->result->corruptions_fixed++; } else { check->result->corruptions++; } @@ -127,6 +128,7 @@ static int qed_check_l1_table(QEDCheck *check, QEDTable *table) /* Clear invalid offset */ if (check->fix) { table->offsets[i] = 0; + check->result->corruptions_fixed++; } else { check->result->corruptions++; } diff --git a/block/qed.c b/block/qed.c index 847417a3d6..5f3eefa3af 100644 --- a/block/qed.c +++ b/block/qed.c @@ -748,7 +748,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, /* If the read straddles the end of the backing file, shorten it */ size = MIN((uint64_t)backing_length - pos, qiov->size); - BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING); + BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, qiov, size / BDRV_SECTOR_SIZE, cb, opaque); } @@ -1517,11 +1517,12 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs) bdrv_qed_open(bs, bs->open_flags); } -static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result) +static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) { BDRVQEDState *s = bs->opaque; - return qed_check(s, result, false); + return qed_check(s, result, !!fix); } static QEMUOptionParameter qed_create_options[] = { diff --git a/block/raw-posix.c b/block/raw-posix.c index 03fcfcc031..0dce089be5 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -52,6 +52,10 @@ #include <sys/param.h> #include <linux/cdrom.h> #include <linux/fd.h> +#include <linux/fs.h> +#endif +#ifdef CONFIG_FIEMAP +#include <linux/fiemap.h> #endif #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) #include <sys/disk.h> @@ -583,6 +587,106 @@ static int raw_create(const char *filename, QEMUOptionParameter *options) return result; } +/* + * Returns true iff the specified sector is present in the disk image. Drivers + * not implementing the functionality are assumed to not support backing files, + * hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + off_t start, data, hole; + int ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + start = sector_num * BDRV_SECTOR_SIZE; + +#ifdef CONFIG_FIEMAP + + BDRVRawState *s = bs->opaque; + struct { + struct fiemap fm; + struct fiemap_extent fe; + } f; + + f.fm.fm_start = start; + f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE; + f.fm.fm_flags = 0; + f.fm.fm_extent_count = 1; + f.fm.fm_reserved = 0; + if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { + /* Assume everything is allocated. */ + *pnum = nb_sectors; + return 1; + } + + if (f.fm.fm_mapped_extents == 0) { + /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length. + * f.fm.fm_start + f.fm.fm_length must be clamped to the file size! + */ + off_t length = lseek(s->fd, 0, SEEK_END); + hole = f.fm.fm_start; + data = MIN(f.fm.fm_start + f.fm.fm_length, length); + } else { + data = f.fe.fe_logical; + hole = f.fe.fe_logical + f.fe.fe_length; + } + +#elif defined SEEK_HOLE && defined SEEK_DATA + + BDRVRawState *s = bs->opaque; + + hole = lseek(s->fd, start, SEEK_HOLE); + if (hole == -1) { + /* -ENXIO indicates that sector_num was past the end of the file. + * There is a virtual hole there. */ + assert(errno != -ENXIO); + + /* Most likely EINVAL. Assume everything is allocated. */ + *pnum = nb_sectors; + return 1; + } + + if (hole > start) { + data = start; + } else { + /* On a hole. We need another syscall to find its end. */ + data = lseek(s->fd, start, SEEK_DATA); + if (data == -1) { + data = lseek(s->fd, 0, SEEK_END); + } + } +#else + *pnum = nb_sectors; + return 1; +#endif + + if (data <= start) { + /* On a data extent, compute sectors to the end of the extent. */ + *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); + return 1; + } else { + /* On a hole, compute sectors to the beginning of the next extent. */ + *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); + return 0; + } +} + #ifdef CONFIG_XFS static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors) { @@ -634,6 +738,7 @@ static BlockDriver bdrv_file = { .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_co_discard = raw_co_discard, + .bdrv_co_is_allocated = raw_co_is_allocated, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, diff --git a/block/raw.c b/block/raw.c index 7086e314a6..ff34ea41e7 100644 --- a/block/raw.c +++ b/block/raw.c @@ -12,12 +12,14 @@ static int raw_open(BlockDriverState *bs, int flags) static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov); } static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov); } @@ -25,6 +27,13 @@ static void raw_close(BlockDriverState *bs) { } +static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum); +} + static int64_t raw_getlength(BlockDriverState *bs) { return bdrv_getlength(bs->file); @@ -108,6 +117,7 @@ static BlockDriver bdrv_raw = { .bdrv_co_readv = raw_co_readv, .bdrv_co_writev = raw_co_writev, + .bdrv_co_is_allocated = raw_co_is_allocated, .bdrv_co_discard = raw_co_discard, .bdrv_probe = raw_probe, diff --git a/block/rbd.c b/block/rbd.c index 49a4787b55..5a0f79fc8f 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -476,6 +476,25 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags) s->snap = g_strdup(snap_buf); } + /* + * Fallback to more conservative semantics if setting cache + * options fails. Ignore errors from setting rbd_cache because the + * only possible error is that the option does not exist, and + * librbd defaults to no caching. If write through caching cannot + * be set up, fall back to no caching. + */ + if (flags & BDRV_O_NOCACHE) { + rados_conf_set(s->cluster, "rbd_cache", "false"); + } else { + rados_conf_set(s->cluster, "rbd_cache", "true"); + if (!(flags & BDRV_O_CACHE_WB)) { + r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0"); + if (r < 0) { + rados_conf_set(s->cluster, "rbd_cache", "false"); + } + } + } + if (strstr(conf, "conf=") == NULL) { /* try default location, but ignore failure */ rados_conf_read_file(s->cluster, NULL); diff --git a/block/sheepdog.c b/block/sheepdog.c index 2c7aecec4f..809df39d9e 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -259,8 +259,7 @@ typedef struct AIOReq { uint8_t flags; uint32_t id; - QLIST_ENTRY(AIOReq) outstanding_aio_siblings; - QLIST_ENTRY(AIOReq) aioreq_siblings; + QLIST_ENTRY(AIOReq) aio_siblings; } AIOReq; enum AIOCBState { @@ -283,8 +282,7 @@ struct SheepdogAIOCB { void (*aio_done_func)(SheepdogAIOCB *); int canceled; - - QLIST_HEAD(aioreq_head, AIOReq) aioreq_head; + int nr_pending; }; typedef struct BDRVSheepdogState { @@ -307,7 +305,8 @@ typedef struct BDRVSheepdogState { Coroutine *co_recv; uint32_t aioreq_seq_num; - QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head; + QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; + QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head; } BDRVSheepdogState; static const char * sd_strerror(int err) @@ -358,7 +357,7 @@ static const char * sd_strerror(int err) * Sheepdog I/O handling: * * 1. In sd_co_rw_vector, we send the I/O requests to the server and - * link the requests to the outstanding_list in the + * link the requests to the inflight_list in the * BDRVSheepdogState. The function exits without waiting for * receiving the response. * @@ -386,21 +385,18 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, aio_req->flags = flags; aio_req->id = s->aioreq_seq_num++; - QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req, - outstanding_aio_siblings); - QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings); - + acb->nr_pending++; return aio_req; } -static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) +static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) { SheepdogAIOCB *acb = aio_req->aiocb; - QLIST_REMOVE(aio_req, outstanding_aio_siblings); - QLIST_REMOVE(aio_req, aioreq_siblings); + + QLIST_REMOVE(aio_req, aio_siblings); g_free(aio_req); - return !QLIST_EMPTY(&acb->aioreq_head); + acb->nr_pending--; } static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) @@ -446,7 +442,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, acb->canceled = 0; acb->coroutine = qemu_coroutine_self(); acb->ret = 0; - QLIST_INIT(&acb->aioreq_head); + acb->nr_pending = 0; return acb; } @@ -522,8 +518,8 @@ static int send_req(int sockfd, SheepdogReq *hdr, void *data, return ret; } -static int send_co_req(int sockfd, SheepdogReq *hdr, void *data, - unsigned int *wlen) +static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, + unsigned int *wlen) { int ret; @@ -540,11 +536,19 @@ static int send_co_req(int sockfd, SheepdogReq *hdr, void *data, return ret; } + +static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data, + unsigned int *wlen, unsigned int *rlen); + static int do_req(int sockfd, SheepdogReq *hdr, void *data, unsigned int *wlen, unsigned int *rlen) { int ret; + if (qemu_in_coroutine()) { + return do_co_req(sockfd, hdr, data, wlen, rlen); + } + socket_set_block(sockfd); ret = send_req(sockfd, hdr, data, wlen); if (ret < 0) { @@ -576,10 +580,21 @@ out: return ret; } -static int do_co_req(int sockfd, SheepdogReq *hdr, void *data, - unsigned int *wlen, unsigned int *rlen) +static void restart_co_req(void *opaque) +{ + Coroutine *co = opaque; + + qemu_coroutine_enter(co, NULL); +} + +static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data, + unsigned int *wlen, unsigned int *rlen) { int ret; + Coroutine *co; + + co = qemu_coroutine_self(); + qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co); socket_set_block(sockfd); ret = send_co_req(sockfd, hdr, data, wlen); @@ -587,6 +602,8 @@ static int do_co_req(int sockfd, SheepdogReq *hdr, void *data, goto out; } + qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co); + ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); if (ret < sizeof(*hdr)) { error_report("failed to get a rsp, %s", strerror(errno)); @@ -608,6 +625,7 @@ static int do_co_req(int sockfd, SheepdogReq *hdr, void *data, } ret = 0; out: + qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL); socket_set_nonblock(sockfd); return ret; } @@ -616,32 +634,41 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, struct iovec *iov, int niov, int create, enum AIOCBState aiocb_type); + +static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) +{ + AIOReq *aio_req; + + QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) { + if (aio_req->oid == oid) { + return aio_req; + } + } + + return NULL; +} + /* * This function searchs pending requests to the object `oid', and * sends them. */ -static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id) +static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid) { - AIOReq *aio_req, *next; + AIOReq *aio_req; SheepdogAIOCB *acb; int ret; - QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head, - outstanding_aio_siblings, next) { - if (id == aio_req->id) { - continue; - } - if (aio_req->oid != oid) { - continue; - } - + while ((aio_req = find_pending_req(s, oid)) != NULL) { acb = aio_req->aiocb; + /* move aio_req from pending list to inflight one */ + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, 0, acb->aiocb_type); if (ret < 0) { error_report("add_aio_request is failed"); free_aio_req(s, aio_req); - if (QLIST_EMPTY(&acb->aioreq_head)) { + if (!acb->nr_pending) { sd_finish_aiocb(acb); } } @@ -662,10 +689,9 @@ static void coroutine_fn aio_read_response(void *opaque) int ret; AIOReq *aio_req = NULL; SheepdogAIOCB *acb; - int rest; unsigned long idx; - if (QLIST_EMPTY(&s->outstanding_aio_head)) { + if (QLIST_EMPTY(&s->inflight_aio_head)) { goto out; } @@ -676,8 +702,8 @@ static void coroutine_fn aio_read_response(void *opaque) goto out; } - /* find the right aio_req from the outstanding_aio list */ - QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) { + /* find the right aio_req from the inflight aio list */ + QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) { if (aio_req->id == rsp.id) { break; } @@ -715,7 +741,7 @@ static void coroutine_fn aio_read_response(void *opaque) * create requests are not allowed, so we search the * pending requests here. */ - send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id); + send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx)); } break; case AIOCB_READ_UDATA: @@ -733,8 +759,8 @@ static void coroutine_fn aio_read_response(void *opaque) error_report("%s", sd_strerror(rsp.result)); } - rest = free_aio_req(s, aio_req); - if (!rest) { + free_aio_req(s, aio_req); + if (!acb->nr_pending) { /* * We've finished all requests which belong to the AIOCB, so * we can switch back to sd_co_readv/writev now. @@ -767,7 +793,8 @@ static int aio_flush_request(void *opaque) { BDRVSheepdogState *s = opaque; - return !QLIST_EMPTY(&s->outstanding_aio_head); + return !QLIST_EMPTY(&s->inflight_aio_head) || + !QLIST_EMPTY(&s->pending_aio_head); } static int set_nodelay(int fd) @@ -1084,7 +1111,8 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) strstart(filename, "sheepdog:", (const char **)&filename); - QLIST_INIT(&s->outstanding_aio_head); + QLIST_INIT(&s->inflight_aio_head); + QLIST_INIT(&s->pending_aio_head); s->fd = -1; memset(vdi, 0, sizeof(vdi)); @@ -1446,6 +1474,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) iov.iov_len = sizeof(s->inode); aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), data_len, offset, 0, 0, offset); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA); if (ret) { free_aio_req(s, aio_req); @@ -1514,7 +1543,7 @@ out: * Send I/O requests to the server. * * This function sends requests to the server, links the requests to - * the outstanding_list in BDRVSheepdogState, and exits without + * the inflight_list in BDRVSheepdogState, and exits without * waiting the response. The responses are received in the * `aio_read_response' function which is called from the main loop as * a fd handler. @@ -1546,6 +1575,12 @@ static int coroutine_fn sd_co_rw_vector(void *p) } } + /* + * Make sure we don't free the aiocb before we are done with all requests. + * This additional reference is dropped at the end of this function. + */ + acb->nr_pending++; + while (done != total) { uint8_t flags = 0; uint64_t old_oid = 0; @@ -1570,22 +1605,18 @@ static int coroutine_fn sd_co_rw_vector(void *p) } if (create) { - dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64 - " %" PRIu64 "\n", inode->vdi_id, oid, + dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n", + inode->vdi_id, oid, vid_to_data_oid(inode->data_vdi_id[idx], idx), idx); oid = vid_to_data_oid(inode->vdi_id, idx); - dprintf("new oid %lx\n", oid); + dprintf("new oid %" PRIx64 "\n", oid); } aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done); if (create) { AIOReq *areq; - QLIST_FOREACH(areq, &s->outstanding_aio_head, - outstanding_aio_siblings) { - if (areq == aio_req) { - continue; - } + QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { if (areq->oid == oid) { /* * Sheepdog cannot handle simultaneous create @@ -1595,11 +1626,14 @@ static int coroutine_fn sd_co_rw_vector(void *p) */ aio_req->flags = 0; aio_req->base_oid = 0; + QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, + aio_siblings); goto done; } } } + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create, acb->aiocb_type); if (ret < 0) { @@ -1614,7 +1648,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) done += len; } out: - if (QLIST_EMPTY(&acb->aioreq_head)) { + if (!--acb->nr_pending) { return acb->ret; } return 1; @@ -1627,7 +1661,6 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, int ret; if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { - /* TODO: shouldn't block here */ ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE); if (ret < 0) { return ret; @@ -1695,7 +1728,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) hdr.opcode = SD_OP_FLUSH_VDI; hdr.oid = vid_to_vdi_oid(inode->vdi_id); - ret = do_co_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen); + ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen); if (ret) { error_report("failed to send a request to the sheep"); return ret; @@ -1725,7 +1758,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) SheepdogInode *inode; unsigned int datalen; - dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d " + dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " " "is_snapshot %d\n", sn_info->name, sn_info->id_str, s->name, sn_info->vm_state_size, s->is_snapshot); diff --git a/block/stream.c b/block/stream.c index 8e5832273b..37c46525d2 100644 --- a/block/stream.c +++ b/block/stream.c @@ -13,6 +13,7 @@ #include "trace.h" #include "block_int.h" +#include "qemu/ratelimit.h" enum { /* @@ -25,34 +26,6 @@ enum { #define SLICE_TIME 100000000ULL /* ns */ -typedef struct { - int64_t next_slice_time; - uint64_t slice_quota; - uint64_t dispatched; -} RateLimit; - -static int64_t ratelimit_calculate_delay(RateLimit *limit, uint64_t n) -{ - int64_t now = qemu_get_clock_ns(rt_clock); - - if (limit->next_slice_time < now) { - limit->next_slice_time = now + SLICE_TIME; - limit->dispatched = 0; - } - if (limit->dispatched == 0 || limit->dispatched + n <= limit->slice_quota) { - limit->dispatched += n; - return 0; - } else { - limit->dispatched = n; - return limit->next_slice_time - now; - } -} - -static void ratelimit_set_speed(RateLimit *limit, uint64_t speed) -{ - limit->slice_quota = speed / (1000000000ULL / SLICE_TIME); -} - typedef struct StreamBlockJob { BlockJob common; RateLimit limit; @@ -98,67 +71,6 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, top->backing_hd = base; } -/* - * Given an image chain: [BASE] -> [INTER1] -> [INTER2] -> [TOP] - * - * Return true if the given sector is allocated in top. - * Return false if the given sector is allocated in intermediate images. - * Return true otherwise. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - */ -static int coroutine_fn is_allocated_base(BlockDriverState *top, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - BlockDriverState *intermediate; - int ret, n; - - ret = bdrv_co_is_allocated(top, sector_num, nb_sectors, &n); - if (ret) { - *pnum = n; - return ret; - } - - /* - * Is the unallocated chunk [sector_num, n] also - * unallocated between base and top? - */ - intermediate = top->backing_hd; - - while (intermediate != base) { - int pnum_inter; - - ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors, - &pnum_inter); - if (ret < 0) { - return ret; - } else if (ret) { - *pnum = pnum_inter; - return 0; - } - - /* - * [sector_num, nb_sectors] is unallocated on top but intermediate - * might have - * - * [sector_num+x, nr_sectors] allocated. - */ - if (n > pnum_inter) { - n = pnum_inter; - } - - intermediate = intermediate->backing_hd; - } - - *pnum = n; - return 1; -} - static void coroutine_fn stream_run(void *opaque) { StreamBlockJob *s = opaque; @@ -189,6 +101,7 @@ static void coroutine_fn stream_run(void *opaque) for (sector_num = 0; sector_num < end; sector_num += n) { uint64_t delay_ns = 0; + bool copy; wait: /* Note that even when no rate limit is applied we need to yield @@ -199,10 +112,20 @@ wait: break; } - ret = is_allocated_base(bs, base, sector_num, - STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); + ret = bdrv_co_is_allocated(bs, sector_num, + STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); + if (ret == 1) { + /* Allocated in the top, no need to copy. */ + copy = false; + } else { + /* Copy if allocated in the intermediate images. Limit to the + * known-unallocated area [sector_num, sector_num+n). */ + ret = bdrv_co_is_allocated_above(bs->backing_hd, base, + sector_num, n, &n); + copy = (ret == 1); + } trace_stream_one_iteration(s, sector_num, n, ret); - if (ret == 0) { + if (ret >= 0 && copy) { if (s->common.speed) { delay_ns = ratelimit_calculate_delay(&s->limit, n); if (delay_ns > 0) { @@ -248,7 +171,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) error_set(errp, QERR_INVALID_PARAMETER, "speed"); return; } - ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE); + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } static BlockJobType stream_job_type = { diff --git a/block/vdi.c b/block/vdi.c index 119d3c74da..57325d65c4 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -277,7 +277,8 @@ static void vdi_header_print(VdiHeader *header) } #endif -static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res) +static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) { /* TODO: additional checks possible. */ BDRVVdiState *s = (BDRVVdiState *)bs->opaque; @@ -286,6 +287,10 @@ static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res) uint32_t *bmap; logout("\n"); + if (fix) { + return -ENOTSUP; + } + bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t)); memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t)); |