summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Makefile.objs11
-rw-r--r--block/blkdebug.c107
-rw-r--r--block/iscsi.c152
-rw-r--r--block/qcow2-cache.c25
-rw-r--r--block/qcow2-cluster.c18
-rw-r--r--block/qcow2-refcount.c64
-rw-r--r--block/qcow2-snapshot.c6
-rw-r--r--block/qcow2.c31
-rw-r--r--block/qcow2.h8
-rw-r--r--block/qed-check.c2
-rw-r--r--block/qed.c7
-rw-r--r--block/raw-posix.c105
-rw-r--r--block/raw.c10
-rw-r--r--block/rbd.c19
-rw-r--r--block/sheepdog.c139
-rw-r--r--block/stream.c109
-rw-r--r--block/vdi.c7
17 files changed, 547 insertions, 273 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
new file mode 100644
index 0000000000..b5754d39bf
--- /dev/null
+++ b/block/Makefile.objs
@@ -0,0 +1,11 @@
+block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
+block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
+block-obj-y += qed-check.o
+block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-obj-y += stream.o
+block-obj-$(CONFIG_WIN32) += raw-win32.o
+block-obj-$(CONFIG_POSIX) += raw-posix.o
+block-obj-$(CONFIG_LIBISCSI) += iscsi.o
+block-obj-$(CONFIG_CURL) += curl.o
+block-obj-$(CONFIG_RBD) += rbd.o
diff --git a/block/blkdebug.c b/block/blkdebug.c
index e56e37da51..59dcea0650 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -26,24 +26,10 @@
#include "block_int.h"
#include "module.h"
-typedef struct BlkdebugVars {
- int state;
-
- /* If inject_errno != 0, an error is injected for requests */
- int inject_errno;
-
- /* Decides if all future requests fail (false) or only the next one and
- * after the next request inject_errno is reset to 0 (true) */
- bool inject_once;
-
- /* Decides if aio_readv/writev fails right away (true) or returns an error
- * return value only in the callback (false) */
- bool inject_immediately;
-} BlkdebugVars;
-
typedef struct BDRVBlkdebugState {
- BlkdebugVars vars;
- QLIST_HEAD(list, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
+ int state;
+ QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
+ QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
} BDRVBlkdebugState;
typedef struct BlkdebugAIOCB {
@@ -73,12 +59,14 @@ typedef struct BlkdebugRule {
int error;
int immediately;
int once;
+ int64_t sector;
} inject;
struct {
int new_state;
} set_state;
} options;
QLIST_ENTRY(BlkdebugRule) next;
+ QSIMPLEQ_ENTRY(BlkdebugRule) active_next;
} BlkdebugRule;
static QemuOptsList inject_error_opts = {
@@ -98,6 +86,10 @@ static QemuOptsList inject_error_opts = {
.type = QEMU_OPT_NUMBER,
},
{
+ .name = "sector",
+ .type = QEMU_OPT_NUMBER,
+ },
+ {
.name = "once",
.type = QEMU_OPT_BOOL,
},
@@ -147,9 +139,7 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
[BLKDBG_L2_ALLOC_COW_READ] = "l2_alloc.cow_read",
[BLKDBG_L2_ALLOC_WRITE] = "l2_alloc.write",
- [BLKDBG_READ] = "read",
[BLKDBG_READ_AIO] = "read_aio",
- [BLKDBG_READ_BACKING] = "read_backing",
[BLKDBG_READ_BACKING_AIO] = "read_backing_aio",
[BLKDBG_READ_COMPRESSED] = "read_compressed",
@@ -228,6 +218,7 @@ static int add_rule(QemuOpts *opts, void *opaque)
rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0);
rule->options.inject.immediately =
qemu_opt_get_bool(opts, "immediately", 0);
+ rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1);
break;
case ACTION_SET_STATE:
@@ -302,7 +293,7 @@ static int blkdebug_open(BlockDriverState *bs, const char *filename, int flags)
filename = c + 1;
/* Set initial state */
- s->vars.state = 1;
+ s->state = 1;
/* Open the backing file */
ret = bdrv_file_open(&bs->file, filename, flags);
@@ -328,18 +319,18 @@ static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb)
}
static BlockDriverAIOCB *inject_error(BlockDriverState *bs,
- BlockDriverCompletionFunc *cb, void *opaque)
+ BlockDriverCompletionFunc *cb, void *opaque, BlkdebugRule *rule)
{
BDRVBlkdebugState *s = bs->opaque;
- int error = s->vars.inject_errno;
+ int error = rule->options.inject.error;
struct BlkdebugAIOCB *acb;
QEMUBH *bh;
- if (s->vars.inject_once) {
- s->vars.inject_errno = 0;
+ if (rule->options.inject.once) {
+ QSIMPLEQ_INIT(&s->active_rules);
}
- if (s->vars.inject_immediately) {
+ if (rule->options.inject.immediately) {
return NULL;
}
@@ -358,14 +349,21 @@ static BlockDriverAIOCB *blkdebug_aio_readv(BlockDriverState *bs,
BlockDriverCompletionFunc *cb, void *opaque)
{
BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugRule *rule = NULL;
- if (s->vars.inject_errno) {
- return inject_error(bs, cb, opaque);
+ QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+ if (rule->options.inject.sector == -1 ||
+ (rule->options.inject.sector >= sector_num &&
+ rule->options.inject.sector < sector_num + nb_sectors)) {
+ break;
+ }
+ }
+
+ if (rule && rule->options.inject.error) {
+ return inject_error(bs, cb, opaque, rule);
}
- BlockDriverAIOCB *acb =
- bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
- return acb;
+ return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
}
static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
@@ -373,14 +371,21 @@ static BlockDriverAIOCB *blkdebug_aio_writev(BlockDriverState *bs,
BlockDriverCompletionFunc *cb, void *opaque)
{
BDRVBlkdebugState *s = bs->opaque;
+ BlkdebugRule *rule = NULL;
+
+ QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+ if (rule->options.inject.sector == -1 ||
+ (rule->options.inject.sector >= sector_num &&
+ rule->options.inject.sector < sector_num + nb_sectors)) {
+ break;
+ }
+ }
- if (s->vars.inject_errno) {
- return inject_error(bs, cb, opaque);
+ if (rule && rule->options.inject.error) {
+ return inject_error(bs, cb, opaque, rule);
}
- BlockDriverAIOCB *acb =
- bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
- return acb;
+ return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
}
static void blkdebug_close(BlockDriverState *bs)
@@ -397,44 +402,53 @@ static void blkdebug_close(BlockDriverState *bs)
}
}
-static void process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
- BlkdebugVars *old_vars)
+static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
+ int old_state, bool injected)
{
BDRVBlkdebugState *s = bs->opaque;
- BlkdebugVars *vars = &s->vars;
/* Only process rules for the current state */
- if (rule->state && rule->state != old_vars->state) {
- return;
+ if (rule->state && rule->state != old_state) {
+ return injected;
}
/* Take the action */
switch (rule->action) {
case ACTION_INJECT_ERROR:
- vars->inject_errno = rule->options.inject.error;
- vars->inject_once = rule->options.inject.once;
- vars->inject_immediately = rule->options.inject.immediately;
+ if (!injected) {
+ QSIMPLEQ_INIT(&s->active_rules);
+ injected = true;
+ }
+ QSIMPLEQ_INSERT_HEAD(&s->active_rules, rule, active_next);
break;
case ACTION_SET_STATE:
- vars->state = rule->options.set_state.new_state;
+ s->state = rule->options.set_state.new_state;
break;
}
+ return injected;
}
static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
{
BDRVBlkdebugState *s = bs->opaque;
struct BlkdebugRule *rule;
- BlkdebugVars old_vars = s->vars;
+ int old_state = s->state;
+ bool injected;
assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
+ injected = false;
QLIST_FOREACH(rule, &s->rules[event], next) {
- process_rule(bs, rule, &old_vars);
+ injected = process_rule(bs, rule, old_state, injected);
}
}
+static int64_t blkdebug_getlength(BlockDriverState *bs)
+{
+ return bdrv_getlength(bs->file);
+}
+
static BlockDriver bdrv_blkdebug = {
.format_name = "blkdebug",
.protocol_name = "blkdebug",
@@ -443,6 +457,7 @@ static BlockDriver bdrv_blkdebug = {
.bdrv_file_open = blkdebug_open,
.bdrv_close = blkdebug_close,
+ .bdrv_getlength = blkdebug_getlength,
.bdrv_aio_readv = blkdebug_aio_readv,
.bdrv_aio_writev = blkdebug_aio_writev,
diff --git a/block/iscsi.c b/block/iscsi.c
index ecb7a2211e..993a86d829 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -35,6 +35,10 @@
#include <iscsi/iscsi.h>
#include <iscsi/scsi-lowlevel.h>
+#ifdef __linux__
+#include <scsi/sg.h>
+#include <hw/scsi-defs.h>
+#endif
typedef struct IscsiLun {
struct iscsi_context *iscsi;
@@ -56,6 +60,9 @@ typedef struct IscsiAIOCB {
int canceled;
size_t read_size;
size_t read_offset;
+#ifdef __linux__
+ sg_io_hdr_t *ioh;
+#endif
} IscsiAIOCB;
struct IscsiTask {
@@ -514,6 +521,136 @@ iscsi_aio_discard(BlockDriverState *bs,
return &acb->common;
}
+#ifdef __linux__
+static void
+iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *opaque)
+{
+ IscsiAIOCB *acb = opaque;
+
+ if (acb->canceled != 0) {
+ qemu_aio_release(acb);
+ scsi_free_scsi_task(acb->task);
+ acb->task = NULL;
+ return;
+ }
+
+ acb->status = 0;
+ if (status < 0) {
+ error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s",
+ iscsi_get_error(iscsi));
+ acb->status = -EIO;
+ }
+
+ acb->ioh->driver_status = 0;
+ acb->ioh->host_status = 0;
+ acb->ioh->resid = 0;
+
+#define SG_ERR_DRIVER_SENSE 0x08
+
+ if (status == SCSI_STATUS_CHECK_CONDITION && acb->task->datain.size >= 2) {
+ int ss;
+
+ acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE;
+
+ acb->ioh->sb_len_wr = acb->task->datain.size - 2;
+ ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ?
+ acb->ioh->mx_sb_len : acb->ioh->sb_len_wr;
+ memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss);
+ }
+
+ iscsi_schedule_bh(iscsi_readv_writev_bh_cb, acb);
+ scsi_free_scsi_task(acb->task);
+ acb->task = NULL;
+}
+
+static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ IscsiLun *iscsilun = bs->opaque;
+ struct iscsi_context *iscsi = iscsilun->iscsi;
+ struct iscsi_data data;
+ IscsiAIOCB *acb;
+
+ assert(req == SG_IO);
+
+ acb = qemu_aio_get(&iscsi_aio_pool, bs, cb, opaque);
+
+ acb->iscsilun = iscsilun;
+ acb->canceled = 0;
+ acb->buf = NULL;
+ acb->ioh = buf;
+
+ acb->task = malloc(sizeof(struct scsi_task));
+ if (acb->task == NULL) {
+ error_report("iSCSI: Failed to allocate task for scsi command. %s",
+ iscsi_get_error(iscsi));
+ qemu_aio_release(acb);
+ return NULL;
+ }
+ memset(acb->task, 0, sizeof(struct scsi_task));
+
+ switch (acb->ioh->dxfer_direction) {
+ case SG_DXFER_TO_DEV:
+ acb->task->xfer_dir = SCSI_XFER_WRITE;
+ break;
+ case SG_DXFER_FROM_DEV:
+ acb->task->xfer_dir = SCSI_XFER_READ;
+ break;
+ default:
+ acb->task->xfer_dir = SCSI_XFER_NONE;
+ break;
+ }
+
+ acb->task->cdb_size = acb->ioh->cmd_len;
+ memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len);
+ acb->task->expxferlen = acb->ioh->dxfer_len;
+
+ if (acb->task->xfer_dir == SCSI_XFER_WRITE) {
+ data.data = acb->ioh->dxferp;
+ data.size = acb->ioh->dxfer_len;
+ }
+ if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task,
+ iscsi_aio_ioctl_cb,
+ (acb->task->xfer_dir == SCSI_XFER_WRITE) ?
+ &data : NULL,
+ acb) != 0) {
+ scsi_free_scsi_task(acb->task);
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ /* tell libiscsi to read straight into the buffer we got from ioctl */
+ if (acb->task->xfer_dir == SCSI_XFER_READ) {
+ scsi_task_add_data_in_buffer(acb->task,
+ acb->ioh->dxfer_len,
+ acb->ioh->dxferp);
+ }
+
+ iscsi_set_events(iscsilun);
+
+ return &acb->common;
+}
+
+static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ IscsiLun *iscsilun = bs->opaque;
+
+ switch (req) {
+ case SG_GET_VERSION_NUM:
+ *(int *)buf = 30000;
+ break;
+ case SG_GET_SCSI_ID:
+ ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
+ break;
+ default:
+ return -1;
+ }
+ return 0;
+}
+#endif
+
static int64_t
iscsi_getlength(BlockDriverState *bs)
{
@@ -884,6 +1021,16 @@ static int iscsi_open(BlockDriverState *bs, const char *filename, int flags)
if (iscsi_url != NULL) {
iscsi_destroy_url(iscsi_url);
}
+
+ /* Medium changer or tape. We dont have any emulation for this so this must
+ * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
+ * to read from the device to guess the image format.
+ */
+ if (iscsilun->type == TYPE_MEDIUM_CHANGER ||
+ iscsilun->type == TYPE_TAPE) {
+ bs->sg = 1;
+ }
+
return 0;
failed:
@@ -925,6 +1072,11 @@ static BlockDriver bdrv_iscsi = {
.bdrv_aio_flush = iscsi_aio_flush,
.bdrv_aio_discard = iscsi_aio_discard,
+
+#ifdef __linux__
+ .bdrv_ioctl = iscsi_ioctl,
+ .bdrv_aio_ioctl = iscsi_aio_ioctl,
+#endif
};
static void iscsi_block_init(void)
diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 710d4b1828..2d4322a8dd 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -40,11 +40,9 @@ struct Qcow2Cache {
struct Qcow2Cache* depends;
int size;
bool depends_on_flush;
- bool writethrough;
};
-Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
- bool writethrough)
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
{
BDRVQcowState *s = bs->opaque;
Qcow2Cache *c;
@@ -53,7 +51,6 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
c = g_malloc0(sizeof(*c));
c->size = num_tables;
c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
- c->writethrough = writethrough;
for (i = 0; i < c->size; i++) {
c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
@@ -307,12 +304,7 @@ found:
*table = NULL;
assert(c->entries[i].ref >= 0);
-
- if (c->writethrough) {
- return qcow2_cache_entry_flush(bs, c, i);
- } else {
- return 0;
- }
+ return 0;
}
void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
@@ -329,16 +321,3 @@ void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
found:
c->entries[i].dirty = true;
}
-
-bool qcow2_cache_set_writethrough(BlockDriverState *bs, Qcow2Cache *c,
- bool enable)
-{
- bool old = c->writethrough;
-
- if (!old && enable) {
- qcow2_cache_flush(bs, c);
- }
-
- c->writethrough = enable;
- return old;
-}
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4b3345b11b..d7e0e19d9c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -471,6 +471,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO);
*cluster_offset &= L2E_OFFSET_MASK;
break;
+ default:
+ abort();
}
qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
@@ -538,7 +540,6 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
if (l2_offset) {
qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
}
- l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
}
/* find the cluster offset for the given disk offset */
@@ -641,11 +642,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
}
if (m->nb_available & (s->cluster_sectors - 1)) {
- uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
cow = true;
qemu_co_mutex_unlock(&s->lock);
- ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
- m->nb_available - end, s->cluster_sectors);
+ ret = copy_sectors(bs, start_sect, cluster_offset, m->nb_available,
+ align_offset(m->nb_available, s->cluster_sectors));
qemu_co_mutex_lock(&s->lock);
if (ret < 0)
goto err;
@@ -947,8 +947,16 @@ again:
/* save info needed for meta data update */
if (nb_clusters > 0) {
+ /*
+ * requested_sectors: Number of sectors from the start of the first
+ * newly allocated cluster to the end of the (possibly shortened
+ * before) write request.
+ *
+ * avail_sectors: Number of sectors from the start of the first
+ * newly allocated to the end of the last newly allocated cluster.
+ */
int requested_sectors = n_end - keep_clusters * s->cluster_sectors;
- int avail_sectors = (keep_clusters + nb_clusters)
+ int avail_sectors = nb_clusters
<< (s->cluster_bits - BDRV_SECTOR_BITS);
*m = (QCowL2Meta) {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 812c93c5c7..5e3f9153fb 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -367,7 +367,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
}
for(i = 0; i < table_size; i++) {
- cpu_to_be64s(&new_table[i]);
+ be64_to_cpus(&new_table[i]);
}
/* Hook up the new refcount table in the qcow2 header */
@@ -627,10 +627,11 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
assert(size > 0 && size <= s->cluster_size);
if (s->free_byte_offset == 0) {
- s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size);
- if (s->free_byte_offset < 0) {
- return s->free_byte_offset;
+ offset = qcow2_alloc_clusters(bs, s->cluster_size);
+ if (offset < 0) {
+ return offset;
}
+ s->free_byte_offset = offset;
}
redo:
free_in_cluster = s->cluster_size -
@@ -726,13 +727,6 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
int64_t old_offset, old_l2_offset;
int i, j, l1_modified = 0, nb_csectors, refcount;
int ret;
- bool old_l2_writethrough, old_refcount_writethrough;
-
- /* Switch caches to writeback mode during update */
- old_l2_writethrough =
- qcow2_cache_set_writethrough(bs, s->l2_table_cache, false);
- old_refcount_writethrough =
- qcow2_cache_set_writethrough(bs, s->refcount_block_cache, false);
l2_table = NULL;
l1_table = NULL;
@@ -856,11 +850,6 @@ fail:
qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
}
- /* Enable writethrough cache mode again */
- qcow2_cache_set_writethrough(bs, s->l2_table_cache, old_l2_writethrough);
- qcow2_cache_set_writethrough(bs, s->refcount_block_cache,
- old_refcount_writethrough);
-
/* Update L1 only if it isn't deleted anyway (addend = -1) */
if (addend >= 0 && l1_modified) {
for(i = 0; i < l1_size; i++)
@@ -1122,11 +1111,12 @@ fail:
* Returns 0 if no errors are found, the number of errors in case the image is
* detected as corrupted, and -errno when an internal error occurred.
*/
-int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res)
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
{
BDRVQcowState *s = bs->opaque;
- int64_t size;
- int nb_clusters, refcount1, refcount2, i;
+ int64_t size, i;
+ int nb_clusters, refcount1, refcount2;
QCowSnapshot *sn;
uint16_t *refcount_table;
int ret;
@@ -1170,14 +1160,15 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res)
/* Refcount blocks are cluster aligned */
if (offset & (s->cluster_size - 1)) {
- fprintf(stderr, "ERROR refcount block %d is not "
+ fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
"cluster aligned; refcount table entry corrupted\n", i);
res->corruptions++;
continue;
}
if (cluster >= nb_clusters) {
- fprintf(stderr, "ERROR refcount block %d is outside image\n", i);
+ fprintf(stderr, "ERROR refcount block %" PRId64
+ " is outside image\n", i);
res->corruptions++;
continue;
}
@@ -1186,7 +1177,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res)
inc_refcounts(bs, res, refcount_table, nb_clusters,
offset, s->cluster_size);
if (refcount_table[cluster] != 1) {
- fprintf(stderr, "ERROR refcount block %d refcount=%d\n",
+ fprintf(stderr, "ERROR refcount block %" PRId64
+ " refcount=%d\n",
i, refcount_table[cluster]);
res->corruptions++;
}
@@ -1197,7 +1189,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res)
for(i = 0; i < nb_clusters; i++) {
refcount1 = get_refcount(bs, i);
if (refcount1 < 0) {
- fprintf(stderr, "Can't get refcount for cluster %d: %s\n",
+ fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
i, strerror(-refcount1));
res->check_errors++;
continue;
@@ -1205,9 +1197,31 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res)
refcount2 = refcount_table[i];
if (refcount1 != refcount2) {
- fprintf(stderr, "%s cluster %d refcount=%d reference=%d\n",
- refcount1 < refcount2 ? "ERROR" : "Leaked",
+
+ /* Check if we're allowed to fix the mismatch */
+ int *num_fixed = NULL;
+ if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
+ num_fixed = &res->leaks_fixed;
+ } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
+ num_fixed = &res->corruptions_fixed;
+ }
+
+ fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
+ num_fixed != NULL ? "Repairing" :
+ refcount1 < refcount2 ? "ERROR" :
+ "Leaked",
i, refcount1, refcount2);
+
+ if (num_fixed) {
+ ret = update_refcount(bs, i << s->cluster_bits, 1,
+ refcount2 - refcount1);
+ if (ret >= 0) {
+ (*num_fixed)++;
+ continue;
+ }
+ }
+
+ /* And if we couldn't, print an error */
if (refcount1 < refcount2) {
res->corruptions++;
} else {
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 4561a2abf9..4e7c93b8b3 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -405,7 +405,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
#ifdef DEBUG_ALLOC
{
BdrvCheckResult result = {0};
- qcow2_check_refcounts(bs, &result);
+ qcow2_check_refcounts(bs, &result, 0);
}
#endif
return 0;
@@ -522,7 +522,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
#ifdef DEBUG_ALLOC
{
BdrvCheckResult result = {0};
- qcow2_check_refcounts(bs, &result);
+ qcow2_check_refcounts(bs, &result, 0);
}
#endif
return 0;
@@ -582,7 +582,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
#ifdef DEBUG_ALLOC
{
BdrvCheckResult result = {0};
- qcow2_check_refcounts(bs, &result);
+ qcow2_check_refcounts(bs, &result, 0);
}
#endif
return 0;
diff --git a/block/qcow2.c b/block/qcow2.c
index 1b5b36cfc1..870148ddf8 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -220,7 +220,6 @@ static int qcow2_open(BlockDriverState *bs, int flags)
int len, i, ret = 0;
QCowHeader header;
uint64_t ext_end;
- bool writethrough;
ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
if (ret < 0) {
@@ -298,14 +297,6 @@ static int qcow2_open(BlockDriverState *bs, int flags)
goto fail;
}
- if (!bs->read_only && s->autoclear_features != 0) {
- s->autoclear_features = 0;
- ret = qcow2_update_header(bs);
- if (ret < 0) {
- goto fail;
- }
- }
-
/* Check support for various header values */
if (header.refcount_order != 4) {
report_unsupported(bs, "%d bit reference counts",
@@ -367,10 +358,8 @@ static int qcow2_open(BlockDriverState *bs, int flags)
}
/* alloc L2 table/refcount block cache */
- writethrough = ((flags & BDRV_O_CACHE_WB) == 0);
- s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE, writethrough);
- s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE,
- writethrough);
+ s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
+ s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
s->cluster_cache = g_malloc(s->cluster_size);
/* one more sector for decompressed data alignment */
@@ -411,13 +400,22 @@ static int qcow2_open(BlockDriverState *bs, int flags)
goto fail;
}
+ /* Clear unknown autoclear feature bits */
+ if (!bs->read_only && s->autoclear_features != 0) {
+ s->autoclear_features = 0;
+ ret = qcow2_update_header(bs);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
/* Initialise locks */
qemu_co_mutex_init(&s->lock);
#ifdef DEBUG_ALLOC
{
BdrvCheckResult result = {0};
- qcow2_check_refcounts(bs, &result);
+ qcow2_check_refcounts(bs, &result, 0);
}
#endif
return ret;
@@ -1467,9 +1465,10 @@ static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
}
-static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result)
+static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
+ BdrvCheckMode fix)
{
- return qcow2_check_refcounts(bs, result);
+ return qcow2_check_refcounts(bs, result, fix);
}
#if 0
diff --git a/block/qcow2.h b/block/qcow2.h
index 93567f6451..455b6d7cfe 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -261,7 +261,8 @@ void qcow2_free_any_clusters(BlockDriverState *bs,
int qcow2_update_snapshot_refcount(BlockDriverState *bs,
int64_t l1_table_offset, int l1_size, int addend);
-int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res);
+int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix);
/* qcow2-cluster.c functions */
int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size);
@@ -296,11 +297,8 @@ void qcow2_free_snapshots(BlockDriverState *bs);
int qcow2_read_snapshots(BlockDriverState *bs);
/* qcow2-cache.c functions */
-Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
- bool writethrough);
+Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables);
int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c);
-bool qcow2_cache_set_writethrough(BlockDriverState *bs, Qcow2Cache *c,
- bool enable);
void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table);
int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c);
diff --git a/block/qed-check.c b/block/qed-check.c
index 94327ff5b3..5edf60775b 100644
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -87,6 +87,7 @@ static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
if (!qed_check_cluster_offset(s, offset)) {
if (check->fix) {
table->offsets[i] = 0;
+ check->result->corruptions_fixed++;
} else {
check->result->corruptions++;
}
@@ -127,6 +128,7 @@ static int qed_check_l1_table(QEDCheck *check, QEDTable *table)
/* Clear invalid offset */
if (check->fix) {
table->offsets[i] = 0;
+ check->result->corruptions_fixed++;
} else {
check->result->corruptions++;
}
diff --git a/block/qed.c b/block/qed.c
index 847417a3d6..5f3eefa3af 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -748,7 +748,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
/* If the read straddles the end of the backing file, shorten it */
size = MIN((uint64_t)backing_length - pos, qiov->size);
- BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING);
+ BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
}
@@ -1517,11 +1517,12 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
bdrv_qed_open(bs, bs->open_flags);
}
-static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result)
+static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
+ BdrvCheckMode fix)
{
BDRVQEDState *s = bs->opaque;
- return qed_check(s, result, false);
+ return qed_check(s, result, !!fix);
}
static QEMUOptionParameter qed_create_options[] = {
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 03fcfcc031..0dce089be5 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -52,6 +52,10 @@
#include <sys/param.h>
#include <linux/cdrom.h>
#include <linux/fd.h>
+#include <linux/fs.h>
+#endif
+#ifdef CONFIG_FIEMAP
+#include <linux/fiemap.h>
#endif
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
#include <sys/disk.h>
@@ -583,6 +587,106 @@ static int raw_create(const char *filename, QEMUOptionParameter *options)
return result;
}
+/*
+ * Returns true iff the specified sector is present in the disk image. Drivers
+ * not implementing the functionality are assumed to not support backing files,
+ * hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ off_t start, data, hole;
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ start = sector_num * BDRV_SECTOR_SIZE;
+
+#ifdef CONFIG_FIEMAP
+
+ BDRVRawState *s = bs->opaque;
+ struct {
+ struct fiemap fm;
+ struct fiemap_extent fe;
+ } f;
+
+ f.fm.fm_start = start;
+ f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
+ f.fm.fm_flags = 0;
+ f.fm.fm_extent_count = 1;
+ f.fm.fm_reserved = 0;
+ if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
+ /* Assume everything is allocated. */
+ *pnum = nb_sectors;
+ return 1;
+ }
+
+ if (f.fm.fm_mapped_extents == 0) {
+ /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
+ * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
+ */
+ off_t length = lseek(s->fd, 0, SEEK_END);
+ hole = f.fm.fm_start;
+ data = MIN(f.fm.fm_start + f.fm.fm_length, length);
+ } else {
+ data = f.fe.fe_logical;
+ hole = f.fe.fe_logical + f.fe.fe_length;
+ }
+
+#elif defined SEEK_HOLE && defined SEEK_DATA
+
+ BDRVRawState *s = bs->opaque;
+
+ hole = lseek(s->fd, start, SEEK_HOLE);
+ if (hole == -1) {
+ /* -ENXIO indicates that sector_num was past the end of the file.
+ * There is a virtual hole there. */
+ assert(errno != -ENXIO);
+
+ /* Most likely EINVAL. Assume everything is allocated. */
+ *pnum = nb_sectors;
+ return 1;
+ }
+
+ if (hole > start) {
+ data = start;
+ } else {
+ /* On a hole. We need another syscall to find its end. */
+ data = lseek(s->fd, start, SEEK_DATA);
+ if (data == -1) {
+ data = lseek(s->fd, 0, SEEK_END);
+ }
+ }
+#else
+ *pnum = nb_sectors;
+ return 1;
+#endif
+
+ if (data <= start) {
+ /* On a data extent, compute sectors to the end of the extent. */
+ *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
+ return 1;
+ } else {
+ /* On a hole, compute sectors to the beginning of the next extent. */
+ *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+ return 0;
+ }
+}
+
#ifdef CONFIG_XFS
static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
{
@@ -634,6 +738,7 @@ static BlockDriver bdrv_file = {
.bdrv_close = raw_close,
.bdrv_create = raw_create,
.bdrv_co_discard = raw_co_discard,
+ .bdrv_co_is_allocated = raw_co_is_allocated,
.bdrv_aio_readv = raw_aio_readv,
.bdrv_aio_writev = raw_aio_writev,
diff --git a/block/raw.c b/block/raw.c
index 7086e314a6..ff34ea41e7 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -12,12 +12,14 @@ static int raw_open(BlockDriverState *bs, int flags)
static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov)
{
+ BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov);
}
static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov)
{
+ BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov);
}
@@ -25,6 +27,13 @@ static void raw_close(BlockDriverState *bs)
{
}
+static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
+}
+
static int64_t raw_getlength(BlockDriverState *bs)
{
return bdrv_getlength(bs->file);
@@ -108,6 +117,7 @@ static BlockDriver bdrv_raw = {
.bdrv_co_readv = raw_co_readv,
.bdrv_co_writev = raw_co_writev,
+ .bdrv_co_is_allocated = raw_co_is_allocated,
.bdrv_co_discard = raw_co_discard,
.bdrv_probe = raw_probe,
diff --git a/block/rbd.c b/block/rbd.c
index 49a4787b55..5a0f79fc8f 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -476,6 +476,25 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
s->snap = g_strdup(snap_buf);
}
+ /*
+ * Fallback to more conservative semantics if setting cache
+ * options fails. Ignore errors from setting rbd_cache because the
+ * only possible error is that the option does not exist, and
+ * librbd defaults to no caching. If write through caching cannot
+ * be set up, fall back to no caching.
+ */
+ if (flags & BDRV_O_NOCACHE) {
+ rados_conf_set(s->cluster, "rbd_cache", "false");
+ } else {
+ rados_conf_set(s->cluster, "rbd_cache", "true");
+ if (!(flags & BDRV_O_CACHE_WB)) {
+ r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0");
+ if (r < 0) {
+ rados_conf_set(s->cluster, "rbd_cache", "false");
+ }
+ }
+ }
+
if (strstr(conf, "conf=") == NULL) {
/* try default location, but ignore failure */
rados_conf_read_file(s->cluster, NULL);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 2c7aecec4f..809df39d9e 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -259,8 +259,7 @@ typedef struct AIOReq {
uint8_t flags;
uint32_t id;
- QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
- QLIST_ENTRY(AIOReq) aioreq_siblings;
+ QLIST_ENTRY(AIOReq) aio_siblings;
} AIOReq;
enum AIOCBState {
@@ -283,8 +282,7 @@ struct SheepdogAIOCB {
void (*aio_done_func)(SheepdogAIOCB *);
int canceled;
-
- QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
+ int nr_pending;
};
typedef struct BDRVSheepdogState {
@@ -307,7 +305,8 @@ typedef struct BDRVSheepdogState {
Coroutine *co_recv;
uint32_t aioreq_seq_num;
- QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
+ QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
+ QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
} BDRVSheepdogState;
static const char * sd_strerror(int err)
@@ -358,7 +357,7 @@ static const char * sd_strerror(int err)
* Sheepdog I/O handling:
*
* 1. In sd_co_rw_vector, we send the I/O requests to the server and
- * link the requests to the outstanding_list in the
+ * link the requests to the inflight_list in the
* BDRVSheepdogState. The function exits without waiting for
* receiving the response.
*
@@ -386,21 +385,18 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
aio_req->flags = flags;
aio_req->id = s->aioreq_seq_num++;
- QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
- outstanding_aio_siblings);
- QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
-
+ acb->nr_pending++;
return aio_req;
}
-static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
+static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
{
SheepdogAIOCB *acb = aio_req->aiocb;
- QLIST_REMOVE(aio_req, outstanding_aio_siblings);
- QLIST_REMOVE(aio_req, aioreq_siblings);
+
+ QLIST_REMOVE(aio_req, aio_siblings);
g_free(aio_req);
- return !QLIST_EMPTY(&acb->aioreq_head);
+ acb->nr_pending--;
}
static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
@@ -446,7 +442,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
acb->canceled = 0;
acb->coroutine = qemu_coroutine_self();
acb->ret = 0;
- QLIST_INIT(&acb->aioreq_head);
+ acb->nr_pending = 0;
return acb;
}
@@ -522,8 +518,8 @@ static int send_req(int sockfd, SheepdogReq *hdr, void *data,
return ret;
}
-static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
- unsigned int *wlen)
+static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
+ unsigned int *wlen)
{
int ret;
@@ -540,11 +536,19 @@ static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
return ret;
}
+
+static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
+ unsigned int *wlen, unsigned int *rlen);
+
static int do_req(int sockfd, SheepdogReq *hdr, void *data,
unsigned int *wlen, unsigned int *rlen)
{
int ret;
+ if (qemu_in_coroutine()) {
+ return do_co_req(sockfd, hdr, data, wlen, rlen);
+ }
+
socket_set_block(sockfd);
ret = send_req(sockfd, hdr, data, wlen);
if (ret < 0) {
@@ -576,10 +580,21 @@ out:
return ret;
}
-static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
- unsigned int *wlen, unsigned int *rlen)
+static void restart_co_req(void *opaque)
+{
+ Coroutine *co = opaque;
+
+ qemu_coroutine_enter(co, NULL);
+}
+
+static coroutine_fn int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
+ unsigned int *wlen, unsigned int *rlen)
{
int ret;
+ Coroutine *co;
+
+ co = qemu_coroutine_self();
+ qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, NULL, co);
socket_set_block(sockfd);
ret = send_co_req(sockfd, hdr, data, wlen);
@@ -587,6 +602,8 @@ static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
goto out;
}
+ qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, NULL, co);
+
ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
if (ret < sizeof(*hdr)) {
error_report("failed to get a rsp, %s", strerror(errno));
@@ -608,6 +625,7 @@ static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
}
ret = 0;
out:
+ qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL);
socket_set_nonblock(sockfd);
return ret;
}
@@ -616,32 +634,41 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
struct iovec *iov, int niov, int create,
enum AIOCBState aiocb_type);
+
+static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
+{
+ AIOReq *aio_req;
+
+ QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
+ if (aio_req->oid == oid) {
+ return aio_req;
+ }
+ }
+
+ return NULL;
+}
+
/*
* This function searchs pending requests to the object `oid', and
* sends them.
*/
-static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
+static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
{
- AIOReq *aio_req, *next;
+ AIOReq *aio_req;
SheepdogAIOCB *acb;
int ret;
- QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
- outstanding_aio_siblings, next) {
- if (id == aio_req->id) {
- continue;
- }
- if (aio_req->oid != oid) {
- continue;
- }
-
+ while ((aio_req = find_pending_req(s, oid)) != NULL) {
acb = aio_req->aiocb;
+ /* move aio_req from pending list to inflight one */
+ QLIST_REMOVE(aio_req, aio_siblings);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
ret = add_aio_request(s, aio_req, acb->qiov->iov,
acb->qiov->niov, 0, acb->aiocb_type);
if (ret < 0) {
error_report("add_aio_request is failed");
free_aio_req(s, aio_req);
- if (QLIST_EMPTY(&acb->aioreq_head)) {
+ if (!acb->nr_pending) {
sd_finish_aiocb(acb);
}
}
@@ -662,10 +689,9 @@ static void coroutine_fn aio_read_response(void *opaque)
int ret;
AIOReq *aio_req = NULL;
SheepdogAIOCB *acb;
- int rest;
unsigned long idx;
- if (QLIST_EMPTY(&s->outstanding_aio_head)) {
+ if (QLIST_EMPTY(&s->inflight_aio_head)) {
goto out;
}
@@ -676,8 +702,8 @@ static void coroutine_fn aio_read_response(void *opaque)
goto out;
}
- /* find the right aio_req from the outstanding_aio list */
- QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
+ /* find the right aio_req from the inflight aio list */
+ QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
if (aio_req->id == rsp.id) {
break;
}
@@ -715,7 +741,7 @@ static void coroutine_fn aio_read_response(void *opaque)
* create requests are not allowed, so we search the
* pending requests here.
*/
- send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
+ send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx));
}
break;
case AIOCB_READ_UDATA:
@@ -733,8 +759,8 @@ static void coroutine_fn aio_read_response(void *opaque)
error_report("%s", sd_strerror(rsp.result));
}
- rest = free_aio_req(s, aio_req);
- if (!rest) {
+ free_aio_req(s, aio_req);
+ if (!acb->nr_pending) {
/*
* We've finished all requests which belong to the AIOCB, so
* we can switch back to sd_co_readv/writev now.
@@ -767,7 +793,8 @@ static int aio_flush_request(void *opaque)
{
BDRVSheepdogState *s = opaque;
- return !QLIST_EMPTY(&s->outstanding_aio_head);
+ return !QLIST_EMPTY(&s->inflight_aio_head) ||
+ !QLIST_EMPTY(&s->pending_aio_head);
}
static int set_nodelay(int fd)
@@ -1084,7 +1111,8 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
strstart(filename, "sheepdog:", (const char **)&filename);
- QLIST_INIT(&s->outstanding_aio_head);
+ QLIST_INIT(&s->inflight_aio_head);
+ QLIST_INIT(&s->pending_aio_head);
s->fd = -1;
memset(vdi, 0, sizeof(vdi));
@@ -1446,6 +1474,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
iov.iov_len = sizeof(s->inode);
aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
data_len, offset, 0, 0, offset);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
if (ret) {
free_aio_req(s, aio_req);
@@ -1514,7 +1543,7 @@ out:
* Send I/O requests to the server.
*
* This function sends requests to the server, links the requests to
- * the outstanding_list in BDRVSheepdogState, and exits without
+ * the inflight_list in BDRVSheepdogState, and exits without
* waiting the response. The responses are received in the
* `aio_read_response' function which is called from the main loop as
* a fd handler.
@@ -1546,6 +1575,12 @@ static int coroutine_fn sd_co_rw_vector(void *p)
}
}
+ /*
+ * Make sure we don't free the aiocb before we are done with all requests.
+ * This additional reference is dropped at the end of this function.
+ */
+ acb->nr_pending++;
+
while (done != total) {
uint8_t flags = 0;
uint64_t old_oid = 0;
@@ -1570,22 +1605,18 @@ static int coroutine_fn sd_co_rw_vector(void *p)
}
if (create) {
- dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
- " %" PRIu64 "\n", inode->vdi_id, oid,
+ dprintf("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n",
+ inode->vdi_id, oid,
vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
oid = vid_to_data_oid(inode->vdi_id, idx);
- dprintf("new oid %lx\n", oid);
+ dprintf("new oid %" PRIx64 "\n", oid);
}
aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
if (create) {
AIOReq *areq;
- QLIST_FOREACH(areq, &s->outstanding_aio_head,
- outstanding_aio_siblings) {
- if (areq == aio_req) {
- continue;
- }
+ QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
if (areq->oid == oid) {
/*
* Sheepdog cannot handle simultaneous create
@@ -1595,11 +1626,14 @@ static int coroutine_fn sd_co_rw_vector(void *p)
*/
aio_req->flags = 0;
aio_req->base_oid = 0;
+ QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req,
+ aio_siblings);
goto done;
}
}
}
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
create, acb->aiocb_type);
if (ret < 0) {
@@ -1614,7 +1648,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
done += len;
}
out:
- if (QLIST_EMPTY(&acb->aioreq_head)) {
+ if (!--acb->nr_pending) {
return acb->ret;
}
return 1;
@@ -1627,7 +1661,6 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
int ret;
if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
- /* TODO: shouldn't block here */
ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE);
if (ret < 0) {
return ret;
@@ -1695,7 +1728,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
hdr.opcode = SD_OP_FLUSH_VDI;
hdr.oid = vid_to_vdi_oid(inode->vdi_id);
- ret = do_co_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
+ ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
if (ret) {
error_report("failed to send a request to the sheep");
return ret;
@@ -1725,7 +1758,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
SheepdogInode *inode;
unsigned int datalen;
- dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
+ dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " "
"is_snapshot %d\n", sn_info->name, sn_info->id_str,
s->name, sn_info->vm_state_size, s->is_snapshot);
diff --git a/block/stream.c b/block/stream.c
index 8e5832273b..37c46525d2 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -13,6 +13,7 @@
#include "trace.h"
#include "block_int.h"
+#include "qemu/ratelimit.h"
enum {
/*
@@ -25,34 +26,6 @@ enum {
#define SLICE_TIME 100000000ULL /* ns */
-typedef struct {
- int64_t next_slice_time;
- uint64_t slice_quota;
- uint64_t dispatched;
-} RateLimit;
-
-static int64_t ratelimit_calculate_delay(RateLimit *limit, uint64_t n)
-{
- int64_t now = qemu_get_clock_ns(rt_clock);
-
- if (limit->next_slice_time < now) {
- limit->next_slice_time = now + SLICE_TIME;
- limit->dispatched = 0;
- }
- if (limit->dispatched == 0 || limit->dispatched + n <= limit->slice_quota) {
- limit->dispatched += n;
- return 0;
- } else {
- limit->dispatched = n;
- return limit->next_slice_time - now;
- }
-}
-
-static void ratelimit_set_speed(RateLimit *limit, uint64_t speed)
-{
- limit->slice_quota = speed / (1000000000ULL / SLICE_TIME);
-}
-
typedef struct StreamBlockJob {
BlockJob common;
RateLimit limit;
@@ -98,67 +71,6 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
top->backing_hd = base;
}
-/*
- * Given an image chain: [BASE] -> [INTER1] -> [INTER2] -> [TOP]
- *
- * Return true if the given sector is allocated in top.
- * Return false if the given sector is allocated in intermediate images.
- * Return true otherwise.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
- * allocated/unallocated state.
- *
- */
-static int coroutine_fn is_allocated_base(BlockDriverState *top,
- BlockDriverState *base,
- int64_t sector_num,
- int nb_sectors, int *pnum)
-{
- BlockDriverState *intermediate;
- int ret, n;
-
- ret = bdrv_co_is_allocated(top, sector_num, nb_sectors, &n);
- if (ret) {
- *pnum = n;
- return ret;
- }
-
- /*
- * Is the unallocated chunk [sector_num, n] also
- * unallocated between base and top?
- */
- intermediate = top->backing_hd;
-
- while (intermediate != base) {
- int pnum_inter;
-
- ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
- &pnum_inter);
- if (ret < 0) {
- return ret;
- } else if (ret) {
- *pnum = pnum_inter;
- return 0;
- }
-
- /*
- * [sector_num, nb_sectors] is unallocated on top but intermediate
- * might have
- *
- * [sector_num+x, nr_sectors] allocated.
- */
- if (n > pnum_inter) {
- n = pnum_inter;
- }
-
- intermediate = intermediate->backing_hd;
- }
-
- *pnum = n;
- return 1;
-}
-
static void coroutine_fn stream_run(void *opaque)
{
StreamBlockJob *s = opaque;
@@ -189,6 +101,7 @@ static void coroutine_fn stream_run(void *opaque)
for (sector_num = 0; sector_num < end; sector_num += n) {
uint64_t delay_ns = 0;
+ bool copy;
wait:
/* Note that even when no rate limit is applied we need to yield
@@ -199,10 +112,20 @@ wait:
break;
}
- ret = is_allocated_base(bs, base, sector_num,
- STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+ ret = bdrv_co_is_allocated(bs, sector_num,
+ STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+ if (ret == 1) {
+ /* Allocated in the top, no need to copy. */
+ copy = false;
+ } else {
+ /* Copy if allocated in the intermediate images. Limit to the
+ * known-unallocated area [sector_num, sector_num+n). */
+ ret = bdrv_co_is_allocated_above(bs->backing_hd, base,
+ sector_num, n, &n);
+ copy = (ret == 1);
+ }
trace_stream_one_iteration(s, sector_num, n, ret);
- if (ret == 0) {
+ if (ret >= 0 && copy) {
if (s->common.speed) {
delay_ns = ratelimit_calculate_delay(&s->limit, n);
if (delay_ns > 0) {
@@ -248,7 +171,7 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
error_set(errp, QERR_INVALID_PARAMETER, "speed");
return;
}
- ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE);
+ ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}
static BlockJobType stream_job_type = {
diff --git a/block/vdi.c b/block/vdi.c
index 119d3c74da..57325d65c4 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -277,7 +277,8 @@ static void vdi_header_print(VdiHeader *header)
}
#endif
-static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res)
+static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
{
/* TODO: additional checks possible. */
BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
@@ -286,6 +287,10 @@ static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res)
uint32_t *bmap;
logout("\n");
+ if (fix) {
+ return -ENOTSUP;
+ }
+
bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t));
memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t));