From 95c6bff3561eedaf7c7de287bc4a002720605a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Fri, 21 Feb 2014 22:21:15 +0100 Subject: quorum: Add quorum mechanism. This patchset enables the core of the quorum mechanism. The num_children reads are compared to get the majority version and if this version exists more than threshold times the guest won't see the error at all. If a block is corrupted or if an error occurs during an IO or if the quorum cannot be established QMP events are used to report to the management. Use gnutls's SHA-256 to compare versions. --enable-quorum must be used to enable the feature. Signed-off-by: Benoit Canet Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/Makefile.objs | 2 +- block/quorum.c | 391 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 391 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/Makefile.objs b/block/Makefile.objs index 716556f6ec..425d7fbb59 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-c block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o -block-obj-y += quorum.o +block-obj-$(CONFIG_QUORUM) += quorum.o block-obj-y += parallels.o blkdebug.o blkverify.o block-obj-y += snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o diff --git a/block/quorum.c b/block/quorum.c index fd19662bcb..4beee96ffa 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -13,7 +13,43 @@ * See the COPYING file in the top-level directory. */ +#include +#include #include "block/block_int.h" +#include "qapi/qmp/qjson.h" + +#define HASH_LENGTH 32 + +/* This union holds a vote hash value */ +typedef union QuorumVoteValue { + char h[HASH_LENGTH]; /* SHA-256 hash */ + int64_t l; /* simpler 64 bits hash */ +} QuorumVoteValue; + +/* A vote item */ +typedef struct QuorumVoteItem { + int index; + QLIST_ENTRY(QuorumVoteItem) next; +} QuorumVoteItem; + +/* this structure is a vote version. A version is the set of votes sharing the + * same vote value. + * The set of votes will be tracked with the items field and its cardinality is + * vote_count. + */ +typedef struct QuorumVoteVersion { + QuorumVoteValue value; + int index; + int vote_count; + QLIST_HEAD(, QuorumVoteItem) items; + QLIST_ENTRY(QuorumVoteVersion) next; +} QuorumVoteVersion; + +/* this structure holds a group of vote versions together */ +typedef struct QuorumVotes { + QLIST_HEAD(, QuorumVoteVersion) vote_list; + bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); +} QuorumVotes; /* the following structure holds the state of one quorum instance */ typedef struct BDRVQuorumState { @@ -65,10 +101,14 @@ struct QuorumAIOCB { int count; /* number of completed AIOCB */ int success_count; /* number of successfully completed AIOCB */ + QuorumVotes votes; + bool is_read; int vote_ret; }; +static void quorum_vote(QuorumAIOCB *acb); + static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) { QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); @@ -94,6 +134,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) BDRVQuorumState *s = acb->common.bs->opaque; int i, ret = 0; + if (acb->vote_ret) { + ret = acb->vote_ret; + } + acb->common.cb(acb->common.opaque, ret); if (acb->is_read) { @@ -107,6 +151,16 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) qemu_aio_release(acb); } +static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return !memcmp(a->h, b->h, HASH_LENGTH); +} + +static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return a->l == b->l; +} + static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, BlockDriverState *bs, QEMUIOVector *qiov, @@ -125,6 +179,8 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, acb->qcrs = g_new0(QuorumChildRequest, s->num_children); acb->count = 0; acb->success_count = 0; + acb->votes.compare = quorum_sha256_compare; + QLIST_INIT(&acb->votes.vote_list); acb->is_read = false; acb->vote_ret = 0; @@ -137,6 +193,48 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, return acb; } +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +{ + QObject *data; + assert(node_name); + data = qobject_from_jsonf("{ 'ret': %d" + ", 'node-name': %s" + ", 'sector-num': %" PRId64 + ", 'sectors-count': %d }", + ret, node_name, acb->sector_num, acb->nb_sectors); + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); + qobject_decref(data); +} + +static void quorum_report_failure(QuorumAIOCB *acb) +{ + QObject *data; + const char *reference = acb->common.bs->device_name[0] ? + acb->common.bs->device_name : + acb->common.bs->node_name; + data = qobject_from_jsonf("{ 'reference': %s" + ", 'sector-num': %" PRId64 + ", 'sectors-count': %d }", + reference, acb->sector_num, acb->nb_sectors); + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); + qobject_decref(data); +} + +static int quorum_vote_error(QuorumAIOCB *acb); + +static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + + if (acb->success_count < s->threshold) { + acb->vote_ret = quorum_vote_error(acb); + quorum_report_failure(acb); + return true; + } + + return false; +} + static void quorum_aio_cb(void *opaque, int ret) { QuorumChildRequest *sacb = opaque; @@ -147,6 +245,8 @@ static void quorum_aio_cb(void *opaque, int ret) acb->count++; if (ret == 0) { acb->success_count++; + } else { + quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); } assert(acb->count <= s->num_children); assert(acb->success_count <= s->num_children); @@ -154,9 +254,298 @@ static void quorum_aio_cb(void *opaque, int ret) return; } + /* Do the vote on read */ + if (acb->is_read) { + quorum_vote(acb); + } else { + quorum_has_too_much_io_failed(acb); + } + quorum_aio_finalize(acb); } +static void quorum_report_bad_versions(BDRVQuorumState *s, + QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + quorum_report_bad(acb, s->bs[item->index]->node_name, 0); + } + } +} + +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) +{ + int i; + assert(dest->niov == source->niov); + assert(dest->size == source->size); + for (i = 0; i < source->niov; i++) { + assert(dest->iov[i].iov_len == source->iov[i].iov_len); + memcpy(dest->iov[i].iov_base, + source->iov[i].iov_base, + source->iov[i].iov_len); + } +} + +static void quorum_count_vote(QuorumVotes *votes, + QuorumVoteValue *value, + int index) +{ + QuorumVoteVersion *v = NULL, *version = NULL; + QuorumVoteItem *item; + + /* look if we have something with this hash */ + QLIST_FOREACH(v, &votes->vote_list, next) { + if (votes->compare(&v->value, value)) { + version = v; + break; + } + } + + /* It's a version not yet in the list add it */ + if (!version) { + version = g_new0(QuorumVoteVersion, 1); + QLIST_INIT(&version->items); + memcpy(&version->value, value, sizeof(version->value)); + version->index = index; + version->vote_count = 0; + QLIST_INSERT_HEAD(&votes->vote_list, version, next); + } + + version->vote_count++; + + item = g_new0(QuorumVoteItem, 1); + item->index = index; + QLIST_INSERT_HEAD(&version->items, item, next); +} + +static void quorum_free_vote_list(QuorumVotes *votes) +{ + QuorumVoteVersion *version, *next_version; + QuorumVoteItem *item, *next_item; + + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { + QLIST_REMOVE(version, next); + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { + QLIST_REMOVE(item, next); + g_free(item); + } + g_free(version); + } +} + +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) +{ + int j, ret; + gnutls_hash_hd_t dig; + QEMUIOVector *qiov = &acb->qcrs[i].qiov; + + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); + + if (ret < 0) { + return ret; + } + + for (j = 0; j < qiov->niov; j++) { + ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len); + if (ret < 0) { + break; + } + } + + gnutls_hash_deinit(dig, (void *) hash); + return ret; +} + +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) +{ + int max = 0; + QuorumVoteVersion *candidate, *winner = NULL; + + QLIST_FOREACH(candidate, &votes->vote_list, next) { + if (candidate->vote_count > max) { + max = candidate->vote_count; + winner = candidate; + } + } + + return winner; +} + +/* qemu_iovec_compare is handy for blkverify mode because it returns the first + * differing byte location. Yet it is handcoded to compare vectors one byte + * after another so it does not benefit from the libc SIMD optimizations. + * quorum_iovec_compare is written for speed and should be used in the non + * blkverify mode of quorum. + */ +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ + int i; + int result; + + assert(a->niov == b->niov); + for (i = 0; i < a->niov; i++) { + assert(a->iov[i].iov_len == b->iov[i].iov_len); + result = memcmp(a->iov[i].iov_base, + b->iov[i].iov_base, + a->iov[i].iov_len); + if (result) { + return false; + } + } + + return true; +} + +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", + acb->sector_num, acb->nb_sectors); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(1); +} + +static bool quorum_compare(QuorumAIOCB *acb, + QEMUIOVector *a, + QEMUIOVector *b) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + ssize_t offset; + + /* This driver will replace blkverify in this particular case */ + if (s->is_blkverify) { + offset = qemu_iovec_compare(a, b); + if (offset != -1) { + quorum_err(acb, "contents mismatch in sector %" PRId64, + acb->sector_num + + (uint64_t)(offset / BDRV_SECTOR_SIZE)); + } + return true; + } + + return quorum_iovec_compare(a, b); +} + +/* Do a vote to get the error code */ +static int quorum_vote_error(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner = NULL; + QuorumVotes error_votes; + QuorumVoteValue result_value; + int i, ret = 0; + bool error = false; + + QLIST_INIT(&error_votes.vote_list); + error_votes.compare = quorum_64bits_compare; + + for (i = 0; i < s->num_children; i++) { + ret = acb->qcrs[i].ret; + if (ret) { + error = true; + result_value.l = ret; + quorum_count_vote(&error_votes, &result_value, i); + } + } + + if (error) { + winner = quorum_get_vote_winner(&error_votes); + ret = winner->value.l; + } + + quorum_free_vote_list(&error_votes); + + return ret; +} + +static void quorum_vote(QuorumAIOCB *acb) +{ + bool quorum = true; + int i, j, ret; + QuorumVoteValue hash; + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner; + + if (quorum_has_too_much_io_failed(acb)) { + return; + } + + /* get the index of the first successful read */ + for (i = 0; i < s->num_children; i++) { + if (!acb->qcrs[i].ret) { + break; + } + } + + assert(i < s->num_children); + + /* compare this read with all other successful reads stopping at quorum + * failure + */ + for (j = i + 1; j < s->num_children; j++) { + if (acb->qcrs[j].ret) { + continue; + } + quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov); + if (!quorum) { + break; + } + } + + /* Every successful read agrees */ + if (quorum) { + quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); + return; + } + + /* compute hashes for each successful read, also store indexes */ + for (i = 0; i < s->num_children; i++) { + if (acb->qcrs[i].ret) { + continue; + } + ret = quorum_compute_hash(acb, i, &hash); + /* if ever the hash computation failed */ + if (ret < 0) { + acb->vote_ret = ret; + goto free_exit; + } + quorum_count_vote(&acb->votes, &hash, i); + } + + /* vote to select the most represented version */ + winner = quorum_get_vote_winner(&acb->votes); + + /* if the winner count is smaller than threshold the read fails */ + if (winner->vote_count < s->threshold) { + quorum_report_failure(acb); + acb->vote_ret = -EIO; + goto free_exit; + } + + /* we have a winner: copy it */ + quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov); + + /* some versions are bad print them */ + quorum_report_bad_versions(s, acb, &winner->value); + +free_exit: + /* free lists */ + quorum_free_vote_list(&acb->votes); +} + static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, @@ -178,7 +567,7 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, } for (i = 0; i < s->num_children; i++) { - bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors, + bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors, quorum_aio_cb, &acb->qcrs[i]); } -- cgit v1.2.1