From 4c37ef022381e777251d7084591978a4dc622efe Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:39 +0100
Subject: host-utils: add ffsl

We can provide fast versions based on the other functions defined
by host-utils.h.  Some care is required on glibc, which provides
ffsl already.

Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/host-utils.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index 81c9a754ae..2a32be4cc0 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -26,6 +26,7 @@
 #define HOST_UTILS_H 1
 
 #include "qemu/compiler.h"   /* QEMU_GNUC_PREREQ */
+#include <string.h>     /* ffsl */
 
 #if defined(__x86_64__)
 #define __HAVE_FAST_MULU64__
@@ -237,4 +238,29 @@ static inline int ctpop64(uint64_t val)
 #endif
 }
 
+/* glibc does not provide an inline version of ffsl, so always define
+ * ours.  We need to give it a different name, however.
+ */
+#ifdef __GLIBC__
+#define ffsl qemu_ffsl
+#endif
+static inline int ffsl(long val)
+{
+    if (!val) {
+        return 0;
+    }
+
+#if QEMU_GNUC_PREREQ(3, 4)
+    return __builtin_ctzl(val) + 1;
+#else
+    if (sizeof(long) == 4) {
+        return ctz32(val) + 1;
+    } else if (sizeof(long) == 8) {
+        return ctz64(val) + 1;
+    } else {
+        abort();
+    }
+#endif
+}
+
 #endif
-- 
cgit v1.2.1


From e7c033c3fa22a1e42d9ba57fed6ddecfbce3a01c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:40 +0100
Subject: add hierarchical bitmap data type and test cases

HBitmaps provides an array of bits.  The bits are stored as usual in an
array of unsigned longs, but HBitmap is also optimized to provide fast
iteration over set bits; going from one bit to the next is O(logB n)
worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough
that the number of levels is in fact fixed.

In order to do this, it stacks multiple bitmaps with progressively coarser
granularity; in all levels except the last, bit N is set iff the N-th
unsigned long is nonzero in the immediately next level.  When iteration
completes on the last level it can examine the 2nd-last level to quickly
skip entire words, and even do so recursively to skip blocks of 64 words or
powers thereof (32 on 32-bit machines).

Given an index in the bitmap, it can be split in group of bits like
this (for the 64-bit case):

     bits 0-57 => word in the last bitmap     | bits 58-63 => bit in the word
     bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word
     bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word

So it is easy to move up simply by shifting the index right by
log2(BITS_PER_LONG) bits.  To move down, you shift the index left
similarly, and add the word index within the group.  Iteration uses
ffs (find first set bit) to find the next word to examine; this
operation can be done in constant time in most current architectures.

Setting or clearing a range of m bits on all levels, the work to perform
is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap.

When iterating on a bitmap, each bit (on any level) is only visited
once.  Hence, The total cost of visiting a bitmap with m bits in it is
the number of bits that are set in all bitmaps.  Unless the bitmap is
extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized
cost of advancing from one bit to the next is usually constant.

Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/hbitmap.h | 207 +++++++++++++++++++++++++
 tests/Makefile         |   3 +
 tests/test-hbitmap.c   | 408 +++++++++++++++++++++++++++++++++++++++++++++++++
 trace-events           |   5 +
 util/Makefile.objs     |   2 +-
 util/hbitmap.c         | 400 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1024 insertions(+), 1 deletion(-)
 create mode 100644 include/qemu/hbitmap.h
 create mode 100644 tests/test-hbitmap.c
 create mode 100644 util/hbitmap.c

diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
new file mode 100644
index 0000000000..7ddfb66808
--- /dev/null
+++ b/include/qemu/hbitmap.h
@@ -0,0 +1,207 @@
+/*
+ * Hierarchical Bitmap Data Type
+ *
+ * Copyright Red Hat, Inc., 2012
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef HBITMAP_H
+#define HBITMAP_H 1
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include "bitops.h"
+
+typedef struct HBitmap HBitmap;
+typedef struct HBitmapIter HBitmapIter;
+
+#define BITS_PER_LEVEL         (BITS_PER_LONG == 32 ? 5 : 6)
+
+/* For 32-bit, the largest that fits in a 4 GiB address space.
+ * For 64-bit, the number of sectors in 1 PiB.  Good luck, in
+ * either case... :)
+ */
+#define HBITMAP_LOG_MAX_SIZE   (BITS_PER_LONG == 32 ? 34 : 41)
+
+/* We need to place a sentinel in level 0 to speed up iteration.  Thus,
+ * we do this instead of HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL.  The
+ * difference is that it allocates an extra level when HBITMAP_LOG_MAX_SIZE
+ * is an exact multiple of BITS_PER_LEVEL.
+ */
+#define HBITMAP_LEVELS         ((HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL) + 1)
+
+struct HBitmapIter {
+    const HBitmap *hb;
+
+    /* Copied from hb for access in the inline functions (hb is opaque).  */
+    int granularity;
+
+    /* Entry offset into the last-level array of longs.  */
+    size_t pos;
+
+    /* The currently-active path in the tree.  Each item of cur[i] stores
+     * the bits (i.e. the subtrees) yet to be processed under that node.
+     */
+    unsigned long cur[HBITMAP_LEVELS];
+};
+
+/**
+ * hbitmap_alloc:
+ * @size: Number of bits in the bitmap.
+ * @granularity: Granularity of the bitmap.  Aligned groups of 2^@granularity
+ * bits will be represented by a single bit.  Each operation on a
+ * range of bits first rounds the bits to determine which group they land
+ * in, and then affect the entire set; iteration will only visit the first
+ * bit of each group.
+ *
+ * Allocate a new HBitmap.
+ */
+HBitmap *hbitmap_alloc(uint64_t size, int granularity);
+
+/**
+ * hbitmap_empty:
+ * @hb: HBitmap to operate on.
+ *
+ * Return whether the bitmap is empty.
+ */
+bool hbitmap_empty(const HBitmap *hb);
+
+/**
+ * hbitmap_granularity:
+ * @hb: HBitmap to operate on.
+ *
+ * Return the granularity of the HBitmap.
+ */
+int hbitmap_granularity(const HBitmap *hb);
+
+/**
+ * hbitmap_count:
+ * @hb: HBitmap to operate on.
+ *
+ * Return the number of bits set in the HBitmap.
+ */
+uint64_t hbitmap_count(const HBitmap *hb);
+
+/**
+ * hbitmap_set:
+ * @hb: HBitmap to operate on.
+ * @start: First bit to set (0-based).
+ * @count: Number of bits to set.
+ *
+ * Set a consecutive range of bits in an HBitmap.
+ */
+void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count);
+
+/**
+ * hbitmap_reset:
+ * @hb: HBitmap to operate on.
+ * @start: First bit to reset (0-based).
+ * @count: Number of bits to reset.
+ *
+ * Reset a consecutive range of bits in an HBitmap.
+ */
+void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count);
+
+/**
+ * hbitmap_get:
+ * @hb: HBitmap to operate on.
+ * @item: Bit to query (0-based).
+ *
+ * Return whether the @item-th bit in an HBitmap is set.
+ */
+bool hbitmap_get(const HBitmap *hb, uint64_t item);
+
+/**
+ * hbitmap_free:
+ * @hb: HBitmap to operate on.
+ *
+ * Free an HBitmap and all of its associated memory.
+ */
+void hbitmap_free(HBitmap *hb);
+
+/**
+ * hbitmap_iter_init:
+ * @hbi: HBitmapIter to initialize.
+ * @hb: HBitmap to iterate on.
+ * @first: First bit to visit (0-based).
+ *
+ * Set up @hbi to iterate on the HBitmap @hb.  hbitmap_iter_next will return
+ * the lowest-numbered bit that is set in @hb, starting at @first.
+ *
+ * Concurrent setting of bits is acceptable, and will at worst cause the
+ * iteration to miss some of those bits.  Resetting bits before the current
+ * position of the iterator is also okay.  However, concurrent resetting of
+ * bits can lead to unexpected behavior if the iterator has not yet reached
+ * those bits.
+ */
+void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first);
+
+/* hbitmap_iter_skip_words:
+ * @hbi: HBitmapIter to operate on.
+ *
+ * Internal function used by hbitmap_iter_next and hbitmap_iter_next_word.
+ */
+unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi);
+
+/**
+ * hbitmap_iter_next:
+ * @hbi: HBitmapIter to operate on.
+ *
+ * Return the next bit that is set in @hbi's associated HBitmap,
+ * or -1 if all remaining bits are zero.
+ */
+static inline int64_t hbitmap_iter_next(HBitmapIter *hbi)
+{
+    unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1];
+    int64_t item;
+
+    if (cur == 0) {
+        cur = hbitmap_iter_skip_words(hbi);
+        if (cur == 0) {
+            return -1;
+        }
+    }
+
+    /* The next call will resume work from the next bit.  */
+    hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+    item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ffsl(cur) - 1;
+
+    return item << hbi->granularity;
+}
+
+/**
+ * hbitmap_iter_next_word:
+ * @hbi: HBitmapIter to operate on.
+ * @p_cur: Location where to store the next non-zero word.
+ *
+ * Return the index of the next nonzero word that is set in @hbi's
+ * associated HBitmap, and set *p_cur to the content of that word
+ * (bits before the index that was passed to hbitmap_iter_init are
+ * trimmed on the first call).  Return -1, and set *p_cur to zero,
+ * if all remaining words are zero.
+ */
+static inline size_t hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_cur)
+{
+    unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1];
+
+    if (cur == 0) {
+        cur = hbitmap_iter_skip_words(hbi);
+        if (cur == 0) {
+            *p_cur = 0;
+            return -1;
+        }
+    }
+
+    /* The next call will resume work from the next word.  */
+    hbi->cur[HBITMAP_LEVELS - 1] = 0;
+    *p_cur = cur;
+    return hbi->pos;
+}
+
+
+#endif
diff --git a/tests/Makefile b/tests/Makefile
index d86e95a400..b3a6d8697b 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -45,6 +45,8 @@ gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
 check-unit-y += tests/test-thread-pool$(EXESUF)
 gcov-files-test-thread-pool-y = thread-pool.c
+gcov-files-test-hbitmap-y = util/hbitmap.c
+check-unit-y += tests/test-hbitmap$(EXESUF)
 
 check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh
 
@@ -86,6 +88,7 @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(block-obj-y) libqemuutil
 tests/test-aio$(EXESUF): tests/test-aio.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-iov$(EXESUF): tests/test-iov.o libqemuutil.a
+tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o libqemuutil.a libqemustub.a
 
 tests/test-qapi-types.c tests/test-qapi-types.h :\
 $(SRC_PATH)/qapi-schema-test.json $(SRC_PATH)/scripts/qapi-types.py
diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
new file mode 100644
index 0000000000..fcc6a00fc3
--- /dev/null
+++ b/tests/test-hbitmap.c
@@ -0,0 +1,408 @@
+/*
+ * Hierarchical bitmap unit-tests.
+ *
+ * Copyright (C) 2012 Red Hat Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include <glib.h>
+#include <stdarg.h>
+#include "qemu/hbitmap.h"
+
+#define LOG_BITS_PER_LONG          (BITS_PER_LONG == 32 ? 5 : 6)
+
+#define L1                         BITS_PER_LONG
+#define L2                         (BITS_PER_LONG * L1)
+#define L3                         (BITS_PER_LONG * L2)
+
+typedef struct TestHBitmapData {
+    HBitmap       *hb;
+    unsigned long *bits;
+    size_t         size;
+    int            granularity;
+} TestHBitmapData;
+
+
+/* Check that the HBitmap and the shadow bitmap contain the same data,
+ * ignoring the same "first" bits.
+ */
+static void hbitmap_test_check(TestHBitmapData *data,
+                               uint64_t first)
+{
+    uint64_t count = 0;
+    size_t pos;
+    int bit;
+    HBitmapIter hbi;
+    int64_t i, next;
+
+    hbitmap_iter_init(&hbi, data->hb, first);
+
+    i = first;
+    for (;;) {
+        next = hbitmap_iter_next(&hbi);
+        if (next < 0) {
+            next = data->size;
+        }
+
+        while (i < next) {
+            pos = i >> LOG_BITS_PER_LONG;
+            bit = i & (BITS_PER_LONG - 1);
+            i++;
+            g_assert_cmpint(data->bits[pos] & (1UL << bit), ==, 0);
+        }
+
+        if (next == data->size) {
+            break;
+        }
+
+        pos = i >> LOG_BITS_PER_LONG;
+        bit = i & (BITS_PER_LONG - 1);
+        i++;
+        count++;
+        g_assert_cmpint(data->bits[pos] & (1UL << bit), !=, 0);
+    }
+
+    if (first == 0) {
+        g_assert_cmpint(count << data->granularity, ==, hbitmap_count(data->hb));
+    }
+}
+
+/* This is provided instead of a test setup function so that the sizes
+   are kept in the test functions (and not in main()) */
+static void hbitmap_test_init(TestHBitmapData *data,
+                              uint64_t size, int granularity)
+{
+    size_t n;
+    data->hb = hbitmap_alloc(size, granularity);
+
+    n = (size + BITS_PER_LONG - 1) / BITS_PER_LONG;
+    if (n == 0) {
+        n = 1;
+    }
+    data->bits = g_new0(unsigned long, n);
+    data->size = size;
+    data->granularity = granularity;
+    hbitmap_test_check(data, 0);
+}
+
+static void hbitmap_test_teardown(TestHBitmapData *data,
+                                  const void *unused)
+{
+    if (data->hb) {
+        hbitmap_free(data->hb);
+        data->hb = NULL;
+    }
+    if (data->bits) {
+        g_free(data->bits);
+        data->bits = NULL;
+    }
+}
+
+/* Set a range in the HBitmap and in the shadow "simple" bitmap.
+ * The two bitmaps are then tested against each other.
+ */
+static void hbitmap_test_set(TestHBitmapData *data,
+                             uint64_t first, uint64_t count)
+{
+    hbitmap_set(data->hb, first, count);
+    while (count-- != 0) {
+        size_t pos = first >> LOG_BITS_PER_LONG;
+        int bit = first & (BITS_PER_LONG - 1);
+        first++;
+
+        data->bits[pos] |= 1UL << bit;
+    }
+
+    if (data->granularity == 0) {
+        hbitmap_test_check(data, 0);
+    }
+}
+
+/* Reset a range in the HBitmap and in the shadow "simple" bitmap.
+ */
+static void hbitmap_test_reset(TestHBitmapData *data,
+                               uint64_t first, uint64_t count)
+{
+    hbitmap_reset(data->hb, first, count);
+    while (count-- != 0) {
+        size_t pos = first >> LOG_BITS_PER_LONG;
+        int bit = first & (BITS_PER_LONG - 1);
+        first++;
+
+        data->bits[pos] &= ~(1UL << bit);
+    }
+
+    if (data->granularity == 0) {
+        hbitmap_test_check(data, 0);
+    }
+}
+
+static void hbitmap_test_check_get(TestHBitmapData *data)
+{
+    uint64_t count = 0;
+    uint64_t i;
+
+    for (i = 0; i < data->size; i++) {
+        size_t pos = i >> LOG_BITS_PER_LONG;
+        int bit = i & (BITS_PER_LONG - 1);
+        unsigned long val = data->bits[pos] & (1UL << bit);
+        count += hbitmap_get(data->hb, i);
+        g_assert_cmpint(hbitmap_get(data->hb, i), ==, val != 0);
+    }
+    g_assert_cmpint(count, ==, hbitmap_count(data->hb));
+}
+
+static void test_hbitmap_zero(TestHBitmapData *data,
+                               const void *unused)
+{
+    hbitmap_test_init(data, 0, 0);
+}
+
+static void test_hbitmap_unaligned(TestHBitmapData *data,
+                                   const void *unused)
+{
+    hbitmap_test_init(data, L3 + 23, 0);
+    hbitmap_test_set(data, 0, 1);
+    hbitmap_test_set(data, L3 + 22, 1);
+}
+
+static void test_hbitmap_iter_empty(TestHBitmapData *data,
+                                    const void *unused)
+{
+    hbitmap_test_init(data, L1, 0);
+}
+
+static void test_hbitmap_iter_partial(TestHBitmapData *data,
+                                      const void *unused)
+{
+    hbitmap_test_init(data, L3, 0);
+    hbitmap_test_set(data, 0, L3);
+    hbitmap_test_check(data, 1);
+    hbitmap_test_check(data, L1 - 1);
+    hbitmap_test_check(data, L1);
+    hbitmap_test_check(data, L1 * 2 - 1);
+    hbitmap_test_check(data, L2 - 1);
+    hbitmap_test_check(data, L2);
+    hbitmap_test_check(data, L2 + 1);
+    hbitmap_test_check(data, L2 + L1);
+    hbitmap_test_check(data, L2 + L1 * 2 - 1);
+    hbitmap_test_check(data, L2 * 2 - 1);
+    hbitmap_test_check(data, L2 * 2);
+    hbitmap_test_check(data, L2 * 2 + 1);
+    hbitmap_test_check(data, L2 * 2 + L1);
+    hbitmap_test_check(data, L2 * 2 + L1 * 2 - 1);
+    hbitmap_test_check(data, L3 / 2);
+}
+
+static void test_hbitmap_iter_past(TestHBitmapData *data,
+                                    const void *unused)
+{
+    hbitmap_test_init(data, L3, 0);
+    hbitmap_test_set(data, 0, L3);
+    hbitmap_test_check(data, L3);
+}
+
+static void test_hbitmap_set_all(TestHBitmapData *data,
+                                 const void *unused)
+{
+    hbitmap_test_init(data, L3, 0);
+    hbitmap_test_set(data, 0, L3);
+}
+
+static void test_hbitmap_get_all(TestHBitmapData *data,
+                                 const void *unused)
+{
+    hbitmap_test_init(data, L3, 0);
+    hbitmap_test_set(data, 0, L3);
+    hbitmap_test_check_get(data);
+}
+
+static void test_hbitmap_get_some(TestHBitmapData *data,
+                                  const void *unused)
+{
+    hbitmap_test_init(data, 2 * L2, 0);
+    hbitmap_test_set(data, 10, 1);
+    hbitmap_test_check_get(data);
+    hbitmap_test_set(data, L1 - 1, 1);
+    hbitmap_test_check_get(data);
+    hbitmap_test_set(data, L1, 1);
+    hbitmap_test_check_get(data);
+    hbitmap_test_set(data, L2 - 1, 1);
+    hbitmap_test_check_get(data);
+    hbitmap_test_set(data, L2, 1);
+    hbitmap_test_check_get(data);
+}
+
+static void test_hbitmap_set_one(TestHBitmapData *data,
+                                 const void *unused)
+{
+    hbitmap_test_init(data, 2 * L2, 0);
+    hbitmap_test_set(data, 10, 1);
+    hbitmap_test_set(data, L1 - 1, 1);
+    hbitmap_test_set(data, L1, 1);
+    hbitmap_test_set(data, L2 - 1, 1);
+    hbitmap_test_set(data, L2, 1);
+}
+
+static void test_hbitmap_set_two_elem(TestHBitmapData *data,
+                                      const void *unused)
+{
+    hbitmap_test_init(data, 2 * L2, 0);
+    hbitmap_test_set(data, L1 - 1, 2);
+    hbitmap_test_set(data, L1 * 2 - 1, 4);
+    hbitmap_test_set(data, L1 * 4, L1 + 1);
+    hbitmap_test_set(data, L1 * 8 - 1, L1 + 1);
+    hbitmap_test_set(data, L2 - 1, 2);
+    hbitmap_test_set(data, L2 + L1 - 1, 8);
+    hbitmap_test_set(data, L2 + L1 * 4, L1 + 1);
+    hbitmap_test_set(data, L2 + L1 * 8 - 1, L1 + 1);
+}
+
+static void test_hbitmap_set(TestHBitmapData *data,
+                             const void *unused)
+{
+    hbitmap_test_init(data, L3 * 2, 0);
+    hbitmap_test_set(data, L1 - 1, L1 + 2);
+    hbitmap_test_set(data, L1 * 3 - 1, L1 + 2);
+    hbitmap_test_set(data, L1 * 5, L1 * 2 + 1);
+    hbitmap_test_set(data, L1 * 8 - 1, L1 * 2 + 1);
+    hbitmap_test_set(data, L2 - 1, L1 + 2);
+    hbitmap_test_set(data, L2 + L1 * 2 - 1, L1 + 2);
+    hbitmap_test_set(data, L2 + L1 * 4, L1 * 2 + 1);
+    hbitmap_test_set(data, L2 + L1 * 7 - 1, L1 * 2 + 1);
+    hbitmap_test_set(data, L2 * 2 - 1, L3 * 2 - L2 * 2);
+}
+
+static void test_hbitmap_set_twice(TestHBitmapData *data,
+                                   const void *unused)
+{
+    hbitmap_test_init(data, L1 * 3, 0);
+    hbitmap_test_set(data, 0, L1 * 3);
+    hbitmap_test_set(data, L1, 1);
+}
+
+static void test_hbitmap_set_overlap(TestHBitmapData *data,
+                                     const void *unused)
+{
+    hbitmap_test_init(data, L3 * 2, 0);
+    hbitmap_test_set(data, L1 - 1, L1 + 2);
+    hbitmap_test_set(data, L1 * 2 - 1, L1 * 2 + 2);
+    hbitmap_test_set(data, 0, L1 * 3);
+    hbitmap_test_set(data, L1 * 8 - 1, L2);
+    hbitmap_test_set(data, L2, L1);
+    hbitmap_test_set(data, L2 - L1 - 1, L1 * 8 + 2);
+    hbitmap_test_set(data, L2, L3 - L2 + 1);
+    hbitmap_test_set(data, L3 - L1, L1 * 3);
+    hbitmap_test_set(data, L3 - 1, 3);
+    hbitmap_test_set(data, L3 - 1, L2);
+}
+
+static void test_hbitmap_reset_empty(TestHBitmapData *data,
+                                     const void *unused)
+{
+    hbitmap_test_init(data, L3, 0);
+    hbitmap_test_reset(data, 0, L3);
+}
+
+static void test_hbitmap_reset(TestHBitmapData *data,
+                               const void *unused)
+{
+    hbitmap_test_init(data, L3 * 2, 0);
+    hbitmap_test_set(data, L1 - 1, L1 + 2);
+    hbitmap_test_reset(data, L1 * 2 - 1, L1 * 2 + 2);
+    hbitmap_test_set(data, 0, L1 * 3);
+    hbitmap_test_reset(data, L1 * 8 - 1, L2);
+    hbitmap_test_set(data, L2, L1);
+    hbitmap_test_reset(data, L2 - L1 - 1, L1 * 8 + 2);
+    hbitmap_test_set(data, L2, L3 - L2 + 1);
+    hbitmap_test_reset(data, L3 - L1, L1 * 3);
+    hbitmap_test_set(data, L3 - 1, 3);
+    hbitmap_test_reset(data, L3 - 1, L2);
+    hbitmap_test_set(data, 0, L3 * 2);
+    hbitmap_test_reset(data, 0, L1);
+    hbitmap_test_reset(data, 0, L2);
+    hbitmap_test_reset(data, L3, L3);
+    hbitmap_test_set(data, L3 / 2, L3);
+}
+
+static void test_hbitmap_granularity(TestHBitmapData *data,
+                                     const void *unused)
+{
+    /* Note that hbitmap_test_check has to be invoked manually in this test.  */
+    hbitmap_test_init(data, L1, 1);
+    hbitmap_test_set(data, 0, 1);
+    g_assert_cmpint(hbitmap_count(data->hb), ==, 2);
+    hbitmap_test_check(data, 0);
+    hbitmap_test_set(data, 2, 1);
+    g_assert_cmpint(hbitmap_count(data->hb), ==, 4);
+    hbitmap_test_check(data, 0);
+    hbitmap_test_set(data, 0, 3);
+    g_assert_cmpint(hbitmap_count(data->hb), ==, 4);
+    hbitmap_test_reset(data, 0, 1);
+    g_assert_cmpint(hbitmap_count(data->hb), ==, 2);
+}
+
+static void test_hbitmap_iter_granularity(TestHBitmapData *data,
+                                          const void *unused)
+{
+    HBitmapIter hbi;
+
+    /* Note that hbitmap_test_check has to be invoked manually in this test.  */
+    hbitmap_test_init(data, 131072 << 7, 7);
+    hbitmap_iter_init(&hbi, data->hb, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+
+    hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
+    hbitmap_iter_init(&hbi, data->hb, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+
+    hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+
+    hbitmap_test_set(data, (131072 << 7) - 8, 8);
+    hbitmap_iter_init(&hbi, data->hb, 0);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+
+    hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
+    g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
+}
+
+static void hbitmap_test_add(const char *testpath,
+                                   void (*test_func)(TestHBitmapData *data, const void *user_data))
+{
+    g_test_add(testpath, TestHBitmapData, NULL, NULL, test_func,
+               hbitmap_test_teardown);
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    hbitmap_test_add("/hbitmap/size/0", test_hbitmap_zero);
+    hbitmap_test_add("/hbitmap/size/unaligned", test_hbitmap_unaligned);
+    hbitmap_test_add("/hbitmap/iter/empty", test_hbitmap_iter_empty);
+    hbitmap_test_add("/hbitmap/iter/past", test_hbitmap_iter_past);
+    hbitmap_test_add("/hbitmap/iter/partial", test_hbitmap_iter_partial);
+    hbitmap_test_add("/hbitmap/iter/granularity", test_hbitmap_iter_granularity);
+    hbitmap_test_add("/hbitmap/get/all", test_hbitmap_get_all);
+    hbitmap_test_add("/hbitmap/get/some", test_hbitmap_get_some);
+    hbitmap_test_add("/hbitmap/set/all", test_hbitmap_set_all);
+    hbitmap_test_add("/hbitmap/set/one", test_hbitmap_set_one);
+    hbitmap_test_add("/hbitmap/set/two-elem", test_hbitmap_set_two_elem);
+    hbitmap_test_add("/hbitmap/set/general", test_hbitmap_set);
+    hbitmap_test_add("/hbitmap/set/twice", test_hbitmap_set_twice);
+    hbitmap_test_add("/hbitmap/set/overlap", test_hbitmap_set_overlap);
+    hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty);
+    hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset);
+    hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity);
+    g_test_run();
+
+    return 0;
+}
diff --git a/trace-events b/trace-events
index 09091e6d17..732cb12a00 100644
--- a/trace-events
+++ b/trace-events
@@ -1060,3 +1060,8 @@ xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]"
 xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x"
 xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
 xics_ics_eoi(int nr) "ics_eoi: irq %#x"
+
+# hbitmap.c
+hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
+hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
+hbitmap_set(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 5baeb53af6..495a178557 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -2,7 +2,7 @@ util-obj-y = osdep.o cutils.o qemu-timer-common.o
 util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o event_notifier-win32.o
 util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o event_notifier-posix.o
 util-obj-y += envlist.o path.o host-utils.o cache-utils.o module.o
-util-obj-y += bitmap.o bitops.o
+util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += acl.o
 util-obj-y += error.o qemu-error.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
diff --git a/util/hbitmap.c b/util/hbitmap.c
new file mode 100644
index 0000000000..fb7e01e8c5
--- /dev/null
+++ b/util/hbitmap.c
@@ -0,0 +1,400 @@
+/*
+ * Hierarchical Bitmap Data Type
+ *
+ * Copyright Red Hat, Inc., 2012
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include <string.h>
+#include <glib.h>
+#include <assert.h>
+#include "qemu/osdep.h"
+#include "qemu/hbitmap.h"
+#include "qemu/host-utils.h"
+#include "trace.h"
+
+/* HBitmaps provides an array of bits.  The bits are stored as usual in an
+ * array of unsigned longs, but HBitmap is also optimized to provide fast
+ * iteration over set bits; going from one bit to the next is O(logB n)
+ * worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough
+ * that the number of levels is in fact fixed.
+ *
+ * In order to do this, it stacks multiple bitmaps with progressively coarser
+ * granularity; in all levels except the last, bit N is set iff the N-th
+ * unsigned long is nonzero in the immediately next level.  When iteration
+ * completes on the last level it can examine the 2nd-last level to quickly
+ * skip entire words, and even do so recursively to skip blocks of 64 words or
+ * powers thereof (32 on 32-bit machines).
+ *
+ * Given an index in the bitmap, it can be split in group of bits like
+ * this (for the 64-bit case):
+ *
+ *   bits 0-57 => word in the last bitmap     | bits 58-63 => bit in the word
+ *   bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word
+ *   bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word
+ *
+ * So it is easy to move up simply by shifting the index right by
+ * log2(BITS_PER_LONG) bits.  To move down, you shift the index left
+ * similarly, and add the word index within the group.  Iteration uses
+ * ffs (find first set bit) to find the next word to examine; this
+ * operation can be done in constant time in most current architectures.
+ *
+ * Setting or clearing a range of m bits on all levels, the work to perform
+ * is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap.
+ *
+ * When iterating on a bitmap, each bit (on any level) is only visited
+ * once.  Hence, The total cost of visiting a bitmap with m bits in it is
+ * the number of bits that are set in all bitmaps.  Unless the bitmap is
+ * extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized
+ * cost of advancing from one bit to the next is usually constant (worst case
+ * O(logB n) as in the non-amortized complexity).
+ */
+
+struct HBitmap {
+    /* Number of total bits in the bottom level.  */
+    uint64_t size;
+
+    /* Number of set bits in the bottom level.  */
+    uint64_t count;
+
+    /* A scaling factor.  Given a granularity of G, each bit in the bitmap will
+     * will actually represent a group of 2^G elements.  Each operation on a
+     * range of bits first rounds the bits to determine which group they land
+     * in, and then affect the entire page; iteration will only visit the first
+     * bit of each group.  Here is an example of operations in a size-16,
+     * granularity-1 HBitmap:
+     *
+     *    initial state            00000000
+     *    set(start=0, count=9)    11111000 (iter: 0, 2, 4, 6, 8)
+     *    reset(start=1, count=3)  00111000 (iter: 4, 6, 8)
+     *    set(start=9, count=2)    00111100 (iter: 4, 6, 8, 10)
+     *    reset(start=5, count=5)  00000000
+     *
+     * From an implementation point of view, when setting or resetting bits,
+     * the bitmap will scale bit numbers right by this amount of bits.  When
+     * iterating, the bitmap will scale bit numbers left by this amount of
+     * bits.
+     */
+    int granularity;
+
+    /* A number of progressively less coarse bitmaps (i.e. level 0 is the
+     * coarsest).  Each bit in level N represents a word in level N+1 that
+     * has a set bit, except the last level where each bit represents the
+     * actual bitmap.
+     *
+     * Note that all bitmaps have the same number of levels.  Even a 1-bit
+     * bitmap will still allocate HBITMAP_LEVELS arrays.
+     */
+    unsigned long *levels[HBITMAP_LEVELS];
+};
+
+static inline int popcountl(unsigned long l)
+{
+    return BITS_PER_LONG == 32 ? ctpop32(l) : ctpop64(l);
+}
+
+/* Advance hbi to the next nonzero word and return it.  hbi->pos
+ * is updated.  Returns zero if we reach the end of the bitmap.
+ */
+unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
+{
+    size_t pos = hbi->pos;
+    const HBitmap *hb = hbi->hb;
+    unsigned i = HBITMAP_LEVELS - 1;
+
+    unsigned long cur;
+    do {
+        cur = hbi->cur[--i];
+        pos >>= BITS_PER_LEVEL;
+    } while (cur == 0);
+
+    /* Check for end of iteration.  We always use fewer than BITS_PER_LONG
+     * bits in the level 0 bitmap; thus we can repurpose the most significant
+     * bit as a sentinel.  The sentinel is set in hbitmap_alloc and ensures
+     * that the above loop ends even without an explicit check on i.
+     */
+
+    if (i == 0 && cur == (1UL << (BITS_PER_LONG - 1))) {
+        return 0;
+    }
+    for (; i < HBITMAP_LEVELS - 1; i++) {
+        /* Shift back pos to the left, matching the right shifts above.
+         * The index of this word's least significant set bit provides
+         * the low-order bits.
+         */
+        pos = (pos << BITS_PER_LEVEL) + ffsl(cur) - 1;
+        hbi->cur[i] = cur & (cur - 1);
+
+        /* Set up next level for iteration.  */
+        cur = hb->levels[i + 1][pos];
+    }
+
+    hbi->pos = pos;
+    trace_hbitmap_iter_skip_words(hbi->hb, hbi, pos, cur);
+
+    assert(cur);
+    return cur;
+}
+
+void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first)
+{
+    unsigned i, bit;
+    uint64_t pos;
+
+    hbi->hb = hb;
+    pos = first >> hb->granularity;
+    hbi->pos = pos >> BITS_PER_LEVEL;
+    hbi->granularity = hb->granularity;
+
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        bit = pos & (BITS_PER_LONG - 1);
+        pos >>= BITS_PER_LEVEL;
+
+        /* Drop bits representing items before first.  */
+        hbi->cur[i] = hb->levels[i][pos] & ~((1UL << bit) - 1);
+
+        /* We have already added level i+1, so the lowest set bit has
+         * been processed.  Clear it.
+         */
+        if (i != HBITMAP_LEVELS - 1) {
+            hbi->cur[i] &= ~(1UL << bit);
+        }
+    }
+}
+
+bool hbitmap_empty(const HBitmap *hb)
+{
+    return hb->count == 0;
+}
+
+int hbitmap_granularity(const HBitmap *hb)
+{
+    return hb->granularity;
+}
+
+uint64_t hbitmap_count(const HBitmap *hb)
+{
+    return hb->count << hb->granularity;
+}
+
+/* Count the number of set bits between start and end, not accounting for
+ * the granularity.  Also an example of how to use hbitmap_iter_next_word.
+ */
+static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last)
+{
+    HBitmapIter hbi;
+    uint64_t count = 0;
+    uint64_t end = last + 1;
+    unsigned long cur;
+    size_t pos;
+
+    hbitmap_iter_init(&hbi, hb, start << hb->granularity);
+    for (;;) {
+        pos = hbitmap_iter_next_word(&hbi, &cur);
+        if (pos >= (end >> BITS_PER_LEVEL)) {
+            break;
+        }
+        count += popcountl(cur);
+    }
+
+    if (pos == (end >> BITS_PER_LEVEL)) {
+        /* Drop bits representing the END-th and subsequent items.  */
+        int bit = end & (BITS_PER_LONG - 1);
+        cur &= (1UL << bit) - 1;
+        count += popcountl(cur);
+    }
+
+    return count;
+}
+
+/* Setting starts at the last layer and propagates up if an element
+ * changes from zero to non-zero.
+ */
+static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last)
+{
+    unsigned long mask;
+    bool changed;
+
+    assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
+    assert(start <= last);
+
+    mask = 2UL << (last & (BITS_PER_LONG - 1));
+    mask -= 1UL << (start & (BITS_PER_LONG - 1));
+    changed = (*elem == 0);
+    *elem |= mask;
+    return changed;
+}
+
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
+static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last)
+{
+    size_t pos = start >> BITS_PER_LEVEL;
+    size_t lastpos = last >> BITS_PER_LEVEL;
+    bool changed = false;
+    size_t i;
+
+    i = pos;
+    if (i < lastpos) {
+        uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
+        changed |= hb_set_elem(&hb->levels[level][i], start, next - 1);
+        for (;;) {
+            start = next;
+            next += BITS_PER_LONG;
+            if (++i == lastpos) {
+                break;
+            }
+            changed |= (hb->levels[level][i] == 0);
+            hb->levels[level][i] = ~0UL;
+        }
+    }
+    changed |= hb_set_elem(&hb->levels[level][i], start, last);
+
+    /* If there was any change in this layer, we may have to update
+     * the one above.
+     */
+    if (level > 0 && changed) {
+        hb_set_between(hb, level - 1, pos, lastpos);
+    }
+}
+
+void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
+{
+    /* Compute range in the last layer.  */
+    uint64_t last = start + count - 1;
+
+    trace_hbitmap_set(hb, start, count,
+                      start >> hb->granularity, last >> hb->granularity);
+
+    start >>= hb->granularity;
+    last >>= hb->granularity;
+    count = last - start + 1;
+
+    hb->count += count - hb_count_between(hb, start, last);
+    hb_set_between(hb, HBITMAP_LEVELS - 1, start, last);
+}
+
+/* Resetting works the other way round: propagate up if the new
+ * value is zero.
+ */
+static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t last)
+{
+    unsigned long mask;
+    bool blanked;
+
+    assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
+    assert(start <= last);
+
+    mask = 2UL << (last & (BITS_PER_LONG - 1));
+    mask -= 1UL << (start & (BITS_PER_LONG - 1));
+    blanked = *elem != 0 && ((*elem & ~mask) == 0);
+    *elem &= ~mask;
+    return blanked;
+}
+
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
+static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t last)
+{
+    size_t pos = start >> BITS_PER_LEVEL;
+    size_t lastpos = last >> BITS_PER_LEVEL;
+    bool changed = false;
+    size_t i;
+
+    i = pos;
+    if (i < lastpos) {
+        uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
+
+        /* Here we need a more complex test than when setting bits.  Even if
+         * something was changed, we must not blank bits in the upper level
+         * unless the lower-level word became entirely zero.  So, remove pos
+         * from the upper-level range if bits remain set.
+         */
+        if (hb_reset_elem(&hb->levels[level][i], start, next - 1)) {
+            changed = true;
+        } else {
+            pos++;
+        }
+
+        for (;;) {
+            start = next;
+            next += BITS_PER_LONG;
+            if (++i == lastpos) {
+                break;
+            }
+            changed |= (hb->levels[level][i] != 0);
+            hb->levels[level][i] = 0UL;
+        }
+    }
+
+    /* Same as above, this time for lastpos.  */
+    if (hb_reset_elem(&hb->levels[level][i], start, last)) {
+        changed = true;
+    } else {
+        lastpos--;
+    }
+
+    if (level > 0 && changed) {
+        hb_reset_between(hb, level - 1, pos, lastpos);
+    }
+}
+
+void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count)
+{
+    /* Compute range in the last layer.  */
+    uint64_t last = start + count - 1;
+
+    trace_hbitmap_reset(hb, start, count,
+                        start >> hb->granularity, last >> hb->granularity);
+
+    start >>= hb->granularity;
+    last >>= hb->granularity;
+
+    hb->count -= hb_count_between(hb, start, last);
+    hb_reset_between(hb, HBITMAP_LEVELS - 1, start, last);
+}
+
+bool hbitmap_get(const HBitmap *hb, uint64_t item)
+{
+    /* Compute position and bit in the last layer.  */
+    uint64_t pos = item >> hb->granularity;
+    unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1));
+
+    return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0;
+}
+
+void hbitmap_free(HBitmap *hb)
+{
+    unsigned i;
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        g_free(hb->levels[i]);
+    }
+    g_free(hb);
+}
+
+HBitmap *hbitmap_alloc(uint64_t size, int granularity)
+{
+    HBitmap *hb = g_malloc0(sizeof (struct HBitmap));
+    unsigned i;
+
+    assert(granularity >= 0 && granularity < 64);
+    size = (size + (1ULL << granularity) - 1) >> granularity;
+    assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+
+    hb->size = size;
+    hb->granularity = granularity;
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+        hb->levels[i] = g_malloc0(size * sizeof(unsigned long));
+    }
+
+    /* We necessarily have free bits in level 0 due to the definition
+     * of HBITMAP_LEVELS, so use one for a sentinel.  This speeds up
+     * hbitmap_iter_skip_words.
+     */
+    assert(size == 1);
+    hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
+    return hb;
+}
-- 
cgit v1.2.1


From 8f0720ecbc3677e13fc7531588fc3831cc972ee4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:41 +0100
Subject: block: implement dirty bitmap using HBitmap

This actually uses the dirty bitmap in the block layer, and converts
mirroring to use an HBitmapIter.

Reviewed-by: Laszlo Ersek <lersek@redhat.com> (except block/mirror.c parts)
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   | 96 ++++++++---------------------------------------
 block/mirror.c            | 12 +++++-
 include/block/block.h     |  6 ++-
 include/block/block_int.h |  4 +-
 trace-events              |  1 +
 5 files changed, 33 insertions(+), 86 deletions(-)

diff --git a/block.c b/block.c
index 6fa7c90144..e0ff736403 100644
--- a/block.c
+++ b/block.c
@@ -1286,7 +1286,6 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     bs_dest->iostatus           = bs_src->iostatus;
 
     /* dirty bitmap */
-    bs_dest->dirty_count        = bs_src->dirty_count;
     bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
 
     /* job */
@@ -2035,36 +2034,6 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-#define BITS_PER_LONG  (sizeof(unsigned long) * 8)
-
-static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
-                             int nb_sectors, int dirty)
-{
-    int64_t start, end;
-    unsigned long val, idx, bit;
-
-    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
-    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
-
-    for (; start <= end; start++) {
-        idx = start / BITS_PER_LONG;
-        bit = start % BITS_PER_LONG;
-        val = bs->dirty_bitmap[idx];
-        if (dirty) {
-            if (!(val & (1UL << bit))) {
-                bs->dirty_count++;
-                val |= 1UL << bit;
-            }
-        } else {
-            if (val & (1UL << bit)) {
-                bs->dirty_count--;
-                val &= ~(1UL << bit);
-            }
-        }
-        bs->dirty_bitmap[idx] = val;
-    }
-}
-
 /* Return < 0 if error. Important errors are:
   -EIO         generic I/O error (may happen for all errors)
   -ENOMEDIUM   No media inserted.
@@ -4173,7 +4142,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
     }
 
     if (bs->dirty_bitmap) {
-        set_dirty_bitmap(bs, sector_num, nb_sectors, 0);
+        bdrv_reset_dirty(bs, sector_num, nb_sectors);
     }
 
     if (bs->drv->bdrv_co_discard) {
@@ -4335,18 +4304,15 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
 {
     int64_t bitmap_size;
 
-    bs->dirty_count = 0;
     if (enable) {
         if (!bs->dirty_bitmap) {
-            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
-                    BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
-            bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
-
-            bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
+            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
+            bs->dirty_bitmap = hbitmap_alloc(bitmap_size,
+                                             BDRV_LOG_SECTORS_PER_DIRTY_CHUNK);
         }
     } else {
         if (bs->dirty_bitmap) {
-            g_free(bs->dirty_bitmap);
+            hbitmap_free(bs->dirty_bitmap);
             bs->dirty_bitmap = NULL;
         }
     }
@@ -4354,67 +4320,37 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
 
 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
 {
-    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
-
-    if (bs->dirty_bitmap &&
-        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
-        return !!(bs->dirty_bitmap[chunk / BITS_PER_LONG] &
-            (1UL << (chunk % BITS_PER_LONG)));
+    if (bs->dirty_bitmap) {
+        return hbitmap_get(bs->dirty_bitmap, sector);
     } else {
         return 0;
     }
 }
 
-int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector)
+void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
 {
-    int64_t chunk;
-    int bit, elem;
-
-    /* Avoid an infinite loop.  */
-    assert(bs->dirty_count > 0);
-
-    sector = (sector | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
-    chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
-
-    QEMU_BUILD_BUG_ON(sizeof(bs->dirty_bitmap[0]) * 8 != BITS_PER_LONG);
-    elem = chunk / BITS_PER_LONG;
-    bit = chunk % BITS_PER_LONG;
-    for (;;) {
-        if (sector >= bs->total_sectors) {
-            sector = 0;
-            bit = elem = 0;
-        }
-        if (bit == 0 && bs->dirty_bitmap[elem] == 0) {
-            sector += BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
-            elem++;
-        } else {
-            if (bs->dirty_bitmap[elem] & (1UL << bit)) {
-                return sector;
-            }
-            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
-            if (++bit == BITS_PER_LONG) {
-                bit = 0;
-                elem++;
-            }
-        }
-    }
+    hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
 }
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
                     int nr_sectors)
 {
-    set_dirty_bitmap(bs, cur_sector, nr_sectors, 1);
+    hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
 }
 
 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
                       int nr_sectors)
 {
-    set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
+    hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
 }
 
 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
 {
-    return bs->dirty_count;
+    if (bs->dirty_bitmap) {
+        return hbitmap_count(bs->dirty_bitmap) >> BDRV_LOG_SECTORS_PER_DIRTY_CHUNK;
+    } else {
+        return 0;
+    }
 }
 
 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
diff --git a/block/mirror.c b/block/mirror.c
index 6180aa30e5..20cb1e777f 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -36,6 +36,7 @@ typedef struct MirrorBlockJob {
     bool synced;
     bool should_complete;
     int64_t sector_num;
+    HBitmapIter hbi;
     uint8_t *buf;
 } MirrorBlockJob;
 
@@ -62,8 +63,15 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
     int64_t end;
     struct iovec iov;
 
+    s->sector_num = hbitmap_iter_next(&s->hbi);
+    if (s->sector_num < 0) {
+        bdrv_dirty_iter_init(source, &s->hbi);
+        s->sector_num = hbitmap_iter_next(&s->hbi);
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(source));
+        assert(s->sector_num >= 0);
+    }
+
     end = s->common.len >> BDRV_SECTOR_BITS;
-    s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
     nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
     bdrv_reset_dirty(source, s->sector_num, nb_sectors);
 
@@ -136,7 +144,7 @@ static void coroutine_fn mirror_run(void *opaque)
         }
     }
 
-    s->sector_num = -1;
+    bdrv_dirty_iter_init(bs, &s->hbi);
     for (;;) {
         uint64_t delay_ns;
         int64_t cnt;
diff --git a/include/block/block.h b/include/block/block.h
index ffd193637d..678fc60dc1 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -351,13 +351,15 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
-#define BDRV_SECTORS_PER_DIRTY_CHUNK 2048
+#define BDRV_SECTORS_PER_DIRTY_CHUNK     (1 << BDRV_LOG_SECTORS_PER_DIRTY_CHUNK)
+#define BDRV_LOG_SECTORS_PER_DIRTY_CHUNK 11
 
+struct HBitmapIter;
 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable);
 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
-int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector);
+void bdrv_dirty_iter_init(BlockDriverState *bs, struct HBitmapIter *hbi);
 int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f83ffb8a08..b81c061cd9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -32,6 +32,7 @@
 #include "qapi-types.h"
 #include "qapi/qmp/qerror.h"
 #include "monitor/monitor.h"
+#include "qemu/hbitmap.h"
 
 #define BLOCK_FLAG_ENCRYPT          1
 #define BLOCK_FLAG_COMPAT6          4
@@ -275,8 +276,7 @@ struct BlockDriverState {
     bool iostatus_enabled;
     BlockDeviceIoStatus iostatus;
     char device_name[32];
-    unsigned long *dirty_bitmap;
-    int64_t dirty_count;
+    HBitmap *dirty_bitmap;
     int in_use; /* users other than guest access, eg. block migration */
     QTAILQ_ENTRY(BlockDriverState) list;
 
diff --git a/trace-events b/trace-events
index 732cb12a00..61ed349765 100644
--- a/trace-events
+++ b/trace-events
@@ -79,6 +79,7 @@ commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "
 
 # block/mirror.c
 mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
+mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64
 mirror_before_flush(void *s) "s %p"
 mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
 mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64" synced %d"
-- 
cgit v1.2.1


From 343bded4ecfc467012e2ab675da75749f1d90f70 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:42 +0100
Subject: block: make round_to_clusters public

This is needed in the following patch.

Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 16 ++++++++--------
 include/block/block.h |  4 ++++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/block.c b/block.c
index e0ff736403..4a4ab162f7 100644
--- a/block.c
+++ b/block.c
@@ -1673,10 +1673,10 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
 /**
  * Round a region to cluster boundaries
  */
-static void round_to_clusters(BlockDriverState *bs,
-                              int64_t sector_num, int nb_sectors,
-                              int64_t *cluster_sector_num,
-                              int *cluster_nb_sectors)
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t sector_num, int nb_sectors,
+                            int64_t *cluster_sector_num,
+                            int *cluster_nb_sectors)
 {
     BlockDriverInfo bdi;
 
@@ -1718,8 +1718,8 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
      * CoR read and write operations are atomic and guest writes cannot
      * interleave between them.
      */
-    round_to_clusters(bs, sector_num, nb_sectors,
-                      &cluster_sector_num, &cluster_nb_sectors);
+    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+                           &cluster_sector_num, &cluster_nb_sectors);
 
     do {
         retry = false;
@@ -2185,8 +2185,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
     /* Cover entire cluster so no additional backing file I/O is required when
      * allocating cluster in the image file.
      */
-    round_to_clusters(bs, sector_num, nb_sectors,
-                      &cluster_sector_num, &cluster_nb_sectors);
+    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+                           &cluster_sector_num, &cluster_nb_sectors);
 
     trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
                                    cluster_sector_num, cluster_nb_sectors);
diff --git a/include/block/block.h b/include/block/block.h
index 678fc60dc1..9ee9068aec 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -309,6 +309,10 @@ int bdrv_get_flags(BlockDriverState *bs);
 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
                           const uint8_t *buf, int nb_sectors);
 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t sector_num, int nb_sectors,
+                            int64_t *cluster_sector_num,
+                            int *cluster_nb_sectors);
 
 const char *bdrv_get_encrypted_filename(BlockDriverState *bs);
 void bdrv_get_backing_filename(BlockDriverState *bs,
-- 
cgit v1.2.1


From b812f6719c21921a819709098dc018ed151c999b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:43 +0100
Subject: mirror: perform COW if the cluster size is bigger than the
 granularity

When mirroring runs, the backing files for the target may not yet be
ready.  However, this means that a copy-on-write operation on the target
would fill the missing sectors with zeros.  Copy-on-write only happens
if the granularity of the dirty bitmap is smaller than the cluster size
(and only for clusters that are allocated in the source after the job
has started copying).  So far, the granularity was fixed to 1MB; to avoid
the problem we detected the situation and required the backing files to
be available in that case only.

However, we want to lower the granularity for efficiency, so we need
a better solution.  The solution is to always copy a whole cluster the
first time it is touched.  The code keeps a bitmap of clusters that
have already been allocated by the mirroring job, and only does "manual"
copy-on-write if the chunk being copied is zero in the bitmap.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c             | 62 +++++++++++++++++++++++++++++++++++++++-------
 blockdev.c                 | 15 +++--------
 tests/qemu-iotests/041     | 50 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/041.out |  4 +--
 trace-events               |  1 +
 5 files changed, 109 insertions(+), 23 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 20cb1e777f..307bcf101e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -15,6 +15,7 @@
 #include "block/blockjob.h"
 #include "block/block_int.h"
 #include "qemu/ratelimit.h"
+#include "qemu/bitmap.h"
 
 enum {
     /*
@@ -36,6 +37,8 @@ typedef struct MirrorBlockJob {
     bool synced;
     bool should_complete;
     int64_t sector_num;
+    size_t buf_size;
+    unsigned long *cow_bitmap;
     HBitmapIter hbi;
     uint8_t *buf;
 } MirrorBlockJob;
@@ -60,7 +63,7 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
     BlockDriverState *target = s->target;
     QEMUIOVector qiov;
     int ret, nb_sectors;
-    int64_t end;
+    int64_t end, sector_num, chunk_num;
     struct iovec iov;
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
@@ -71,32 +74,53 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
         assert(s->sector_num >= 0);
     }
 
+    /* If we have no backing file yet in the destination, and the cluster size
+     * is very large, we need to do COW ourselves.  The first time a cluster is
+     * copied, copy it entirely.
+     *
+     * Because both BDRV_SECTORS_PER_DIRTY_CHUNK and the cluster size are
+     * powers of two, the number of sectors to copy cannot exceed one cluster.
+     */
+    sector_num = s->sector_num;
+    nb_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+    chunk_num = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+    if (s->cow_bitmap && !test_bit(chunk_num, s->cow_bitmap)) {
+        trace_mirror_cow(s, sector_num);
+        bdrv_round_to_clusters(s->target,
+                               sector_num, BDRV_SECTORS_PER_DIRTY_CHUNK,
+                               &sector_num, &nb_sectors);
+    }
+
     end = s->common.len >> BDRV_SECTOR_BITS;
-    nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
-    bdrv_reset_dirty(source, s->sector_num, nb_sectors);
+    nb_sectors = MIN(nb_sectors, end - sector_num);
+    bdrv_reset_dirty(source, sector_num, nb_sectors);
 
     /* Copy the dirty cluster.  */
     iov.iov_base = s->buf;
     iov.iov_len  = nb_sectors * 512;
     qemu_iovec_init_external(&qiov, &iov, 1);
 
-    trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
-    ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
+    trace_mirror_one_iteration(s, sector_num, nb_sectors);
+    ret = bdrv_co_readv(source, sector_num, nb_sectors, &qiov);
     if (ret < 0) {
         *p_action = mirror_error_action(s, true, -ret);
         goto fail;
     }
-    ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
+    ret = bdrv_co_writev(target, sector_num, nb_sectors, &qiov);
     if (ret < 0) {
         *p_action = mirror_error_action(s, false, -ret);
         s->synced = false;
         goto fail;
     }
+    if (s->cow_bitmap) {
+        bitmap_set(s->cow_bitmap, sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK,
+                   nb_sectors / BDRV_SECTORS_PER_DIRTY_CHUNK);
+    }
     return 0;
 
 fail:
     /* Try again later.  */
-    bdrv_set_dirty(source, s->sector_num, nb_sectors);
+    bdrv_set_dirty(source, sector_num, nb_sectors);
     return ret;
 }
 
@@ -104,7 +128,9 @@ static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     BlockDriverState *bs = s->common.bs;
-    int64_t sector_num, end;
+    int64_t sector_num, end, length;
+    BlockDriverInfo bdi;
+    char backing_filename[1024];
     int ret = 0;
     int n;
 
@@ -118,8 +144,23 @@ static void coroutine_fn mirror_run(void *opaque)
         return;
     }
 
+    /* If we have no backing file yet in the destination, we cannot let
+     * the destination do COW.  Instead, we copy sectors around the
+     * dirty data if needed.  We need a bitmap to do that.
+     */
+    bdrv_get_backing_filename(s->target, backing_filename,
+                              sizeof(backing_filename));
+    if (backing_filename[0] && !s->target->backing_hd) {
+        bdrv_get_info(s->target, &bdi);
+        if (s->buf_size < bdi.cluster_size) {
+            s->buf_size = bdi.cluster_size;
+            length = (bdrv_getlength(bs) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            s->cow_bitmap = bitmap_new(length);
+        }
+    }
+
     end = s->common.len >> BDRV_SECTOR_BITS;
-    s->buf = qemu_blockalign(bs, BLOCK_SIZE);
+    s->buf = qemu_blockalign(bs, s->buf_size);
 
     if (s->mode != MIRROR_SYNC_MODE_NONE) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -234,6 +275,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
 immediate_exit:
     qemu_vfree(s->buf);
+    g_free(s->cow_bitmap);
     bdrv_set_dirty_tracking(bs, false);
     bdrv_iostatus_disable(s->target);
     if (s->should_complete && ret == 0) {
@@ -320,6 +362,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->on_target_error = on_target_error;
     s->target = target;
     s->mode = mode;
+    s->buf_size = BLOCK_SIZE;
+
     bdrv_set_dirty_tracking(bs, true);
     bdrv_set_enable_write_cache(s->target, true);
     bdrv_set_on_error(s->target, on_target_error, on_target_error);
diff --git a/blockdev.c b/blockdev.c
index 9126587c45..1eb62b637c 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1197,7 +1197,6 @@ void qmp_drive_mirror(const char *device, const char *target,
                       bool has_on_target_error, BlockdevOnError on_target_error,
                       Error **errp)
 {
-    BlockDriverInfo bdi;
     BlockDriverState *bs;
     BlockDriverState *source, *target_bs;
     BlockDriver *proto_drv;
@@ -1288,6 +1287,9 @@ void qmp_drive_mirror(const char *device, const char *target,
         return;
     }
 
+    /* Mirroring takes care of copy-on-write using the source's backing
+     * file.
+     */
     target_bs = bdrv_new("");
     ret = bdrv_open(target_bs, target, flags | BDRV_O_NO_BACKING, drv);
 
@@ -1297,17 +1299,6 @@ void qmp_drive_mirror(const char *device, const char *target,
         return;
     }
 
-    /* We need a backing file if we will copy parts of a cluster.  */
-    if (bdrv_get_info(target_bs, &bdi) >= 0 && bdi.cluster_size != 0 &&
-        bdi.cluster_size >= BDRV_SECTORS_PER_DIRTY_CHUNK * 512) {
-        ret = bdrv_open_backing_file(target_bs);
-        if (ret < 0) {
-            bdrv_delete(target_bs);
-            error_set(errp, QERR_OPEN_FILE_FAILED, target);
-            return;
-        }
-    }
-
     mirror_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
                  block_job_cb, bs, &local_err);
     if (local_err != NULL) {
diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041
index c6eb851871..a1299b348e 100755
--- a/tests/qemu-iotests/041
+++ b/tests/qemu-iotests/041
@@ -292,6 +292,27 @@ class TestMirrorNoBacking(ImageMirroringTestCase):
         self.assertTrue(self.compare_images(test_img, target_img),
                         'target image does not match source after mirroring')
 
+    def test_large_cluster(self):
+        self.assert_no_active_mirrors()
+
+        # qemu-img create fails if the image is not there
+        qemu_img('create', '-f', iotests.imgfmt, '-o', 'size=%d'
+                        %(TestMirrorNoBacking.image_len), target_backing_img)
+        qemu_img('create', '-f', iotests.imgfmt, '-o', 'cluster_size=%d,backing_file=%s'
+                        % (TestMirrorNoBacking.image_len, target_backing_img), target_img)
+        os.remove(target_backing_img)
+
+        result = self.vm.qmp('drive-mirror', device='drive0', sync='full',
+                             mode='existing', target=target_img)
+        self.assert_qmp(result, 'return', {})
+
+        self.complete_and_wait()
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/inserted/file', target_img)
+        self.vm.shutdown()
+        self.assertTrue(self.compare_images(test_img, target_img),
+                        'target image does not match source after mirroring')
+
 class TestReadErrors(ImageMirroringTestCase):
     image_len = 2 * 1024 * 1024 # MB
 
@@ -330,6 +351,9 @@ new_state = "1"
                  '-o', 'backing_file=blkdebug:%s:%s,backing_fmt=raw'
                        % (self.blkdebug_file, backing_img),
                  test_img)
+        # Write something for tests that use sync='top'
+        qemu_io('-c', 'write %d 512' % (self.MIRROR_GRANULARITY + 65536),
+                        test_img)
         self.vm = iotests.VM().add_drive(test_img)
         self.vm.launch()
 
@@ -383,6 +407,32 @@ new_state = "1"
         self.complete_and_wait()
         self.vm.shutdown()
 
+    def test_large_cluster(self):
+        self.assert_no_active_mirrors()
+
+        # Test COW into the target image.  The first half of the
+        # cluster at MIRROR_GRANULARITY has to be copied from
+        # backing_img, even though sync='top'.
+        qemu_img('create', '-f', iotests.imgfmt, '-ocluster_size=131072,backing_file=%s' %(backing_img), target_img)
+        result = self.vm.qmp('drive-mirror', device='drive0', sync='top',
+                             on_source_error='ignore',
+                             mode='existing', target=target_img)
+        self.assert_qmp(result, 'return', {})
+
+        event = self.vm.get_qmp_event(wait=True)
+        self.assertEquals(event['event'], 'BLOCK_JOB_ERROR')
+        self.assert_qmp(event, 'data/device', 'drive0')
+        self.assert_qmp(event, 'data/operation', 'read')
+        result = self.vm.qmp('query-block-jobs')
+        self.assert_qmp(result, 'return[0]/paused', False)
+        self.complete_and_wait()
+        self.vm.shutdown()
+
+        # Detach blkdebug to compare images successfully
+        qemu_img('rebase', '-f', iotests.imgfmt, '-u', '-b', backing_img, test_img)
+        self.assertTrue(self.compare_images(test_img, target_img),
+                        'target image does not match source after mirroring')
+
     def test_stop_read(self):
         self.assert_no_active_mirrors()
 
diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out
index 71009c239f..3a89159833 100644
--- a/tests/qemu-iotests/041.out
+++ b/tests/qemu-iotests/041.out
@@ -1,5 +1,5 @@
-..................
+....................
 ----------------------------------------------------------------------
-Ran 18 tests
+Ran 20 tests
 
 OK
diff --git a/trace-events b/trace-events
index 61ed349765..ffa27568ee 100644
--- a/trace-events
+++ b/trace-events
@@ -84,6 +84,7 @@ mirror_before_flush(void *s) "s %p"
 mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
 mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64" synced %d"
 mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
+mirror_cow(void *s, int64_t sector_num) "s %p sector_num %"PRId64
 
 # blockdev.c
 qmp_block_job_cancel(void *job) "job %p"
-- 
cgit v1.2.1


From acc906c6c5d5745fe7a3a2ed1bb5f0b1d6d1f21b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:44 +0100
Subject: block: return count of dirty sectors, not chunks

Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block-migration.c | 2 +-
 block.c           | 5 ++---
 block/mirror.c    | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 6acf3e1682..9d0b037029 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -478,7 +478,7 @@ static int64_t get_remaining_dirty(void)
         dirty += bdrv_get_dirty_count(bmds->bs);
     }
 
-    return dirty * BLOCK_SIZE;
+    return dirty << BDRV_SECTOR_BITS;
 }
 
 static void blk_mig_cleanup(void)
diff --git a/block.c b/block.c
index 4a4ab162f7..a27454453e 100644
--- a/block.c
+++ b/block.c
@@ -2832,8 +2832,7 @@ BlockInfo *bdrv_query_info(BlockDriverState *bs)
     if (bs->dirty_bitmap) {
         info->has_dirty = true;
         info->dirty = g_malloc0(sizeof(*info->dirty));
-        info->dirty->count = bdrv_get_dirty_count(bs) *
-            BDRV_SECTORS_PER_DIRTY_CHUNK * BDRV_SECTOR_SIZE;
+        info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
     }
 
     if (bs->drv) {
@@ -4347,7 +4346,7 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
 {
     if (bs->dirty_bitmap) {
-        return hbitmap_count(bs->dirty_bitmap) >> BDRV_LOG_SECTORS_PER_DIRTY_CHUNK;
+        return hbitmap_count(bs->dirty_bitmap);
     } else {
         return 0;
     }
diff --git a/block/mirror.c b/block/mirror.c
index 307bcf101e..7884b3bc78 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -245,7 +245,7 @@ static void coroutine_fn mirror_run(void *opaque)
         trace_mirror_before_sleep(s, cnt, s->synced);
         if (!s->synced) {
             /* Publish progress */
-            s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+            s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
 
             if (s->common.speed) {
                 delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
-- 
cgit v1.2.1


From 50717e941b9f306a45292621999eeafbaa954418 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:45 +0100
Subject: block: allow customizing the granularity of the dirty bitmap

Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block-migration.c     |  5 +++--
 block.c               | 17 ++++++++++-------
 block/mirror.c        | 14 ++++----------
 include/block/block.h |  5 +----
 qapi-schema.json      |  4 +++-
 5 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 9d0b037029..9ac7de6d78 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -23,7 +23,8 @@
 #include "sysemu/blockdev.h"
 #include <assert.h>
 
-#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
+#define BLOCK_SIZE                       (1 << 20)
+#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
 
 #define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
 #define BLK_MIG_FLAG_EOS                0x02
@@ -254,7 +255,7 @@ static void set_dirty_tracking(int enable)
     BlkMigDevState *bmds;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        bdrv_set_dirty_tracking(bmds->bs, enable);
+        bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0);
     }
 }
 
diff --git a/block.c b/block.c
index a27454453e..ba67c0def2 100644
--- a/block.c
+++ b/block.c
@@ -2833,6 +2833,8 @@ BlockInfo *bdrv_query_info(BlockDriverState *bs)
         info->has_dirty = true;
         info->dirty = g_malloc0(sizeof(*info->dirty));
         info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
+        info->dirty->granularity =
+            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
     }
 
     if (bs->drv) {
@@ -4299,16 +4301,17 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
     return true;
 }
 
-void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
+void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
 {
     int64_t bitmap_size;
 
-    if (enable) {
-        if (!bs->dirty_bitmap) {
-            bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
-            bs->dirty_bitmap = hbitmap_alloc(bitmap_size,
-                                             BDRV_LOG_SECTORS_PER_DIRTY_CHUNK);
-        }
+    assert((granularity & (granularity - 1)) == 0);
+
+    if (granularity) {
+        granularity >>= BDRV_SECTOR_BITS;
+        assert(!bs->dirty_bitmap);
+        bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
+        bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
     } else {
         if (bs->dirty_bitmap) {
             hbitmap_free(bs->dirty_bitmap);
diff --git a/block/mirror.c b/block/mirror.c
index 7884b3bc78..e425927ee5 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -17,14 +17,8 @@
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
 
-enum {
-    /*
-     * Size of data buffer for populating the image file.  This should be large
-     * enough to process multiple clusters in a single call, so that populating
-     * contiguous regions of the image is efficient.
-     */
-    BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
-};
+#define BLOCK_SIZE                       (1 << 20)
+#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
 
 #define SLICE_TIME 100000000ULL /* ns */
 
@@ -276,7 +270,7 @@ static void coroutine_fn mirror_run(void *opaque)
 immediate_exit:
     qemu_vfree(s->buf);
     g_free(s->cow_bitmap);
-    bdrv_set_dirty_tracking(bs, false);
+    bdrv_set_dirty_tracking(bs, 0);
     bdrv_iostatus_disable(s->target);
     if (s->should_complete && ret == 0) {
         if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
@@ -364,7 +358,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->mode = mode;
     s->buf_size = BLOCK_SIZE;
 
-    bdrv_set_dirty_tracking(bs, true);
+    bdrv_set_dirty_tracking(bs, BLOCK_SIZE);
     bdrv_set_enable_write_cache(s->target, true);
     bdrv_set_on_error(s->target, on_target_error, on_target_error);
     bdrv_iostatus_enable(s->target);
diff --git a/include/block/block.h b/include/block/block.h
index 9ee9068aec..5c3b911c1b 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -355,11 +355,8 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
-#define BDRV_SECTORS_PER_DIRTY_CHUNK     (1 << BDRV_LOG_SECTORS_PER_DIRTY_CHUNK)
-#define BDRV_LOG_SECTORS_PER_DIRTY_CHUNK 11
-
 struct HBitmapIter;
-void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable);
+void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity);
 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
diff --git a/qapi-schema.json b/qapi-schema.json
index 6d7252b9e8..ce4f901dc6 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -667,10 +667,12 @@
 #
 # @count: number of dirty bytes according to the dirty bitmap
 #
+# @granularity: granularity of the dirty bitmap in bytes (since 1.4)
+#
 # Since: 1.3
 ##
 { 'type': 'BlockDirtyInfo',
-  'data': {'count': 'int'} }
+  'data': {'count': 'int', 'granularity': 'int'} }
 
 ##
 # @BlockInfo:
-- 
cgit v1.2.1


From eee13dfe302833944d1176677d12a6ea421a94ea Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 21 Jan 2013 17:09:46 +0100
Subject: mirror: allow customizing the granularity

The desired granularity may be very different depending on the kind of
operation (e.g. continuous replication vs. collapse-to-raw) and whether
the VM is expected to perform lots of I/O while mirroring is in progress.

Allow the user to customize it, while providing a sane default so that
in general there will be no extra allocated space in the target compared
to the source.

Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c            | 52 ++++++++++++++++++++++++++++++-----------------
 blockdev.c                | 15 +++++++++++++-
 hmp.c                     |  2 +-
 include/block/block_int.h |  3 ++-
 qapi-schema.json          |  8 +++++++-
 qmp-commands.hx           |  8 +++++++-
 6 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index e425927ee5..0fecb40c10 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -17,9 +17,6 @@
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
 
-#define BLOCK_SIZE                       (1 << 20)
-#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
-
 #define SLICE_TIME 100000000ULL /* ns */
 
 typedef struct MirrorBlockJob {
@@ -31,6 +28,7 @@ typedef struct MirrorBlockJob {
     bool synced;
     bool should_complete;
     int64_t sector_num;
+    int64_t granularity;
     size_t buf_size;
     unsigned long *cow_bitmap;
     HBitmapIter hbi;
@@ -56,7 +54,7 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
     BlockDriverState *source = s->common.bs;
     BlockDriverState *target = s->target;
     QEMUIOVector qiov;
-    int ret, nb_sectors;
+    int ret, nb_sectors, sectors_per_chunk;
     int64_t end, sector_num, chunk_num;
     struct iovec iov;
 
@@ -72,16 +70,16 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
      * is very large, we need to do COW ourselves.  The first time a cluster is
      * copied, copy it entirely.
      *
-     * Because both BDRV_SECTORS_PER_DIRTY_CHUNK and the cluster size are
-     * powers of two, the number of sectors to copy cannot exceed one cluster.
+     * Because both the granularity and the cluster size are powers of two, the
+     * number of sectors to copy cannot exceed one cluster.
      */
     sector_num = s->sector_num;
-    nb_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
-    chunk_num = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+    sectors_per_chunk = nb_sectors = s->granularity >> BDRV_SECTOR_BITS;
+    chunk_num = sector_num / sectors_per_chunk;
     if (s->cow_bitmap && !test_bit(chunk_num, s->cow_bitmap)) {
         trace_mirror_cow(s, sector_num);
         bdrv_round_to_clusters(s->target,
-                               sector_num, BDRV_SECTORS_PER_DIRTY_CHUNK,
+                               sector_num, sectors_per_chunk,
                                &sector_num, &nb_sectors);
     }
 
@@ -107,8 +105,8 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
         goto fail;
     }
     if (s->cow_bitmap) {
-        bitmap_set(s->cow_bitmap, sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK,
-                   nb_sectors / BDRV_SECTORS_PER_DIRTY_CHUNK);
+        bitmap_set(s->cow_bitmap, sector_num / sectors_per_chunk,
+                   nb_sectors / sectors_per_chunk);
     }
     return 0;
 
@@ -122,7 +120,7 @@ static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     BlockDriverState *bs = s->common.bs;
-    int64_t sector_num, end, length;
+    int64_t sector_num, end, sectors_per_chunk, length;
     BlockDriverInfo bdi;
     char backing_filename[1024];
     int ret = 0;
@@ -146,22 +144,23 @@ static void coroutine_fn mirror_run(void *opaque)
                               sizeof(backing_filename));
     if (backing_filename[0] && !s->target->backing_hd) {
         bdrv_get_info(s->target, &bdi);
-        if (s->buf_size < bdi.cluster_size) {
+        if (s->granularity < bdi.cluster_size) {
             s->buf_size = bdi.cluster_size;
-            length = (bdrv_getlength(bs) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
             s->cow_bitmap = bitmap_new(length);
         }
     }
 
     end = s->common.len >> BDRV_SECTOR_BITS;
     s->buf = qemu_blockalign(bs, s->buf_size);
+    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
 
     if (s->mode != MIRROR_SYNC_MODE_NONE) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
         BlockDriverState *base;
         base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
         for (sector_num = 0; sector_num < end; ) {
-            int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+            int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
             ret = bdrv_co_is_allocated_above(bs, base,
                                              sector_num, next - sector_num, &n);
 
@@ -242,7 +241,7 @@ static void coroutine_fn mirror_run(void *opaque)
             s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
 
             if (s->common.speed) {
-                delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+                delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk);
             } else {
                 delay_ns = 0;
             }
@@ -332,7 +331,7 @@ static BlockJobType mirror_job_type = {
 };
 
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, MirrorSyncMode mode,
+                  int64_t speed, int64_t granularity, MirrorSyncMode mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockDriverCompletionFunc *cb,
@@ -340,6 +339,20 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
 {
     MirrorBlockJob *s;
 
+    if (granularity == 0) {
+        /* Choose the default granularity based on the target file's cluster
+         * size, clamped between 4k and 64k.  */
+        BlockDriverInfo bdi;
+        if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
+            granularity = MAX(4096, bdi.cluster_size);
+            granularity = MIN(65536, granularity);
+        } else {
+            granularity = 65536;
+        }
+    }
+
+    assert ((granularity & (granularity - 1)) == 0);
+
     if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
          on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
         !bdrv_iostatus_is_enabled(bs)) {
@@ -356,9 +369,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->on_target_error = on_target_error;
     s->target = target;
     s->mode = mode;
-    s->buf_size = BLOCK_SIZE;
+    s->granularity = granularity;
+    s->buf_size = granularity;
 
-    bdrv_set_dirty_tracking(bs, BLOCK_SIZE);
+    bdrv_set_dirty_tracking(bs, granularity);
     bdrv_set_enable_write_cache(s->target, true);
     bdrv_set_on_error(s->target, on_target_error, on_target_error);
     bdrv_iostatus_enable(s->target);
diff --git a/blockdev.c b/blockdev.c
index 1eb62b637c..07fd3273ed 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1193,6 +1193,7 @@ void qmp_drive_mirror(const char *device, const char *target,
                       enum MirrorSyncMode sync,
                       bool has_mode, enum NewImageMode mode,
                       bool has_speed, int64_t speed,
+                      bool has_granularity, uint32_t granularity,
                       bool has_on_source_error, BlockdevOnError on_source_error,
                       bool has_on_target_error, BlockdevOnError on_target_error,
                       Error **errp)
@@ -1218,6 +1219,17 @@ void qmp_drive_mirror(const char *device, const char *target,
     if (!has_mode) {
         mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
     }
+    if (!has_granularity) {
+        granularity = 0;
+    }
+    if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
+        error_set(errp, QERR_INVALID_PARAMETER, device);
+        return;
+    }
+    if (granularity & (granularity - 1)) {
+        error_set(errp, QERR_INVALID_PARAMETER, device);
+        return;
+    }
 
     bs = bdrv_find(device);
     if (!bs) {
@@ -1299,7 +1311,8 @@ void qmp_drive_mirror(const char *device, const char *target,
         return;
     }
 
-    mirror_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
+    mirror_start(bs, target_bs, speed, granularity, sync,
+                 on_source_error, on_target_error,
                  block_job_cb, bs, &local_err);
     if (local_err != NULL) {
         bdrv_delete(target_bs);
diff --git a/hmp.c b/hmp.c
index c7b6ba02fc..0f3347dd76 100644
--- a/hmp.c
+++ b/hmp.c
@@ -796,7 +796,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict)
 
     qmp_drive_mirror(device, filename, !!format, format,
                      full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP,
-                     true, mode, false, 0,
+                     true, mode, false, 0, false, 0,
                      false, 0, false, 0, &errp);
     hmp_handle_error(mon, &errp);
 }
diff --git a/include/block/block_int.h b/include/block/block_int.h
index b81c061cd9..1165339fd1 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -344,6 +344,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
  * @bs: Block device to operate on.
  * @target: Block device to write to.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @granularity: The chosen granularity for the dirty bitmap.
  * @mode: Whether to collapse all images in the chain to the target.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
@@ -357,7 +358,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
  * @bs will be switched to read from @target.
  */
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, MirrorSyncMode mode,
+                  int64_t speed, int64_t granularity, MirrorSyncMode mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockDriverCompletionFunc *cb,
diff --git a/qapi-schema.json b/qapi-schema.json
index ce4f901dc6..fd5ec93c03 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1636,6 +1636,11 @@
 #        (all the disk, only the sectors allocated in the topmost image, or
 #        only new I/O).
 #
+# @granularity: #optional granularity of the dirty bitmap, default is 64K
+#               if the image format doesn't have clusters, 4K if the clusters
+#               are smaller than that, else the cluster size.  Must be a
+#               power of 2 between 512 and 64M (since 1.4).
+#
 # @on-source-error: #optional the action to take on an error on the source,
 #                   default 'report'.  'stop' and 'enospc' can only be used
 #                   if the block device supports io-status (see BlockInfo).
@@ -1652,7 +1657,8 @@
 { 'command': 'drive-mirror',
   'data': { 'device': 'str', 'target': 'str', '*format': 'str',
             'sync': 'MirrorSyncMode', '*mode': 'NewImageMode',
-            '*speed': 'int', '*on-source-error': 'BlockdevOnError',
+            '*speed': 'int', '*granularity': 'uint32',
+            '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError' } }
 
 ##
diff --git a/qmp-commands.hx b/qmp-commands.hx
index cbf12804be..835ea26e9d 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -938,7 +938,8 @@ EQMP
     {
         .name       = "drive-mirror",
         .args_type  = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
-                      "on-source-error:s?,on-target-error:s?",
+                      "on-source-error:s?,on-target-error:s?,"
+                      "granularity:i?",
         .mhandler.cmd_new = qmp_marshal_input_drive_mirror,
     },
 
@@ -962,6 +963,7 @@ Arguments:
   file/device (NewImageMode, optional, default 'absolute-paths')
 - "speed": maximum speed of the streaming job, in bytes per second
   (json-int)
+- "granularity": granularity of the dirty bitmap, in bytes (json-int, optional)
 - "sync": what parts of the disk image should be copied to the destination;
   possibilities include "full" for all the disk, "top" for only the sectors
   allocated in the topmost image, or "none" to only replicate new I/O
@@ -971,6 +973,10 @@ Arguments:
 - "on-target-error": the action to take on an error on the target
   (BlockdevOnError, default 'report')
 
+The default value of the granularity is the image cluster size clamped
+between 4096 and 65536, if the image format defines one.  If the format
+does not define a cluster size, the default value of the granularity
+is 65536.
 
 
 Example:
-- 
cgit v1.2.1


From bd48bde8f0fa08dfc8edcafc2bc8aa6d43734463 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Jan 2013 09:03:12 +0100
Subject: mirror: switch mirror_iteration to AIO

There is really no change in the behavior of the job here, since
there is still a maximum of one in-flight I/O operation between
the source and the target.  However, this patch already introduces
the AIO callbacks (which are unmodified in the next patch)
and some of the logic to count in-flight operations and only
complete the job when there is none.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c | 167 +++++++++++++++++++++++++++++++++++++++++++--------------
 trace-events   |   2 +
 2 files changed, 128 insertions(+), 41 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 0fecb40c10..fc6b9b7118 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -33,8 +33,19 @@ typedef struct MirrorBlockJob {
     unsigned long *cow_bitmap;
     HBitmapIter hbi;
     uint8_t *buf;
+
+    int in_flight;
+    int ret;
 } MirrorBlockJob;
 
+typedef struct MirrorOp {
+    MirrorBlockJob *s;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int64_t sector_num;
+    int nb_sectors;
+} MirrorOp;
+
 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
                                             int error)
 {
@@ -48,15 +59,70 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
     }
 }
 
-static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
-                                         BlockErrorAction *p_action)
+static void mirror_iteration_done(MirrorOp *op, int ret)
+{
+    MirrorBlockJob *s = op->s;
+    int64_t chunk_num;
+    int nb_chunks, sectors_per_chunk;
+
+    trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);
+
+    s->in_flight--;
+    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    chunk_num = op->sector_num / sectors_per_chunk;
+    nb_chunks = op->nb_sectors / sectors_per_chunk;
+    if (s->cow_bitmap && ret >= 0) {
+        bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+    }
+
+    g_slice_free(MirrorOp, op);
+    qemu_coroutine_enter(s->common.co, NULL);
+}
+
+static void mirror_write_complete(void *opaque, int ret)
+{
+    MirrorOp *op = opaque;
+    MirrorBlockJob *s = op->s;
+    if (ret < 0) {
+        BlockDriverState *source = s->common.bs;
+        BlockErrorAction action;
+
+        bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+        action = mirror_error_action(s, false, -ret);
+        if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
+            s->ret = ret;
+        }
+    }
+    mirror_iteration_done(op, ret);
+}
+
+static void mirror_read_complete(void *opaque, int ret)
+{
+    MirrorOp *op = opaque;
+    MirrorBlockJob *s = op->s;
+    if (ret < 0) {
+        BlockDriverState *source = s->common.bs;
+        BlockErrorAction action;
+
+        bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
+        action = mirror_error_action(s, true, -ret);
+        if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
+            s->ret = ret;
+        }
+
+        mirror_iteration_done(op, ret);
+        return;
+    }
+    bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
+                    mirror_write_complete, op);
+}
+
+static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = s->common.bs;
-    BlockDriverState *target = s->target;
-    QEMUIOVector qiov;
-    int ret, nb_sectors, sectors_per_chunk;
+    int nb_sectors, sectors_per_chunk;
     int64_t end, sector_num, chunk_num;
-    struct iovec iov;
+    MirrorOp *op;
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
     if (s->sector_num < 0) {
@@ -85,35 +151,30 @@ static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
 
     end = s->common.len >> BDRV_SECTOR_BITS;
     nb_sectors = MIN(nb_sectors, end - sector_num);
+
+    /* Allocate a MirrorOp that is used as an AIO callback.  */
+    op = g_slice_new(MirrorOp);
+    op->s = s;
+    op->iov.iov_base = s->buf;
+    op->iov.iov_len  = nb_sectors * 512;
+    op->sector_num = sector_num;
+    op->nb_sectors = nb_sectors;
+    qemu_iovec_init_external(&op->qiov, &op->iov, 1);
+
     bdrv_reset_dirty(source, sector_num, nb_sectors);
 
     /* Copy the dirty cluster.  */
-    iov.iov_base = s->buf;
-    iov.iov_len  = nb_sectors * 512;
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
+    s->in_flight++;
     trace_mirror_one_iteration(s, sector_num, nb_sectors);
-    ret = bdrv_co_readv(source, sector_num, nb_sectors, &qiov);
-    if (ret < 0) {
-        *p_action = mirror_error_action(s, true, -ret);
-        goto fail;
-    }
-    ret = bdrv_co_writev(target, sector_num, nb_sectors, &qiov);
-    if (ret < 0) {
-        *p_action = mirror_error_action(s, false, -ret);
-        s->synced = false;
-        goto fail;
-    }
-    if (s->cow_bitmap) {
-        bitmap_set(s->cow_bitmap, sector_num / sectors_per_chunk,
-                   nb_sectors / sectors_per_chunk);
-    }
-    return 0;
+    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+                   mirror_read_complete, op);
+}
 
-fail:
-    /* Try again later.  */
-    bdrv_set_dirty(source, sector_num, nb_sectors);
-    return ret;
+static void mirror_drain(MirrorBlockJob *s)
+{
+    while (s->in_flight > 0) {
+        qemu_coroutine_yield();
+    }
 }
 
 static void coroutine_fn mirror_run(void *opaque)
@@ -121,6 +182,7 @@ static void coroutine_fn mirror_run(void *opaque)
     MirrorBlockJob *s = opaque;
     BlockDriverState *bs = s->common.bs;
     int64_t sector_num, end, sectors_per_chunk, length;
+    uint64_t last_pause_ns;
     BlockDriverInfo bdi;
     char backing_filename[1024];
     int ret = 0;
@@ -179,23 +241,38 @@ static void coroutine_fn mirror_run(void *opaque)
     }
 
     bdrv_dirty_iter_init(bs, &s->hbi);
+    last_pause_ns = qemu_get_clock_ns(rt_clock);
     for (;;) {
         uint64_t delay_ns;
         int64_t cnt;
         bool should_complete;
 
+        if (s->ret < 0) {
+            ret = s->ret;
+            goto immediate_exit;
+        }
+
         cnt = bdrv_get_dirty_count(bs);
-        if (cnt != 0) {
-            BlockErrorAction action = BDRV_ACTION_REPORT;
-            ret = mirror_iteration(s, &action);
-            if (ret < 0 && action == BDRV_ACTION_REPORT) {
-                goto immediate_exit;
+
+        /* Note that even when no rate limit is applied we need to yield
+         * periodically with no pending I/O so that qemu_aio_flush() returns.
+         * We do so every SLICE_TIME nanoseconds, or when there is an error,
+         * or when the source is clean, whichever comes first.
+         */
+        if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
+            s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+            if (s->in_flight > 0) {
+                trace_mirror_yield(s, s->in_flight, cnt);
+                qemu_coroutine_yield();
+                continue;
+            } else if (cnt != 0) {
+                mirror_iteration(s);
+                continue;
             }
-            cnt = bdrv_get_dirty_count(bs);
         }
 
         should_complete = false;
-        if (cnt == 0) {
+        if (s->in_flight == 0 && cnt == 0) {
             trace_mirror_before_flush(s);
             ret = bdrv_flush(s->target);
             if (ret < 0) {
@@ -246,15 +323,12 @@ static void coroutine_fn mirror_run(void *opaque)
                 delay_ns = 0;
             }
 
-            /* Note that even when no rate limit is applied we need to yield
-             * with no pending I/O here so that bdrv_drain_all() returns.
-             */
             block_job_sleep_ns(&s->common, rt_clock, delay_ns);
             if (block_job_is_cancelled(&s->common)) {
                 break;
             }
         } else if (!should_complete) {
-            delay_ns = (cnt == 0 ? SLICE_TIME : 0);
+            delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
             block_job_sleep_ns(&s->common, rt_clock, delay_ns);
         } else if (cnt == 0) {
             /* The two disks are in sync.  Exit and report successful
@@ -264,9 +338,20 @@ static void coroutine_fn mirror_run(void *opaque)
             s->common.cancelled = false;
             break;
         }
+        last_pause_ns = qemu_get_clock_ns(rt_clock);
     }
 
 immediate_exit:
+    if (s->in_flight > 0) {
+        /* We get here only if something went wrong.  Either the job failed,
+         * or it was cancelled prematurely so that we do not guarantee that
+         * the target is a copy of the source.
+         */
+        assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
+        mirror_drain(s);
+    }
+
+    assert(s->in_flight == 0);
     qemu_vfree(s->buf);
     g_free(s->cow_bitmap);
     bdrv_set_dirty_tracking(bs, 0);
diff --git a/trace-events b/trace-events
index ffa27568ee..43e6b73fa5 100644
--- a/trace-events
+++ b/trace-events
@@ -85,6 +85,8 @@ mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
 mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64" synced %d"
 mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
 mirror_cow(void *s, int64_t sector_num) "s %p sector_num %"PRId64
+mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d"
+mirror_yield(void *s, int64_t cnt, int in_flight) "s %p dirty count %"PRId64" in_flight %d"
 
 # blockdev.c
 qmp_block_job_cancel(void *job) "job %p"
-- 
cgit v1.2.1


From 08e4ed6cdeeee7912072cf14aa8ab6c60dacb4fb Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Jan 2013 09:03:13 +0100
Subject: mirror: add buf-size argument to drive-mirror

This makes sense when the next commit starts using the extra buffer space
to perform many I/O operations asynchronously.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c             |  8 ++++----
 blockdev.c                 |  9 ++++++++-
 hmp.c                      |  2 +-
 include/block/block_int.h  |  5 +++--
 qapi-schema.json           |  5 ++++-
 qmp-commands.hx            |  4 +++-
 tests/qemu-iotests/041     | 31 +++++++++++++++++++++++++++++++
 tests/qemu-iotests/041.out |  4 ++--
 8 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index fc6b9b7118..896972c297 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -207,7 +207,7 @@ static void coroutine_fn mirror_run(void *opaque)
     if (backing_filename[0] && !s->target->backing_hd) {
         bdrv_get_info(s->target, &bdi);
         if (s->granularity < bdi.cluster_size) {
-            s->buf_size = bdi.cluster_size;
+            s->buf_size = MAX(s->buf_size, bdi.cluster_size);
             length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
             s->cow_bitmap = bitmap_new(length);
         }
@@ -416,8 +416,8 @@ static BlockJobType mirror_job_type = {
 };
 
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, int64_t granularity, MirrorSyncMode mode,
-                  BlockdevOnError on_source_error,
+                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp)
@@ -455,7 +455,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->target = target;
     s->mode = mode;
     s->granularity = granularity;
-    s->buf_size = granularity;
+    s->buf_size = MAX(buf_size, granularity);
 
     bdrv_set_dirty_tracking(bs, granularity);
     bdrv_set_enable_write_cache(s->target, true);
diff --git a/blockdev.c b/blockdev.c
index 07fd3273ed..ad25b9b86e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1188,12 +1188,15 @@ void qmp_block_commit(const char *device,
     drive_get_ref(drive_get_by_blockdev(bs));
 }
 
+#define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
+
 void qmp_drive_mirror(const char *device, const char *target,
                       bool has_format, const char *format,
                       enum MirrorSyncMode sync,
                       bool has_mode, enum NewImageMode mode,
                       bool has_speed, int64_t speed,
                       bool has_granularity, uint32_t granularity,
+                      bool has_buf_size, int64_t buf_size,
                       bool has_on_source_error, BlockdevOnError on_source_error,
                       bool has_on_target_error, BlockdevOnError on_target_error,
                       Error **errp)
@@ -1222,6 +1225,10 @@ void qmp_drive_mirror(const char *device, const char *target,
     if (!has_granularity) {
         granularity = 0;
     }
+    if (!has_buf_size) {
+        buf_size = DEFAULT_MIRROR_BUF_SIZE;
+    }
+
     if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
         error_set(errp, QERR_INVALID_PARAMETER, device);
         return;
@@ -1311,7 +1318,7 @@ void qmp_drive_mirror(const char *device, const char *target,
         return;
     }
 
-    mirror_start(bs, target_bs, speed, granularity, sync,
+    mirror_start(bs, target_bs, speed, granularity, buf_size, sync,
                  on_source_error, on_target_error,
                  block_job_cb, bs, &local_err);
     if (local_err != NULL) {
diff --git a/hmp.c b/hmp.c
index 0f3347dd76..99fd89206b 100644
--- a/hmp.c
+++ b/hmp.c
@@ -796,7 +796,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict)
 
     qmp_drive_mirror(device, filename, !!format, format,
                      full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP,
-                     true, mode, false, 0, false, 0,
+                     true, mode, false, 0, false, 0, false, 0,
                      false, 0, false, 0, &errp);
     hmp_handle_error(mon, &errp);
 }
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 1165339fd1..f7279b978a 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -345,6 +345,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
  * @target: Block device to write to.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @granularity: The chosen granularity for the dirty bitmap.
+ * @buf_size: The amount of data that can be in flight at one time.
  * @mode: Whether to collapse all images in the chain to the target.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
@@ -358,8 +359,8 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
  * @bs will be switched to read from @target.
  */
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, int64_t granularity, MirrorSyncMode mode,
-                  BlockdevOnError on_source_error,
+                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp);
diff --git a/qapi-schema.json b/qapi-schema.json
index fd5ec93c03..ba75c4de12 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1641,6 +1641,9 @@
 #               are smaller than that, else the cluster size.  Must be a
 #               power of 2 between 512 and 64M (since 1.4).
 #
+# @buf-size: #optional maximum amount of data in flight from source to
+#            target (since 1.4).
+#
 # @on-source-error: #optional the action to take on an error on the source,
 #                   default 'report'.  'stop' and 'enospc' can only be used
 #                   if the block device supports io-status (see BlockInfo).
@@ -1658,7 +1661,7 @@
   'data': { 'device': 'str', 'target': 'str', '*format': 'str',
             'sync': 'MirrorSyncMode', '*mode': 'NewImageMode',
             '*speed': 'int', '*granularity': 'uint32',
-            '*on-source-error': 'BlockdevOnError',
+            '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError' } }
 
 ##
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 835ea26e9d..273b4a67ba 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -939,7 +939,7 @@ EQMP
         .name       = "drive-mirror",
         .args_type  = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
                       "on-source-error:s?,on-target-error:s?,"
-                      "granularity:i?",
+                      "granularity:i?,buf-size:i?",
         .mhandler.cmd_new = qmp_marshal_input_drive_mirror,
     },
 
@@ -964,6 +964,8 @@ Arguments:
 - "speed": maximum speed of the streaming job, in bytes per second
   (json-int)
 - "granularity": granularity of the dirty bitmap, in bytes (json-int, optional)
+- "buf_size": maximum amount of data in flight from source to target, in bytes
+  (json-int, default 10M)
 - "sync": what parts of the disk image should be copied to the destination;
   possibilities include "full" for all the disk, "top" for only the sectors
   allocated in the topmost image, or "none" to only replicate new I/O
diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041
index a1299b348e..b040820c51 100755
--- a/tests/qemu-iotests/041
+++ b/tests/qemu-iotests/041
@@ -207,6 +207,37 @@ class TestSingleDrive(ImageMirroringTestCase):
         self.assertTrue(self.compare_images(test_img, target_img),
                         'target image does not match source after mirroring')
 
+    def test_small_buffer(self):
+        self.assert_no_active_mirrors()
+
+        # A small buffer is rounded up automatically
+        result = self.vm.qmp('drive-mirror', device='drive0', sync='full',
+                             buf_size=4096, target=target_img)
+        self.assert_qmp(result, 'return', {})
+
+        self.complete_and_wait()
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/inserted/file', target_img)
+        self.vm.shutdown()
+        self.assertTrue(self.compare_images(test_img, target_img),
+                        'target image does not match source after mirroring')
+
+    def test_small_buffer2(self):
+        self.assert_no_active_mirrors()
+
+        qemu_img('create', '-f', iotests.imgfmt, '-o', 'cluster_size=%d,size=%d'
+                        % (TestSingleDrive.image_len, TestSingleDrive.image_len), target_img)
+        result = self.vm.qmp('drive-mirror', device='drive0', sync='full',
+                             buf_size=65536, mode='existing', target=target_img)
+        self.assert_qmp(result, 'return', {})
+
+        self.complete_and_wait()
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/inserted/file', target_img)
+        self.vm.shutdown()
+        self.assertTrue(self.compare_images(test_img, target_img),
+                        'target image does not match source after mirroring')
+
     def test_large_cluster(self):
         self.assert_no_active_mirrors()
 
diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out
index 3a89159833..84bfd63fba 100644
--- a/tests/qemu-iotests/041.out
+++ b/tests/qemu-iotests/041.out
@@ -1,5 +1,5 @@
-....................
+......................
 ----------------------------------------------------------------------
-Ran 20 tests
+Ran 22 tests
 
 OK
-- 
cgit v1.2.1


From 402a47411bff5e849dc880dd08ba7e6564e6e4f4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Jan 2013 09:03:14 +0100
Subject: mirror: support more than one in-flight AIO operation

With AIO support in place, we can start copying more than one chunk
in parallel.  This patch introduces the required infrastructure for
this: the buffer is split into multiple granularity-sized chunks,
and there is a free list to access them.

Because of copy-on-write, a single operation may already require
multiple chunks to be available on the free list.

In addition, two different iterations on the HBitmap may want to
copy the same cluster.  We avoid this by keeping a bitmap of in-flight
I/O operations, and blocking until the previous iteration completes.
This should be a pretty rare occurrence, though; as long as there is
no overlap the next iteration can start before the previous one finishes.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 trace-events   |   4 ++-
 2 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 896972c297..4696dc84ea 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -17,7 +17,15 @@
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
 
-#define SLICE_TIME 100000000ULL /* ns */
+#define SLICE_TIME    100000000ULL /* ns */
+#define MAX_IN_FLIGHT 16
+
+/* The mirroring buffer is a list of granularity-sized chunks.
+ * Free chunks are organized in a list.
+ */
+typedef struct MirrorBuffer {
+    QSIMPLEQ_ENTRY(MirrorBuffer) next;
+} MirrorBuffer;
 
 typedef struct MirrorBlockJob {
     BlockJob common;
@@ -33,7 +41,10 @@ typedef struct MirrorBlockJob {
     unsigned long *cow_bitmap;
     HBitmapIter hbi;
     uint8_t *buf;
+    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
+    int buf_free_count;
 
+    unsigned long *in_flight_bitmap;
     int in_flight;
     int ret;
 } MirrorBlockJob;
@@ -41,7 +52,6 @@ typedef struct MirrorBlockJob {
 typedef struct MirrorOp {
     MirrorBlockJob *s;
     QEMUIOVector qiov;
-    struct iovec iov;
     int64_t sector_num;
     int nb_sectors;
 } MirrorOp;
@@ -62,15 +72,24 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 static void mirror_iteration_done(MirrorOp *op, int ret)
 {
     MirrorBlockJob *s = op->s;
+    struct iovec *iov;
     int64_t chunk_num;
-    int nb_chunks, sectors_per_chunk;
+    int i, nb_chunks, sectors_per_chunk;
 
     trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);
 
     s->in_flight--;
+    iov = op->qiov.iov;
+    for (i = 0; i < op->qiov.niov; i++) {
+        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
+        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
+        s->buf_free_count++;
+    }
+
     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
     chunk_num = op->sector_num / sectors_per_chunk;
     nb_chunks = op->nb_sectors / sectors_per_chunk;
+    bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
     if (s->cow_bitmap && ret >= 0) {
         bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
     }
@@ -120,8 +139,8 @@ static void mirror_read_complete(void *opaque, int ret)
 static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = s->common.bs;
-    int nb_sectors, sectors_per_chunk;
-    int64_t end, sector_num, chunk_num;
+    int nb_sectors, sectors_per_chunk, nb_chunks;
+    int64_t end, sector_num, chunk_num, next_sector, hbitmap_next_sector;
     MirrorOp *op;
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
@@ -132,6 +151,8 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
         assert(s->sector_num >= 0);
     }
 
+    hbitmap_next_sector = s->sector_num;
+
     /* If we have no backing file yet in the destination, and the cluster size
      * is very large, we need to do COW ourselves.  The first time a cluster is
      * copied, copy it entirely.
@@ -147,19 +168,56 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
         bdrv_round_to_clusters(s->target,
                                sector_num, sectors_per_chunk,
                                &sector_num, &nb_sectors);
+
+        /* The rounding may make us copy sectors before the
+         * first dirty one.
+         */
+        chunk_num = sector_num / sectors_per_chunk;
+    }
+
+    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
+    while (test_bit(chunk_num, s->in_flight_bitmap)) {
+        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        qemu_coroutine_yield();
     }
 
     end = s->common.len >> BDRV_SECTOR_BITS;
     nb_sectors = MIN(nb_sectors, end - sector_num);
+    nb_chunks = (nb_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
+    while (s->buf_free_count < nb_chunks) {
+        trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+        qemu_coroutine_yield();
+    }
+
+    /* We have enough free space to copy these sectors.  */
+    bitmap_set(s->in_flight_bitmap, chunk_num, nb_chunks);
 
     /* Allocate a MirrorOp that is used as an AIO callback.  */
     op = g_slice_new(MirrorOp);
     op->s = s;
-    op->iov.iov_base = s->buf;
-    op->iov.iov_len  = nb_sectors * 512;
     op->sector_num = sector_num;
     op->nb_sectors = nb_sectors;
-    qemu_iovec_init_external(&op->qiov, &op->iov, 1);
+
+    /* Now make a QEMUIOVector taking enough granularity-sized chunks
+     * from s->buf_free.
+     */
+    qemu_iovec_init(&op->qiov, nb_chunks);
+    next_sector = sector_num;
+    while (nb_chunks-- > 0) {
+        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
+        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
+        s->buf_free_count--;
+        qemu_iovec_add(&op->qiov, buf, s->granularity);
+
+        /* Advance the HBitmapIter in parallel, so that we do not examine
+         * the same sector twice.
+         */
+        if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
+            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
+        }
+
+        next_sector += sectors_per_chunk;
+    }
 
     bdrv_reset_dirty(source, sector_num, nb_sectors);
 
@@ -170,6 +228,23 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
                    mirror_read_complete, op);
 }
 
+static void mirror_free_init(MirrorBlockJob *s)
+{
+    int granularity = s->granularity;
+    size_t buf_size = s->buf_size;
+    uint8_t *buf = s->buf;
+
+    assert(s->buf_free_count == 0);
+    QSIMPLEQ_INIT(&s->buf_free);
+    while (buf_size != 0) {
+        MirrorBuffer *cur = (MirrorBuffer *)buf;
+        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
+        s->buf_free_count++;
+        buf_size -= granularity;
+        buf += granularity;
+    }
+}
+
 static void mirror_drain(MirrorBlockJob *s)
 {
     while (s->in_flight > 0) {
@@ -198,6 +273,9 @@ static void coroutine_fn mirror_run(void *opaque)
         return;
     }
 
+    length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
+    s->in_flight_bitmap = bitmap_new(length);
+
     /* If we have no backing file yet in the destination, we cannot let
      * the destination do COW.  Instead, we copy sectors around the
      * dirty data if needed.  We need a bitmap to do that.
@@ -208,7 +286,6 @@ static void coroutine_fn mirror_run(void *opaque)
         bdrv_get_info(s->target, &bdi);
         if (s->granularity < bdi.cluster_size) {
             s->buf_size = MAX(s->buf_size, bdi.cluster_size);
-            length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
             s->cow_bitmap = bitmap_new(length);
         }
     }
@@ -216,6 +293,7 @@ static void coroutine_fn mirror_run(void *opaque)
     end = s->common.len >> BDRV_SECTOR_BITS;
     s->buf = qemu_blockalign(bs, s->buf_size);
     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    mirror_free_init(s);
 
     if (s->mode != MIRROR_SYNC_MODE_NONE) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -261,8 +339,9 @@ static void coroutine_fn mirror_run(void *opaque)
          */
         if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-            if (s->in_flight > 0) {
-                trace_mirror_yield(s, s->in_flight, cnt);
+            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+                (cnt == 0 && s->in_flight > 0)) {
+                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
                 qemu_coroutine_yield();
                 continue;
             } else if (cnt != 0) {
@@ -354,6 +433,7 @@ immediate_exit:
     assert(s->in_flight == 0);
     qemu_vfree(s->buf);
     g_free(s->cow_bitmap);
+    g_free(s->in_flight_bitmap);
     bdrv_set_dirty_tracking(bs, 0);
     bdrv_iostatus_disable(s->target);
     if (s->should_complete && ret == 0) {
diff --git a/trace-events b/trace-events
index 43e6b73fa5..b4a23cdc2e 100644
--- a/trace-events
+++ b/trace-events
@@ -86,7 +86,9 @@ mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64
 mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
 mirror_cow(void *s, int64_t sector_num) "s %p sector_num %"PRId64
 mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d"
-mirror_yield(void *s, int64_t cnt, int in_flight) "s %p dirty count %"PRId64" in_flight %d"
+mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
+mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
+mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
 
 # blockdev.c
 qmp_block_job_cancel(void *job) "job %p"
-- 
cgit v1.2.1


From 884fea4e87fbc6daf4e6df618bb3cadc188dcc6b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Jan 2013 09:03:15 +0100
Subject: mirror: support arbitrarily-sized iterations

Yet another optimization is to extend the mirroring iteration to include more
adjacent dirty blocks.  This limits the number of I/O operations and makes
mirroring efficient even with a small granularity.  Most of the infrastructure
is already in place; we only need to put a loop around the computation of
the origin and sector count of the iteration.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c | 97 ++++++++++++++++++++++++++++++++++++++++------------------
 trace-events   |  1 +
 2 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 4696dc84ea..9347533b9b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -140,7 +140,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = s->common.bs;
     int nb_sectors, sectors_per_chunk, nb_chunks;
-    int64_t end, sector_num, chunk_num, next_sector, hbitmap_next_sector;
+    int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
     MirrorOp *op;
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
@@ -152,45 +152,82 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
     }
 
     hbitmap_next_sector = s->sector_num;
+    sector_num = s->sector_num;
+    sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    end = s->common.len >> BDRV_SECTOR_BITS;
 
-    /* If we have no backing file yet in the destination, and the cluster size
-     * is very large, we need to do COW ourselves.  The first time a cluster is
-     * copied, copy it entirely.
+    /* Extend the QEMUIOVector to include all adjacent blocks that will
+     * be copied in this operation.
+     *
+     * We have to do this if we have no backing file yet in the destination,
+     * and the cluster size is very large.  Then we need to do COW ourselves.
+     * The first time a cluster is copied, copy it entirely.  Note that,
+     * because both the granularity and the cluster size are powers of two,
+     * the number of sectors to copy cannot exceed one cluster.
      *
-     * Because both the granularity and the cluster size are powers of two, the
-     * number of sectors to copy cannot exceed one cluster.
+     * We also want to extend the QEMUIOVector to include more adjacent
+     * dirty blocks if possible, to limit the number of I/O operations and
+     * run efficiently even with a small granularity.
      */
-    sector_num = s->sector_num;
-    sectors_per_chunk = nb_sectors = s->granularity >> BDRV_SECTOR_BITS;
-    chunk_num = sector_num / sectors_per_chunk;
-    if (s->cow_bitmap && !test_bit(chunk_num, s->cow_bitmap)) {
-        trace_mirror_cow(s, sector_num);
-        bdrv_round_to_clusters(s->target,
-                               sector_num, sectors_per_chunk,
-                               &sector_num, &nb_sectors);
-
-        /* The rounding may make us copy sectors before the
-         * first dirty one.
-         */
-        chunk_num = sector_num / sectors_per_chunk;
-    }
+    nb_chunks = 0;
+    nb_sectors = 0;
+    next_sector = sector_num;
+    next_chunk = sector_num / sectors_per_chunk;
 
     /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
-    while (test_bit(chunk_num, s->in_flight_bitmap)) {
+    while (test_bit(next_chunk, s->in_flight_bitmap)) {
         trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
         qemu_coroutine_yield();
     }
 
-    end = s->common.len >> BDRV_SECTOR_BITS;
-    nb_sectors = MIN(nb_sectors, end - sector_num);
-    nb_chunks = (nb_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
-    while (s->buf_free_count < nb_chunks) {
-        trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
-        qemu_coroutine_yield();
-    }
+    do {
+        int added_sectors, added_chunks;
+
+        if (!bdrv_get_dirty(source, next_sector) ||
+            test_bit(next_chunk, s->in_flight_bitmap)) {
+            assert(nb_sectors > 0);
+            break;
+        }
+
+        added_sectors = sectors_per_chunk;
+        if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
+            bdrv_round_to_clusters(s->target,
+                                   next_sector, added_sectors,
+                                   &next_sector, &added_sectors);
+
+            /* On the first iteration, the rounding may make us copy
+             * sectors before the first dirty one.
+             */
+            if (next_sector < sector_num) {
+                assert(nb_sectors == 0);
+                sector_num = next_sector;
+                next_chunk = next_sector / sectors_per_chunk;
+            }
+        }
+
+        added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
+        added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
+
+        /* When doing COW, it may happen that there is not enough space for
+         * a full cluster.  Wait if that is the case.
+         */
+        while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
+            trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+            qemu_coroutine_yield();
+        }
+        if (s->buf_free_count < nb_chunks + added_chunks) {
+            trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
+            break;
+        }
+
+        /* We have enough free space to copy these sectors.  */
+        bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
 
-    /* We have enough free space to copy these sectors.  */
-    bitmap_set(s->in_flight_bitmap, chunk_num, nb_chunks);
+        nb_sectors += added_sectors;
+        nb_chunks += added_chunks;
+        next_sector += added_sectors;
+        next_chunk += added_chunks;
+    } while (next_sector < end);
 
     /* Allocate a MirrorOp that is used as an AIO callback.  */
     op = g_slice_new(MirrorOp);
diff --git a/trace-events b/trace-events
index b4a23cdc2e..2b28076e45 100644
--- a/trace-events
+++ b/trace-events
@@ -89,6 +89,7 @@ mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %
 mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
 mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
 mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
+mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
 
 # blockdev.c
 qmp_block_job_cancel(void *job) "job %p"
-- 
cgit v1.2.1


From 02582abd48aa3d860015e9a8fcd0d7ec1c34ec62 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 17 Jan 2013 21:45:24 +0100
Subject: block: Add special error code for wrong format

The block drivers need a special error code for "wrong format".
From the available error codes EMEDIUMTYPE fits best.
It is not available on all platforms, so a definition in
qemu-common.h and a specific error report are needed.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c            | 9 +++++++--
 include/qemu-common.h | 3 +++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index ad25b9b86e..ac396f3c94 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -617,8 +617,13 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type)
 
     ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv);
     if (ret < 0) {
-        error_report("could not open disk image %s: %s",
-                     file, strerror(-ret));
+        if (ret == -EMEDIUMTYPE) {
+            error_report("could not open disk image %s: not in %s format",
+                         file, drv->format_name);
+        } else {
+            error_report("could not open disk image %s: %s",
+                         file, strerror(-ret));
+        }
         goto err;
     }
 
diff --git a/include/qemu-common.h b/include/qemu-common.h
index ca464bb367..af2379ff38 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -68,6 +68,9 @@
 #if !defined(ECANCELED)
 #define ECANCELED 4097
 #endif
+#if !defined(EMEDIUMTYPE)
+#define EMEDIUMTYPE 4098
+#endif
 #ifndef TIME_MAX
 #define TIME_MAX LONG_MAX
 #endif
-- 
cgit v1.2.1


From 15bac0d54f78adb5e255155a69e56ab7f6d8c8ea Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 17 Jan 2013 21:45:25 +0100
Subject: block: Use error code EMEDIUMTYPE for wrong format in some block
 drivers

This improves error reports for bochs, cow, qcow, qcow2, qed and vmdk
when a file with the wrong format is selected.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/bochs.c | 2 +-
 block/cow.c   | 2 +-
 block/qcow.c  | 2 +-
 block/qcow2.c | 2 +-
 block/qed.c   | 2 +-
 block/vmdk.c  | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/bochs.c b/block/bochs.c
index 1b1d9cdbe5..37375834e9 100644
--- a/block/bochs.c
+++ b/block/bochs.c
@@ -126,7 +126,7 @@ static int bochs_open(BlockDriverState *bs, int flags)
         strcmp(bochs.subtype, GROWING_TYPE) ||
 	((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
 	(le32_to_cpu(bochs.version) != HEADER_V1))) {
-        goto fail;
+        return -EMEDIUMTYPE;
     }
 
     if (le32_to_cpu(bochs.version) == HEADER_V1) {
diff --git a/block/cow.c b/block/cow.c
index a33ce950d4..4baf9042fe 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -73,7 +73,7 @@ static int cow_open(BlockDriverState *bs, int flags)
     }
 
     if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
-        ret = -EINVAL;
+        ret = -EMEDIUMTYPE;
         goto fail;
     }
 
diff --git a/block/qcow.c b/block/qcow.c
index 4276610afd..a7135eea47 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -112,7 +112,7 @@ static int qcow_open(BlockDriverState *bs, int flags)
     be64_to_cpus(&header.l1_table_offset);
 
     if (header.magic != QCOW_MAGIC) {
-        ret = -EINVAL;
+        ret = -EMEDIUMTYPE;
         goto fail;
     }
     if (header.version != QCOW_VERSION) {
diff --git a/block/qcow2.c b/block/qcow2.c
index f6abff6111..7610e569e3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -311,7 +311,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
     be32_to_cpus(&header.nb_snapshots);
 
     if (header.magic != QCOW_MAGIC) {
-        ret = -EINVAL;
+        ret = -EMEDIUMTYPE;
         goto fail;
     }
     if (header.version < 2 || header.version > 3) {
diff --git a/block/qed.c b/block/qed.c
index cf85d8f2b4..b8515e58b3 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -390,7 +390,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
     qed_header_le_to_cpu(&le_header, &s->header);
 
     if (s->header.magic != QED_MAGIC) {
-        return -EINVAL;
+        return -EMEDIUMTYPE;
     }
     if (s->header.features & ~QED_FEATURE_MASK) {
         /* image uses unsupported feature bits */
diff --git a/block/vmdk.c b/block/vmdk.c
index 19298c2a3e..8333afb5e3 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -616,7 +616,7 @@ static int vmdk_open_sparse(BlockDriverState *bs,
             return vmdk_open_vmdk4(bs, file, flags);
             break;
         default:
-            return -EINVAL;
+            return -EMEDIUMTYPE;
             break;
     }
 }
@@ -718,7 +718,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
     }
     buf[2047] = '\0';
     if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
-        return -EINVAL;
+        return -EMEDIUMTYPE;
     }
     if (strcmp(ct, "monolithicFlat") &&
         strcmp(ct, "twoGbMaxExtentSparse") &&
-- 
cgit v1.2.1


From 9f0470bb2d7942c28977296ff2598cdf30886e07 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 17 Jan 2013 21:45:26 +0100
Subject: block/vdi: Improve debug output for signature

The signature is a 32 bit value and needs up to 8 hex digits for printing.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/vdi.c b/block/vdi.c
index 021abaa227..0e1ed617f3 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -246,7 +246,7 @@ static void vdi_header_print(VdiHeader *header)
 {
     char uuid[37];
     logout("text        %s", header->text);
-    logout("signature   0x%04x\n", header->signature);
+    logout("signature   0x%08x\n", header->signature);
     logout("header size 0x%04x\n", header->header_size);
     logout("image type  0x%04x\n", header->image_type);
     logout("image flags 0x%04x\n", header->image_flags);
-- 
cgit v1.2.1


From 8937f8222c14ab42140a5645c34e17cc620c05bb Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 17 Jan 2013 21:45:27 +0100
Subject: block/vdi: Improved return values from vdi_open

vdi_open returned -1 in case of any error, but it should return an
error code (negative value of errno or -EMEDIUMTYPE).

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index 0e1ed617f3..8b768bfa49 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -369,10 +369,12 @@ static int vdi_open(BlockDriverState *bs, int flags)
     BDRVVdiState *s = bs->opaque;
     VdiHeader header;
     size_t bmap_size;
+    int ret;
 
     logout("\n");
 
-    if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) {
+    ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1);
+    if (ret < 0) {
         goto fail;
     }
 
@@ -393,30 +395,38 @@ static int vdi_open(BlockDriverState *bs, int flags)
     if (header.version != VDI_VERSION_1_1) {
         logout("unsupported version %u.%u\n",
                header.version >> 16, header.version & 0xffff);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.offset_bmap % SECTOR_SIZE != 0) {
         /* We only support block maps which start on a sector boundary. */
         logout("unsupported block map offset 0x%x B\n", header.offset_bmap);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.offset_data % SECTOR_SIZE != 0) {
         /* We only support data blocks which start on a sector boundary. */
         logout("unsupported data offset 0x%x B\n", header.offset_data);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.sector_size != SECTOR_SIZE) {
         logout("unsupported sector size %u B\n", header.sector_size);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.block_size != 1 * MiB) {
         logout("unsupported block size %u B\n", header.block_size);
+        ret = -ENOTSUP;
         goto fail;
     } else if (header.disk_size >
                (uint64_t)header.blocks_in_image * header.block_size) {
         logout("unsupported disk size %" PRIu64 " B\n", header.disk_size);
+        ret = -ENOTSUP;
         goto fail;
     } else if (!uuid_is_null(header.uuid_link)) {
         logout("link uuid != 0, unsupported\n");
+        ret = -ENOTSUP;
         goto fail;
     } else if (!uuid_is_null(header.uuid_parent)) {
         logout("parent uuid != 0, unsupported\n");
+        ret = -ENOTSUP;
         goto fail;
     }
 
@@ -432,7 +442,8 @@ static int vdi_open(BlockDriverState *bs, int flags)
     if (bmap_size > 0) {
         s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
     }
-    if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) {
+    ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size);
+    if (ret < 0) {
         goto fail_free_bmap;
     }
 
@@ -448,7 +459,7 @@ static int vdi_open(BlockDriverState *bs, int flags)
     g_free(s->bmap);
 
  fail:
-    return -1;
+    return ret;
 }
 
 static int vdi_reopen_prepare(BDRVReopenState *state,
-- 
cgit v1.2.1


From 0e87ba2ccbf900cc5a56b95e0671e5a5d2c2f6a0 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 17 Jan 2013 21:45:28 +0100
Subject: block/vdi: Check for bad signature

vdi_open did not check for a bad signature.
This check was only in vdi_probe.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/vdi.c b/block/vdi.c
index 8b768bfa49..257a592ea9 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -392,7 +392,11 @@ static int vdi_open(BlockDriverState *bs, int flags)
         header.disk_size &= ~(SECTOR_SIZE - 1);
     }
 
-    if (header.version != VDI_VERSION_1_1) {
+    if (header.signature != VDI_SIGNATURE) {
+        logout("bad vdi signature %08x\n", header.signature);
+        ret = -EMEDIUMTYPE;
+        goto fail;
+    } else if (header.version != VDI_VERSION_1_1) {
         logout("unsupported version %u.%u\n",
                header.version >> 16, header.version & 0xffff);
         ret = -ENOTSUP;
-- 
cgit v1.2.1


From 88ff0e48eedd679a9dc1122676d8aa29f8d07571 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Jan 2013 15:01:11 +0100
Subject: mirror: do nothing on zero-sized disk

On a zero-sized disk we need to break out of the job successfully
before bdrv_dirty_iter_init is called, otherwise you will get an
assertion failure with the next patch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 9347533b9b..a62ad86c28 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -305,7 +305,7 @@ static void coroutine_fn mirror_run(void *opaque)
     }
 
     s->common.len = bdrv_getlength(bs);
-    if (s->common.len < 0) {
+    if (s->common.len <= 0) {
         block_job_completed(&s->common, s->common.len);
         return;
     }
-- 
cgit v1.2.1


From 1b0952445522af73b0e78420a9078b3653923703 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 22 Jan 2013 15:01:12 +0100
Subject: hbitmap: add assertion on hbitmap_iter_init

hbitmap_iter_init causes an out-of-bounds access when the "first"
argument is or greater than or equal to the size of the bitmap.
Forbid this with an assertion, and remove the failing testcase.

Reported-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/hbitmap.h |  3 ++-
 tests/test-hbitmap.c   | 13 +++----------
 util/hbitmap.c         |  1 +
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 7ddfb66808..73f5d1d8d3 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -128,7 +128,8 @@ void hbitmap_free(HBitmap *hb);
  * hbitmap_iter_init:
  * @hbi: HBitmapIter to initialize.
  * @hb: HBitmap to iterate on.
- * @first: First bit to visit (0-based).
+ * @first: First bit to visit (0-based, must be strictly less than the
+ * size of the bitmap).
  *
  * Set up @hbi to iterate on the HBitmap @hb.  hbitmap_iter_next will return
  * the lowest-numbered bit that is set in @hb, starting at @first.
diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index fcc6a00fc3..8c902f2055 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -86,7 +86,9 @@ static void hbitmap_test_init(TestHBitmapData *data,
     data->bits = g_new0(unsigned long, n);
     data->size = size;
     data->granularity = granularity;
-    hbitmap_test_check(data, 0);
+    if (size) {
+        hbitmap_test_check(data, 0);
+    }
 }
 
 static void hbitmap_test_teardown(TestHBitmapData *data,
@@ -198,14 +200,6 @@ static void test_hbitmap_iter_partial(TestHBitmapData *data,
     hbitmap_test_check(data, L3 / 2);
 }
 
-static void test_hbitmap_iter_past(TestHBitmapData *data,
-                                    const void *unused)
-{
-    hbitmap_test_init(data, L3, 0);
-    hbitmap_test_set(data, 0, L3);
-    hbitmap_test_check(data, L3);
-}
-
 static void test_hbitmap_set_all(TestHBitmapData *data,
                                  const void *unused)
 {
@@ -388,7 +382,6 @@ int main(int argc, char **argv)
     hbitmap_test_add("/hbitmap/size/0", test_hbitmap_zero);
     hbitmap_test_add("/hbitmap/size/unaligned", test_hbitmap_unaligned);
     hbitmap_test_add("/hbitmap/iter/empty", test_hbitmap_iter_empty);
-    hbitmap_test_add("/hbitmap/iter/past", test_hbitmap_iter_past);
     hbitmap_test_add("/hbitmap/iter/partial", test_hbitmap_iter_partial);
     hbitmap_test_add("/hbitmap/iter/granularity", test_hbitmap_iter_granularity);
     hbitmap_test_add("/hbitmap/get/all", test_hbitmap_get_all);
diff --git a/util/hbitmap.c b/util/hbitmap.c
index fb7e01e8c5..2aa487db74 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -147,6 +147,7 @@ void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first)
 
     hbi->hb = hb;
     pos = first >> hb->granularity;
+    assert(pos < hb->size);
     hbi->pos = pos >> BITS_PER_LEVEL;
     hbi->granularity = hb->granularity;
 
-- 
cgit v1.2.1


From 1147bb15a715a907a91195c2ed601fc926e43d46 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 4 Jan 2013 14:44:41 -0500
Subject: ahci: Remove unused AHCIDevice fields

'dma_status' and 'dma_cb' are written to, but never read.
Remove these fields in preparation for AHCI migration bits.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/ahci.c | 8 ++------
 hw/ide/ahci.h | 2 --
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 21f50ea5be..2d185cb20f 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1035,11 +1035,10 @@ out:
 static void ahci_start_dma(IDEDMA *dma, IDEState *s,
                            BlockDriverCompletionFunc *dma_cb)
 {
+#ifdef DEBUG_AHCI
     AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
-
+#endif
     DPRINTF(ad->port_no, "\n");
-    ad->dma_cb = dma_cb;
-    ad->dma_status |= BM_STATUS_DMAING;
     s->io_buffer_offset = 0;
     dma_cb(s, 0);
 }
@@ -1095,7 +1094,6 @@ static int ahci_dma_set_unit(IDEDMA *dma, int unit)
 static int ahci_dma_add_status(IDEDMA *dma, int status)
 {
     AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
-    ad->dma_status |= status;
     DPRINTF(ad->port_no, "set status: %x\n", status);
 
     if (status & BM_STATUS_INT) {
@@ -1114,8 +1112,6 @@ static int ahci_dma_set_inactive(IDEDMA *dma)
     /* update d2h status */
     ahci_write_fis_d2h(ad, NULL);
 
-    ad->dma_cb = NULL;
-
     if (!ad->check_bh) {
         /* maybe we still have something to process, check later */
         ad->check_bh = qemu_bh_new(ahci_check_cmd_bh, ad);
diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h
index 1200a56ada..735b379e82 100644
--- a/hw/ide/ahci.h
+++ b/hw/ide/ahci.h
@@ -281,11 +281,9 @@ struct AHCIDevice {
     QEMUBH *check_bh;
     uint8_t *lst;
     uint8_t *res_fis;
-    int dma_status;
     int done_atapi_packet;
     int busy_slot;
     int init_d2h_sent;
-    BlockDriverCompletionFunc *dma_cb;
     AHCICmdHdr *cur_cmd;
     NCQTransferState ncq_tfs[AHCI_MAX_CMDS];
 };
-- 
cgit v1.2.1


From 4ac557c89b04d506c876a0a378e815d822261c8a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 15 Jan 2013 16:12:09 +0100
Subject: ahci: Change data types in preparation for migration

The size of an int depends on the host, so in order to be able to
migrate these fields, make them either int32_t or bool, depending on the
use.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/ahci.c | 8 ++++----
 hw/ide/ahci.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 2d185cb20f..f91cff2b4d 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -241,7 +241,7 @@ static void  ahci_port_write(AHCIState *s, int port, int offset, uint32_t val)
             if ((pr->cmd & PORT_CMD_FIS_ON) &&
                 !s->dev[port].init_d2h_sent) {
                 ahci_init_d2h(&s->dev[port]);
-                s->dev[port].init_d2h_sent = 1;
+                s->dev[port].init_d2h_sent = true;
             }
 
             check_cmd(s, port);
@@ -494,7 +494,7 @@ static void ahci_reset_port(AHCIState *s, int port)
     pr->scr_err = 0;
     pr->scr_act = 0;
     d->busy_slot = -1;
-    d->init_d2h_sent = 0;
+    d->init_d2h_sent = false;
 
     ide_state = &s->dev[port].port.ifs[0];
     if (!ide_state->bs) {
@@ -946,7 +946,7 @@ static int handle_cmd(AHCIState *s, int port, int slot)
             ide_state->hcyl = 0xeb;
             debug_print_fis(ide_state->io_buffer, 0x10);
             ide_state->feature = IDE_FEATURE_DMA;
-            s->dev[port].done_atapi_packet = 0;
+            s->dev[port].done_atapi_packet = false;
             /* XXX send PIO setup FIS */
         }
 
@@ -991,7 +991,7 @@ static int ahci_start_transfer(IDEDMA *dma)
 
     if (is_atapi && !ad->done_atapi_packet) {
         /* already prepopulated iobuffer */
-        ad->done_atapi_packet = 1;
+        ad->done_atapi_packet = true;
         goto out;
     }
 
diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h
index 735b379e82..70d3b57adf 100644
--- a/hw/ide/ahci.h
+++ b/hw/ide/ahci.h
@@ -281,9 +281,9 @@ struct AHCIDevice {
     QEMUBH *check_bh;
     uint8_t *lst;
     uint8_t *res_fis;
-    int done_atapi_packet;
-    int busy_slot;
-    int init_d2h_sent;
+    bool done_atapi_packet;
+    int32_t busy_slot;
+    bool init_d2h_sent;
     AHCICmdHdr *cur_cmd;
     NCQTransferState ncq_tfs[AHCI_MAX_CMDS];
 };
@@ -295,7 +295,7 @@ typedef struct AHCIState {
     MemoryRegion idp;       /* Index-Data Pair I/O port space */
     unsigned idp_offset;    /* Offset of index in I/O port space */
     uint32_t idp_index;     /* Current IDP index */
-    int ports;
+    int32_t ports;
     qemu_irq irq;
     DMAContext *dma;
 } AHCIState;
-- 
cgit v1.2.1


From a26230218d7d66ec5cb1aec101ceaf0e7400ef7f Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 4 Jan 2013 14:44:42 -0500
Subject: ahci: Add migration support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Jason tested these patches by migrating Windows 7 and Fedora 17 guests
(while under I/O) on both piix with ahci attached and on q35 (which has
a built-in AHCI controller).

Signed-off-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/ahci.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/ide/ahci.h | 10 ++++++++
 hw/ide/ich.c  | 14 +++++++---
 3 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index f91cff2b4d..ad0094f532 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1199,6 +1199,82 @@ void ahci_reset(AHCIState *s)
     }
 }
 
+static const VMStateDescription vmstate_ahci_device = {
+    .name = "ahci port",
+    .version_id = 1,
+    .fields = (VMStateField []) {
+        VMSTATE_IDE_BUS(port, AHCIDevice),
+        VMSTATE_UINT32(port_state, AHCIDevice),
+        VMSTATE_UINT32(finished, AHCIDevice),
+        VMSTATE_UINT32(port_regs.lst_addr, AHCIDevice),
+        VMSTATE_UINT32(port_regs.lst_addr_hi, AHCIDevice),
+        VMSTATE_UINT32(port_regs.fis_addr, AHCIDevice),
+        VMSTATE_UINT32(port_regs.fis_addr_hi, AHCIDevice),
+        VMSTATE_UINT32(port_regs.irq_stat, AHCIDevice),
+        VMSTATE_UINT32(port_regs.irq_mask, AHCIDevice),
+        VMSTATE_UINT32(port_regs.cmd, AHCIDevice),
+        VMSTATE_UINT32(port_regs.tfdata, AHCIDevice),
+        VMSTATE_UINT32(port_regs.sig, AHCIDevice),
+        VMSTATE_UINT32(port_regs.scr_stat, AHCIDevice),
+        VMSTATE_UINT32(port_regs.scr_ctl, AHCIDevice),
+        VMSTATE_UINT32(port_regs.scr_err, AHCIDevice),
+        VMSTATE_UINT32(port_regs.scr_act, AHCIDevice),
+        VMSTATE_UINT32(port_regs.cmd_issue, AHCIDevice),
+        VMSTATE_BOOL(done_atapi_packet, AHCIDevice),
+        VMSTATE_INT32(busy_slot, AHCIDevice),
+        VMSTATE_BOOL(init_d2h_sent, AHCIDevice),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static int ahci_state_post_load(void *opaque, int version_id)
+{
+    int i;
+    struct AHCIDevice *ad;
+    AHCIState *s = opaque;
+
+    for (i = 0; i < s->ports; i++) {
+        ad = &s->dev[i];
+        AHCIPortRegs *pr = &ad->port_regs;
+
+        map_page(&ad->lst,
+                 ((uint64_t)pr->lst_addr_hi << 32) | pr->lst_addr, 1024);
+        map_page(&ad->res_fis,
+                 ((uint64_t)pr->fis_addr_hi << 32) | pr->fis_addr, 256);
+        /*
+         * All pending i/o should be flushed out on a migrate. However,
+         * we might not have cleared the busy_slot since this is done
+         * in a bh. Also, issue i/o against any slots that are pending.
+         */
+        if ((ad->busy_slot != -1) &&
+            !(ad->port.ifs[0].status & (BUSY_STAT|DRQ_STAT))) {
+            pr->cmd_issue &= ~(1 << ad->busy_slot);
+            ad->busy_slot = -1;
+        }
+        check_cmd(s, i);
+    }
+
+    return 0;
+}
+
+const VMStateDescription vmstate_ahci = {
+    .name = "ahci",
+    .version_id = 1,
+    .post_load = ahci_state_post_load,
+    .fields = (VMStateField []) {
+        VMSTATE_STRUCT_VARRAY_POINTER_INT32(dev, AHCIState, ports,
+                                     vmstate_ahci_device, AHCIDevice),
+        VMSTATE_UINT32(control_regs.cap, AHCIState),
+        VMSTATE_UINT32(control_regs.ghc, AHCIState),
+        VMSTATE_UINT32(control_regs.irqstatus, AHCIState),
+        VMSTATE_UINT32(control_regs.impl, AHCIState),
+        VMSTATE_UINT32(control_regs.version, AHCIState),
+        VMSTATE_UINT32(idp_index, AHCIState),
+        VMSTATE_INT32(ports, AHCIState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 typedef struct SysbusAHCIState {
     SysBusDevice busdev;
     AHCIState ahci;
@@ -1207,7 +1283,11 @@ typedef struct SysbusAHCIState {
 
 static const VMStateDescription vmstate_sysbus_ahci = {
     .name = "sysbus-ahci",
-    .unmigratable = 1,
+    .unmigratable = 1, /* Still buggy under I/O load */
+    .fields = (VMStateField []) {
+        VMSTATE_AHCI(ahci, AHCIPCIState),
+        VMSTATE_END_OF_LIST()
+    },
 };
 
 static void sysbus_ahci_reset(DeviceState *dev)
diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h
index 70d3b57adf..85f37fe99d 100644
--- a/hw/ide/ahci.h
+++ b/hw/ide/ahci.h
@@ -305,6 +305,16 @@ typedef struct AHCIPCIState {
     AHCIState ahci;
 } AHCIPCIState;
 
+extern const VMStateDescription vmstate_ahci;
+
+#define VMSTATE_AHCI(_field, _state) {                               \
+    .name       = (stringify(_field)),                               \
+    .size       = sizeof(AHCIState),                                 \
+    .vmsd       = &vmstate_ahci,                                     \
+    .flags      = VMS_STRUCT,                                        \
+    .offset     = vmstate_offset_value(_state, _field, AHCIState),   \
+}
+
 typedef struct NCQFrame {
     uint8_t fis_type;
     uint8_t c;
diff --git a/hw/ide/ich.c b/hw/ide/ich.c
index 1fb803d340..cc30adc701 100644
--- a/hw/ide/ich.c
+++ b/hw/ide/ich.c
@@ -79,9 +79,15 @@
 #define ICH9_IDP_INDEX          0x10
 #define ICH9_IDP_INDEX_LOG2     0x04
 
-static const VMStateDescription vmstate_ahci = {
-    .name = "ahci",
-    .unmigratable = 1,
+static const VMStateDescription vmstate_ich9_ahci = {
+    .name = "ich9_ahci",
+    .unmigratable = 1, /* Still buggy under I/O load */
+    .version_id = 1,
+    .fields = (VMStateField []) {
+        VMSTATE_PCI_DEVICE(card, AHCIPCIState),
+        VMSTATE_AHCI(ahci, AHCIPCIState),
+        VMSTATE_END_OF_LIST()
+    },
 };
 
 static void pci_ich9_reset(DeviceState *dev)
@@ -152,7 +158,7 @@ static void ich_ahci_class_init(ObjectClass *klass, void *data)
     k->device_id = PCI_DEVICE_ID_INTEL_82801IR;
     k->revision = 0x02;
     k->class_id = PCI_CLASS_STORAGE_SATA;
-    dc->vmsd = &vmstate_ahci;
+    dc->vmsd = &vmstate_ich9_ahci;
     dc->reset = pci_ich9_reset;
 }
 
-- 
cgit v1.2.1


From 8689907266b649b757c2203d9652cbe928a3ae0b Mon Sep 17 00:00:00 2001
From: Vishvananda Ishaya <vishvananda@gmail.com>
Date: Thu, 24 Jan 2013 10:00:40 -0800
Subject: block: Create proper size file for disk mirror

The qmp monitor command to mirror a disk was passing -1 for size
along with the disk's backing file. This size of the resulting disk
is the size of the backing file, which is incorrect if the disk
has been resized. Therefore we should always pass in the size of
the current disk.

Signed-off-by: Vishvananda Ishaya <vishvananda@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index ac396f3c94..fdc573f519 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1282,11 +1282,11 @@ void qmp_drive_mirror(const char *device, const char *target,
         return;
     }
 
+    bdrv_get_geometry(bs, &size);
+    size *= 512;
     if (sync == MIRROR_SYNC_MODE_FULL && mode != NEW_IMAGE_MODE_EXISTING) {
         /* create new image w/o backing file */
         assert(format && drv);
-        bdrv_get_geometry(bs, &size);
-        size *= 512;
         bdrv_img_create(target, format,
                         NULL, NULL, NULL, size, flags, &local_err);
     } else {
@@ -1299,7 +1299,7 @@ void qmp_drive_mirror(const char *device, const char *target,
             bdrv_img_create(target, format,
                             source->filename,
                             source->drv->format_name,
-                            NULL, -1, flags, &local_err);
+                            NULL, size, flags, &local_err);
             break;
         default:
             abort();
-- 
cgit v1.2.1


From 67bec53d9f2ccd3aa7d37a7e0689122587929220 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 24 Jan 2013 12:50:28 +0100
Subject: ide: Add fall through annotations

Add comments to help static analysers detect that these cases are
intentional, and clean up some whitespace in the environment of these
comments.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 hw/ide/core.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 14ad0799c3..3743dc3b55 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -1149,8 +1149,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         }
         ide_set_irq(s->bus);
         break;
+
     case WIN_VERIFY_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_VERIFY:
     case WIN_VERIFY_ONCE:
         /* do sector number check ? */
@@ -1158,8 +1160,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s->bus);
         break;
+
     case WIN_READ_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_READ:
     case WIN_READ_ONCE:
         if (s->drive_kind == IDE_CD) {
@@ -1173,8 +1177,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         s->req_nb_sectors = 1;
         ide_sector_read(s);
         break;
+
     case WIN_WRITE_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_WRITE:
     case WIN_WRITE_ONCE:
     case CFA_WRITE_SECT_WO_ERASE:
@@ -1189,8 +1195,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         ide_transfer_start(s, s->io_buffer, 512, ide_sector_write);
         s->media_changed = 1;
         break;
+
     case WIN_MULTREAD_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_MULTREAD:
         if (!s->bs) {
             goto abort_cmd;
@@ -1202,8 +1210,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         s->req_nb_sectors = s->mult_sectors;
         ide_sector_read(s);
         break;
+
     case WIN_MULTWRITE_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_MULTWRITE:
     case CFA_WRITE_MULTI_WO_ERASE:
         if (!s->bs) {
@@ -1222,8 +1232,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         ide_transfer_start(s, s->io_buffer, 512 * n, ide_sector_write);
         s->media_changed = 1;
         break;
+
     case WIN_READDMA_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_READDMA:
     case WIN_READDMA_ONCE:
         if (!s->bs) {
@@ -1232,8 +1244,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
 	ide_cmd_lba48_transform(s, lba48);
         ide_sector_start_dma(s, IDE_DMA_READ);
         break;
+
     case WIN_WRITEDMA_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_WRITEDMA:
     case WIN_WRITEDMA_ONCE:
         if (!s->bs) {
@@ -1243,14 +1257,17 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         ide_sector_start_dma(s, IDE_DMA_WRITE);
         s->media_changed = 1;
         break;
+
     case WIN_READ_NATIVE_MAX_EXT:
-	lba48 = 1;
+        lba48 = 1;
+        /* fall through */
     case WIN_READ_NATIVE_MAX:
 	ide_cmd_lba48_transform(s, lba48);
         ide_set_sector(s, s->nb_sectors - 1);
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s->bus);
         break;
+
     case WIN_CHECKPOWERMODE1:
     case WIN_CHECKPOWERMODE2:
         s->error = 0;
-- 
cgit v1.2.1