summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2015-11-10 17:49:39 +0000
committerPeter Maydell <peter.maydell@linaro.org>2015-11-10 17:49:39 +0000
commita77067f6ac9b17beefea506ce5f514072fe3fcf4 (patch)
treea494e1154c79ed0d8ed5f947d321dfde26c186d1
parenta1a88589dc982f9f8b6c717c2ac98dd71dd4353d (diff)
parent15b3b8eaae8dbcc903bb164311ea0066c77536a7 (diff)
downloadqemu-a77067f6ac9b17beefea506ce5f514072fe3fcf4.tar.gz
Merge remote-tracking branch 'remotes/juanquintela/tags/migration/20151110' into staging
migration/next for 20151110 # gpg: Signature made Tue 10 Nov 2015 14:23:26 GMT using RSA key ID 5872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" # gpg: aka "Juan Quintela <quintela@trasno.org>" * remotes/juanquintela/tags/migration/20151110: (57 commits) migration: qemu_savevm_state_cleanup becomes mandatory operation Inhibit ballooning during postcopy Disable mlock around incoming postcopy End of migration for postcopy Postcopy: Mark nohugepage before discard postcopy: Wire up loadvm_postcopy_handle_ commands Start up a postcopy/listener thread ready for incoming page data Postcopy; Handle userfault requests Round up RAMBlock sizes to host page sizes Host page!=target page: Cleanup bitmaps Don't iterate on precopy-only devices during postcopy Don't sync dirty bitmaps in postcopy postcopy: Check order of received target pages Postcopy: Use helpers to map pages during migration postcopy_ram.c: place_page and helpers Page request: Consume pages off the post-copy queue Page request: Process incoming page request Page request: Add MIG_RP_MSG_REQ_PAGES reverse command Postcopy: End of iteration Postcopy: Postcopy startup in migration thread ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--balloon.c11
-rw-r--r--docs/migration.txt191
-rw-r--r--exec.c92
-rw-r--r--hmp-commands.hx15
-rw-r--r--hmp.c7
-rw-r--r--hmp.h1
-rw-r--r--hw/ppc/spapr.c2
-rw-r--r--hw/virtio/virtio-balloon.c4
-rw-r--r--include/exec/cpu-common.h4
-rw-r--r--include/exec/exec-all.h1
-rw-r--r--include/exec/ram_addr.h2
-rw-r--r--include/migration/migration.h121
-rw-r--r--include/migration/postcopy-ram.h99
-rw-r--r--include/migration/qemu-file.h10
-rw-r--r--include/migration/vmstate.h8
-rw-r--r--include/qemu-common.h1
-rw-r--r--include/qemu/osdep.h9
-rw-r--r--include/qemu/typedefs.h3
-rw-r--r--include/sysemu/balloon.h2
-rw-r--r--include/sysemu/sysemu.h46
-rw-r--r--kvm-all.c1
-rw-r--r--linux-headers/linux/userfaultfd.h167
-rw-r--r--migration/Makefile.objs2
-rw-r--r--migration/block.c9
-rw-r--r--migration/migration.c723
-rw-r--r--migration/postcopy-ram.c767
-rw-r--r--migration/qemu-file-unix.c111
-rw-r--r--migration/qemu-file.c64
-rw-r--r--migration/ram.c997
-rw-r--r--migration/savevm.c826
-rw-r--r--qapi-schema.json18
-rw-r--r--qmp-commands.hx19
-rw-r--r--qtest.c1
-rw-r--r--trace-events84
-rw-r--r--vl.c1
35 files changed, 4168 insertions, 251 deletions
diff --git a/balloon.c b/balloon.c
index 5d69e8a00b..0f45d1b5c4 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,17 @@
static QEMUBalloonEvent *balloon_event_fn;
static QEMUBalloonStatus *balloon_stat_fn;
static void *balloon_opaque;
+static bool balloon_inhibited;
+
+bool qemu_balloon_is_inhibited(void)
+{
+ return balloon_inhibited;
+}
+
+void qemu_balloon_inhibit(bool state)
+{
+ balloon_inhibited = state;
+}
static bool have_balloon(Error **errp)
{
diff --git a/docs/migration.txt b/docs/migration.txt
index f6df4beb2a..fda8d61d69 100644
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -291,3 +291,194 @@ save/send this state when we are in the middle of a pio operation
(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is
not enabled, the values on that fields are garbage and don't need to
be sent.
+
+= Return path =
+
+In most migration scenarios there is only a single data path that runs
+from the source VM to the destination, typically along a single fd (although
+possibly with another fd or similar for some fast way of throwing pages across).
+
+However, some uses need two way communication; in particular the Postcopy
+destination needs to be able to request pages on demand from the source.
+
+For these scenarios there is a 'return path' from the destination to the source;
+qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return
+path.
+
+ Source side
+ Forward path - written by migration thread
+ Return path - opened by main thread, read by return-path thread
+
+ Destination side
+ Forward path - read by main thread
+ Return path - opened by main thread, written by main thread AND postcopy
+ thread (protected by rp_mutex)
+
+= Postcopy =
+'Postcopy' migration is a way to deal with migrations that refuse to converge
+(or take too long to converge) its plus side is that there is an upper bound on
+the amount of migration traffic and time it takes, the down side is that during
+the postcopy phase, a failure of *either* side or the network connection causes
+the guest to be lost.
+
+In postcopy the destination CPUs are started before all the memory has been
+transferred, and accesses to pages that are yet to be transferred cause
+a fault that's translated by QEMU into a request to the source QEMU.
+
+Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
+doesn't finish in a given time the switch is made to postcopy.
+
+=== Enabling postcopy ===
+
+To enable postcopy, issue this command on the monitor prior to the
+start of migration:
+
+migrate_set_capability x-postcopy-ram on
+
+The normal commands are then used to start a migration, which is still
+started in precopy mode. Issuing:
+
+migrate_start_postcopy
+
+will now cause the transition from precopy to postcopy.
+It can be issued immediately after migration is started or any
+time later on. Issuing it after the end of a migration is harmless.
+
+Note: During the postcopy phase, the bandwidth limits set using
+migrate_set_speed is ignored (to avoid delaying requested pages that
+the destination is waiting for).
+
+=== Postcopy device transfer ===
+
+Loading of device data may cause the device emulation to access guest RAM
+that may trigger faults that have to be resolved by the source, as such
+the migration stream has to be able to respond with page data *during* the
+device load, and hence the device data has to be read from the stream completely
+before the device load begins to free the stream up. This is achieved by
+'packaging' the device data into a blob that's read in one go.
+
+Source behaviour
+
+Until postcopy is entered the migration stream is identical to normal
+precopy, except for the addition of a 'postcopy advise' command at
+the beginning, to tell the destination that postcopy might happen.
+When postcopy starts the source sends the page discard data and then
+forms the 'package' containing:
+
+ Command: 'postcopy listen'
+ The device state
+ A series of sections, identical to the precopy streams device state stream
+ containing everything except postcopiable devices (i.e. RAM)
+ Command: 'postcopy run'
+
+The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the
+contents are formatted in the same way as the main migration stream.
+
+During postcopy the source scans the list of dirty pages and sends them
+to the destination without being requested (in much the same way as precopy),
+however when a page request is received from the destination, the dirty page
+scanning restarts from the requested location. This causes requested pages
+to be sent quickly, and also causes pages directly after the requested page
+to be sent quickly in the hope that those pages are likely to be used
+by the destination soon.
+
+Destination behaviour
+
+Initially the destination looks the same as precopy, with a single thread
+reading the migration stream; the 'postcopy advise' and 'discard' commands
+are processed to change the way RAM is managed, but don't affect the stream
+processing.
+
+------------------------------------------------------------------------------
+ 1 2 3 4 5 6 7
+main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN )
+thread | |
+ | (page request)
+ | \___
+ v \
+listen thread: --- page -- page -- page -- page -- page --
+
+ a b c
+------------------------------------------------------------------------------
+
+On receipt of CMD_PACKAGED (1)
+ All the data associated with the package - the ( ... ) section in the
+diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
+recurses into qemu_loadvm_state_main to process the contents of the package (2)
+which contains commands (3,6) and devices (4...)
+
+On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
+a new thread (a) is started that takes over servicing the migration stream,
+while the main thread carries on loading the package. It loads normal
+background page data (b) but if during a device load a fault happens (5) the
+returned page (c) is loaded by the listen thread allowing the main threads
+device load to carry on.
+
+The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination
+CPUs start running.
+At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour
+and is no longer used by migration, while the listen thread carries
+on servicing page data until the end of migration.
+
+=== Postcopy states ===
+
+Postcopy moves through a series of states (see postcopy_state) from
+ADVISE->DISCARD->LISTEN->RUNNING->END
+
+ Advise: Set at the start of migration if postcopy is enabled, even
+ if it hasn't had the start command; here the destination
+ checks that its OS has the support needed for postcopy, and performs
+ setup to ensure the RAM mappings are suitable for later postcopy.
+ The destination will fail early in migration at this point if the
+ required OS support is not present.
+ (Triggered by reception of POSTCOPY_ADVISE command)
+
+ Discard: Entered on receipt of the first 'discard' command; prior to
+ the first Discard being performed, hugepages are switched off
+ (using madvise) to ensure that no new huge pages are created
+ during the postcopy phase, and to cause any huge pages that
+ have discards on them to be broken.
+
+ Listen: The first command in the package, POSTCOPY_LISTEN, switches
+ the destination state to Listen, and starts a new thread
+ (the 'listen thread') which takes over the job of receiving
+ pages off the migration stream, while the main thread carries
+ on processing the blob. With this thread able to process page
+ reception, the destination now 'sensitises' the RAM to detect
+ any access to missing pages (on Linux using the 'userfault'
+ system).
+
+ Running: POSTCOPY_RUN causes the destination to synchronise all
+ state and start the CPUs and IO devices running. The main
+ thread now finishes processing the migration package and
+ now carries on as it would for normal precopy migration
+ (although it can't do the cleanup it would do as it
+ finishes a normal migration).
+
+ End: The listen thread can now quit, and perform the cleanup of migration
+ state, the migration is now complete.
+
+=== Source side page maps ===
+
+The source side keeps two bitmaps during postcopy; 'the migration bitmap'
+and 'unsent map'. The 'migration bitmap' is basically the same as in
+the precopy case, and holds a bit to indicate that page is 'dirty' -
+i.e. needs sending. During the precopy phase this is updated as the CPU
+dirties pages, however during postcopy the CPUs are stopped and nothing
+should dirty anything any more.
+
+The 'unsent map' is used for the transition to postcopy. It is a bitmap that
+has a bit cleared whenever a page is sent to the destination, however during
+the transition to postcopy mode it is combined with the migration bitmap
+to form a set of pages that:
+ a) Have been sent but then redirtied (which must be discarded)
+ b) Have not yet been sent - which also must be discarded to cause any
+ transparent huge pages built during precopy to be broken.
+
+Note that the contents of the unsentmap are sacrificed during the calculation
+of the discard set and thus aren't valid once in postcopy. The dirtymap
+is still valid and is used to ensure that no page is sent more than once. Any
+request for a page that has already been sent is ignored. Duplicate requests
+such as this can happen as a page is sent at about the same time the
+destination accesses it.
+
diff --git a/exec.c b/exec.c
index a028961587..b09f18b2a4 100644
--- a/exec.c
+++ b/exec.c
@@ -1377,6 +1377,11 @@ static RAMBlock *find_ram_block(ram_addr_t addr)
return NULL;
}
+const char *qemu_ram_get_idstr(RAMBlock *rb)
+{
+ return rb->idstr;
+}
+
/* Called with iothread lock held. */
void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
{
@@ -1447,7 +1452,7 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
assert(block);
- newsize = TARGET_PAGE_ALIGN(newsize);
+ newsize = HOST_PAGE_ALIGN(newsize);
if (block->used_length == newsize) {
return 0;
@@ -1591,7 +1596,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
return -1;
}
- size = TARGET_PAGE_ALIGN(size);
+ size = HOST_PAGE_ALIGN(size);
new_block = g_malloc0(sizeof(*new_block));
new_block->mr = mr;
new_block->used_length = size;
@@ -1627,8 +1632,8 @@ ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
ram_addr_t addr;
Error *local_err = NULL;
- size = TARGET_PAGE_ALIGN(size);
- max_size = TARGET_PAGE_ALIGN(max_size);
+ size = HOST_PAGE_ALIGN(size);
+ max_size = HOST_PAGE_ALIGN(max_size);
new_block = g_malloc0(sizeof(*new_block));
new_block->mr = mr;
new_block->resized = resized;
@@ -1877,8 +1882,16 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
}
}
-/* Some of the softmmu routines need to translate from a host pointer
- * (typically a TLB entry) back to a ram offset.
+/*
+ * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
+ * in that RAMBlock.
+ *
+ * ptr: Host pointer to look up
+ * round_offset: If true round the result offset down to a page boundary
+ * *ram_addr: set to result ram_addr
+ * *offset: set to result offset within the RAMBlock
+ *
+ * Returns: RAMBlock (or NULL if not found)
*
* By the time this function returns, the returned pointer is not protected
* by RCU anymore. If the caller is not within an RCU critical section and
@@ -1886,18 +1899,22 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
* pointer, such as a reference to the region that includes the incoming
* ram_addr_t.
*/
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+ ram_addr_t *ram_addr,
+ ram_addr_t *offset)
{
RAMBlock *block;
uint8_t *host = ptr;
- MemoryRegion *mr;
if (xen_enabled()) {
rcu_read_lock();
*ram_addr = xen_ram_addr_from_mapcache(ptr);
- mr = qemu_get_ram_block(*ram_addr)->mr;
+ block = qemu_get_ram_block(*ram_addr);
+ if (block) {
+ *offset = (host - block->host);
+ }
rcu_read_unlock();
- return mr;
+ return block;
}
rcu_read_lock();
@@ -1920,10 +1937,49 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
return NULL;
found:
- *ram_addr = block->offset + (host - block->host);
- mr = block->mr;
+ *offset = (host - block->host);
+ if (round_offset) {
+ *offset &= TARGET_PAGE_MASK;
+ }
+ *ram_addr = block->offset + *offset;
rcu_read_unlock();
- return mr;
+ return block;
+}
+
+/*
+ * Finds the named RAMBlock
+ *
+ * name: The name of RAMBlock to find
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ */
+RAMBlock *qemu_ram_block_by_name(const char *name)
+{
+ RAMBlock *block;
+
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+ if (!strcmp(name, block->idstr)) {
+ return block;
+ }
+ }
+
+ return NULL;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+ (typically a TLB entry) back to a ram offset. */
+MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+{
+ RAMBlock *block;
+ ram_addr_t offset; /* Not used */
+
+ block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+
+ if (!block) {
+ return NULL;
+ }
+
+ return block->mr;
}
static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
@@ -3502,6 +3558,16 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
}
return 0;
}
+
+/*
+ * Allows code that needs to deal with migration bitmaps etc to still be built
+ * target independent.
+ */
+size_t qemu_target_page_bits(void)
+{
+ return TARGET_PAGE_BITS;
+}
+
#endif
/*
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 3a4ae3950a..8939b9838a 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1008,6 +1008,21 @@ Set the parameter @var{parameter} for migration.
ETEXI
{
+ .name = "migrate_start_postcopy",
+ .args_type = "",
+ .params = "",
+ .help = "Switch migration to postcopy mode",
+ .mhandler.cmd = hmp_migrate_start_postcopy,
+ },
+
+STEXI
+@item migrate_start_postcopy
+@findex migrate_start_postcopy
+Switch in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
+ETEXI
+
+ {
.name = "client_migrate_info",
.args_type = "protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?",
.params = "protocol hostname port tls-port cert-subject",
diff --git a/hmp.c b/hmp.c
index a15d00c18c..e1f854aefe 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1293,6 +1293,13 @@ void hmp_client_migrate_info(Monitor *mon, const QDict *qdict)
hmp_handle_error(mon, &err);
}
+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict)
+{
+ Error *err = NULL;
+ qmp_migrate_start_postcopy(&err);
+ hmp_handle_error(mon, &err);
+}
+
void hmp_set_password(Monitor *mon, const QDict *qdict)
{
const char *protocol = qdict_get_str(qdict, "protocol");
diff --git a/hmp.h b/hmp.h
index 81656c3d82..a8c5b5a9a6 100644
--- a/hmp.h
+++ b/hmp.h
@@ -69,6 +69,7 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict);
void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict);
void hmp_client_migrate_info(Monitor *mon, const QDict *qdict);
+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict);
void hmp_set_password(Monitor *mon, const QDict *qdict);
void hmp_expire_password(Monitor *mon, const QDict *qdict);
void hmp_eject(Monitor *mon, const QDict *qdict);
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0ed8527969..37d071e4d4 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1588,7 +1588,7 @@ static int htab_load(QEMUFile *f, void *opaque, int version_id)
static SaveVMHandlers savevm_htab_handlers = {
.save_live_setup = htab_save_setup,
.save_live_iterate = htab_save_iterate,
- .save_live_complete = htab_save_complete,
+ .save_live_complete_precopy = htab_save_complete,
.load_state = htab_load,
};
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index c419b17143..9671635e63 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -37,9 +37,11 @@
static void balloon_page(void *addr, int deflate)
{
#if defined(__linux__)
- if (!kvm_enabled() || kvm_has_sync_mmu())
+ if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+ kvm_has_sync_mmu())) {
qemu_madvise(addr, TARGET_PAGE_SIZE,
deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+ }
#endif
}
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9fb1d541d4..85aa4033e7 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -64,8 +64,12 @@ typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
/* This should not be used by devices. */
MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+RAMBlock *qemu_ram_block_by_name(const char *name);
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+ ram_addr_t *ram_addr, ram_addr_t *offset);
void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
void qemu_ram_unset_idstr(ram_addr_t addr);
+const char *qemu_ram_get_idstr(RAMBlock *rb);
void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
int len, int is_write);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index b07de109fb..d900b0d078 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -72,7 +72,6 @@ void restore_state_to_opc(CPUArchState *env, struct TranslationBlock *tb,
void cpu_gen_init(void);
bool cpu_restore_state(CPUState *cpu, uintptr_t searched_pc);
-void page_size_init(void);
void QEMU_NORETURN cpu_resume_from_signal(CPUState *cpu, void *puc);
void QEMU_NORETURN cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 3360ac5fde..7115154bc1 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -22,8 +22,6 @@
#ifndef CONFIG_USER_ONLY
#include "hw/xen/xen.h"
-typedef struct RAMBlock RAMBlock;
-
struct RAMBlock {
struct rcu_head rcu;
struct MemoryRegion *mr;
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 83346210b1..fd018b74a2 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -35,6 +35,7 @@
#define QEMU_VM_SUBSECTION 0x05
#define QEMU_VM_VMDESCRIPTION 0x06
#define QEMU_VM_CONFIGURATION 0x07
+#define QEMU_VM_COMMAND 0x08
#define QEMU_VM_SECTION_FOOTER 0x7e
struct MigrationParams {
@@ -42,13 +43,67 @@ struct MigrationParams {
bool shared;
};
-typedef struct MigrationState MigrationState;
+/* Messages sent on the return path from destination to source */
+enum mig_rp_message_type {
+ MIG_RP_MSG_INVALID = 0, /* Must be 0 */
+ MIG_RP_MSG_SHUT, /* sibling will not send any more RP messages */
+ MIG_RP_MSG_PONG, /* Response to a PING; data (seq: be32 ) */
+
+ MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
+ MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */
+
+ MIG_RP_MSG_MAX
+};
typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
+/* The current postcopy state is read/set by postcopy_state_get/set
+ * which update it atomically.
+ * The state is updated as postcopy messages are received, and
+ * in general only one thread should be writing to the state at any one
+ * time, initially the main thread and then the listen thread;
+ * Corner cases are where either thread finishes early and/or errors.
+ * The state is checked as messages are received to ensure that
+ * the source is sending us messages in the correct order.
+ * The state is also used by the RAM reception code to know if it
+ * has to place pages atomically, and the cleanup code at the end of
+ * the main thread to know if it has to delay cleanup until the end
+ * of postcopy.
+ */
+typedef enum {
+ POSTCOPY_INCOMING_NONE = 0, /* Initial state - no postcopy */
+ POSTCOPY_INCOMING_ADVISE,
+ POSTCOPY_INCOMING_DISCARD,
+ POSTCOPY_INCOMING_LISTENING,
+ POSTCOPY_INCOMING_RUNNING,
+ POSTCOPY_INCOMING_END
+} PostcopyState;
+
/* State for the incoming migration */
struct MigrationIncomingState {
- QEMUFile *file;
+ QEMUFile *from_src_file;
+
+ /*
+ * Free at the start of the main state load, set as the main thread finishes
+ * loading state.
+ */
+ QemuEvent main_thread_load_event;
+
+ bool have_fault_thread;
+ QemuThread fault_thread;
+ QemuSemaphore fault_thread_sem;
+
+ bool have_listen_thread;
+ QemuThread listen_thread;
+ QemuSemaphore listen_thread_sem;
+
+ /* For the kernel to send us notifications */
+ int userfault_fd;
+ /* To tell the fault_thread to quit */
+ int userfault_quit_fd;
+ QEMUFile *to_src_file;
+ QemuMutex rp_mutex; /* We send replies from multiple threads */
+ void *postcopy_tmp_page;
/* See savevm.c */
LoadStateEntry_Head loadvm_handlers;
@@ -58,6 +113,18 @@ MigrationIncomingState *migration_incoming_get_current(void);
MigrationIncomingState *migration_incoming_state_new(QEMUFile *f);
void migration_incoming_state_destroy(void);
+/*
+ * An outstanding page request, on the source, having been received
+ * and queued
+ */
+struct MigrationSrcPageRequest {
+ RAMBlock *rb;
+ hwaddr offset;
+ hwaddr len;
+
+ QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req;
+};
+
struct MigrationState
{
int64_t bandwidth_limit;
@@ -70,6 +137,14 @@ struct MigrationState
int state;
MigrationParams params;
+
+ /* State related to return path */
+ struct {
+ QEMUFile *from_dst_file;
+ QemuThread rp_thread;
+ bool error;
+ } rp_state;
+
double mbps;
int64_t total_time;
int64_t downtime;
@@ -80,6 +155,18 @@ struct MigrationState
int64_t xbzrle_cache_size;
int64_t setup_time;
int64_t dirty_sync_count;
+
+ /* Flag set once the migration has been asked to enter postcopy */
+ bool start_postcopy;
+
+ /* Flag set once the migration thread is running (and needs joining) */
+ bool migration_thread_running;
+
+ /* Queue of outstanding page requests from the destination */
+ QemuMutex src_page_req_mutex;
+ QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests;
+ /* The RAMBlock used in the last src_page_request */
+ RAMBlock *last_req_rb;
};
void process_incoming_migration(QEMUFile *f);
@@ -116,9 +203,12 @@ int migrate_fd_close(MigrationState *s);
void add_migration_state_change_notifier(Notifier *notify);
void remove_migration_state_change_notifier(Notifier *notify);
+MigrationState *migrate_init(const MigrationParams *params);
bool migration_in_setup(MigrationState *);
bool migration_has_finished(MigrationState *);
bool migration_has_failed(MigrationState *);
+/* True if outgoing migration has entered postcopy phase */
+bool migration_in_postcopy(MigrationState *);
MigrationState *migrate_get_current(void);
void migrate_compress_threads_create(void);
@@ -145,6 +235,13 @@ uint64_t xbzrle_mig_pages_cache_miss(void);
double xbzrle_mig_cache_miss_rate(void);
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
+void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
+/* For outgoing discard bitmap */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms);
+/* For incoming postcopy discard */
+int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
+ uint64_t start, size_t length);
+int ram_postcopy_incoming_init(MigrationIncomingState *mis);
/**
* @migrate_add_blocker - prevent migration from proceeding
@@ -160,6 +257,7 @@ void migrate_add_blocker(Error *reason);
*/
void migrate_del_blocker(Error *reason);
+bool migrate_postcopy_ram(void);
bool migrate_zero_blocks(void);
bool migrate_auto_converge(void);
@@ -179,6 +277,17 @@ int migrate_compress_threads(void);
int migrate_decompress_threads(void);
bool migrate_use_events(void);
+/* Sending on the return path - generic and then for each message type */
+void migrate_send_rp_message(MigrationIncomingState *mis,
+ enum mig_rp_message_type message_type,
+ uint16_t len, void *data);
+void migrate_send_rp_shut(MigrationIncomingState *mis,
+ uint32_t value);
+void migrate_send_rp_pong(MigrationIncomingState *mis,
+ uint32_t value);
+void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname,
+ ram_addr_t start, size_t len);
+
void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data);
@@ -204,4 +313,12 @@ void global_state_set_optional(void);
void savevm_skip_configuration(void);
int global_state_store(void);
void global_state_store_running(void);
+
+void flush_page_queue(MigrationState *ms);
+int ram_save_queue_pages(MigrationState *ms, const char *rbname,
+ ram_addr_t start, ram_addr_t len);
+
+PostcopyState postcopy_state_get(void);
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(PostcopyState new_state);
#endif
diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
new file mode 100644
index 0000000000..b6a7491f2d
--- /dev/null
+++ b/include/migration/postcopy-ram.h
@@ -0,0 +1,99 @@
+/*
+ * Postcopy migration for RAM
+ *
+ * Copyright 2013 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Dave Gilbert <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_POSTCOPY_RAM_H
+#define QEMU_POSTCOPY_RAM_H
+
+/* Return true if the host supports everything we need to do postcopy-ram */
+bool postcopy_ram_supported_by_host(void);
+
+/*
+ * Make all of RAM sensitive to accesses to areas that haven't yet been written
+ * and wire up anything necessary to deal with it.
+ */
+int postcopy_ram_enable_notify(MigrationIncomingState *mis);
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from ram.c's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages);
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis);
+
+/*
+ * Discard the contents of 'length' bytes from 'start'
+ * We can assume that if we've been called postcopy_ram_hosttest returned true
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+ size_t length);
+
+/*
+ * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
+ * however leaving it until after precopy means that most of the precopy
+ * data is still THPd
+ */
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis);
+
+/*
+ * Called at the start of each RAMBlock by the bitmap code.
+ * 'offset' is the bitmap offset of the named RAMBlock in the migration
+ * bitmap.
+ * Returns a new PDS
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+ unsigned long offset,
+ const char *name);
+
+/*
+ * Called by the bitmap code for each chunk to discard.
+ * May send a discard message, may just leave it queued to
+ * be sent later.
+ * @start,@length: a range of pages in the migration bitmap in the
+ * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
+ */
+void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
+ unsigned long start, unsigned long length);
+
+/*
+ * Called at the end of each RAMBlock by the bitmap code.
+ * Sends any outstanding discard messages, frees the PDS.
+ */
+void postcopy_discard_send_finish(MigrationState *ms,
+ PostcopyDiscardState *pds);
+
+/*
+ * Place a page (from) at (host) efficiently
+ * There are restrictions on how 'from' must be mapped, in general best
+ * to use other postcopy_ routines to allocate.
+ * returns 0 on success
+ */
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from);
+
+/*
+ * Place a zero page at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host);
+
+/*
+ * Allocate a page of memory that can be mapped at a later point in time
+ * using postcopy_place_page
+ * Returns: Pointer to allocated page
+ */
+void *postcopy_get_tmp_page(MigrationIncomingState *mis);
+
+#endif
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 29a338d0a9..b5d08d217d 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -89,6 +89,11 @@ typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
uint64_t *bytes_sent);
/*
+ * Return a QEMUFile for comms in the opposite direction
+ */
+typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
+
+/*
* Stop any read or write (depending on flags) on the underlying
* transport on the QEMUFile.
* Existing blocking reads/writes must be woken
@@ -106,6 +111,7 @@ typedef struct QEMUFileOps {
QEMURamHookFunc *after_ram_iterate;
QEMURamHookFunc *hook_ram_load;
QEMURamSaveFunc *save_page;
+ QEMURetPathFunc *get_return_path;
QEMUFileShutdownFunc *shut_down;
} QEMUFileOps;
@@ -163,9 +169,11 @@ void qemu_put_be32(QEMUFile *f, unsigned int v);
void qemu_put_be64(QEMUFile *f, uint64_t v);
size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset);
size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size);
+size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size);
ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
int level);
int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src);
+
/*
* Note that you can only peek continuous bytes from where the current pointer
* is; you aren't guaranteed to be able to peak to +n bytes unless you've
@@ -194,7 +202,9 @@ int64_t qemu_file_get_rate_limit(QEMUFile *f);
int qemu_file_get_error(QEMUFile *f);
void qemu_file_set_error(QEMUFile *f, int ret);
int qemu_file_shutdown(QEMUFile *f);
+QEMUFile *qemu_file_get_return_path(QEMUFile *f);
void qemu_fflush(QEMUFile *f);
+void qemu_file_set_blocking(QEMUFile *f, bool block);
static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
{
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index d173b565f5..7267e38c1f 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -40,7 +40,8 @@ typedef struct SaveVMHandlers {
SaveStateHandler *save_state;
void (*cleanup)(void *opaque);
- int (*save_live_complete)(QEMUFile *f, void *opaque);
+ int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);
+ int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);
/* This runs both outside and inside the iothread lock. */
bool (*is_active)(void *opaque);
@@ -54,8 +55,9 @@ typedef struct SaveVMHandlers {
/* This runs outside the iothread lock! */
int (*save_live_setup)(QEMUFile *f, void *opaque);
- uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
-
+ void (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size,
+ uint64_t *non_postcopiable_pending,
+ uint64_t *postcopiable_pending);
LoadStateHandler *load_state;
} SaveVMHandlers;
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 2f74540a87..405364f2b9 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -499,5 +499,6 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len);
int parse_debug_env(const char *name, int max, int initial);
const char *qemu_ether_ntoa(const MACAddr *mac);
+void page_size_init(void);
#endif
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index ab2d5d9d31..861d84b4e4 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -139,6 +139,8 @@ void qemu_anon_ram_free(void *ptr, size_t size);
#if defined(CONFIG_MADVISE)
+#include <sys/mman.h>
+
#define QEMU_MADV_WILLNEED MADV_WILLNEED
#define QEMU_MADV_DONTNEED MADV_DONTNEED
#ifdef MADV_DONTFORK
@@ -171,6 +173,11 @@ void qemu_anon_ram_free(void *ptr, size_t size);
#else
#define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
#endif
+#ifdef MADV_NOHUGEPAGE
+#define QEMU_MADV_NOHUGEPAGE MADV_NOHUGEPAGE
+#else
+#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
+#endif
#elif defined(CONFIG_POSIX_MADVISE)
@@ -182,6 +189,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
#define QEMU_MADV_DODUMP QEMU_MADV_INVALID
#define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID
#define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
+#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
#else /* no-op */
@@ -193,6 +201,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
#define QEMU_MADV_DODUMP QEMU_MADV_INVALID
#define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID
#define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
+#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
#endif
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 2cdce1866e..6b1093dcfc 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -44,6 +44,7 @@ typedef struct MemoryRegion MemoryRegion;
typedef struct MemoryRegionSection MemoryRegionSection;
typedef struct MigrationIncomingState MigrationIncomingState;
typedef struct MigrationParams MigrationParams;
+typedef struct MigrationState MigrationState;
typedef struct Monitor Monitor;
typedef struct MouseTransformInfo MouseTransformInfo;
typedef struct MSIMessage MSIMessage;
@@ -66,6 +67,7 @@ typedef struct PCMachineState PCMachineState;
typedef struct PCMachineClass PCMachineClass;
typedef struct PCMCIACardState PCMCIACardState;
typedef struct PixelFormat PixelFormat;
+typedef struct PostcopyDiscardState PostcopyDiscardState;
typedef struct PropertyInfo PropertyInfo;
typedef struct Property Property;
typedef struct QEMUBH QEMUBH;
@@ -79,6 +81,7 @@ typedef struct QEMUSizedBuffer QEMUSizedBuffer;
typedef struct QEMUTimerListGroup QEMUTimerListGroup;
typedef struct QEMUTimer QEMUTimer;
typedef struct Range Range;
+typedef struct RAMBlock RAMBlock;
typedef struct SerialState SerialState;
typedef struct SHPCDevice SHPCDevice;
typedef struct SMBusDevice SMBusDevice;
diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h
index 17fe30070d..3f976b49e7 100644
--- a/include/sysemu/balloon.h
+++ b/include/sysemu/balloon.h
@@ -22,5 +22,7 @@ typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info);
int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
QEMUBalloonStatus *stat_func, void *opaque);
void qemu_remove_balloon_handler(void *opaque);
+bool qemu_balloon_is_inhibited(void);
+void qemu_balloon_inhibit(bool state);
#endif
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 5cb0f05068..f992494e10 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -70,6 +70,7 @@ void qemu_system_killed(int signal, pid_t pid);
void qemu_devices_reset(void);
void qemu_system_reset(bool report);
void qemu_system_guest_panicked(void);
+size_t qemu_target_page_bits(void);
void qemu_add_exit_notifier(Notifier *notify);
void qemu_remove_exit_notifier(Notifier *notify);
@@ -83,14 +84,52 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict);
void qemu_announce_self(void);
+/* Subcommands for QEMU_VM_COMMAND */
+enum qemu_vm_cmd {
+ MIG_CMD_INVALID = 0, /* Must be 0 */
+ MIG_CMD_OPEN_RETURN_PATH, /* Tell the dest to open the Return path */
+ MIG_CMD_PING, /* Request a PONG on the RP */
+
+ MIG_CMD_POSTCOPY_ADVISE, /* Prior to any page transfers, just
+ warn we might want to do PC */
+ MIG_CMD_POSTCOPY_LISTEN, /* Start listening for incoming
+ pages as it's running. */
+ MIG_CMD_POSTCOPY_RUN, /* Start execution */
+
+ MIG_CMD_POSTCOPY_RAM_DISCARD, /* A list of pages to discard that
+ were previously sent during
+ precopy but are dirty. */
+ MIG_CMD_PACKAGED, /* Send a wrapped stream within this stream */
+ MIG_CMD_MAX
+};
+
+#define MAX_VM_CMD_PACKAGED_SIZE (1ul << 24)
+
bool qemu_savevm_state_blocked(Error **errp);
void qemu_savevm_state_begin(QEMUFile *f,
const MigrationParams *params);
void qemu_savevm_state_header(QEMUFile *f);
-int qemu_savevm_state_iterate(QEMUFile *f);
-void qemu_savevm_state_complete(QEMUFile *f);
+int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
void qemu_savevm_state_cleanup(void);
-uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
+void qemu_savevm_state_complete_postcopy(QEMUFile *f);
+void qemu_savevm_state_complete_precopy(QEMUFile *f);
+void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
+ uint64_t *res_non_postcopiable,
+ uint64_t *res_postcopiable);
+void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
+ uint16_t len, uint8_t *data);
+void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
+void qemu_savevm_send_open_return_path(QEMUFile *f);
+int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb);
+void qemu_savevm_send_postcopy_advise(QEMUFile *f);
+void qemu_savevm_send_postcopy_listen(QEMUFile *f);
+void qemu_savevm_send_postcopy_run(QEMUFile *f);
+
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+ uint16_t len,
+ uint64_t *start_list,
+ uint64_t *length_list);
+
int qemu_loadvm_state(QEMUFile *f);
typedef enum DisplayType
@@ -133,6 +172,7 @@ extern int boot_menu;
extern bool boot_strict;
extern uint8_t *boot_splash_filedata;
extern size_t boot_splash_filedata_size;
+extern bool enable_mlock;
extern uint8_t qemu_extra_params_fw[2];
extern QEMUClockType rtc_clock;
extern const char *mem_path;
diff --git a/kvm-all.c b/kvm-all.c
index 1bc1273772..de3c8c48bb 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1461,7 +1461,6 @@ static int kvm_init(MachineState *ms)
* page size for the system though.
*/
assert(TARGET_PAGE_SIZE <= getpagesize());
- page_size_init();
s->sigmask_len = 8;
diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h
new file mode 100644
index 0000000000..9057d7af3a
--- /dev/null
+++ b/linux-headers/linux/userfaultfd.h
@@ -0,0 +1,167 @@
+/*
+ * include/linux/userfaultfd.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ * UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS \
+ ((__u64)1 << _UFFDIO_REGISTER | \
+ (__u64)1 << _UFFDIO_UNREGISTER | \
+ (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F. UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER (0x00)
+#define _UFFDIO_UNREGISTER (0x01)
+#define _UFFDIO_WAKE (0x02)
+#define _UFFDIO_COPY (0x03)
+#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_API (0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
+ struct uffdio_api)
+#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+ struct uffdio_register)
+#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
+ struct uffdio_range)
+#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
+ struct uffdio_range)
+#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
+ struct uffdio_copy)
+#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+ struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+ __u8 event;
+
+ __u8 reserved1;
+ __u16 reserved2;
+ __u32 reserved3;
+
+ union {
+ struct {
+ __u64 flags;
+ __u64 address;
+ } pagefault;
+
+ struct {
+ /* unused reserved fields */
+ __u64 reserved1;
+ __u64 reserved2;
+ __u64 reserved3;
+ } reserved;
+ } arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT 0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK 0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+ /* userland asks for an API number and the features to enable */
+ __u64 api;
+ /*
+ * Kernel answers below with the all available features for
+ * the API, this notifies userland of which events and/or
+ * which flags for each event are enabled in the current
+ * kernel.
+ *
+ * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+ * are to be considered implicitly always enabled in all kernels as
+ * long as the uffdio_api.api requested matches UFFD_API.
+ */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
+#define UFFD_FEATURE_EVENT_FORK (1<<1)
+#endif
+ __u64 features;
+
+ __u64 ioctls;
+};
+
+struct uffdio_range {
+ __u64 start;
+ __u64 len;
+};
+
+struct uffdio_register {
+ struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
+ __u64 mode;
+
+ /*
+ * kernel answers which ioctl commands are available for the
+ * range, keep at the end as the last 8 bytes aren't read.
+ */
+ __u64 ioctls;
+};
+
+struct uffdio_copy {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+ /*
+ * There will be a wrprotection flag later that allows to map
+ * pages wrprotected on the fly. And such a flag will be
+ * available if the wrprotection ioctl are implemented for the
+ * range according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "copy" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 copy;
+};
+
+struct uffdio_zeropage {
+ struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "zeropage" is written by the ioctl and must be at the end:
+ * the copy_from_user will not read the last 8 bytes.
+ */
+ __s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index d929e969ae..0cac6d707a 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -1,7 +1,7 @@
common-obj-y += migration.o tcp.o
common-obj-y += vmstate.o
common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o
-common-obj-y += xbzrle.o
+common-obj-y += xbzrle.o postcopy-ram.o
common-obj-$(CONFIG_RDMA) += rdma.o
common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
diff --git a/migration/block.c b/migration/block.c
index cf9d9f8999..310e2b36dc 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -748,7 +748,9 @@ static int block_save_complete(QEMUFile *f, void *opaque)
return 0;
}
-static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+ uint64_t *non_postcopiable_pending,
+ uint64_t *postcopiable_pending)
{
/* Estimate pending number of bytes to send */
uint64_t pending;
@@ -767,7 +769,8 @@ static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
qemu_mutex_unlock_iothread();
DPRINTF("Enter save live pending %" PRIu64 "\n", pending);
- return pending;
+ /* We don't do postcopy */
+ *non_postcopiable_pending += pending;
}
static int block_load(QEMUFile *f, void *opaque, int version_id)
@@ -876,7 +879,7 @@ static SaveVMHandlers savevm_block_handlers = {
.set_params = block_set_params,
.save_live_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
- .save_live_complete = block_save_complete,
+ .save_live_complete_precopy = block_save_complete,
.save_live_pending = block_save_pending,
.load_state = block_load,
.cleanup = block_migration_cleanup,
diff --git a/migration/migration.c b/migration/migration.c
index f99d3eabf7..c5c977e737 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -21,15 +21,18 @@
#include "sysemu/sysemu.h"
#include "block/block.h"
#include "qapi/qmp/qerror.h"
+#include "qapi/util.h"
#include "qemu/sockets.h"
#include "qemu/rcu.h"
#include "migration/block.h"
+#include "migration/postcopy-ram.h"
#include "qemu/thread.h"
#include "qmp-commands.h"
#include "trace.h"
-#include "qapi/util.h"
#include "qapi-event.h"
#include "qom/cpu.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
#define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */
@@ -57,6 +60,13 @@ static NotifierList migration_state_notifiers =
static bool deferred_incoming;
+/*
+ * Current state of incoming postcopy; note this is not part of
+ * MigrationIncomingState since it's state is used during cleanup
+ * at the end as MIS is being freed.
+ */
+static PostcopyState incoming_postcopy_state;
+
/* When we add fault tolerance, we could have several
migrations at once. For now we don't need to add
dynamic creation of migration */
@@ -64,6 +74,7 @@ static bool deferred_incoming;
/* For outgoing */
MigrationState *migrate_get_current(void)
{
+ static bool once;
static MigrationState current_migration = {
.state = MIGRATION_STATUS_NONE,
.bandwidth_limit = MAX_THROTTLE,
@@ -81,6 +92,10 @@ MigrationState *migrate_get_current(void)
DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT,
};
+ if (!once) {
+ qemu_mutex_init(&current_migration.src_page_req_mutex);
+ once = true;
+ }
return &current_migration;
}
@@ -95,14 +110,17 @@ MigrationIncomingState *migration_incoming_get_current(void)
MigrationIncomingState *migration_incoming_state_new(QEMUFile* f)
{
mis_current = g_new0(MigrationIncomingState, 1);
- mis_current->file = f;
+ mis_current->from_src_file = f;
QLIST_INIT(&mis_current->loadvm_handlers);
+ qemu_mutex_init(&mis_current->rp_mutex);
+ qemu_event_init(&mis_current->main_thread_load_event, false);
return mis_current;
}
void migration_incoming_state_destroy(void)
{
+ qemu_event_destroy(&mis_current->main_thread_load_event);
loadvm_free_handlers(mis_current);
g_free(mis_current);
mis_current = NULL;
@@ -248,6 +266,35 @@ static void deferred_incoming_migration(Error **errp)
deferred_incoming = true;
}
+/* Request a range of pages from the source VM at the given
+ * start address.
+ * rbname: Name of the RAMBlock to request the page in, if NULL it's the same
+ * as the last request (a name must have been given previously)
+ * Start: Address offset within the RB
+ * Len: Length in bytes required - must be a multiple of pagesize
+ */
+void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
+ ram_addr_t start, size_t len)
+{
+ uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname upto 256 */
+ size_t msglen = 12; /* start + len */
+
+ *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
+ *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
+
+ if (rbname) {
+ int rbname_len = strlen(rbname);
+ assert(rbname_len < 256);
+
+ bufc[msglen++] = rbname_len;
+ memcpy(bufc + msglen, rbname, rbname_len);
+ msglen += rbname_len;
+ migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc);
+ } else {
+ migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc);
+ }
+}
+
void qemu_start_incoming_migration(const char *uri, Error **errp)
{
const char *p;
@@ -278,12 +325,37 @@ static void process_incoming_migration_co(void *opaque)
{
QEMUFile *f = opaque;
Error *local_err = NULL;
+ MigrationIncomingState *mis;
+ PostcopyState ps;
int ret;
- migration_incoming_state_new(f);
+ mis = migration_incoming_state_new(f);
+ postcopy_state_set(POSTCOPY_INCOMING_NONE);
migrate_generate_event(MIGRATION_STATUS_ACTIVE);
+
ret = qemu_loadvm_state(f);
+ ps = postcopy_state_get();
+ trace_process_incoming_migration_co_end(ret, ps);
+ if (ps != POSTCOPY_INCOMING_NONE) {
+ if (ps == POSTCOPY_INCOMING_ADVISE) {
+ /*
+ * Where a migration had postcopy enabled (and thus went to advise)
+ * but managed to complete within the precopy period, we can use
+ * the normal exit.
+ */
+ postcopy_ram_incoming_cleanup(mis);
+ } else if (ret >= 0) {
+ /*
+ * Postcopy was started, cleanup should happen at the end of the
+ * postcopy thread.
+ */
+ trace_process_incoming_migration_co_postcopy_end_main();
+ return;
+ }
+ /* Else if something went wrong then just fall out of the normal exit */
+ }
+
qemu_fclose(f);
free_xbzrle_decoded_buf();
migration_incoming_state_destroy();
@@ -344,6 +416,50 @@ void process_incoming_migration(QEMUFile *f)
qemu_coroutine_enter(co, f);
}
+/*
+ * Send a message on the return channel back to the source
+ * of the migration.
+ */
+void migrate_send_rp_message(MigrationIncomingState *mis,
+ enum mig_rp_message_type message_type,
+ uint16_t len, void *data)
+{
+ trace_migrate_send_rp_message((int)message_type, len);
+ qemu_mutex_lock(&mis->rp_mutex);
+ qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
+ qemu_put_be16(mis->to_src_file, len);
+ qemu_put_buffer(mis->to_src_file, data, len);
+ qemu_fflush(mis->to_src_file);
+ qemu_mutex_unlock(&mis->rp_mutex);
+}
+
+/*
+ * Send a 'SHUT' message on the return channel with the given value
+ * to indicate that we've finished with the RP. Non-0 value indicates
+ * error.
+ */
+void migrate_send_rp_shut(MigrationIncomingState *mis,
+ uint32_t value)
+{
+ uint32_t buf;
+
+ buf = cpu_to_be32(value);
+ migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
+}
+
+/*
+ * Send a 'PONG' message on the return channel with the given value
+ * (normally in response to a 'PING')
+ */
+void migrate_send_rp_pong(MigrationIncomingState *mis,
+ uint32_t value)
+{
+ uint32_t buf;
+
+ buf = cpu_to_be32(value);
+ migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
+}
+
/* amount of nanoseconds we are willing to wait for migration to be down.
* the choice of nanoseconds is because it is the maximum resolution that
* get_clock() can achieve. It is an internal measure. All user-visible
@@ -399,6 +515,24 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
return params;
}
+/*
+ * Return true if we're already in the middle of a migration
+ * (i.e. any of the active or setup states)
+ */
+static bool migration_is_setup_or_active(int state)
+{
+ switch (state) {
+ case MIGRATION_STATUS_ACTIVE:
+ case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ case MIGRATION_STATUS_SETUP:
+ return true;
+
+ default:
+ return false;
+
+ }
+}
+
static void get_xbzrle_cache_stats(MigrationInfo *info)
{
if (migrate_use_xbzrle()) {
@@ -465,6 +599,39 @@ MigrationInfo *qmp_query_migrate(Error **errp)
get_xbzrle_cache_stats(info);
break;
+ case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+ /* Mostly the same as active; TODO add some postcopy stats */
+ info->has_status = true;
+ info->has_total_time = true;
+ info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
+ - s->total_time;
+ info->has_expected_downtime = true;
+ info->expected_downtime = s->expected_downtime;
+ info->has_setup_time = true;
+ info->setup_time = s->setup_time;
+
+ info->has_ram = true;
+ info->ram = g_malloc0(sizeof(*info->ram));
+ info->ram->transferred = ram_bytes_transferred();
+ info->ram->remaining = ram_bytes_remaining();
+ info->ram->total = ram_bytes_total();
+ info->ram->duplicate = dup_mig_pages_transferred();
+ info->ram->skipped = skipped_mig_pages_transferred();
+ info->ram->normal = norm_mig_pages_transferred();
+ info->ram->normal_bytes = norm_mig_bytes_transferred();
+ info->ram->dirty_pages_rate = s->dirty_pages_rate;
+ info->ram->mbps = s->mbps;
+
+ if (blk_mig_active()) {
+ info->has_disk = true;
+ info->disk = g_malloc0(sizeof(*info->disk));
+ info->disk->transferred = blk_mig_bytes_transferred();
+ info->disk->remaining = blk_mig_bytes_remaining();
+ info->disk->total = blk_mig_bytes_total();
+ }
+
+ get_xbzrle_cache_stats(info);
+ break;
case MIGRATION_STATUS_COMPLETED:
get_xbzrle_cache_stats(info);
@@ -506,8 +673,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
MigrationState *s = migrate_get_current();
MigrationCapabilityStatusList *cap;
- if (s->state == MIGRATION_STATUS_ACTIVE ||
- s->state == MIGRATION_STATUS_SETUP) {
+ if (migration_is_setup_or_active(s->state)) {
error_setg(errp, QERR_MIGRATION_ACTIVE);
return;
}
@@ -515,6 +681,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
for (cap = params; cap; cap = cap->next) {
s->enabled_capabilities[cap->value->capability] = cap->value->state;
}
+
+ if (migrate_postcopy_ram()) {
+ if (migrate_use_compression()) {
+ /* The decompression threads asynchronously write into RAM
+ * rather than use the atomic copies needed to avoid
+ * userfaulting. It should be possible to fix the decompression
+ * threads for compatibility in future.
+ */
+ error_report("Postcopy is not currently compatible with "
+ "compression");
+ s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM] =
+ false;
+ }
+ }
}
void qmp_migrate_set_parameters(bool has_compress_level,
@@ -583,6 +763,28 @@ void qmp_migrate_set_parameters(bool has_compress_level,
}
}
+void qmp_migrate_start_postcopy(Error **errp)
+{
+ MigrationState *s = migrate_get_current();
+
+ if (!migrate_postcopy_ram()) {
+ error_setg(errp, "Enable postcopy with migration_set_capability before"
+ " the start of migration");
+ return;
+ }
+
+ if (s->state == MIGRATION_STATUS_NONE) {
+ error_setg(errp, "Postcopy must be started after migration has been"
+ " started");
+ return;
+ }
+ /*
+ * we don't error if migration has finished since that would be racy
+ * with issuing this command.
+ */
+ atomic_set(&s->start_postcopy, true);
+}
+
/* shared migration helpers */
static void migrate_set_state(MigrationState *s, int old_state, int new_state)
@@ -600,10 +802,15 @@ static void migrate_fd_cleanup(void *opaque)
qemu_bh_delete(s->cleanup_bh);
s->cleanup_bh = NULL;
+ flush_page_queue(s);
+
if (s->file) {
trace_migrate_fd_cleanup();
qemu_mutex_unlock_iothread();
- qemu_thread_join(&s->thread);
+ if (s->migration_thread_running) {
+ qemu_thread_join(&s->thread);
+ s->migration_thread_running = false;
+ }
qemu_mutex_lock_iothread();
migrate_compress_threads_join();
@@ -611,7 +818,8 @@ static void migrate_fd_cleanup(void *opaque)
s->file = NULL;
}
- assert(s->state != MIGRATION_STATUS_ACTIVE);
+ assert((s->state != MIGRATION_STATUS_ACTIVE) &&
+ (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE));
if (s->state == MIGRATION_STATUS_CANCELLING) {
migrate_set_state(s, MIGRATION_STATUS_CANCELLING,
@@ -635,10 +843,14 @@ static void migrate_fd_cancel(MigrationState *s)
QEMUFile *f = migrate_get_current()->file;
trace_migrate_fd_cancel();
+ if (s->rp_state.from_dst_file) {
+ /* shutdown the rp socket, so causing the rp thread to shutdown */
+ qemu_file_shutdown(s->rp_state.from_dst_file);
+ }
+
do {
old_state = s->state;
- if (old_state != MIGRATION_STATUS_SETUP &&
- old_state != MIGRATION_STATUS_ACTIVE) {
+ if (!migration_is_setup_or_active(old_state)) {
break;
}
migrate_set_state(s, old_state, MIGRATION_STATUS_CANCELLING);
@@ -682,7 +894,12 @@ bool migration_has_failed(MigrationState *s)
s->state == MIGRATION_STATUS_FAILED);
}
-static MigrationState *migrate_init(const MigrationParams *params)
+bool migration_in_postcopy(MigrationState *s)
+{
+ return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
+}
+
+MigrationState *migrate_init(const MigrationParams *params)
{
MigrationState *s = migrate_get_current();
int64_t bandwidth_limit = s->bandwidth_limit;
@@ -719,6 +936,8 @@ static MigrationState *migrate_init(const MigrationParams *params)
s->bandwidth_limit = bandwidth_limit;
migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
+ QSIMPLEQ_INIT(&s->src_page_requests);
+
s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
return s;
}
@@ -770,8 +989,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
params.blk = has_blk && blk;
params.shared = has_inc && inc;
- if (s->state == MIGRATION_STATUS_ACTIVE ||
- s->state == MIGRATION_STATUS_SETUP ||
+ if (migration_is_setup_or_active(s->state) ||
s->state == MIGRATION_STATUS_CANCELLING) {
error_setg(errp, QERR_MIGRATION_ACTIVE);
return;
@@ -890,6 +1108,15 @@ void qmp_migrate_set_downtime(double value, Error **errp)
max_downtime = (uint64_t)value;
}
+bool migrate_postcopy_ram(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM];
+}
+
bool migrate_auto_converge(void)
{
MigrationState *s;
@@ -971,36 +1198,376 @@ int64_t migrate_xbzrle_cache_size(void)
return s->xbzrle_cache_size;
}
+/* migration thread support */
+/*
+ * Something bad happened to the RP stream, mark an error
+ * The caller shall print or trace something to indicate why
+ */
+static void mark_source_rp_bad(MigrationState *s)
+{
+ s->rp_state.error = true;
+}
+
+static struct rp_cmd_args {
+ ssize_t len; /* -1 = variable */
+ const char *name;
+} rp_cmd_args[] = {
+ [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" },
+ [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" },
+ [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" },
+ [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" },
+ [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" },
+ [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" },
+};
+
+/*
+ * Process a request for pages received on the return path,
+ * We're allowed to send more than requested (e.g. to round to our page size)
+ * and we don't need to send pages that have already been sent.
+ */
+static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
+ ram_addr_t start, size_t len)
+{
+ long our_host_ps = getpagesize();
+
+ trace_migrate_handle_rp_req_pages(rbname, start, len);
+
+ /*
+ * Since we currently insist on matching page sizes, just sanity check
+ * we're being asked for whole host pages.
+ */
+ if (start & (our_host_ps-1) ||
+ (len & (our_host_ps-1))) {
+ error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
+ " len: %zd", __func__, start, len);
+ mark_source_rp_bad(ms);
+ return;
+ }
+
+ if (ram_save_queue_pages(ms, rbname, start, len)) {
+ mark_source_rp_bad(ms);
+ }
+}
+
+/*
+ * Handles messages sent on the return path towards the source VM
+ *
+ */
+static void *source_return_path_thread(void *opaque)
+{
+ MigrationState *ms = opaque;
+ QEMUFile *rp = ms->rp_state.from_dst_file;
+ uint16_t header_len, header_type;
+ const int max_len = 512;
+ uint8_t buf[max_len];
+ uint32_t tmp32, sibling_error;
+ ram_addr_t start = 0; /* =0 to silence warning */
+ size_t len = 0, expected_len;
+ int res;
+
+ trace_source_return_path_thread_entry();
+ while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
+ migration_is_setup_or_active(ms->state)) {
+ trace_source_return_path_thread_loop_top();
+ header_type = qemu_get_be16(rp);
+ header_len = qemu_get_be16(rp);
+
+ if (header_type >= MIG_RP_MSG_MAX ||
+ header_type == MIG_RP_MSG_INVALID) {
+ error_report("RP: Received invalid message 0x%04x length 0x%04x",
+ header_type, header_len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+
+ if ((rp_cmd_args[header_type].len != -1 &&
+ header_len != rp_cmd_args[header_type].len) ||
+ header_len > max_len) {
+ error_report("RP: Received '%s' message (0x%04x) with"
+ "incorrect length %d expecting %zu",
+ rp_cmd_args[header_type].name, header_type, header_len,
+ (size_t)rp_cmd_args[header_type].len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+
+ /* We know we've got a valid header by this point */
+ res = qemu_get_buffer(rp, buf, header_len);
+ if (res != header_len) {
+ error_report("RP: Failed reading data for message 0x%04x"
+ " read %d expected %d",
+ header_type, res, header_len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+
+ /* OK, we have the message and the data */
+ switch (header_type) {
+ case MIG_RP_MSG_SHUT:
+ sibling_error = be32_to_cpup((uint32_t *)buf);
+ trace_source_return_path_thread_shut(sibling_error);
+ if (sibling_error) {
+ error_report("RP: Sibling indicated error %d", sibling_error);
+ mark_source_rp_bad(ms);
+ }
+ /*
+ * We'll let the main thread deal with closing the RP
+ * we could do a shutdown(2) on it, but we're the only user
+ * anyway, so there's nothing gained.
+ */
+ goto out;
+
+ case MIG_RP_MSG_PONG:
+ tmp32 = be32_to_cpup((uint32_t *)buf);
+ trace_source_return_path_thread_pong(tmp32);
+ break;
+
+ case MIG_RP_MSG_REQ_PAGES:
+ start = be64_to_cpup((uint64_t *)buf);
+ len = be32_to_cpup((uint32_t *)(buf + 8));
+ migrate_handle_rp_req_pages(ms, NULL, start, len);
+ break;
+
+ case MIG_RP_MSG_REQ_PAGES_ID:
+ expected_len = 12 + 1; /* header + termination */
+
+ if (header_len >= expected_len) {
+ start = be64_to_cpup((uint64_t *)buf);
+ len = be32_to_cpup((uint32_t *)(buf + 8));
+ /* Now we expect an idstr */
+ tmp32 = buf[12]; /* Length of the following idstr */
+ buf[13 + tmp32] = '\0';
+ expected_len += tmp32;
+ }
+ if (header_len != expected_len) {
+ error_report("RP: Req_Page_id with length %d expecting %zd",
+ header_len, expected_len);
+ mark_source_rp_bad(ms);
+ goto out;
+ }
+ migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (rp && qemu_file_get_error(rp)) {
+ trace_source_return_path_thread_bad_end();
+ mark_source_rp_bad(ms);
+ }
+
+ trace_source_return_path_thread_end();
+out:
+ ms->rp_state.from_dst_file = NULL;
+ qemu_fclose(rp);
+ return NULL;
+}
+
+static int open_return_path_on_source(MigrationState *ms)
+{
+
+ ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->file);
+ if (!ms->rp_state.from_dst_file) {
+ return -1;
+ }
+
+ trace_open_return_path_on_source();
+ qemu_thread_create(&ms->rp_state.rp_thread, "return path",
+ source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
+
+ trace_open_return_path_on_source_continue();
+
+ return 0;
+}
+
+/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
+static int await_return_path_close_on_source(MigrationState *ms)
+{
+ /*
+ * If this is a normal exit then the destination will send a SHUT and the
+ * rp_thread will exit, however if there's an error we need to cause
+ * it to exit.
+ */
+ if (qemu_file_get_error(ms->file) && ms->rp_state.from_dst_file) {
+ /*
+ * shutdown(2), if we have it, will cause it to unblock if it's stuck
+ * waiting for the destination.
+ */
+ qemu_file_shutdown(ms->rp_state.from_dst_file);
+ mark_source_rp_bad(ms);
+ }
+ trace_await_return_path_close_on_source_joining();
+ qemu_thread_join(&ms->rp_state.rp_thread);
+ trace_await_return_path_close_on_source_close();
+ return ms->rp_state.error;
+}
+
+/*
+ * Switch from normal iteration to postcopy
+ * Returns non-0 on error
+ */
+static int postcopy_start(MigrationState *ms, bool *old_vm_running)
+{
+ int ret;
+ const QEMUSizedBuffer *qsb;
+ int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ migrate_set_state(ms, MIGRATION_STATUS_ACTIVE,
+ MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+ trace_postcopy_start();
+ qemu_mutex_lock_iothread();
+ trace_postcopy_start_set_run();
+
+ qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+ *old_vm_running = runstate_is_running();
+ global_state_store();
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /*
+ * in Finish migrate and with the io-lock held everything should
+ * be quiet, but we've potentially still got dirty pages and we
+ * need to tell the destination to throw any pages it's already received
+ * that are dirty
+ */
+ if (ram_postcopy_send_discard_bitmap(ms)) {
+ error_report("postcopy send discard bitmap failed");
+ goto fail;
+ }
+
+ /*
+ * send rest of state - note things that are doing postcopy
+ * will notice we're in POSTCOPY_ACTIVE and not actually
+ * wrap their state up here
+ */
+ qemu_file_set_rate_limit(ms->file, INT64_MAX);
+ /* Ping just for debugging, helps line traces up */
+ qemu_savevm_send_ping(ms->file, 2);
+
+ /*
+ * While loading the device state we may trigger page transfer
+ * requests and the fd must be free to process those, and thus
+ * the destination must read the whole device state off the fd before
+ * it starts processing it. Unfortunately the ad-hoc migration format
+ * doesn't allow the destination to know the size to read without fully
+ * parsing it through each devices load-state code (especially the open
+ * coded devices that use get/put).
+ * So we wrap the device state up in a package with a length at the start;
+ * to do this we use a qemu_buf to hold the whole of the device state.
+ */
+ QEMUFile *fb = qemu_bufopen("w", NULL);
+ if (!fb) {
+ error_report("Failed to create buffered file");
+ goto fail;
+ }
+
+ /*
+ * Make sure the receiver can get incoming pages before we send the rest
+ * of the state
+ */
+ qemu_savevm_send_postcopy_listen(fb);
+
+ qemu_savevm_state_complete_precopy(fb);
+ qemu_savevm_send_ping(fb, 3);
+
+ qemu_savevm_send_postcopy_run(fb);
+
+ /* <><> end of stuff going into the package */
+ qsb = qemu_buf_get(fb);
+
+ /* Now send that blob */
+ if (qemu_savevm_send_packaged(ms->file, qsb)) {
+ goto fail_closefb;
+ }
+ qemu_fclose(fb);
+ ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
+
+ qemu_mutex_unlock_iothread();
+
+ /*
+ * Although this ping is just for debug, it could potentially be
+ * used for getting a better measurement of downtime at the source.
+ */
+ qemu_savevm_send_ping(ms->file, 4);
+
+ ret = qemu_file_get_error(ms->file);
+ if (ret) {
+ error_report("postcopy_start: Migration stream errored");
+ migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+ MIGRATION_STATUS_FAILED);
+ }
+
+ return ret;
+
+fail_closefb:
+ qemu_fclose(fb);
+fail:
+ migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+ MIGRATION_STATUS_FAILED);
+ qemu_mutex_unlock_iothread();
+ return -1;
+}
+
/**
* migration_completion: Used by migration_thread when there's not much left.
* The caller 'breaks' the loop when this returns.
*
* @s: Current migration state
+ * @current_active_state: The migration state we expect to be in
* @*old_vm_running: Pointer to old_vm_running flag
* @*start_time: Pointer to time to update
*/
-static void migration_completion(MigrationState *s, bool *old_vm_running,
+static void migration_completion(MigrationState *s, int current_active_state,
+ bool *old_vm_running,
int64_t *start_time)
{
int ret;
- qemu_mutex_lock_iothread();
- *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
- qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
- *old_vm_running = runstate_is_running();
+ if (s->state == MIGRATION_STATUS_ACTIVE) {
+ qemu_mutex_lock_iothread();
+ *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+ *old_vm_running = runstate_is_running();
+ ret = global_state_store();
+
+ if (!ret) {
+ ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+ if (ret >= 0) {
+ qemu_file_set_rate_limit(s->file, INT64_MAX);
+ qemu_savevm_state_complete_precopy(s->file);
+ }
+ }
+ qemu_mutex_unlock_iothread();
- ret = global_state_store();
- if (!ret) {
- ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
- if (ret >= 0) {
- qemu_file_set_rate_limit(s->file, INT64_MAX);
- qemu_savevm_state_complete(s->file);
+ if (ret < 0) {
+ goto fail;
}
+ } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+ trace_migration_completion_postcopy_end();
+
+ qemu_savevm_state_complete_postcopy(s->file);
+ trace_migration_completion_postcopy_end_after_complete();
}
- qemu_mutex_unlock_iothread();
- if (ret < 0) {
- goto fail;
+ /*
+ * If rp was opened we must clean up the thread before
+ * cleaning everything else up (since if there are no failures
+ * it will wait for the destination to send it's status in
+ * a SHUT command).
+ * Postcopy opens rp if enabled (even if it's not avtivated)
+ */
+ if (migrate_postcopy_ram()) {
+ int rp_error;
+ trace_migration_completion_postcopy_end_before_rp();
+ rp_error = await_return_path_close_on_source(s);
+ trace_migration_completion_postcopy_end_after_rp(rp_error);
+ if (rp_error) {
+ goto fail;
+ }
}
if (qemu_file_get_error(s->file)) {
@@ -1008,18 +1575,21 @@ static void migration_completion(MigrationState *s, bool *old_vm_running,
goto fail;
}
- migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COMPLETED);
+ migrate_set_state(s, current_active_state, MIGRATION_STATUS_COMPLETED);
return;
fail:
- migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_FAILED);
+ migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
}
-/* migration thread support */
-
+/*
+ * Master migration thread on the source VM.
+ * It drives the migration and pumps the data down the outgoing channel.
+ */
static void *migration_thread(void *opaque)
{
MigrationState *s = opaque;
+ /* Used by the bandwidth calcs, updated later */
int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
int64_t initial_bytes = 0;
@@ -1027,34 +1597,79 @@ static void *migration_thread(void *opaque)
int64_t start_time = initial_time;
int64_t end_time;
bool old_vm_running = false;
+ bool entered_postcopy = false;
+ /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
+ enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
rcu_register_thread();
qemu_savevm_state_header(s->file);
+
+ if (migrate_postcopy_ram()) {
+ /* Now tell the dest that it should open its end so it can reply */
+ qemu_savevm_send_open_return_path(s->file);
+
+ /* And do a ping that will make stuff easier to debug */
+ qemu_savevm_send_ping(s->file, 1);
+
+ /*
+ * Tell the destination that we *might* want to do postcopy later;
+ * if the other end can't do postcopy it should fail now, nice and
+ * early.
+ */
+ qemu_savevm_send_postcopy_advise(s->file);
+ }
+
qemu_savevm_state_begin(s->file, &s->params);
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+ current_active_state = MIGRATION_STATUS_ACTIVE;
migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE);
- while (s->state == MIGRATION_STATUS_ACTIVE) {
+ trace_migration_thread_setup_complete();
+
+ while (s->state == MIGRATION_STATUS_ACTIVE ||
+ s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
int64_t current_time;
uint64_t pending_size;
if (!qemu_file_rate_limit(s->file)) {
- pending_size = qemu_savevm_state_pending(s->file, max_size);
- trace_migrate_pending(pending_size, max_size);
+ uint64_t pend_post, pend_nonpost;
+
+ qemu_savevm_state_pending(s->file, max_size, &pend_nonpost,
+ &pend_post);
+ pending_size = pend_nonpost + pend_post;
+ trace_migrate_pending(pending_size, max_size,
+ pend_post, pend_nonpost);
if (pending_size && pending_size >= max_size) {
- qemu_savevm_state_iterate(s->file);
+ /* Still a significant amount to transfer */
+
+ current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ if (migrate_postcopy_ram() &&
+ s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE &&
+ pend_nonpost <= max_size &&
+ atomic_read(&s->start_postcopy)) {
+
+ if (!postcopy_start(s, &old_vm_running)) {
+ current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+ entered_postcopy = true;
+ }
+
+ continue;
+ }
+ /* Just another iteration step */
+ qemu_savevm_state_iterate(s->file, entered_postcopy);
} else {
trace_migration_thread_low_pending(pending_size);
- migration_completion(s, &old_vm_running, &start_time);
+ migration_completion(s, current_active_state,
+ &old_vm_running, &start_time);
break;
}
}
if (qemu_file_get_error(s->file)) {
- migrate_set_state(s, MIGRATION_STATUS_ACTIVE,
- MIGRATION_STATUS_FAILED);
+ migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
+ trace_migration_thread_file_err();
break;
}
current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1085,6 +1700,7 @@ static void *migration_thread(void *opaque)
}
}
+ trace_migration_thread_after_loop();
/* If we enabled cpu throttling for auto-converge, turn it off. */
cpu_throttle_stop();
end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1094,14 +1710,16 @@ static void *migration_thread(void *opaque)
if (s->state == MIGRATION_STATUS_COMPLETED) {
uint64_t transferred_bytes = qemu_ftell(s->file);
s->total_time = end_time - s->total_time;
- s->downtime = end_time - start_time;
+ if (!entered_postcopy) {
+ s->downtime = end_time - start_time;
+ }
if (s->total_time) {
s->mbps = (((double) transferred_bytes * 8.0) /
((double) s->total_time)) / 1000;
}
runstate_set(RUN_STATE_POSTMIGRATE);
} else {
- if (old_vm_running) {
+ if (old_vm_running && !entered_postcopy) {
vm_start();
}
}
@@ -1124,7 +1742,34 @@ void migrate_fd_connect(MigrationState *s)
/* Notify before starting migration thread */
notifier_list_notify(&migration_state_notifiers, s);
+ /*
+ * Open the return path; currently for postcopy but other things might
+ * also want it.
+ */
+ if (migrate_postcopy_ram()) {
+ if (open_return_path_on_source(s)) {
+ error_report("Unable to open return-path for postcopy");
+ migrate_set_state(s, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_FAILED);
+ migrate_fd_cleanup(s);
+ return;
+ }
+ }
+
migrate_compress_threads_create();
qemu_thread_create(&s->thread, "migration", migration_thread, s,
QEMU_THREAD_JOINABLE);
+ s->migration_thread_running = true;
+}
+
+PostcopyState postcopy_state_get(void)
+{
+ return atomic_mb_read(&incoming_postcopy_state);
}
+
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(PostcopyState new_state)
+{
+ return atomic_xchg(&incoming_postcopy_state, new_state);
+}
+
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
new file mode 100644
index 0000000000..22d6b18e63
--- /dev/null
+++ b/migration/postcopy-ram.c
@@ -0,0 +1,767 @@
+/*
+ * Postcopy migration for RAM
+ *
+ * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Dave Gilbert <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * Postcopy is a migration technique where the execution flips from the
+ * source to the destination before all the data has been copied.
+ */
+
+#include <glib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/balloon.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+/* Arbitrary limit on size of each discard command,
+ * keeps them around ~200 bytes
+ */
+#define MAX_DISCARDS_PER_COMMAND 12
+
+struct PostcopyDiscardState {
+ const char *ramblock_name;
+ uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */
+ uint16_t cur_entry;
+ /*
+ * Start and length of a discard range (bytes)
+ */
+ uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
+ uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
+ unsigned int nsentwords;
+ unsigned int nsentcmds;
+};
+
+/* Postcopy needs to detect accesses to pages that haven't yet been copied
+ * across, and efficiently map new pages in, the techniques for doing this
+ * are target OS specific.
+ */
+#if defined(__linux__)
+
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <asm/types.h> /* for __u64 */
+#endif
+
+#if defined(__linux__) && defined(__NR_userfaultfd)
+#include <linux/userfaultfd.h>
+
+static bool ufd_version_check(int ufd)
+{
+ struct uffdio_api api_struct;
+ uint64_t ioctl_mask;
+
+ api_struct.api = UFFD_API;
+ api_struct.features = 0;
+ if (ioctl(ufd, UFFDIO_API, &api_struct)) {
+ error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
+ strerror(errno));
+ return false;
+ }
+
+ ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
+ (__u64)1 << _UFFDIO_UNREGISTER;
+ if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+ error_report("Missing userfault features: %" PRIx64,
+ (uint64_t)(~api_struct.ioctls & ioctl_mask));
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Note: This has the side effect of munlock'ing all of RAM, that's
+ * normally fine since if the postcopy succeeds it gets turned back on at the
+ * end.
+ */
+bool postcopy_ram_supported_by_host(void)
+{
+ long pagesize = getpagesize();
+ int ufd = -1;
+ bool ret = false; /* Error unless we change it */
+ void *testarea = NULL;
+ struct uffdio_register reg_struct;
+ struct uffdio_range range_struct;
+ uint64_t feature_mask;
+
+ if ((1ul << qemu_target_page_bits()) > pagesize) {
+ error_report("Target page size bigger than host page size");
+ goto out;
+ }
+
+ ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ if (ufd == -1) {
+ error_report("%s: userfaultfd not available: %s", __func__,
+ strerror(errno));
+ goto out;
+ }
+
+ /* Version and features check */
+ if (!ufd_version_check(ufd)) {
+ goto out;
+ }
+
+ /*
+ * userfault and mlock don't go together; we'll put it back later if
+ * it was enabled.
+ */
+ if (munlockall()) {
+ error_report("%s: munlockall: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ /*
+ * We need to check that the ops we need are supported on anon memory
+ * To do that we need to register a chunk and see the flags that
+ * are returned.
+ */
+ testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
+ MAP_ANONYMOUS, -1, 0);
+ if (testarea == MAP_FAILED) {
+ error_report("%s: Failed to map test area: %s", __func__,
+ strerror(errno));
+ goto out;
+ }
+ g_assert(((size_t)testarea & (pagesize-1)) == 0);
+
+ reg_struct.range.start = (uintptr_t)testarea;
+ reg_struct.range.len = pagesize;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
+ error_report("%s userfault register: %s", __func__, strerror(errno));
+ goto out;
+ }
+
+ range_struct.start = (uintptr_t)testarea;
+ range_struct.len = pagesize;
+ if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
+ error_report("%s userfault unregister: %s", __func__, strerror(errno));
+ goto out;
+ }
+
+ feature_mask = (__u64)1 << _UFFDIO_WAKE |
+ (__u64)1 << _UFFDIO_COPY |
+ (__u64)1 << _UFFDIO_ZEROPAGE;
+ if ((reg_struct.ioctls & feature_mask) != feature_mask) {
+ error_report("Missing userfault map features: %" PRIx64,
+ (uint64_t)(~reg_struct.ioctls & feature_mask));
+ goto out;
+ }
+
+ /* Success! */
+ ret = true;
+out:
+ if (testarea) {
+ munmap(testarea, pagesize);
+ }
+ if (ufd != -1) {
+ close(ufd);
+ }
+ return ret;
+}
+
+/**
+ * postcopy_ram_discard_range: Discard a range of memory.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true.
+ *
+ * @mis: Current incoming migration state.
+ * @start, @length: range of memory to discard.
+ *
+ * returns: 0 on success.
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+ size_t length)
+{
+ trace_postcopy_ram_discard_range(start, length);
+ if (madvise(start, length, MADV_DONTNEED)) {
+ error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Setup an area of RAM so that it *can* be used for postcopy later; this
+ * must be done right at the start prior to pre-copy.
+ * opaque should be the MIS.
+ */
+static int init_range(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+
+ trace_postcopy_init_range(block_name, host_addr, offset, length);
+
+ /*
+ * We need the whole of RAM to be truly empty for postcopy, so things
+ * like ROMs and any data tables built during init must be zero'd
+ * - we're going to get the copy from the source anyway.
+ * (Precopy will just overwrite this data, so doesn't need the discard)
+ */
+ if (postcopy_ram_discard_range(mis, host_addr, length)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * At the end of migration, undo the effects of init_range
+ * opaque should be the MIS.
+ */
+static int cleanup_range(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+ struct uffdio_range range_struct;
+ trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
+
+ /*
+ * We turned off hugepage for the precopy stage with postcopy enabled
+ * we can turn it back on now.
+ */
+ if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
+ error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ /*
+ * We can also turn off userfault now since we should have all the
+ * pages. It can be useful to leave it on to debug postcopy
+ * if you're not sure it's always getting every page.
+ */
+ range_struct.start = (uintptr_t)host_addr;
+ range_struct.len = length;
+
+ if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
+ error_report("%s: userfault unregister %s", __func__, strerror(errno));
+
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from arch_init's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+ if (qemu_ram_foreach_block(init_range, mis)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+ trace_postcopy_ram_incoming_cleanup_entry();
+
+ if (mis->have_fault_thread) {
+ uint64_t tmp64;
+
+ if (qemu_ram_foreach_block(cleanup_range, mis)) {
+ return -1;
+ }
+ /*
+ * Tell the fault_thread to exit, it's an eventfd that should
+ * currently be at 0, we're going to increment it to 1
+ */
+ tmp64 = 1;
+ if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
+ trace_postcopy_ram_incoming_cleanup_join();
+ qemu_thread_join(&mis->fault_thread);
+ } else {
+ /* Not much we can do here, but may as well report it */
+ error_report("%s: incrementing userfault_quit_fd: %s", __func__,
+ strerror(errno));
+ }
+ trace_postcopy_ram_incoming_cleanup_closeuf();
+ close(mis->userfault_fd);
+ close(mis->userfault_quit_fd);
+ mis->have_fault_thread = false;
+ }
+
+ qemu_balloon_inhibit(false);
+
+ if (enable_mlock) {
+ if (os_mlock() < 0) {
+ error_report("mlock: %s", strerror(errno));
+ /*
+ * It doesn't feel right to fail at this point, we have a valid
+ * VM state.
+ */
+ }
+ }
+
+ postcopy_state_set(POSTCOPY_INCOMING_END);
+ migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
+
+ if (mis->postcopy_tmp_page) {
+ munmap(mis->postcopy_tmp_page, getpagesize());
+ mis->postcopy_tmp_page = NULL;
+ }
+ trace_postcopy_ram_incoming_cleanup_exit();
+ return 0;
+}
+
+/*
+ * Disable huge pages on an area
+ */
+static int nhp_range(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+ trace_postcopy_nhp_range(block_name, host_addr, offset, length);
+
+ /*
+ * Before we do discards we need to ensure those discards really
+ * do delete areas of the page, even if THP thinks a hugepage would
+ * be a good idea, so force hugepages off.
+ */
+ if (qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE)) {
+ error_report("%s: NOHUGEPAGE: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
+ * however leaving it until after precopy means that most of the precopy
+ * data is still THPd
+ */
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+ if (qemu_ram_foreach_block(nhp_range, mis)) {
+ return -1;
+ }
+
+ postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
+
+ return 0;
+}
+
+/*
+ * Mark the given area of RAM as requiring notification to unwritten areas
+ * Used as a callback on qemu_ram_foreach_block.
+ * host_addr: Base of area to mark
+ * offset: Offset in the whole ram arena
+ * length: Length of the section
+ * opaque: MigrationIncomingState pointer
+ * Returns 0 on success
+ */
+static int ram_block_enable_notify(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length,
+ void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+ struct uffdio_register reg_struct;
+
+ reg_struct.range.start = (uintptr_t)host_addr;
+ reg_struct.range.len = length;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ /* Now tell our userfault_fd that it's responsible for this area */
+ if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
+ error_report("%s userfault register: %s", __func__, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Handle faults detected by the USERFAULT markings
+ */
+static void *postcopy_ram_fault_thread(void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+ struct uffd_msg msg;
+ int ret;
+ size_t hostpagesize = getpagesize();
+ RAMBlock *rb = NULL;
+ RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
+
+ trace_postcopy_ram_fault_thread_entry();
+ qemu_sem_post(&mis->fault_thread_sem);
+
+ while (true) {
+ ram_addr_t rb_offset;
+ ram_addr_t in_raspace;
+ struct pollfd pfd[2];
+
+ /*
+ * We're mainly waiting for the kernel to give us a faulting HVA,
+ * however we can be told to quit via userfault_quit_fd which is
+ * an eventfd
+ */
+ pfd[0].fd = mis->userfault_fd;
+ pfd[0].events = POLLIN;
+ pfd[0].revents = 0;
+ pfd[1].fd = mis->userfault_quit_fd;
+ pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
+ pfd[1].revents = 0;
+
+ if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
+ error_report("%s: userfault poll: %s", __func__, strerror(errno));
+ break;
+ }
+
+ if (pfd[1].revents) {
+ trace_postcopy_ram_fault_thread_quit();
+ break;
+ }
+
+ ret = read(mis->userfault_fd, &msg, sizeof(msg));
+ if (ret != sizeof(msg)) {
+ if (errno == EAGAIN) {
+ /*
+ * if a wake up happens on the other thread just after
+ * the poll, there is nothing to read.
+ */
+ continue;
+ }
+ if (ret < 0) {
+ error_report("%s: Failed to read full userfault message: %s",
+ __func__, strerror(errno));
+ break;
+ } else {
+ error_report("%s: Read %d bytes from userfaultfd expected %zd",
+ __func__, ret, sizeof(msg));
+ break; /* Lost alignment, don't know what we'd read next */
+ }
+ }
+ if (msg.event != UFFD_EVENT_PAGEFAULT) {
+ error_report("%s: Read unexpected event %ud from userfaultfd",
+ __func__, msg.event);
+ continue; /* It's not a page fault, shouldn't happen */
+ }
+
+ rb = qemu_ram_block_from_host(
+ (void *)(uintptr_t)msg.arg.pagefault.address,
+ true, &in_raspace, &rb_offset);
+ if (!rb) {
+ error_report("postcopy_ram_fault_thread: Fault outside guest: %"
+ PRIx64, (uint64_t)msg.arg.pagefault.address);
+ break;
+ }
+
+ rb_offset &= ~(hostpagesize - 1);
+ trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
+ qemu_ram_get_idstr(rb),
+ rb_offset);
+
+ /*
+ * Send the request to the source - we want to request one
+ * of our host page sizes (which is >= TPS)
+ */
+ if (rb != last_rb) {
+ last_rb = rb;
+ migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
+ rb_offset, hostpagesize);
+ } else {
+ /* Save some space */
+ migrate_send_rp_req_pages(mis, NULL,
+ rb_offset, hostpagesize);
+ }
+ }
+ trace_postcopy_ram_fault_thread_exit();
+ return NULL;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+ /* Open the fd for the kernel to give us userfaults */
+ mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (mis->userfault_fd == -1) {
+ error_report("%s: Failed to open userfault fd: %s", __func__,
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * Although the host check already tested the API, we need to
+ * do the check again as an ABI handshake on the new fd.
+ */
+ if (!ufd_version_check(mis->userfault_fd)) {
+ return -1;
+ }
+
+ /* Now an eventfd we use to tell the fault-thread to quit */
+ mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
+ if (mis->userfault_quit_fd == -1) {
+ error_report("%s: Opening userfault_quit_fd: %s", __func__,
+ strerror(errno));
+ close(mis->userfault_fd);
+ return -1;
+ }
+
+ qemu_sem_init(&mis->fault_thread_sem, 0);
+ qemu_thread_create(&mis->fault_thread, "postcopy/fault",
+ postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
+ qemu_sem_wait(&mis->fault_thread_sem);
+ qemu_sem_destroy(&mis->fault_thread_sem);
+ mis->have_fault_thread = true;
+
+ /* Mark so that we get notified of accesses to unwritten areas */
+ if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
+ return -1;
+ }
+
+ /*
+ * Ballooning can mark pages as absent while we're postcopying
+ * that would cause false userfaults.
+ */
+ qemu_balloon_inhibit(true);
+
+ trace_postcopy_ram_enable_notify();
+
+ return 0;
+}
+
+/*
+ * Place a host page (from) at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+ struct uffdio_copy copy_struct;
+
+ copy_struct.dst = (uint64_t)(uintptr_t)host;
+ copy_struct.src = (uint64_t)(uintptr_t)from;
+ copy_struct.len = getpagesize();
+ copy_struct.mode = 0;
+
+ /* copy also acks to the kernel waking the stalled thread up
+ * TODO: We can inhibit that ack and only do it if it was requested
+ * which would be slightly cheaper, but we'd have to be careful
+ * of the order of updating our page state.
+ */
+ if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
+ int e = errno;
+ error_report("%s: %s copy host: %p from: %p",
+ __func__, strerror(e), host, from);
+
+ return -e;
+ }
+
+ trace_postcopy_place_page(host);
+ return 0;
+}
+
+/*
+ * Place a zero page at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+ struct uffdio_zeropage zero_struct;
+
+ zero_struct.range.start = (uint64_t)(uintptr_t)host;
+ zero_struct.range.len = getpagesize();
+ zero_struct.mode = 0;
+
+ if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
+ int e = errno;
+ error_report("%s: %s zero host: %p",
+ __func__, strerror(e), host);
+
+ return -e;
+ }
+
+ trace_postcopy_place_page_zero(host);
+ return 0;
+}
+
+/*
+ * Returns a target page of memory that can be mapped at a later point in time
+ * using postcopy_place_page
+ * The same address is used repeatedly, postcopy_place_page just takes the
+ * backing page away.
+ * Returns: Pointer to allocated page
+ *
+ */
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+ if (!mis->postcopy_tmp_page) {
+ mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
+ PROT_READ | PROT_WRITE, MAP_PRIVATE |
+ MAP_ANONYMOUS, -1, 0);
+ if (!mis->postcopy_tmp_page) {
+ error_report("%s: %s", __func__, strerror(errno));
+ return NULL;
+ }
+ }
+
+ return mis->postcopy_tmp_page;
+}
+
+#else
+/* No target OS support, stubs just fail */
+bool postcopy_ram_supported_by_host(void)
+{
+ error_report("%s: No OS support", __func__);
+ return false;
+}
+
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+ error_report("postcopy_ram_incoming_init: No OS support");
+ return -1;
+}
+
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+ size_t length)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+ assert(0);
+ return -1;
+}
+
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+ assert(0);
+ return -1;
+}
+
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+ assert(0);
+ return NULL;
+}
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+/**
+ * postcopy_discard_send_init: Called at the start of each RAMBlock before
+ * asking to discard individual ranges.
+ *
+ * @ms: The current migration state.
+ * @offset: the bitmap offset of the named RAMBlock in the migration
+ * bitmap.
+ * @name: RAMBlock that discards will operate on.
+ *
+ * returns: a new PDS.
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+ unsigned long offset,
+ const char *name)
+{
+ PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
+
+ if (res) {
+ res->ramblock_name = name;
+ res->offset = offset;
+ }
+
+ return res;
+}
+
+/**
+ * postcopy_discard_send_range: Called by the bitmap code for each chunk to
+ * discard. May send a discard message, may just leave it queued to
+ * be sent later.
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ * @start,@length: a range of pages in the migration bitmap in the
+ * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
+ */
+void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
+ unsigned long start, unsigned long length)
+{
+ size_t tp_bits = qemu_target_page_bits();
+ /* Convert to byte offsets within the RAM block */
+ pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
+ pds->length_list[pds->cur_entry] = length << tp_bits;
+ trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
+ pds->cur_entry++;
+ pds->nsentwords++;
+
+ if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
+ /* Full set, ship it! */
+ qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+ pds->cur_entry,
+ pds->start_list,
+ pds->length_list);
+ pds->nsentcmds++;
+ pds->cur_entry = 0;
+ }
+}
+
+/**
+ * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
+ * bitmap code. Sends any outstanding discard messages, frees the PDS
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ */
+void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
+{
+ /* Anything unsent? */
+ if (pds->cur_entry) {
+ qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+ pds->cur_entry,
+ pds->start_list,
+ pds->length_list);
+ pds->nsentcmds++;
+ }
+
+ trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
+ pds->nsentcmds);
+
+ g_free(pds);
+}
diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c
index 809bf070d7..c503b027a9 100644
--- a/migration/qemu-file-unix.c
+++ b/migration/qemu-file-unix.c
@@ -22,6 +22,7 @@
* THE SOFTWARE.
*/
#include "qemu-common.h"
+#include "qemu/error-report.h"
#include "qemu/iov.h"
#include "qemu/sockets.h"
#include "qemu/coroutine.h"
@@ -39,12 +40,43 @@ static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
QEMUFileSocket *s = opaque;
ssize_t len;
ssize_t size = iov_size(iov, iovcnt);
+ ssize_t offset = 0;
+ int err;
- len = iov_send(s->fd, iov, iovcnt, 0, size);
- if (len < size) {
- len = -socket_error();
- }
- return len;
+ while (size > 0) {
+ len = iov_send(s->fd, iov, iovcnt, offset, size);
+
+ if (len > 0) {
+ size -= len;
+ offset += len;
+ }
+
+ if (size > 0) {
+ err = socket_error();
+
+ if (err != EAGAIN && err != EWOULDBLOCK) {
+ error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)",
+ err, (size_t)size, (size_t)len);
+ /*
+ * If I've already sent some but only just got the error, I
+ * could return the amount validly sent so far and wait for the
+ * next call to report the error, but I'd rather flag the error
+ * immediately.
+ */
+ return -err;
+ }
+
+ /* Emulate blocking */
+ GPollFD pfd;
+
+ pfd.fd = s->fd;
+ pfd.events = G_IO_OUT | G_IO_ERR;
+ pfd.revents = 0;
+ g_poll(&pfd, 1 /* 1 fd */, -1 /* no timeout */);
+ }
+ }
+
+ return offset;
}
static int socket_get_fd(void *opaque)
@@ -97,6 +129,56 @@ static int socket_shutdown(void *opaque, bool rd, bool wr)
}
}
+static int socket_return_close(void *opaque)
+{
+ QEMUFileSocket *s = opaque;
+ /*
+ * Note: We don't close the socket, that should be done by the forward
+ * path.
+ */
+ g_free(s);
+ return 0;
+}
+
+static const QEMUFileOps socket_return_read_ops = {
+ .get_fd = socket_get_fd,
+ .get_buffer = socket_get_buffer,
+ .close = socket_return_close,
+ .shut_down = socket_shutdown,
+};
+
+static const QEMUFileOps socket_return_write_ops = {
+ .get_fd = socket_get_fd,
+ .writev_buffer = socket_writev_buffer,
+ .close = socket_return_close,
+ .shut_down = socket_shutdown,
+};
+
+/*
+ * Give a QEMUFile* off the same socket but data in the opposite
+ * direction.
+ */
+static QEMUFile *socket_get_return_path(void *opaque)
+{
+ QEMUFileSocket *forward = opaque;
+ QEMUFileSocket *reverse;
+
+ if (qemu_file_get_error(forward->file)) {
+ /* If the forward file is in error, don't try and open a return */
+ return NULL;
+ }
+
+ reverse = g_malloc0(sizeof(QEMUFileSocket));
+ reverse->fd = forward->fd;
+ /* I don't think there's a better way to tell which direction 'this' is */
+ if (forward->file->ops->get_buffer != NULL) {
+ /* being called from the read side, so we need to be able to write */
+ return qemu_fopen_ops(reverse, &socket_return_write_ops);
+ } else {
+ return qemu_fopen_ops(reverse, &socket_return_read_ops);
+ }
+}
+
static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
int64_t pos)
{
@@ -206,18 +288,19 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
}
static const QEMUFileOps socket_read_ops = {
- .get_fd = socket_get_fd,
- .get_buffer = socket_get_buffer,
- .close = socket_close,
- .shut_down = socket_shutdown
-
+ .get_fd = socket_get_fd,
+ .get_buffer = socket_get_buffer,
+ .close = socket_close,
+ .shut_down = socket_shutdown,
+ .get_return_path = socket_get_return_path
};
static const QEMUFileOps socket_write_ops = {
- .get_fd = socket_get_fd,
- .writev_buffer = socket_writev_buffer,
- .close = socket_close,
- .shut_down = socket_shutdown
+ .get_fd = socket_get_fd,
+ .writev_buffer = socket_writev_buffer,
+ .close = socket_close,
+ .shut_down = socket_shutdown,
+ .get_return_path = socket_get_return_path
};
QEMUFile *qemu_fopen_socket(int fd, const char *mode)
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index df49023ed8..0bbd2574a8 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -44,6 +44,18 @@ int qemu_file_shutdown(QEMUFile *f)
return f->ops->shut_down(f->opaque, true, true);
}
+/*
+ * Result: QEMUFile* for a 'return path' for comms in the opposite direction
+ * NULL if not available
+ */
+QEMUFile *qemu_file_get_return_path(QEMUFile *f)
+{
+ if (!f->ops->get_return_path) {
+ return NULL;
+ }
+ return f->ops->get_return_path(f->opaque);
+}
+
bool qemu_file_mode_is_not_valid(const char *mode)
{
if (mode == NULL ||
@@ -434,6 +446,43 @@ size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size)
}
/*
+ * Read 'size' bytes of data from the file.
+ * 'size' can be larger than the internal buffer.
+ *
+ * The data:
+ * may be held on an internal buffer (in which case *buf is updated
+ * to point to it) that is valid until the next qemu_file operation.
+ * OR
+ * will be copied to the *buf that was passed in.
+ *
+ * The code tries to avoid the copy if possible.
+ *
+ * It will return size bytes unless there was an error, in which case it will
+ * return as many as it managed to read (assuming blocking fd's which
+ * all current QEMUFile are)
+ *
+ * Note: Since **buf may get changed, the caller should take care to
+ * keep a pointer to the original buffer if it needs to deallocate it.
+ */
+size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
+{
+ if (size < IO_BUF_SIZE) {
+ size_t res;
+ uint8_t *src;
+
+ res = qemu_peek_buffer(f, &src, size, 0);
+
+ if (res == size) {
+ qemu_file_skip(f, res);
+ *buf = src;
+ return res;
+ }
+ }
+
+ return qemu_get_buffer(f, *buf, size);
+}
+
+/*
* Peeks a single byte from the buffer; this isn't guaranteed to work if
* offset leaves a gap after the previous read/peeked data.
*/
@@ -611,3 +660,18 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256])
return res == len ? res : 0;
}
+
+/*
+ * Set the blocking state of the QEMUFile.
+ * Note: On some transports the OS only keeps a single blocking state for
+ * both directions, and thus changing the blocking on the main
+ * QEMUFile can also affect the return path.
+ */
+void qemu_file_set_blocking(QEMUFile *f, bool block)
+{
+ if (block) {
+ qemu_set_block(qemu_get_fd(f));
+ } else {
+ qemu_set_nonblock(qemu_get_fd(f));
+ }
+}
diff --git a/migration/ram.c b/migration/ram.c
index df3df9e3bf..62cf42bfdb 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -32,6 +32,7 @@
#include "qemu/timer.h"
#include "qemu/main-loop.h"
#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
#include "exec/address-spaces.h"
#include "migration/page_cache.h"
#include "qemu/error-report.h"
@@ -237,7 +238,14 @@ typedef struct PageSearchStatus PageSearchStatus;
static struct BitmapRcu {
struct rcu_head rcu;
+ /* Main migration bitmap */
unsigned long *bmap;
+ /* bitmap of pages that haven't been sent even once
+ * only maintained and used in postcopy at the moment
+ * where it's used to send the dirtymap at the start
+ * of the postcopy phase
+ */
+ unsigned long *unsentmap;
} *migration_bitmap_rcu;
struct CompressParam {
@@ -531,10 +539,18 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
return 1;
}
-/* Called with rcu_read_lock() to protect migration_bitmap */
+/* Called with rcu_read_lock() to protect migration_bitmap
+ * rb: The RAMBlock to search for dirty pages in
+ * start: Start address (typically so we can continue from previous page)
+ * ram_addr_abs: Pointer into which to store the address of the dirty page
+ * within the global ram_addr space
+ *
+ * Returns: byte offset within memory region of the start of a dirty page
+ */
static inline
-ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
- ram_addr_t start)
+ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
+ ram_addr_t start,
+ ram_addr_t *ram_addr_abs)
{
unsigned long base = rb->offset >> TARGET_PAGE_BITS;
unsigned long nr = base + (start >> TARGET_PAGE_BITS);
@@ -551,14 +567,24 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
next = find_next_bit(bitmap, size, nr);
}
- if (next < size) {
- clear_bit(next, bitmap);
+ *ram_addr_abs = next << TARGET_PAGE_BITS;
+ return (next - base) << TARGET_PAGE_BITS;
+}
+
+static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
+{
+ bool ret;
+ int nr = addr >> TARGET_PAGE_BITS;
+ unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+
+ ret = test_and_clear_bit(nr, bitmap);
+
+ if (ret) {
migration_dirty_pages--;
}
- return (next - base) << TARGET_PAGE_BITS;
+ return ret;
}
-/* Called with rcu_read_lock() to protect migration_bitmap */
static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
{
unsigned long *bitmap;
@@ -951,12 +977,14 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
* @f: Current migration stream.
* @pss: Data about the state of the current dirty page scan.
* @*again: Set to false if the search has scanned the whole of RAM
+ * *ram_addr_abs: Pointer into which to store the address of the dirty page
+ * within the global ram_addr space
*/
static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
- bool *again)
+ bool *again, ram_addr_t *ram_addr_abs)
{
- pss->offset = migration_bitmap_find_and_reset_dirty(pss->block,
- pss->offset);
+ pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
+ ram_addr_abs);
if (pss->complete_round && pss->block == last_seen_block &&
pss->offset >= last_offset) {
/*
@@ -995,6 +1023,276 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
}
}
+/*
+ * Helper for 'get_queued_page' - gets a page off the queue
+ * ms: MigrationState in
+ * *offset: Used to return the offset within the RAMBlock
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns: block (or NULL if none available)
+ */
+static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
+ ram_addr_t *ram_addr_abs)
+{
+ RAMBlock *block = NULL;
+
+ qemu_mutex_lock(&ms->src_page_req_mutex);
+ if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
+ struct MigrationSrcPageRequest *entry =
+ QSIMPLEQ_FIRST(&ms->src_page_requests);
+ block = entry->rb;
+ *offset = entry->offset;
+ *ram_addr_abs = (entry->offset + entry->rb->offset) &
+ TARGET_PAGE_MASK;
+
+ if (entry->len > TARGET_PAGE_SIZE) {
+ entry->len -= TARGET_PAGE_SIZE;
+ entry->offset += TARGET_PAGE_SIZE;
+ } else {
+ memory_region_unref(block->mr);
+ QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+ g_free(entry);
+ }
+ }
+ qemu_mutex_unlock(&ms->src_page_req_mutex);
+
+ return block;
+}
+
+/*
+ * Unqueue a page from the queue fed by postcopy page requests; skips pages
+ * that are already sent (!dirty)
+ *
+ * ms: MigrationState in
+ * pss: PageSearchStatus structure updated with found block/offset
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns: true if a queued page is found
+ */
+static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
+ ram_addr_t *ram_addr_abs)
+{
+ RAMBlock *block;
+ ram_addr_t offset;
+ bool dirty;
+
+ do {
+ block = unqueue_page(ms, &offset, ram_addr_abs);
+ /*
+ * We're sending this page, and since it's postcopy nothing else
+ * will dirty it, and we must make sure it doesn't get sent again
+ * even if this queue request was received after the background
+ * search already sent it.
+ */
+ if (block) {
+ unsigned long *bitmap;
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
+ if (!dirty) {
+ trace_get_queued_page_not_dirty(
+ block->idstr, (uint64_t)offset,
+ (uint64_t)*ram_addr_abs,
+ test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
+ atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
+ } else {
+ trace_get_queued_page(block->idstr,
+ (uint64_t)offset,
+ (uint64_t)*ram_addr_abs);
+ }
+ }
+
+ } while (block && !dirty);
+
+ if (block) {
+ /*
+ * As soon as we start servicing pages out of order, then we have
+ * to kill the bulk stage, since the bulk stage assumes
+ * in (migration_bitmap_find_and_reset_dirty) that every page is
+ * dirty, that's no longer true.
+ */
+ ram_bulk_stage = false;
+
+ /*
+ * We want the background search to continue from the queued page
+ * since the guest is likely to want other pages near to the page
+ * it just requested.
+ */
+ pss->block = block;
+ pss->offset = offset;
+ }
+
+ return !!block;
+}
+
+/**
+ * flush_page_queue: Flush any remaining pages in the ram request queue
+ * it should be empty at the end anyway, but in error cases there may be
+ * some left.
+ *
+ * ms: MigrationState
+ */
+void flush_page_queue(MigrationState *ms)
+{
+ struct MigrationSrcPageRequest *mspr, *next_mspr;
+ /* This queue generally should be empty - but in the case of a failed
+ * migration might have some droppings in.
+ */
+ rcu_read_lock();
+ QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
+ memory_region_unref(mspr->rb->mr);
+ QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+ g_free(mspr);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * Queue the pages for transmission, e.g. a request from postcopy destination
+ * ms: MigrationStatus in which the queue is held
+ * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
+ * start: Offset from the start of the RAMBlock
+ * len: Length (in bytes) to send
+ * Return: 0 on success
+ */
+int ram_save_queue_pages(MigrationState *ms, const char *rbname,
+ ram_addr_t start, ram_addr_t len)
+{
+ RAMBlock *ramblock;
+
+ rcu_read_lock();
+ if (!rbname) {
+ /* Reuse last RAMBlock */
+ ramblock = ms->last_req_rb;
+
+ if (!ramblock) {
+ /*
+ * Shouldn't happen, we can't reuse the last RAMBlock if
+ * it's the 1st request.
+ */
+ error_report("ram_save_queue_pages no previous block");
+ goto err;
+ }
+ } else {
+ ramblock = qemu_ram_block_by_name(rbname);
+
+ if (!ramblock) {
+ /* We shouldn't be asked for a non-existent RAMBlock */
+ error_report("ram_save_queue_pages no block '%s'", rbname);
+ goto err;
+ }
+ ms->last_req_rb = ramblock;
+ }
+ trace_ram_save_queue_pages(ramblock->idstr, start, len);
+ if (start+len > ramblock->used_length) {
+ error_report("%s request overrun start=%zx len=%zx blocklen=%zx",
+ __func__, start, len, ramblock->used_length);
+ goto err;
+ }
+
+ struct MigrationSrcPageRequest *new_entry =
+ g_malloc0(sizeof(struct MigrationSrcPageRequest));
+ new_entry->rb = ramblock;
+ new_entry->offset = start;
+ new_entry->len = len;
+
+ memory_region_ref(ramblock->mr);
+ qemu_mutex_lock(&ms->src_page_req_mutex);
+ QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
+ qemu_mutex_unlock(&ms->src_page_req_mutex);
+ rcu_read_unlock();
+
+ return 0;
+
+err:
+ rcu_read_unlock();
+ return -1;
+}
+
+/**
+ * ram_save_target_page: Save one target page
+ *
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page;
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ *
+ * Returns: Number of pages written.
+ */
+static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
+ RAMBlock *block, ram_addr_t offset,
+ bool last_stage,
+ uint64_t *bytes_transferred,
+ ram_addr_t dirty_ram_abs)
+{
+ int res = 0;
+
+ /* Check the pages is dirty and if it is send it */
+ if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
+ unsigned long *unsentmap;
+ if (compression_switch && migrate_use_compression()) {
+ res = ram_save_compressed_page(f, block, offset,
+ last_stage,
+ bytes_transferred);
+ } else {
+ res = ram_save_page(f, block, offset, last_stage,
+ bytes_transferred);
+ }
+
+ if (res < 0) {
+ return res;
+ }
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+ if (unsentmap) {
+ clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
+ }
+ }
+
+ return res;
+}
+
+/**
+ * ram_save_host_page: Starting at *offset send pages upto the end
+ * of the current host page. It's valid for the initial
+ * offset to point into the middle of a host page
+ * in which case the remainder of the hostpage is sent.
+ * Only dirty target pages are sent.
+ *
+ * Returns: Number of pages written.
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page; updated to last target page
+ * sent
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ */
+static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block,
+ ram_addr_t *offset, bool last_stage,
+ uint64_t *bytes_transferred,
+ ram_addr_t dirty_ram_abs)
+{
+ int tmppages, pages = 0;
+ do {
+ tmppages = ram_save_target_page(ms, f, block, *offset, last_stage,
+ bytes_transferred, dirty_ram_abs);
+ if (tmppages < 0) {
+ return tmppages;
+ }
+
+ pages += tmppages;
+ *offset += TARGET_PAGE_SIZE;
+ dirty_ram_abs += TARGET_PAGE_SIZE;
+ } while (*offset & (qemu_host_page_size - 1));
+
+ /* The offset we leave with is the last one we looked at */
+ *offset -= TARGET_PAGE_SIZE;
+ return pages;
+}
+
/**
* ram_find_and_save_block: Finds a dirty page and sends it to f
*
@@ -1006,14 +1304,20 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
* @f: QEMUFile where to send the data
* @last_stage: if we are at the completion stage
* @bytes_transferred: increase it with the number of transferred bytes
+ *
+ * On systems where host-page-size > target-page-size it will send all the
+ * pages in a host page that are dirty.
*/
static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
uint64_t *bytes_transferred)
{
PageSearchStatus pss;
+ MigrationState *ms = migrate_get_current();
int pages = 0;
bool again, found;
+ ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
+ ram_addr_t space */
pss.block = last_seen_block;
pss.offset = last_offset;
@@ -1024,22 +1328,18 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
}
do {
- found = find_dirty_block(f, &pss, &again);
+ again = true;
+ found = get_queued_page(ms, &pss, &dirty_ram_abs);
- if (found) {
- if (compression_switch && migrate_use_compression()) {
- pages = ram_save_compressed_page(f, pss.block, pss.offset,
- last_stage,
- bytes_transferred);
- } else {
- pages = ram_save_page(f, pss.block, pss.offset, last_stage,
- bytes_transferred);
- }
+ if (!found) {
+ /* priority queue empty, so just search for something dirty */
+ found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
+ }
- /* if page is unmodified, continue to the next */
- if (pages > 0) {
- last_sent_block = pss.block;
- }
+ if (found) {
+ pages = ram_save_host_page(ms, f, pss.block, &pss.offset,
+ last_stage, bytes_transferred,
+ dirty_ram_abs);
}
} while (!pages && again);
@@ -1097,6 +1397,7 @@ void free_xbzrle_decoded_buf(void)
static void migration_bitmap_free(struct BitmapRcu *bmap)
{
g_free(bmap->bmap);
+ g_free(bmap->unsentmap);
g_free(bmap);
}
@@ -1153,6 +1454,13 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
qemu_mutex_lock(&migration_bitmap_mutex);
bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
bitmap_set(bitmap->bmap, old, new - old);
+
+ /* We don't have a way to safely extend the sentmap
+ * with RCU; so mark it as missing, entry to postcopy
+ * will fail.
+ */
+ bitmap->unsentmap = NULL;
+
atomic_rcu_set(&migration_bitmap_rcu, bitmap);
qemu_mutex_unlock(&migration_bitmap_mutex);
migration_dirty_pages += new - old;
@@ -1160,6 +1468,394 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
}
}
+/*
+ * 'expected' is the value you expect the bitmap mostly to be full
+ * of; it won't bother printing lines that are all this value.
+ * If 'todump' is null the migration bitmap is dumped.
+ */
+void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
+{
+ int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+ int64_t cur;
+ int64_t linelen = 128;
+ char linebuf[129];
+
+ if (!todump) {
+ todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ }
+
+ for (cur = 0; cur < ram_pages; cur += linelen) {
+ int64_t curb;
+ bool found = false;
+ /*
+ * Last line; catch the case where the line length
+ * is longer than remaining ram
+ */
+ if (cur + linelen > ram_pages) {
+ linelen = ram_pages - cur;
+ }
+ for (curb = 0; curb < linelen; curb++) {
+ bool thisbit = test_bit(cur + curb, todump);
+ linebuf[curb] = thisbit ? '1' : '.';
+ found = found || (thisbit != expected);
+ }
+ if (found) {
+ linebuf[curb] = '\0';
+ fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
+ }
+ }
+}
+
+/* **** functions for postcopy ***** */
+
+/*
+ * Callback from postcopy_each_ram_send_discard for each RAMBlock
+ * Note: At this point the 'unsentmap' is the processed bitmap combined
+ * with the dirtymap; so a '1' means it's either dirty or unsent.
+ * start,length: Indexes into the bitmap for the first bit
+ * representing the named block and length in target-pages
+ */
+static int postcopy_send_discard_bm_ram(MigrationState *ms,
+ PostcopyDiscardState *pds,
+ unsigned long start,
+ unsigned long length)
+{
+ unsigned long end = start + length; /* one after the end */
+ unsigned long current;
+ unsigned long *unsentmap;
+
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+ for (current = start; current < end; ) {
+ unsigned long one = find_next_bit(unsentmap, end, current);
+
+ if (one <= end) {
+ unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
+ unsigned long discard_length;
+
+ if (zero >= end) {
+ discard_length = end - one;
+ } else {
+ discard_length = zero - one;
+ }
+ postcopy_discard_send_range(ms, pds, one, discard_length);
+ current = one + discard_length;
+ } else {
+ current = one;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ * Calls postcopy_send_discard_bm_ram for each RAMBlock
+ * passing it bitmap indexes and name.
+ * Returns: 0 on success
+ * (qemu_ram_foreach_block ends up passing unscaled lengths
+ * which would mean postcopy code would have to deal with target page)
+ */
+static int postcopy_each_ram_send_discard(MigrationState *ms)
+{
+ struct RAMBlock *block;
+ int ret;
+
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+ unsigned long first = block->offset >> TARGET_PAGE_BITS;
+ PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
+ first,
+ block->idstr);
+
+ /*
+ * Postcopy sends chunks of bitmap over the wire, but it
+ * just needs indexes at this point, avoids it having
+ * target page specific code.
+ */
+ ret = postcopy_send_discard_bm_ram(ms, pds, first,
+ block->used_length >> TARGET_PAGE_BITS);
+ postcopy_discard_send_finish(ms, pds);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
+ * the two bitmaps, that are similar, but one is inverted.
+ *
+ * We search for runs of target-pages that don't start or end on a
+ * host page boundary;
+ * unsent_pass=true: Cleans up partially unsent host pages by searching
+ * the unsentmap
+ * unsent_pass=false: Cleans up partially dirty host pages by searching
+ * the main migration bitmap
+ *
+ */
+static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
+ RAMBlock *block,
+ PostcopyDiscardState *pds)
+{
+ unsigned long *bitmap;
+ unsigned long *unsentmap;
+ unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
+ unsigned long first = block->offset >> TARGET_PAGE_BITS;
+ unsigned long len = block->used_length >> TARGET_PAGE_BITS;
+ unsigned long last = first + (len - 1);
+ unsigned long run_start;
+
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+
+ if (unsent_pass) {
+ /* Find a sent page */
+ run_start = find_next_zero_bit(unsentmap, last + 1, first);
+ } else {
+ /* Find a dirty page */
+ run_start = find_next_bit(bitmap, last + 1, first);
+ }
+
+ while (run_start <= last) {
+ bool do_fixup = false;
+ unsigned long fixup_start_addr;
+ unsigned long host_offset;
+
+ /*
+ * If the start of this run of pages is in the middle of a host
+ * page, then we need to fixup this host page.
+ */
+ host_offset = run_start % host_ratio;
+ if (host_offset) {
+ do_fixup = true;
+ run_start -= host_offset;
+ fixup_start_addr = run_start;
+ /* For the next pass */
+ run_start = run_start + host_ratio;
+ } else {
+ /* Find the end of this run */
+ unsigned long run_end;
+ if (unsent_pass) {
+ run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
+ } else {
+ run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
+ }
+ /*
+ * If the end isn't at the start of a host page, then the
+ * run doesn't finish at the end of a host page
+ * and we need to discard.
+ */
+ host_offset = run_end % host_ratio;
+ if (host_offset) {
+ do_fixup = true;
+ fixup_start_addr = run_end - host_offset;
+ /*
+ * This host page has gone, the next loop iteration starts
+ * from after the fixup
+ */
+ run_start = fixup_start_addr + host_ratio;
+ } else {
+ /*
+ * No discards on this iteration, next loop starts from
+ * next sent/dirty page
+ */
+ run_start = run_end + 1;
+ }
+ }
+
+ if (do_fixup) {
+ unsigned long page;
+
+ /* Tell the destination to discard this page */
+ if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
+ /* For the unsent_pass we:
+ * discard partially sent pages
+ * For the !unsent_pass (dirty) we:
+ * discard partially dirty pages that were sent
+ * (any partially sent pages were already discarded
+ * by the previous unsent_pass)
+ */
+ postcopy_discard_send_range(ms, pds, fixup_start_addr,
+ host_ratio);
+ }
+
+ /* Clean up the bitmap */
+ for (page = fixup_start_addr;
+ page < fixup_start_addr + host_ratio; page++) {
+ /* All pages in this host page are now not sent */
+ set_bit(page, unsentmap);
+
+ /*
+ * Remark them as dirty, updating the count for any pages
+ * that weren't previously dirty.
+ */
+ migration_dirty_pages += !test_and_set_bit(page, bitmap);
+ }
+ }
+
+ if (unsent_pass) {
+ /* Find the next sent page for the next iteration */
+ run_start = find_next_zero_bit(unsentmap, last + 1,
+ run_start);
+ } else {
+ /* Find the next dirty page for the next iteration */
+ run_start = find_next_bit(bitmap, last + 1, run_start);
+ }
+ }
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *
+ * Discard any partially sent host-page size chunks, mark any partially
+ * dirty host-page size chunks as all dirty.
+ *
+ * Returns: 0 on success
+ */
+static int postcopy_chunk_hostpages(MigrationState *ms)
+{
+ struct RAMBlock *block;
+
+ if (qemu_host_page_size == TARGET_PAGE_SIZE) {
+ /* Easy case - TPS==HPS - nothing to be done */
+ return 0;
+ }
+
+ /* Easiest way to make sure we don't resume in the middle of a host-page */
+ last_seen_block = NULL;
+ last_sent_block = NULL;
+ last_offset = 0;
+
+ QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+ unsigned long first = block->offset >> TARGET_PAGE_BITS;
+
+ PostcopyDiscardState *pds =
+ postcopy_discard_send_init(ms, first, block->idstr);
+
+ /* First pass: Discard all partially sent host pages */
+ postcopy_chunk_hostpages_pass(ms, true, block, pds);
+ /*
+ * Second pass: Ensure that all partially dirty host pages are made
+ * fully dirty.
+ */
+ postcopy_chunk_hostpages_pass(ms, false, block, pds);
+
+ postcopy_discard_send_finish(ms, pds);
+ } /* ram_list loop */
+
+ return 0;
+}
+
+/*
+ * Transmit the set of pages to be discarded after precopy to the target
+ * these are pages that:
+ * a) Have been previously transmitted but are now dirty again
+ * b) Pages that have never been transmitted, this ensures that
+ * any pages on the destination that have been mapped by background
+ * tasks get discarded (transparent huge pages is the specific concern)
+ * Hopefully this is pretty sparse
+ */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms)
+{
+ int ret;
+ unsigned long *bitmap, *unsentmap;
+
+ rcu_read_lock();
+
+ /* This should be our last sync, the src is now paused */
+ migration_bitmap_sync();
+
+ unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+ if (!unsentmap) {
+ /* We don't have a safe way to resize the sentmap, so
+ * if the bitmap was resized it will be NULL at this
+ * point.
+ */
+ error_report("migration ram resized during precopy phase");
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ /* Deal with TPS != HPS */
+ ret = postcopy_chunk_hostpages(ms);
+ if (ret) {
+ rcu_read_unlock();
+ return ret;
+ }
+
+ /*
+ * Update the unsentmap to be unsentmap = unsentmap | dirty
+ */
+ bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+ bitmap_or(unsentmap, unsentmap, bitmap,
+ last_ram_offset() >> TARGET_PAGE_BITS);
+
+
+ trace_ram_postcopy_send_discard_bitmap();
+#ifdef DEBUG_POSTCOPY
+ ram_debug_dump_bitmap(unsentmap, true);
+#endif
+
+ ret = postcopy_each_ram_send_discard(ms);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * At the start of the postcopy phase of migration, any now-dirty
+ * precopied pages are discarded.
+ *
+ * start, length describe a byte address range within the RAMBlock
+ *
+ * Returns 0 on success.
+ */
+int ram_discard_range(MigrationIncomingState *mis,
+ const char *block_name,
+ uint64_t start, size_t length)
+{
+ int ret = -1;
+
+ rcu_read_lock();
+ RAMBlock *rb = qemu_ram_block_by_name(block_name);
+
+ if (!rb) {
+ error_report("ram_discard_range: Failed to find block '%s'",
+ block_name);
+ goto err;
+ }
+
+ uint8_t *host_startaddr = rb->host + start;
+
+ if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
+ error_report("ram_discard_range: Unaligned start address: %p",
+ host_startaddr);
+ goto err;
+ }
+
+ if ((start + length) <= rb->used_length) {
+ uint8_t *host_endaddr = host_startaddr + length;
+ if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
+ error_report("ram_discard_range: Unaligned end address: %p",
+ host_endaddr);
+ goto err;
+ }
+ ret = postcopy_ram_discard_range(mis, host_startaddr, length);
+ } else {
+ error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
+ "/%zu/%zu)",
+ block_name, start, length, rb->used_length);
+ }
+
+err:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+
/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
* long-running RCU critical section. When rcu-reclaims in the code
* start to become numerous it will be necessary to reduce the
@@ -1214,10 +1910,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
reset_ram_globals();
ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
- migration_bitmap_rcu = g_new(struct BitmapRcu, 1);
+ migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
+ if (migrate_postcopy_ram()) {
+ migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
+ bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
+ }
+
/*
* Count the total number of pages used by ram blocks not including any
* gaps due to alignment or unplugs.
@@ -1317,7 +2018,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
{
rcu_read_lock();
- migration_bitmap_sync();
+ if (!migration_in_postcopy(migrate_get_current())) {
+ migration_bitmap_sync();
+ }
ram_control_before_iterate(f, RAM_CONTROL_FINISH);
@@ -1344,13 +2047,16 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return 0;
}
-static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+ uint64_t *non_postcopiable_pending,
+ uint64_t *postcopiable_pending)
{
uint64_t remaining_size;
remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
- if (remaining_size < max_size) {
+ if (!migration_in_postcopy(migrate_get_current()) &&
+ remaining_size < max_size) {
qemu_mutex_lock_iothread();
rcu_read_lock();
migration_bitmap_sync();
@@ -1358,7 +2064,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
qemu_mutex_unlock_iothread();
remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
}
- return remaining_size;
+
+ /* We can do postcopy, and all the data is postcopiable */
+ *postcopiable_pending += remaining_size;
}
static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
@@ -1399,6 +2107,14 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
/* Must be called from within a rcu critical section.
* Returns a pointer from within the RCU-protected ram_list.
*/
+/*
+ * Read a RAMBlock ID from the stream f, find the host address of the
+ * start of that block and add on 'offset'
+ *
+ * f: Stream to read from
+ * offset: Offset within the block
+ * flags: Page flags (mostly to see if it's a continuation of previous block)
+ */
static inline void *host_from_stream_offset(QEMUFile *f,
ram_addr_t offset,
int flags)
@@ -1420,14 +2136,12 @@ static inline void *host_from_stream_offset(QEMUFile *f,
qemu_get_buffer(f, (uint8_t *)id, len);
id[len] = 0;
- QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
- if (!strncmp(id, block->idstr, sizeof(id)) &&
- block->max_length > offset) {
- return block->host + offset;
- }
+ block = qemu_ram_block_by_name(id);
+ if (block && block->max_length > offset) {
+ return block->host + offset;
}
- error_report("Can't find block %s!", id);
+ error_report("Can't find block %s", id);
return NULL;
}
@@ -1535,11 +2249,148 @@ static void decompress_data_with_multi_threads(uint8_t *compbuf,
}
}
+/*
+ * Allocate data structures etc needed by incoming migration with postcopy-ram
+ * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
+ */
+int ram_postcopy_incoming_init(MigrationIncomingState *mis)
+{
+ size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+ return postcopy_ram_incoming_init(mis, ram_pages);
+}
+
+/*
+ * Called in postcopy mode by ram_load().
+ * rcu_read_lock is taken prior to this being called.
+ */
+static int ram_load_postcopy(QEMUFile *f)
+{
+ int flags = 0, ret = 0;
+ bool place_needed = false;
+ bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ /* Temporary page that is later 'placed' */
+ void *postcopy_host_page = postcopy_get_tmp_page(mis);
+ void *last_host = NULL;
+
+ while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+ ram_addr_t addr;
+ void *host = NULL;
+ void *page_buffer = NULL;
+ void *place_source = NULL;
+ uint8_t ch;
+ bool all_zero = false;
+
+ addr = qemu_get_be64(f);
+ flags = addr & ~TARGET_PAGE_MASK;
+ addr &= TARGET_PAGE_MASK;
+
+ trace_ram_load_postcopy_loop((uint64_t)addr, flags);
+ place_needed = false;
+ if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
+ host = host_from_stream_offset(f, addr, flags);
+ if (!host) {
+ error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+ ret = -EINVAL;
+ break;
+ }
+ page_buffer = host;
+ /*
+ * Postcopy requires that we place whole host pages atomically.
+ * To make it atomic, the data is read into a temporary page
+ * that's moved into place later.
+ * The migration protocol uses, possibly smaller, target-pages
+ * however the source ensures it always sends all the components
+ * of a host page in order.
+ */
+ page_buffer = postcopy_host_page +
+ ((uintptr_t)host & ~qemu_host_page_mask);
+ /* If all TP are zero then we can optimise the place */
+ if (!((uintptr_t)host & ~qemu_host_page_mask)) {
+ all_zero = true;
+ } else {
+ /* not the 1st TP within the HP */
+ if (host != (last_host + TARGET_PAGE_SIZE)) {
+ error_report("Non-sequential target page %p/%p\n",
+ host, last_host);
+ ret = -EINVAL;
+ break;
+ }
+ }
+
+
+ /*
+ * If it's the last part of a host page then we place the host
+ * page
+ */
+ place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
+ ~qemu_host_page_mask) == 0;
+ place_source = postcopy_host_page;
+ }
+ last_host = host;
+
+ switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
+ case RAM_SAVE_FLAG_COMPRESS:
+ ch = qemu_get_byte(f);
+ memset(page_buffer, ch, TARGET_PAGE_SIZE);
+ if (ch) {
+ all_zero = false;
+ }
+ break;
+
+ case RAM_SAVE_FLAG_PAGE:
+ all_zero = false;
+ if (!place_needed || !matching_page_sizes) {
+ qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
+ } else {
+ /* Avoids the qemu_file copy during postcopy, which is
+ * going to do a copy later; can only do it when we
+ * do this read in one go (matching page sizes)
+ */
+ qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
+ TARGET_PAGE_SIZE);
+ }
+ break;
+ case RAM_SAVE_FLAG_EOS:
+ /* normal exit */
+ break;
+ default:
+ error_report("Unknown combination of migration flags: %#x"
+ " (postcopy mode)", flags);
+ ret = -EINVAL;
+ }
+
+ if (place_needed) {
+ /* This gets called at the last target page in the host page */
+ if (all_zero) {
+ ret = postcopy_place_page_zero(mis,
+ host + TARGET_PAGE_SIZE -
+ qemu_host_page_size);
+ } else {
+ ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
+ qemu_host_page_size,
+ place_source);
+ }
+ }
+ if (!ret) {
+ ret = qemu_file_get_error(f);
+ }
+ }
+
+ return ret;
+}
+
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
int flags = 0, ret = 0;
static uint64_t seq_iter;
int len = 0;
+ /*
+ * If system is running in postcopy mode, page inserts to host memory must
+ * be atomic
+ */
+ bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
seq_iter++;
@@ -1553,15 +2404,30 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
* critical section.
*/
rcu_read_lock();
- while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+
+ if (postcopy_running) {
+ ret = ram_load_postcopy(f);
+ }
+
+ while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
ram_addr_t addr, total_ram_bytes;
- void *host;
+ void *host = NULL;
uint8_t ch;
addr = qemu_get_be64(f);
flags = addr & ~TARGET_PAGE_MASK;
addr &= TARGET_PAGE_MASK;
+ if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
+ RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
+ host = host_from_stream_offset(f, addr, flags);
+ if (!host) {
+ error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+ ret = -EINVAL;
+ break;
+ }
+ }
+
switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
case RAM_SAVE_FLAG_MEM_SIZE:
/* Synchronize RAM block list */
@@ -1576,23 +2442,20 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
id[len] = 0;
length = qemu_get_be64(f);
- QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
- if (!strncmp(id, block->idstr, sizeof(id))) {
- if (length != block->used_length) {
- Error *local_err = NULL;
+ block = qemu_ram_block_by_name(id);
+ if (block) {
+ if (length != block->used_length) {
+ Error *local_err = NULL;
- ret = qemu_ram_resize(block->offset, length, &local_err);
- if (local_err) {
- error_report_err(local_err);
- }
+ ret = qemu_ram_resize(block->offset, length,
+ &local_err);
+ if (local_err) {
+ error_report_err(local_err);
}
- ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
- block->idstr);
- break;
}
- }
-
- if (!block) {
+ ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
+ block->idstr);
+ } else {
error_report("Unknown ramblock \"%s\", cannot "
"accept migration", id);
ret = -EINVAL;
@@ -1601,33 +2464,17 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
total_ram_bytes -= length;
}
break;
+
case RAM_SAVE_FLAG_COMPRESS:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
ch = qemu_get_byte(f);
ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
break;
+
case RAM_SAVE_FLAG_PAGE:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
break;
- case RAM_SAVE_FLAG_COMPRESS_PAGE:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
+ case RAM_SAVE_FLAG_COMPRESS_PAGE:
len = qemu_get_be32(f);
if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
error_report("Invalid compressed data length: %d", len);
@@ -1637,13 +2484,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
qemu_get_buffer(f, compressed_data_buf, len);
decompress_data_with_multi_threads(compressed_data_buf, host, len);
break;
+
case RAM_SAVE_FLAG_XBZRLE:
- host = host_from_stream_offset(f, addr, flags);
- if (!host) {
- error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
- ret = -EINVAL;
- break;
- }
if (load_xbzrle(f, addr, host) < 0) {
error_report("Failed to decompress XBZRLE page at "
RAM_ADDR_FMT, addr);
@@ -1677,7 +2519,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
static SaveVMHandlers savevm_ram_handlers = {
.save_live_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
- .save_live_complete = ram_save_complete,
+ .save_live_complete_postcopy = ram_save_complete,
+ .save_live_complete_precopy = ram_save_complete,
.save_live_pending = ram_save_pending,
.load_state = ram_load,
.cleanup = ram_migration_cleanup,
diff --git a/migration/savevm.c b/migration/savevm.c
index e05158d7ba..be52314a12 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -37,6 +37,7 @@
#include "qemu/timer.h"
#include "audio/audio.h"
#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
#include "qapi/qmp/qerror.h"
#include "qemu/error-report.h"
#include "qemu/sockets.h"
@@ -45,6 +46,7 @@
#include "exec/memory.h"
#include "qmp-commands.h"
#include "trace.h"
+#include "qemu/bitops.h"
#include "qemu/iov.h"
#include "block/snapshot.h"
#include "block/qapi.h"
@@ -57,8 +59,26 @@
#define ARP_PTYPE_IP 0x0800
#define ARP_OP_REQUEST_REV 0x3
+const unsigned int postcopy_ram_discard_version = 0;
+
static bool skip_section_footers;
+static struct mig_cmd_args {
+ ssize_t len; /* -1 = variable */
+ const char *name;
+} mig_cmd_args[] = {
+ [MIG_CMD_INVALID] = { .len = -1, .name = "INVALID" },
+ [MIG_CMD_OPEN_RETURN_PATH] = { .len = 0, .name = "OPEN_RETURN_PATH" },
+ [MIG_CMD_PING] = { .len = sizeof(uint32_t), .name = "PING" },
+ [MIG_CMD_POSTCOPY_ADVISE] = { .len = 16, .name = "POSTCOPY_ADVISE" },
+ [MIG_CMD_POSTCOPY_LISTEN] = { .len = 0, .name = "POSTCOPY_LISTEN" },
+ [MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" },
+ [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
+ .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
+ [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" },
+ [MIG_CMD_MAX] = { .len = -1, .name = "MAX" },
+};
+
static int announce_self_create(uint8_t *buf,
uint8_t *mac_addr)
{
@@ -694,6 +714,156 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
}
}
+/**
+ * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
+ * command and associated data.
+ *
+ * @f: File to send command on
+ * @command: Command type to send
+ * @len: Length of associated data
+ * @data: Data associated with command.
+ */
+void qemu_savevm_command_send(QEMUFile *f,
+ enum qemu_vm_cmd command,
+ uint16_t len,
+ uint8_t *data)
+{
+ trace_savevm_command_send(command, len);
+ qemu_put_byte(f, QEMU_VM_COMMAND);
+ qemu_put_be16(f, (uint16_t)command);
+ qemu_put_be16(f, len);
+ qemu_put_buffer(f, data, len);
+ qemu_fflush(f);
+}
+
+void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
+{
+ uint32_t buf;
+
+ trace_savevm_send_ping(value);
+ buf = cpu_to_be32(value);
+ qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
+}
+
+void qemu_savevm_send_open_return_path(QEMUFile *f)
+{
+ trace_savevm_send_open_return_path();
+ qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
+}
+
+/* We have a buffer of data to send; we don't want that all to be loaded
+ * by the command itself, so the command contains just the length of the
+ * extra buffer that we then send straight after it.
+ * TODO: Must be a better way to organise that
+ *
+ * Returns:
+ * 0 on success
+ * -ve on error
+ */
+int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb)
+{
+ size_t cur_iov;
+ size_t len = qsb_get_length(qsb);
+ uint32_t tmp;
+
+ if (len > MAX_VM_CMD_PACKAGED_SIZE) {
+ error_report("%s: Unreasonably large packaged state: %zu",
+ __func__, len);
+ return -1;
+ }
+
+ tmp = cpu_to_be32(len);
+
+ trace_qemu_savevm_send_packaged();
+ qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
+
+ /* all the data follows (concatinating the iov's) */
+ for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) {
+ /* The iov entries are partially filled */
+ size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len);
+ len -= towrite;
+
+ if (!towrite) {
+ break;
+ }
+
+ qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite);
+ }
+
+ return 0;
+}
+
+/* Send prior to any postcopy transfer */
+void qemu_savevm_send_postcopy_advise(QEMUFile *f)
+{
+ uint64_t tmp[2];
+ tmp[0] = cpu_to_be64(getpagesize());
+ tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits());
+
+ trace_qemu_savevm_send_postcopy_advise();
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
+}
+
+/* Sent prior to starting the destination running in postcopy, discard pages
+ * that have already been sent but redirtied on the source.
+ * CMD_POSTCOPY_RAM_DISCARD consist of:
+ * byte version (0)
+ * byte Length of name field (not including 0)
+ * n x byte RAM block name
+ * byte 0 terminator (just for safety)
+ * n x Byte ranges within the named RAMBlock
+ * be64 Start of the range
+ * be64 Length
+ *
+ * name: RAMBlock name that these entries are part of
+ * len: Number of page entries
+ * start_list: 'len' addresses
+ * length_list: 'len' addresses
+ *
+ */
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+ uint16_t len,
+ uint64_t *start_list,
+ uint64_t *length_list)
+{
+ uint8_t *buf;
+ uint16_t tmplen;
+ uint16_t t;
+ size_t name_len = strlen(name);
+
+ trace_qemu_savevm_send_postcopy_ram_discard(name, len);
+ assert(name_len < 256);
+ buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
+ buf[0] = postcopy_ram_discard_version;
+ buf[1] = name_len;
+ memcpy(buf + 2, name, name_len);
+ tmplen = 2 + name_len;
+ buf[tmplen++] = '\0';
+
+ for (t = 0; t < len; t++) {
+ cpu_to_be64w((uint64_t *)(buf + tmplen), start_list[t]);
+ tmplen += 8;
+ cpu_to_be64w((uint64_t *)(buf + tmplen), length_list[t]);
+ tmplen += 8;
+ }
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
+ g_free(buf);
+}
+
+/* Get the destination into a state where it can receive postcopy data. */
+void qemu_savevm_send_postcopy_listen(QEMUFile *f)
+{
+ trace_savevm_send_postcopy_listen();
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
+}
+
+/* Kick the destination into running */
+void qemu_savevm_send_postcopy_run(QEMUFile *f)
+{
+ trace_savevm_send_postcopy_run();
+ qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
+}
+
bool qemu_savevm_state_blocked(Error **errp)
{
SaveStateEntry *se;
@@ -713,6 +883,12 @@ void qemu_savevm_state_header(QEMUFile *f)
trace_savevm_state_header();
qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
qemu_put_be32(f, QEMU_VM_FILE_VERSION);
+
+ if (!savevm_state.skip_configuration) {
+ qemu_put_byte(f, QEMU_VM_CONFIGURATION);
+ vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
+ }
+
}
void qemu_savevm_state_begin(QEMUFile *f,
@@ -729,11 +905,6 @@ void qemu_savevm_state_begin(QEMUFile *f,
se->ops->set_params(params, se->opaque);
}
- if (!savevm_state.skip_configuration) {
- qemu_put_byte(f, QEMU_VM_CONFIGURATION);
- vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
- }
-
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->save_live_setup) {
continue;
@@ -760,7 +931,7 @@ void qemu_savevm_state_begin(QEMUFile *f,
* 0 : We haven't finished, caller have to go again
* 1 : We have finished, we can go to complete phase
*/
-int qemu_savevm_state_iterate(QEMUFile *f)
+int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
{
SaveStateEntry *se;
int ret = 1;
@@ -775,6 +946,15 @@ int qemu_savevm_state_iterate(QEMUFile *f)
continue;
}
}
+ /*
+ * In the postcopy phase, any device that doesn't know how to
+ * do postcopy should have saved it's state in the _complete
+ * call that's already run, it might get confused if we call
+ * iterate afterwards.
+ */
+ if (postcopy && !se->ops->save_live_complete_postcopy) {
+ continue;
+ }
if (qemu_file_rate_limit(f)) {
return 0;
}
@@ -803,22 +983,65 @@ int qemu_savevm_state_iterate(QEMUFile *f)
static bool should_send_vmdesc(void)
{
MachineState *machine = MACHINE(qdev_get_machine());
- return !machine->suppress_vmdesc;
+ bool in_postcopy = migration_in_postcopy(migrate_get_current());
+ return !machine->suppress_vmdesc && !in_postcopy;
}
-void qemu_savevm_state_complete(QEMUFile *f)
+/*
+ * Calls the save_live_complete_postcopy methods
+ * causing the last few pages to be sent immediately and doing any associated
+ * cleanup.
+ * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
+ * all the other devices, but that happens at the point we switch to postcopy.
+ */
+void qemu_savevm_state_complete_postcopy(QEMUFile *f)
+{
+ SaveStateEntry *se;
+ int ret;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (!se->ops || !se->ops->save_live_complete_postcopy) {
+ continue;
+ }
+ if (se->ops && se->ops->is_active) {
+ if (!se->ops->is_active(se->opaque)) {
+ continue;
+ }
+ }
+ trace_savevm_section_start(se->idstr, se->section_id);
+ /* Section type */
+ qemu_put_byte(f, QEMU_VM_SECTION_END);
+ qemu_put_be32(f, se->section_id);
+
+ ret = se->ops->save_live_complete_postcopy(f, se->opaque);
+ trace_savevm_section_end(se->idstr, se->section_id, ret);
+ save_section_footer(f, se);
+ if (ret < 0) {
+ qemu_file_set_error(f, ret);
+ return;
+ }
+ }
+
+ qemu_put_byte(f, QEMU_VM_EOF);
+ qemu_fflush(f);
+}
+
+void qemu_savevm_state_complete_precopy(QEMUFile *f)
{
QJSON *vmdesc;
int vmdesc_len;
SaveStateEntry *se;
int ret;
+ bool in_postcopy = migration_in_postcopy(migrate_get_current());
- trace_savevm_state_complete();
+ trace_savevm_state_complete_precopy();
cpu_synchronize_all_states();
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (!se->ops || !se->ops->save_live_complete) {
+ if (!se->ops ||
+ (in_postcopy && se->ops->save_live_complete_postcopy) ||
+ !se->ops->save_live_complete_precopy) {
continue;
}
if (se->ops && se->ops->is_active) {
@@ -830,7 +1053,7 @@ void qemu_savevm_state_complete(QEMUFile *f)
save_section_header(f, se, QEMU_VM_SECTION_END);
- ret = se->ops->save_live_complete(f, se->opaque);
+ ret = se->ops->save_live_complete_precopy(f, se->opaque);
trace_savevm_section_end(se->idstr, se->section_id, ret);
save_section_footer(f, se);
if (ret < 0) {
@@ -867,7 +1090,10 @@ void qemu_savevm_state_complete(QEMUFile *f)
save_section_footer(f, se);
}
- qemu_put_byte(f, QEMU_VM_EOF);
+ if (!in_postcopy) {
+ /* Postcopy stream will still be going */
+ qemu_put_byte(f, QEMU_VM_EOF);
+ }
json_end_array(vmdesc);
qjson_finish(vmdesc);
@@ -883,10 +1109,19 @@ void qemu_savevm_state_complete(QEMUFile *f)
qemu_fflush(f);
}
-uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
+/* Give an estimate of the amount left to be transferred,
+ * the result is split into the amount for units that can and
+ * for units that can't do postcopy.
+ */
+void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
+ uint64_t *res_non_postcopiable,
+ uint64_t *res_postcopiable)
{
SaveStateEntry *se;
- uint64_t ret = 0;
+
+ *res_non_postcopiable = 0;
+ *res_postcopiable = 0;
+
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (!se->ops || !se->ops->save_live_pending) {
@@ -897,9 +1132,9 @@ uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
continue;
}
}
- ret += se->ops->save_live_pending(f, se->opaque, max_size);
+ se->ops->save_live_pending(f, se->opaque, max_size,
+ res_non_postcopiable, res_postcopiable);
}
- return ret;
}
void qemu_savevm_state_cleanup(void)
@@ -921,6 +1156,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
.blk = 0,
.shared = 0
};
+ MigrationState *ms = migrate_init(&params);
+ ms->file = f;
if (qemu_savevm_state_blocked(errp)) {
return -EINVAL;
@@ -932,18 +1169,18 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
qemu_mutex_lock_iothread();
while (qemu_file_get_error(f) == 0) {
- if (qemu_savevm_state_iterate(f) > 0) {
+ if (qemu_savevm_state_iterate(f, false) > 0) {
break;
}
}
ret = qemu_file_get_error(f);
if (ret == 0) {
- qemu_savevm_state_complete(f);
+ qemu_savevm_state_complete_precopy(f);
ret = qemu_file_get_error(f);
}
+ qemu_savevm_state_cleanup();
if (ret != 0) {
- qemu_savevm_state_cleanup();
error_setg_errno(errp, -ret, "Error while writing VM state");
}
return ret;
@@ -1001,6 +1238,420 @@ static SaveStateEntry *find_se(const char *idstr, int instance_id)
return NULL;
}
+enum LoadVMExitCodes {
+ /* Allow a command to quit all layers of nested loadvm loops */
+ LOADVM_QUIT = 1,
+};
+
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
+
+/* ------ incoming postcopy messages ------ */
+/* 'advise' arrives before any transfers just to tell us that a postcopy
+ * *might* happen - it might be skipped if precopy transferred everything
+ * quickly.
+ */
+static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
+{
+ PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+ uint64_t remote_hps, remote_tps;
+
+ trace_loadvm_postcopy_handle_advise();
+ if (ps != POSTCOPY_INCOMING_NONE) {
+ error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
+ return -1;
+ }
+
+ if (!postcopy_ram_supported_by_host()) {
+ return -1;
+ }
+
+ remote_hps = qemu_get_be64(mis->from_src_file);
+ if (remote_hps != getpagesize()) {
+ /*
+ * Some combinations of mismatch are probably possible but it gets
+ * a bit more complicated. In particular we need to place whole
+ * host pages on the dest at once, and we need to ensure that we
+ * handle dirtying to make sure we never end up sending part of
+ * a hostpage on it's own.
+ */
+ error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
+ (int)remote_hps, getpagesize());
+ return -1;
+ }
+
+ remote_tps = qemu_get_be64(mis->from_src_file);
+ if (remote_tps != (1ul << qemu_target_page_bits())) {
+ /*
+ * Again, some differences could be dealt with, but for now keep it
+ * simple.
+ */
+ error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
+ (int)remote_tps, 1 << qemu_target_page_bits());
+ return -1;
+ }
+
+ if (ram_postcopy_incoming_init(mis)) {
+ return -1;
+ }
+
+ postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+
+ return 0;
+}
+
+/* After postcopy we will be told to throw some pages away since they're
+ * dirty and will have to be demand fetched. Must happen before CPU is
+ * started.
+ * There can be 0..many of these messages, each encoding multiple pages.
+ */
+static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
+ uint16_t len)
+{
+ int tmp;
+ char ramid[256];
+ PostcopyState ps = postcopy_state_get();
+
+ trace_loadvm_postcopy_ram_handle_discard();
+
+ switch (ps) {
+ case POSTCOPY_INCOMING_ADVISE:
+ /* 1st discard */
+ tmp = postcopy_ram_prepare_discard(mis);
+ if (tmp) {
+ return tmp;
+ }
+ break;
+
+ case POSTCOPY_INCOMING_DISCARD:
+ /* Expected state */
+ break;
+
+ default:
+ error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
+ ps);
+ return -1;
+ }
+ /* We're expecting a
+ * Version (0)
+ * a RAM ID string (length byte, name, 0 term)
+ * then at least 1 16 byte chunk
+ */
+ if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+ return -1;
+ }
+
+ tmp = qemu_get_byte(mis->from_src_file);
+ if (tmp != postcopy_ram_discard_version) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
+ return -1;
+ }
+
+ if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
+ return -1;
+ }
+ tmp = qemu_get_byte(mis->from_src_file);
+ if (tmp != 0) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
+ return -1;
+ }
+
+ len -= 3 + strlen(ramid);
+ if (len % 16) {
+ error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+ return -1;
+ }
+ trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
+ while (len) {
+ uint64_t start_addr, block_length;
+ start_addr = qemu_get_be64(mis->from_src_file);
+ block_length = qemu_get_be64(mis->from_src_file);
+
+ len -= 16;
+ int ret = ram_discard_range(mis, ramid, start_addr,
+ block_length);
+ if (ret) {
+ return ret;
+ }
+ }
+ trace_loadvm_postcopy_ram_handle_discard_end();
+
+ return 0;
+}
+
+/*
+ * Triggered by a postcopy_listen command; this thread takes over reading
+ * the input stream, leaving the main thread free to carry on loading the rest
+ * of the device state (from RAM).
+ * (TODO:This could do with being in a postcopy file - but there again it's
+ * just another input loop, not that postcopy specific)
+ */
+static void *postcopy_ram_listen_thread(void *opaque)
+{
+ QEMUFile *f = opaque;
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ int load_res;
+
+ qemu_sem_post(&mis->listen_thread_sem);
+ trace_postcopy_ram_listen_thread_start();
+
+ /*
+ * Because we're a thread and not a coroutine we can't yield
+ * in qemu_file, and thus we must be blocking now.
+ */
+ qemu_file_set_blocking(f, true);
+ load_res = qemu_loadvm_state_main(f, mis);
+ /* And non-blocking again so we don't block in any cleanup */
+ qemu_file_set_blocking(f, false);
+
+ trace_postcopy_ram_listen_thread_exit();
+ if (load_res < 0) {
+ error_report("%s: loadvm failed: %d", __func__, load_res);
+ qemu_file_set_error(f, load_res);
+ } else {
+ /*
+ * This looks good, but it's possible that the device loading in the
+ * main thread hasn't finished yet, and so we might not be in 'RUN'
+ * state yet; wait for the end of the main thread.
+ */
+ qemu_event_wait(&mis->main_thread_load_event);
+ }
+ postcopy_ram_incoming_cleanup(mis);
+ /*
+ * If everything has worked fine, then the main thread has waited
+ * for us to start, and we're the last use of the mis.
+ * (If something broke then qemu will have to exit anyway since it's
+ * got a bad migration state).
+ */
+ migration_incoming_state_destroy();
+
+ if (load_res < 0) {
+ /*
+ * If something went wrong then we have a bad state so exit;
+ * depending how far we got it might be possible at this point
+ * to leave the guest running and fire MCEs for pages that never
+ * arrived as a desperate recovery step.
+ */
+ exit(EXIT_FAILURE);
+ }
+
+ return NULL;
+}
+
+/* After this message we must be able to immediately receive postcopy data */
+static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
+{
+ PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
+ trace_loadvm_postcopy_handle_listen();
+ if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
+ error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
+ return -1;
+ }
+ if (ps == POSTCOPY_INCOMING_ADVISE) {
+ /*
+ * A rare case, we entered listen without having to do any discards,
+ * so do the setup that's normally done at the time of the 1st discard.
+ */
+ postcopy_ram_prepare_discard(mis);
+ }
+
+ /*
+ * Sensitise RAM - can now generate requests for blocks that don't exist
+ * However, at this point the CPU shouldn't be running, and the IO
+ * shouldn't be doing anything yet so don't actually expect requests
+ */
+ if (postcopy_ram_enable_notify(mis)) {
+ return -1;
+ }
+
+ if (mis->have_listen_thread) {
+ error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
+ return -1;
+ }
+
+ mis->have_listen_thread = true;
+ /* Start up the listening thread and wait for it to signal ready */
+ qemu_sem_init(&mis->listen_thread_sem, 0);
+ qemu_thread_create(&mis->listen_thread, "postcopy/listen",
+ postcopy_ram_listen_thread, mis->from_src_file,
+ QEMU_THREAD_JOINABLE);
+ qemu_sem_wait(&mis->listen_thread_sem);
+ qemu_sem_destroy(&mis->listen_thread_sem);
+
+ return 0;
+}
+
+/* After all discards we can start running and asking for pages */
+static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
+{
+ PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
+ Error *local_err = NULL;
+
+ trace_loadvm_postcopy_handle_run();
+ if (ps != POSTCOPY_INCOMING_LISTENING) {
+ error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
+ return -1;
+ }
+
+ /* TODO we should move all of this lot into postcopy_ram.c or a shared code
+ * in migration.c
+ */
+ cpu_synchronize_all_post_init();
+
+ qemu_announce_self();
+
+ /* Make sure all file formats flush their mutable metadata */
+ bdrv_invalidate_cache_all(&local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ return -1;
+ }
+
+ trace_loadvm_postcopy_handle_run_cpu_sync();
+ cpu_synchronize_all_post_init();
+
+ trace_loadvm_postcopy_handle_run_vmstart();
+
+ if (autostart) {
+ /* Hold onto your hats, starting the CPU */
+ vm_start();
+ } else {
+ /* leave it paused and let management decide when to start the CPU */
+ runstate_set(RUN_STATE_PAUSED);
+ }
+
+ /* We need to finish reading the stream from the package
+ * and also stop reading anything more from the stream that loaded the
+ * package (since it's now being read by the listener thread).
+ * LOADVM_QUIT will quit all the layers of nested loadvm loops.
+ */
+ return LOADVM_QUIT;
+}
+
+/**
+ * Immediately following this command is a blob of data containing an embedded
+ * chunk of migration stream; read it and load it.
+ *
+ * @mis: Incoming state
+ * @length: Length of packaged data to read
+ *
+ * Returns: Negative values on error
+ *
+ */
+static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
+{
+ int ret;
+ uint8_t *buffer;
+ uint32_t length;
+ QEMUSizedBuffer *qsb;
+
+ length = qemu_get_be32(mis->from_src_file);
+ trace_loadvm_handle_cmd_packaged(length);
+
+ if (length > MAX_VM_CMD_PACKAGED_SIZE) {
+ error_report("Unreasonably large packaged state: %u", length);
+ return -1;
+ }
+ buffer = g_malloc0(length);
+ ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length);
+ if (ret != length) {
+ g_free(buffer);
+ error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d\n",
+ ret, length);
+ return (ret < 0) ? ret : -EAGAIN;
+ }
+ trace_loadvm_handle_cmd_packaged_received(ret);
+
+ /* Setup a dummy QEMUFile that actually reads from the buffer */
+ qsb = qsb_create(buffer, length);
+ g_free(buffer); /* Because qsb_create copies */
+ if (!qsb) {
+ error_report("Unable to create qsb");
+ }
+ QEMUFile *packf = qemu_bufopen("r", qsb);
+
+ ret = qemu_loadvm_state_main(packf, mis);
+ trace_loadvm_handle_cmd_packaged_main(ret);
+ qemu_fclose(packf);
+ qsb_free(qsb);
+
+ return ret;
+}
+
+/*
+ * Process an incoming 'QEMU_VM_COMMAND'
+ * 0 just a normal return
+ * LOADVM_QUIT All good, but exit the loop
+ * <0 Error
+ */
+static int loadvm_process_command(QEMUFile *f)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ uint16_t cmd;
+ uint16_t len;
+ uint32_t tmp32;
+
+ cmd = qemu_get_be16(f);
+ len = qemu_get_be16(f);
+
+ trace_loadvm_process_command(cmd, len);
+ if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
+ error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
+ return -EINVAL;
+ }
+
+ if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
+ error_report("%s received with bad length - expecting %zu, got %d",
+ mig_cmd_args[cmd].name,
+ (size_t)mig_cmd_args[cmd].len, len);
+ return -ERANGE;
+ }
+
+ switch (cmd) {
+ case MIG_CMD_OPEN_RETURN_PATH:
+ if (mis->to_src_file) {
+ error_report("CMD_OPEN_RETURN_PATH called when RP already open");
+ /* Not really a problem, so don't give up */
+ return 0;
+ }
+ mis->to_src_file = qemu_file_get_return_path(f);
+ if (!mis->to_src_file) {
+ error_report("CMD_OPEN_RETURN_PATH failed");
+ return -1;
+ }
+ break;
+
+ case MIG_CMD_PING:
+ tmp32 = qemu_get_be32(f);
+ trace_loadvm_process_command_ping(tmp32);
+ if (!mis->to_src_file) {
+ error_report("CMD_PING (0x%x) received with no return path",
+ tmp32);
+ return -1;
+ }
+ migrate_send_rp_pong(mis, tmp32);
+ break;
+
+ case MIG_CMD_PACKAGED:
+ return loadvm_handle_cmd_packaged(mis);
+
+ case MIG_CMD_POSTCOPY_ADVISE:
+ return loadvm_postcopy_handle_advise(mis);
+
+ case MIG_CMD_POSTCOPY_LISTEN:
+ return loadvm_postcopy_handle_listen(mis);
+
+ case MIG_CMD_POSTCOPY_RUN:
+ return loadvm_postcopy_handle_run(mis);
+
+ case MIG_CMD_POSTCOPY_RAM_DISCARD:
+ return loadvm_postcopy_ram_handle_discard(mis, len);
+ }
+
+ return 0;
+}
+
struct LoadStateEntry {
QLIST_ENTRY(LoadStateEntry) entry;
SaveStateEntry *se;
@@ -1053,47 +1704,10 @@ void loadvm_free_handlers(MigrationIncomingState *mis)
}
}
-int qemu_loadvm_state(QEMUFile *f)
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
{
- MigrationIncomingState *mis = migration_incoming_get_current();
- Error *local_err = NULL;
uint8_t section_type;
- unsigned int v;
int ret;
- int file_error_after_eof = -1;
-
- if (qemu_savevm_state_blocked(&local_err)) {
- error_report_err(local_err);
- return -EINVAL;
- }
-
- v = qemu_get_be32(f);
- if (v != QEMU_VM_FILE_MAGIC) {
- error_report("Not a migration stream");
- return -EINVAL;
- }
-
- v = qemu_get_be32(f);
- if (v == QEMU_VM_FILE_VERSION_COMPAT) {
- error_report("SaveVM v2 format is obsolete and don't work anymore");
- return -ENOTSUP;
- }
- if (v != QEMU_VM_FILE_VERSION) {
- error_report("Unsupported migration stream version");
- return -ENOTSUP;
- }
-
- if (!savevm_state.skip_configuration) {
- if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
- error_report("Configuration section missing");
- return -EINVAL;
- }
- ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
-
- if (ret) {
- return ret;
- }
- }
while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
uint32_t instance_id, version_id, section_id;
@@ -1122,16 +1736,14 @@ int qemu_loadvm_state(QEMUFile *f)
if (se == NULL) {
error_report("Unknown savevm section or instance '%s' %d",
idstr, instance_id);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* Validate version */
if (version_id > se->version_id) {
error_report("savevm: unsupported version %d for '%s' v%d",
version_id, idstr, se->version_id);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* Add entry */
@@ -1146,11 +1758,10 @@ int qemu_loadvm_state(QEMUFile *f)
if (ret < 0) {
error_report("error while loading state for instance 0x%x of"
" device '%s'", instance_id, idstr);
- goto out;
+ return ret;
}
if (!check_section_footer(f, le)) {
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
break;
case QEMU_VM_SECTION_PART:
@@ -1165,29 +1776,88 @@ int qemu_loadvm_state(QEMUFile *f)
}
if (le == NULL) {
error_report("Unknown savevm section %d", section_id);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
ret = vmstate_load(f, le->se, le->version_id);
if (ret < 0) {
error_report("error while loading state section id %d(%s)",
section_id, le->se->idstr);
- goto out;
+ return ret;
}
if (!check_section_footer(f, le)) {
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
+ }
+ break;
+ case QEMU_VM_COMMAND:
+ ret = loadvm_process_command(f);
+ trace_qemu_loadvm_state_section_command(ret);
+ if ((ret < 0) || (ret & LOADVM_QUIT)) {
+ return ret;
}
break;
default:
error_report("Unknown savevm section type %d", section_type);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
}
- file_error_after_eof = qemu_file_get_error(f);
+ return 0;
+}
+
+int qemu_loadvm_state(QEMUFile *f)
+{
+ MigrationIncomingState *mis = migration_incoming_get_current();
+ Error *local_err = NULL;
+ unsigned int v;
+ int ret;
+
+ if (qemu_savevm_state_blocked(&local_err)) {
+ error_report_err(local_err);
+ return -EINVAL;
+ }
+
+ v = qemu_get_be32(f);
+ if (v != QEMU_VM_FILE_MAGIC) {
+ error_report("Not a migration stream");
+ return -EINVAL;
+ }
+
+ v = qemu_get_be32(f);
+ if (v == QEMU_VM_FILE_VERSION_COMPAT) {
+ error_report("SaveVM v2 format is obsolete and don't work anymore");
+ return -ENOTSUP;
+ }
+ if (v != QEMU_VM_FILE_VERSION) {
+ error_report("Unsupported migration stream version");
+ return -ENOTSUP;
+ }
+
+ if (!savevm_state.skip_configuration) {
+ if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
+ error_report("Configuration section missing");
+ return -EINVAL;
+ }
+ ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
+
+ if (ret) {
+ return ret;
+ }
+ }
+
+ ret = qemu_loadvm_state_main(f, mis);
+ qemu_event_set(&mis->main_thread_load_event);
+
+ trace_qemu_loadvm_state_post_main(ret);
+
+ if (mis->have_listen_thread) {
+ /* Listen thread still going, can't clean up yet */
+ return ret;
+ }
+
+ if (ret == 0) {
+ ret = qemu_file_get_error(f);
+ }
/*
* Try to read in the VMDESC section as well, so that dumping tools that
@@ -1199,10 +1869,10 @@ int qemu_loadvm_state(QEMUFile *f)
* We also mustn't read data that isn't there; some transports (RDMA)
* will stall waiting for that data when the source has already closed.
*/
- if (should_send_vmdesc()) {
+ if (ret == 0 && should_send_vmdesc()) {
uint8_t *buf;
uint32_t size;
- section_type = qemu_get_byte(f);
+ uint8_t section_type = qemu_get_byte(f);
if (section_type != QEMU_VM_VMDESCRIPTION) {
error_report("Expected vmdescription section, but got %d",
@@ -1226,14 +1896,6 @@ int qemu_loadvm_state(QEMUFile *f)
cpu_synchronize_all_post_init();
- ret = 0;
-
-out:
- if (ret == 0) {
- /* We may not have a VMDESC section, so ignore relative errors */
- ret = file_error_after_eof;
- }
-
return ret;
}
diff --git a/qapi-schema.json b/qapi-schema.json
index e18f14c88e..8c3a42a1ac 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -430,6 +430,8 @@
#
# @active: in the process of doing migration.
#
+# @postcopy-active: like active, but now in postcopy mode. (since 2.5)
+#
# @completed: migration is finished.
#
# @failed: some error occurred during migration process.
@@ -439,7 +441,7 @@
##
{ 'enum': 'MigrationStatus',
'data': [ 'none', 'setup', 'cancelling', 'cancelled',
- 'active', 'completed', 'failed' ] }
+ 'active', 'postcopy-active', 'completed', 'failed' ] }
##
# @MigrationInfo
@@ -540,11 +542,15 @@
# @auto-converge: If enabled, QEMU will automatically throttle down the guest
# to speed up convergence of RAM migration. (since 1.6)
#
+# @x-postcopy-ram: Start executing on the migration target before all of RAM has
+# been migrated, pulling the remaining pages along as needed. NOTE: If
+# the migration fails during postcopy the VM will fail. (since 2.5)
+#
# Since: 1.2
##
{ 'enum': 'MigrationCapability',
'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
- 'compress', 'events'] }
+ 'compress', 'events', 'x-postcopy-ram'] }
##
# @MigrationCapabilityStatus
@@ -698,6 +704,14 @@
'*tls-port': 'int', '*cert-subject': 'str' } }
##
+# @migrate-start-postcopy
+#
+# Switch migration to postcopy mode
+#
+# Since: 2.5
+{ 'command': 'migrate-start-postcopy' }
+
+##
# @MouseInfo:
#
# Information about a mouse device.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index d7cf0ff264..7f85d4046c 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -718,6 +718,25 @@ Example:
EQMP
{
+ .name = "migrate-start-postcopy",
+ .args_type = "",
+ .mhandler.cmd_new = qmp_marshal_migrate_start_postcopy,
+ },
+
+SQMP
+migrate-start-postcopy
+----------------------
+
+Switch an in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
+
+Example:
+-> { "execute": "migrate-start-postcopy" }
+<- { "return": {} }
+
+EQMP
+
+ {
.name = "query-migrate-cache-size",
.args_type = "",
.mhandler.cmd_new = qmp_marshal_query_migrate_cache_size,
diff --git a/qtest.c b/qtest.c
index 8e10340c7e..05cefd2800 100644
--- a/qtest.c
+++ b/qtest.c
@@ -657,7 +657,6 @@ void qtest_init(const char *qtest_chrdev, const char *qtest_log, Error **errp)
inbuf = g_string_new("");
qtest_chr = chr;
- page_size_init();
}
bool qtest_driver(void)
diff --git a/trace-events b/trace-events
index ea2d32e362..ef6bc41a56 100644
--- a/trace-events
+++ b/trace-events
@@ -1202,16 +1202,43 @@ virtio_gpu_fence_resp(uint64_t fence) "fence 0x%" PRIx64
# migration/savevm.c
qemu_loadvm_state_section(unsigned int section_type) "%d"
+qemu_loadvm_state_section_command(int ret) "%d"
qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
+qemu_loadvm_state_main(void) ""
+qemu_loadvm_state_main_quit_parent(void) ""
+qemu_loadvm_state_post_main(int ret) "%d"
qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
+qemu_savevm_send_packaged(void) ""
+loadvm_handle_cmd_packaged(unsigned int length) "%u"
+loadvm_handle_cmd_packaged_main(int ret) "%d"
+loadvm_handle_cmd_packaged_received(int ret) "%d"
+loadvm_postcopy_handle_advise(void) ""
+loadvm_postcopy_handle_listen(void) ""
+loadvm_postcopy_handle_run(void) ""
+loadvm_postcopy_handle_run_cpu_sync(void) ""
+loadvm_postcopy_handle_run_vmstart(void) ""
+loadvm_postcopy_ram_handle_discard(void) ""
+loadvm_postcopy_ram_handle_discard_end(void) ""
+loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
+loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
+loadvm_process_command_ping(uint32_t val) "%x"
+postcopy_ram_listen_thread_exit(void) ""
+postcopy_ram_listen_thread_start(void) ""
+qemu_savevm_send_postcopy_advise(void) ""
+qemu_savevm_send_postcopy_ram_discard(const char *id, uint16_t len) "%s: %ud"
+savevm_command_send(uint16_t command, uint16_t len) "com=0x%x len=%d"
savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u"
+savevm_send_open_return_path(void) ""
+savevm_send_ping(uint32_t val) "%x"
+savevm_send_postcopy_listen(void) ""
+savevm_send_postcopy_run(void) ""
savevm_state_begin(void) ""
savevm_state_header(void) ""
savevm_state_iterate(void) ""
-savevm_state_complete(void) ""
savevm_state_cleanup(void) ""
+savevm_state_complete_precopy(void) ""
vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s"
vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
qemu_announce_self_iter(const char *mac) "%s"
@@ -1229,9 +1256,14 @@ vmstate_subsection_load_good(const char *parent) "%s"
qemu_file_fclose(void) ""
# migration/ram.c
+get_queued_page(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr) "%s/%" PRIx64 " ram_addr=%" PRIx64
+get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr, int sent) "%s/%" PRIx64 " ram_addr=%" PRIx64 " (sent=%d)"
migration_bitmap_sync_start(void) ""
migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
migration_throttle(void) ""
+ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x"
+ram_postcopy_send_discard_bitmap(void) ""
+ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx"
# hw/display/qxl.c
disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
@@ -1421,17 +1453,40 @@ flic_no_device_api(int err) "flic: no Device Contral API support %d"
flic_reset_failed(int err) "flic: reset failed %d"
# migration.c
+await_return_path_close_on_source_close(void) ""
+await_return_path_close_on_source_joining(void) ""
migrate_set_state(int new_state) "new state %d"
migrate_fd_cleanup(void) ""
migrate_fd_error(void) ""
migrate_fd_cancel(void) ""
-migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
-migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
-migrate_state_too_big(void) ""
+migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at %zx len %zx"
+migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")"
+migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
+migration_completion_file_err(void) ""
+migration_completion_postcopy_end(void) ""
+migration_completion_postcopy_end_after_complete(void) ""
+migration_completion_postcopy_end_before_rp(void) ""
+migration_completion_postcopy_end_after_rp(int rp_error) "%d"
+migration_thread_after_loop(void) ""
+migration_thread_file_err(void) ""
+migration_thread_setup_complete(void) ""
+open_return_path_on_source(void) ""
+open_return_path_on_source_continue(void) ""
+postcopy_start(void) ""
+postcopy_start_set_run(void) ""
+source_return_path_thread_bad_end(void) ""
+source_return_path_thread_end(void) ""
+source_return_path_thread_entry(void) ""
+source_return_path_thread_loop_top(void) ""
+source_return_path_thread_pong(uint32_t val) "%x"
+source_return_path_thread_shut(uint32_t val) "%x"
migrate_global_state_post_load(const char *state) "loaded state: %s"
migrate_global_state_pre_save(const char *state) "saved state: %s"
-migration_completion_file_err(void) ""
migration_thread_low_pending(uint64_t pending) "%" PRIu64
+migrate_state_too_big(void) ""
+migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
+process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
+process_incoming_migration_co_postcopy_end_main(void) ""
# migration/rdma.c
qemu_rdma_accept_incoming_migration(void) ""
@@ -1497,6 +1552,25 @@ rdma_start_incoming_migration_after_rdma_listen(void) ""
rdma_start_outgoing_migration_after_rdma_connect(void) ""
rdma_start_outgoing_migration_after_rdma_source_init(void) ""
+# migration/postcopy-ram.c
+postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
+postcopy_discard_send_range(const char *ramblock, unsigned long start, unsigned long length) "%s:%lx/%lx"
+postcopy_ram_discard_range(void *start, size_t length) "%p,+%zx"
+postcopy_cleanup_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_init_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_place_page(void *host_addr) "host=%p"
+postcopy_place_page_zero(void *host_addr) "host=%p"
+postcopy_ram_enable_notify(void) ""
+postcopy_ram_fault_thread_entry(void) ""
+postcopy_ram_fault_thread_exit(void) ""
+postcopy_ram_fault_thread_quit(void) ""
+postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
+postcopy_ram_incoming_cleanup_closeuf(void) ""
+postcopy_ram_incoming_cleanup_entry(void) ""
+postcopy_ram_incoming_cleanup_exit(void) ""
+postcopy_ram_incoming_cleanup_join(void) ""
+
# kvm-all.c
kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
diff --git a/vl.c b/vl.c
index 21e8876a57..7d993a5243 100644
--- a/vl.c
+++ b/vl.c
@@ -4285,6 +4285,7 @@ int main(int argc, char **argv, char **envp)
exit(1);
}
+ page_size_init();
socket_init();
if (qemu_opts_foreach(qemu_find_opts("object"),