From 5d6c599fe1d69a1bf8c5c4d3c58be2b31cd625ad Mon Sep 17 00:00:00 2001 From: Anthony PERARD Date: Thu, 16 Nov 2017 15:14:19 +0000 Subject: migration, xen: Fix block image lock issue on live migration When doing a live migration of a Xen guest with libxl, the images for block devices are locked by the original QEMU process, and this prevent the QEMU at the destination to take the lock and the migration fail. >From QEMU point of view, once the RAM of a domain is migrated, there is two QMP commands, "stop" then "xen-save-devices-state", at which point a new QEMU is spawned at the destination. Release locks in "xen-save-devices-state" so the destination can takes them, if it's a live migration. This patch add the "live" parameter to "xen-save-devices-state" which default to true so older version of libxenlight can work with newer version of QEMU. Signed-off-by: Anthony PERARD Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/savevm.c | 23 ++++++++++++++++++++++- qapi/migration.json | 6 +++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index 192f2d82cd..b7908f62be 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2242,13 +2242,20 @@ int save_snapshot(const char *name, Error **errp) return ret; } -void qmp_xen_save_devices_state(const char *filename, Error **errp) +void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, + Error **errp) { QEMUFile *f; QIOChannelFile *ioc; int saved_vm_running; int ret; + if (!has_live) { + /* live default to true so old version of Xen tool stack can have a + * successfull live migration */ + live = true; + } + saved_vm_running = runstate_is_running(); vm_stop(RUN_STATE_SAVE_VM); global_state_store_running(); @@ -2263,6 +2270,20 @@ void qmp_xen_save_devices_state(const char *filename, Error **errp) qemu_fclose(f); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); + } else { + /* libxl calls the QMP command "stop" before calling + * "xen-save-devices-state" and in case of migration failure, libxl + * would call "cont". + * So call bdrv_inactivate_all (release locks) here to let the other + * side of the migration take controle of the images. + */ + if (live && !saved_vm_running) { + ret = bdrv_inactivate_all(); + if (ret) { + error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)", + __func__, ret); + } + } } the_end: diff --git a/qapi/migration.json b/qapi/migration.json index bbc4671ded..03f57c9616 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1075,6 +1075,9 @@ # data. See xen-save-devices-state.txt for a description of the binary # format. # +# @live: Optional argument to ask QEMU to treat this command as part of a live +# migration. Default to true. (since 2.11) +# # Returns: Nothing on success # # Since: 1.1 @@ -1086,7 +1089,8 @@ # <- { "return": {} } # ## -{ 'command': 'xen-save-devices-state', 'data': {'filename': 'str'} } +{ 'command': 'xen-save-devices-state', + 'data': {'filename': 'str', '*live':'bool' } } ## # @xen-set-replication: -- cgit v1.2.1 From acab30b85db0885ab161aff4c83c550628f6d8ca Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Thu, 16 Nov 2017 20:35:26 -0200 Subject: migration/ram.c: do not set 'postcopy_running' in POSTCOPY_INCOMING_END When migrating a VM with 'migrate_set_capability postcopy-ram on' a postcopy_state is set during the process, ending up with the state POSTCOPY_INCOMING_END when the migration is over. This postcopy_state is taken into account inside ram_load to check how it will load the memory pages. This same ram_load is called when in a loadvm command. Inside ram_load, the logic to see if we're at postcopy_running state is: postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING postcopy_state_get() returns this enum type: typedef enum { POSTCOPY_INCOMING_NONE = 0, POSTCOPY_INCOMING_ADVISE, POSTCOPY_INCOMING_DISCARD, POSTCOPY_INCOMING_LISTENING, POSTCOPY_INCOMING_RUNNING, POSTCOPY_INCOMING_END } PostcopyState; In the case where ram_load is executed and postcopy_state is POSTCOPY_INCOMING_END, postcopy_running will be set to 'true' and ram_load will behave like a postcopy is in progress. This scenario isn't achievable in a migration but it is reproducible when executing savevm/loadvm after migrating with 'postcopy-ram on', causing loadvm to fail with Error -22: Source: (qemu) migrate_set_capability postcopy-ram on (qemu) migrate tcp:127.0.0.1:4444 Dest: (qemu) migrate_set_capability postcopy-ram on (qemu) ubuntu1704-intel login: Ubuntu 17.04 ubuntu1704-intel ttyS0 ubuntu1704-intel login: (qemu) (qemu) savevm test1 (qemu) loadvm test1 Unknown combination of migration flags: 0x4 (postcopy mode) error while loading state for instance 0x0 of device 'ram' Error -22 while loading VM state (qemu) This patch fixes this problem by changing the existing logic for postcopy_advised and postcopy_running in ram_load, making them 'false' if we're at POSTCOPY_INCOMING_END state. Signed-off-by: Daniel Henrique Barboza CC: Juan Quintela CC: Dr. David Alan Gilbert Reviewed-by: Peter Xu Reviewed-by: Juan Quintela Reported-by: Balamuruhan S Signed-off-by: Juan Quintela --- migration/ram.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 8620aa400a..021d583b9b 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2798,6 +2798,18 @@ static int ram_load_postcopy(QEMUFile *f) return ret; } +static bool postcopy_is_advised(void) +{ + PostcopyState ps = postcopy_state_get(); + return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; +} + +static bool postcopy_is_running(void) +{ + PostcopyState ps = postcopy_state_get(); + return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; +} + static int ram_load(QEMUFile *f, void *opaque, int version_id) { int flags = 0, ret = 0, invalid_flags = 0; @@ -2807,9 +2819,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) * If system is running in postcopy mode, page inserts to host memory must * be atomic */ - bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; + bool postcopy_running = postcopy_is_running(); /* ADVISE is earlier, it shows the source has the postcopy capability on */ - bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE; + bool postcopy_advised = postcopy_is_advised(); seq_iter++; -- cgit v1.2.1