From 46e13601a521ccc5883226b989ed0705fc404937 Mon Sep 17 00:00:00 2001 From: Tomi Juvonen Date: Mon, 6 May 2019 17:24:59 +0300 Subject: [PATCH] Add live migration support story: 2005585 Task: #30774 Change-Id: I5dc6db643900a6bfcc427b4b9ee23b5557b091a4 Signed-off-by: Tomi Juvonen --- fenix/db/sqlalchemy/api.py | 2 +- fenix/utils/service.py | 6 +++ fenix/workflow/workflows/default.py | 81 +++++++++++++++++++++++++++-- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/fenix/db/sqlalchemy/api.py b/fenix/db/sqlalchemy/api.py index a3765f3..84df43d 100644 --- a/fenix/db/sqlalchemy/api.py +++ b/fenix/db/sqlalchemy/api.py @@ -164,7 +164,7 @@ def remove_session(session_id): downloads = _download_get_all(session, session_id) if downloads: for download in downloads: - download.delete(download) + session.delete(download) hosts = _hosts_get(session, session_id) if hosts: diff --git a/fenix/utils/service.py b/fenix/utils/service.py index 2d434c0..08d37ad 100644 --- a/fenix/utils/service.py +++ b/fenix/utils/service.py @@ -69,6 +69,12 @@ opts = [ cfg.StrOpt('local_cache_dir', default="/tmp", help="Local cache directory"), + cfg.StrOpt('live_migration_retries', + default=5, + help="Number of live migration retries"), + cfg.StrOpt('live_migration_wait_time', + default=600, + help="How long to wait live migration to be done"), ] CONF.register_opts(opts) diff --git a/fenix/workflow/workflows/default.py b/fenix/workflow/workflows/default.py index 9ade496..7c3126a 100644 --- a/fenix/workflow/workflows/default.py +++ b/fenix/workflow/workflows/default.py @@ -565,8 +565,11 @@ class Workflow(BaseWorkflow): self.notify_action_done(project, instance) elif instance.action == 'OWN_ACTION': pass + elif instance.action == 'LIVE_MIGRATE': + if not self.live_migrate_server(instance): + return False + self.notify_action_done(project, instance) else: - # TBD LIVE_MIGRATE not supported raise Exception('%s: instance %s action ' '%s not supported' % (self.session_id, instance.instance_id, @@ -576,7 +579,7 @@ class Workflow(BaseWorkflow): def _wait_host_empty(self, host): hid = self.nova.hypervisors.search(host)[0].id vcpus_used_last = 0 - # wait 4min to get host empty + # wait 4min to get host emptys for j in range(48): hvisor = self.nova.hypervisors.get(hid) vcpus_used = hvisor.__getattr__('vcpus_used') @@ -592,6 +595,75 @@ class Workflow(BaseWorkflow): LOG.info('%s host still not empty' % host) return False + def live_migrate_server(self, instance): + server_id = instance.instance_id + server = self.nova.servers.get(server_id) + instance.state = server.__dict__.get('OS-EXT-STS:vm_state') + orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host')) + LOG.info('live_migrate_server %s state %s host %s' % (server_id, + instance.state, + orig_host)) + orig_vm_state = instance.state + last_vm_status = str(server.__dict__.get('status')) + last_migration_status = "active" + try: + server.live_migrate() + waited = 0 + migrate_retries = 0 + while waited != self.conf.live_migration_wait_time: + time.sleep(1) + server = self.nova.servers.get(server_id) + host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host')) + vm_status = str(server.__dict__.get('status')) + instance.state = server.__dict__.get('OS-EXT-STS:vm_state') + instance.host = host + if vm_status != last_vm_status: + LOG.info('instance %s status changed: %s' % (server_id, + vm_status)) + if instance.state == 'error': + LOG.error('instance %s live migration failed' + % server_id) + return False + elif orig_vm_state != instance.state: + LOG.info('instance %s state changed: %s' % (server_id, + instance.state)) + elif host != orig_host: + LOG.info('instance %s live migrated to host %s' % + (server_id, host)) + return True + migration = ( + self.nova.migrations.list(instance_uuid=server_id)[0]) + if migration.status == 'error': + if migrate_retries == self.conf.live_migration_retries: + LOG.error('instance %s live migration failed after ' + '%d retries' % + (server_id, + self.conf.live_migration_retries)) + return False + # When live migrate fails it can fail fast after calling + # To have Nova time to be ready for next live migration + # There needs to be enough time to wait before retry + # And waiting more on next retry have better chance to + # Have live migration finally through + time.sleep(2 * (migrate_retries + 5)) + LOG.info('instance %s live migration failed, retry' + % server_id) + server.live_migrate() + waited = 0 + migrate_retries = migrate_retries + 1 + elif migration.status != last_migration_status: + LOG.info('instance %s live migration status changed: %s' + % (server_id, migration.status)) + waited = waited + 1 + last_migration_status = migration.status + last_vm_status = vm_status + LOG.error('instance %s live migration did not finish in %ss, ' + 'state: %s' % (server_id, waited, instance.state)) + except Exception as e: + LOG.error('server %s live migration failed, Exception=%s' % + (server_id, e)) + return False + def migrate_server(self, instance): # TBD this method should be enhanced for errors and to have failed # instance back to state active instead of error @@ -599,8 +671,9 @@ class Workflow(BaseWorkflow): server = self.nova.servers.get(server_id) instance.state = server.__dict__.get('OS-EXT-STS:vm_state') orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host')) - LOG.info('server %s state %s host %s' % (server_id, instance.state, - orig_host)) + LOG.info('migrate_server %s state %s host %s' % (server_id, + instance.state, + orig_host)) last_vm_state = instance.state retry_migrate = 2 while True: