Add live migration support

story: 2005585
Task: #30774

Change-Id: I5dc6db643900a6bfcc427b4b9ee23b5557b091a4
Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
This commit is contained in:
Tomi Juvonen 2019-05-06 17:24:59 +03:00
parent a232dda364
commit 46e13601a5
3 changed files with 84 additions and 5 deletions

View File

@ -164,7 +164,7 @@ def remove_session(session_id):
downloads = _download_get_all(session, session_id) downloads = _download_get_all(session, session_id)
if downloads: if downloads:
for download in downloads: for download in downloads:
download.delete(download) session.delete(download)
hosts = _hosts_get(session, session_id) hosts = _hosts_get(session, session_id)
if hosts: if hosts:

View File

@ -69,6 +69,12 @@ opts = [
cfg.StrOpt('local_cache_dir', cfg.StrOpt('local_cache_dir',
default="/tmp", default="/tmp",
help="Local cache directory"), help="Local cache directory"),
cfg.StrOpt('live_migration_retries',
default=5,
help="Number of live migration retries"),
cfg.StrOpt('live_migration_wait_time',
default=600,
help="How long to wait live migration to be done"),
] ]
CONF.register_opts(opts) CONF.register_opts(opts)

View File

@ -565,8 +565,11 @@ class Workflow(BaseWorkflow):
self.notify_action_done(project, instance) self.notify_action_done(project, instance)
elif instance.action == 'OWN_ACTION': elif instance.action == 'OWN_ACTION':
pass pass
elif instance.action == 'LIVE_MIGRATE':
if not self.live_migrate_server(instance):
return False
self.notify_action_done(project, instance)
else: else:
# TBD LIVE_MIGRATE not supported
raise Exception('%s: instance %s action ' raise Exception('%s: instance %s action '
'%s not supported' % '%s not supported' %
(self.session_id, instance.instance_id, (self.session_id, instance.instance_id,
@ -576,7 +579,7 @@ class Workflow(BaseWorkflow):
def _wait_host_empty(self, host): def _wait_host_empty(self, host):
hid = self.nova.hypervisors.search(host)[0].id hid = self.nova.hypervisors.search(host)[0].id
vcpus_used_last = 0 vcpus_used_last = 0
# wait 4min to get host empty # wait 4min to get host emptys
for j in range(48): for j in range(48):
hvisor = self.nova.hypervisors.get(hid) hvisor = self.nova.hypervisors.get(hid)
vcpus_used = hvisor.__getattr__('vcpus_used') vcpus_used = hvisor.__getattr__('vcpus_used')
@ -592,6 +595,75 @@ class Workflow(BaseWorkflow):
LOG.info('%s host still not empty' % host) LOG.info('%s host still not empty' % host)
return False return False
def live_migrate_server(self, instance):
server_id = instance.instance_id
server = self.nova.servers.get(server_id)
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
LOG.info('live_migrate_server %s state %s host %s' % (server_id,
instance.state,
orig_host))
orig_vm_state = instance.state
last_vm_status = str(server.__dict__.get('status'))
last_migration_status = "active"
try:
server.live_migrate()
waited = 0
migrate_retries = 0
while waited != self.conf.live_migration_wait_time:
time.sleep(1)
server = self.nova.servers.get(server_id)
host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
vm_status = str(server.__dict__.get('status'))
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
instance.host = host
if vm_status != last_vm_status:
LOG.info('instance %s status changed: %s' % (server_id,
vm_status))
if instance.state == 'error':
LOG.error('instance %s live migration failed'
% server_id)
return False
elif orig_vm_state != instance.state:
LOG.info('instance %s state changed: %s' % (server_id,
instance.state))
elif host != orig_host:
LOG.info('instance %s live migrated to host %s' %
(server_id, host))
return True
migration = (
self.nova.migrations.list(instance_uuid=server_id)[0])
if migration.status == 'error':
if migrate_retries == self.conf.live_migration_retries:
LOG.error('instance %s live migration failed after '
'%d retries' %
(server_id,
self.conf.live_migration_retries))
return False
# When live migrate fails it can fail fast after calling
# To have Nova time to be ready for next live migration
# There needs to be enough time to wait before retry
# And waiting more on next retry have better chance to
# Have live migration finally through
time.sleep(2 * (migrate_retries + 5))
LOG.info('instance %s live migration failed, retry'
% server_id)
server.live_migrate()
waited = 0
migrate_retries = migrate_retries + 1
elif migration.status != last_migration_status:
LOG.info('instance %s live migration status changed: %s'
% (server_id, migration.status))
waited = waited + 1
last_migration_status = migration.status
last_vm_status = vm_status
LOG.error('instance %s live migration did not finish in %ss, '
'state: %s' % (server_id, waited, instance.state))
except Exception as e:
LOG.error('server %s live migration failed, Exception=%s' %
(server_id, e))
return False
def migrate_server(self, instance): def migrate_server(self, instance):
# TBD this method should be enhanced for errors and to have failed # TBD this method should be enhanced for errors and to have failed
# instance back to state active instead of error # instance back to state active instead of error
@ -599,8 +671,9 @@ class Workflow(BaseWorkflow):
server = self.nova.servers.get(server_id) server = self.nova.servers.get(server_id)
instance.state = server.__dict__.get('OS-EXT-STS:vm_state') instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host')) orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
LOG.info('server %s state %s host %s' % (server_id, instance.state, LOG.info('migrate_server %s state %s host %s' % (server_id,
orig_host)) instance.state,
orig_host))
last_vm_state = instance.state last_vm_state = instance.state
retry_migrate = 2 retry_migrate = 2
while True: while True: