Add live migration support
story: 2005585 Task: #30774 Change-Id: I5dc6db643900a6bfcc427b4b9ee23b5557b091a4 Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
This commit is contained in:
parent
a232dda364
commit
46e13601a5
@ -164,7 +164,7 @@ def remove_session(session_id):
|
|||||||
downloads = _download_get_all(session, session_id)
|
downloads = _download_get_all(session, session_id)
|
||||||
if downloads:
|
if downloads:
|
||||||
for download in downloads:
|
for download in downloads:
|
||||||
download.delete(download)
|
session.delete(download)
|
||||||
|
|
||||||
hosts = _hosts_get(session, session_id)
|
hosts = _hosts_get(session, session_id)
|
||||||
if hosts:
|
if hosts:
|
||||||
|
@ -69,6 +69,12 @@ opts = [
|
|||||||
cfg.StrOpt('local_cache_dir',
|
cfg.StrOpt('local_cache_dir',
|
||||||
default="/tmp",
|
default="/tmp",
|
||||||
help="Local cache directory"),
|
help="Local cache directory"),
|
||||||
|
cfg.StrOpt('live_migration_retries',
|
||||||
|
default=5,
|
||||||
|
help="Number of live migration retries"),
|
||||||
|
cfg.StrOpt('live_migration_wait_time',
|
||||||
|
default=600,
|
||||||
|
help="How long to wait live migration to be done"),
|
||||||
]
|
]
|
||||||
|
|
||||||
CONF.register_opts(opts)
|
CONF.register_opts(opts)
|
||||||
|
@ -565,8 +565,11 @@ class Workflow(BaseWorkflow):
|
|||||||
self.notify_action_done(project, instance)
|
self.notify_action_done(project, instance)
|
||||||
elif instance.action == 'OWN_ACTION':
|
elif instance.action == 'OWN_ACTION':
|
||||||
pass
|
pass
|
||||||
|
elif instance.action == 'LIVE_MIGRATE':
|
||||||
|
if not self.live_migrate_server(instance):
|
||||||
|
return False
|
||||||
|
self.notify_action_done(project, instance)
|
||||||
else:
|
else:
|
||||||
# TBD LIVE_MIGRATE not supported
|
|
||||||
raise Exception('%s: instance %s action '
|
raise Exception('%s: instance %s action '
|
||||||
'%s not supported' %
|
'%s not supported' %
|
||||||
(self.session_id, instance.instance_id,
|
(self.session_id, instance.instance_id,
|
||||||
@ -576,7 +579,7 @@ class Workflow(BaseWorkflow):
|
|||||||
def _wait_host_empty(self, host):
|
def _wait_host_empty(self, host):
|
||||||
hid = self.nova.hypervisors.search(host)[0].id
|
hid = self.nova.hypervisors.search(host)[0].id
|
||||||
vcpus_used_last = 0
|
vcpus_used_last = 0
|
||||||
# wait 4min to get host empty
|
# wait 4min to get host emptys
|
||||||
for j in range(48):
|
for j in range(48):
|
||||||
hvisor = self.nova.hypervisors.get(hid)
|
hvisor = self.nova.hypervisors.get(hid)
|
||||||
vcpus_used = hvisor.__getattr__('vcpus_used')
|
vcpus_used = hvisor.__getattr__('vcpus_used')
|
||||||
@ -592,6 +595,75 @@ class Workflow(BaseWorkflow):
|
|||||||
LOG.info('%s host still not empty' % host)
|
LOG.info('%s host still not empty' % host)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def live_migrate_server(self, instance):
|
||||||
|
server_id = instance.instance_id
|
||||||
|
server = self.nova.servers.get(server_id)
|
||||||
|
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
|
||||||
|
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
|
||||||
|
LOG.info('live_migrate_server %s state %s host %s' % (server_id,
|
||||||
|
instance.state,
|
||||||
|
orig_host))
|
||||||
|
orig_vm_state = instance.state
|
||||||
|
last_vm_status = str(server.__dict__.get('status'))
|
||||||
|
last_migration_status = "active"
|
||||||
|
try:
|
||||||
|
server.live_migrate()
|
||||||
|
waited = 0
|
||||||
|
migrate_retries = 0
|
||||||
|
while waited != self.conf.live_migration_wait_time:
|
||||||
|
time.sleep(1)
|
||||||
|
server = self.nova.servers.get(server_id)
|
||||||
|
host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
|
||||||
|
vm_status = str(server.__dict__.get('status'))
|
||||||
|
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
|
||||||
|
instance.host = host
|
||||||
|
if vm_status != last_vm_status:
|
||||||
|
LOG.info('instance %s status changed: %s' % (server_id,
|
||||||
|
vm_status))
|
||||||
|
if instance.state == 'error':
|
||||||
|
LOG.error('instance %s live migration failed'
|
||||||
|
% server_id)
|
||||||
|
return False
|
||||||
|
elif orig_vm_state != instance.state:
|
||||||
|
LOG.info('instance %s state changed: %s' % (server_id,
|
||||||
|
instance.state))
|
||||||
|
elif host != orig_host:
|
||||||
|
LOG.info('instance %s live migrated to host %s' %
|
||||||
|
(server_id, host))
|
||||||
|
return True
|
||||||
|
migration = (
|
||||||
|
self.nova.migrations.list(instance_uuid=server_id)[0])
|
||||||
|
if migration.status == 'error':
|
||||||
|
if migrate_retries == self.conf.live_migration_retries:
|
||||||
|
LOG.error('instance %s live migration failed after '
|
||||||
|
'%d retries' %
|
||||||
|
(server_id,
|
||||||
|
self.conf.live_migration_retries))
|
||||||
|
return False
|
||||||
|
# When live migrate fails it can fail fast after calling
|
||||||
|
# To have Nova time to be ready for next live migration
|
||||||
|
# There needs to be enough time to wait before retry
|
||||||
|
# And waiting more on next retry have better chance to
|
||||||
|
# Have live migration finally through
|
||||||
|
time.sleep(2 * (migrate_retries + 5))
|
||||||
|
LOG.info('instance %s live migration failed, retry'
|
||||||
|
% server_id)
|
||||||
|
server.live_migrate()
|
||||||
|
waited = 0
|
||||||
|
migrate_retries = migrate_retries + 1
|
||||||
|
elif migration.status != last_migration_status:
|
||||||
|
LOG.info('instance %s live migration status changed: %s'
|
||||||
|
% (server_id, migration.status))
|
||||||
|
waited = waited + 1
|
||||||
|
last_migration_status = migration.status
|
||||||
|
last_vm_status = vm_status
|
||||||
|
LOG.error('instance %s live migration did not finish in %ss, '
|
||||||
|
'state: %s' % (server_id, waited, instance.state))
|
||||||
|
except Exception as e:
|
||||||
|
LOG.error('server %s live migration failed, Exception=%s' %
|
||||||
|
(server_id, e))
|
||||||
|
return False
|
||||||
|
|
||||||
def migrate_server(self, instance):
|
def migrate_server(self, instance):
|
||||||
# TBD this method should be enhanced for errors and to have failed
|
# TBD this method should be enhanced for errors and to have failed
|
||||||
# instance back to state active instead of error
|
# instance back to state active instead of error
|
||||||
@ -599,8 +671,9 @@ class Workflow(BaseWorkflow):
|
|||||||
server = self.nova.servers.get(server_id)
|
server = self.nova.servers.get(server_id)
|
||||||
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
|
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
|
||||||
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
|
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
|
||||||
LOG.info('server %s state %s host %s' % (server_id, instance.state,
|
LOG.info('migrate_server %s state %s host %s' % (server_id,
|
||||||
orig_host))
|
instance.state,
|
||||||
|
orig_host))
|
||||||
last_vm_state = instance.state
|
last_vm_state = instance.state
|
||||||
retry_migrate = 2
|
retry_migrate = 2
|
||||||
while True:
|
while True:
|
||||||
|
Loading…
Reference in New Issue
Block a user