Make VM migrations timeout configurable and apply reasonable defaults
This change adds new 'migration_max_retries' and 'migrate_interval' configuration parameters to the [nova] section to control timeout behavior for VM migration operations. Changes: - Add a new [nova] section to the configuration file to store parameters related to the integration with nova. - Add migration_max_retries config option (default: 180) to define the max retries to check the result of VM migrations before giving up. - Add migration_interval config option (default: 5 seconds) to define the polling interval to check the VM status in migrate actions. - Update live_migrate_instance() and watcher_non_live_migrate_instance() to use configured migration_max_retries when retry parameter is None. - Add new parameter interval to the live_migrate_instance() and watcher_non_live_migrate_instance() methods. - Add comprehensive unit tests for timeout and retry functionality - Set migrate_max_retries to 120 and migration_interval to 1s in CI jobs. Closes-Bug: #2131663 Assisted-By: claude-code (claude-sonnet-4.5) Change-Id: Ifed3c058d821ce3b0741627dcc414fe054eb9dca Signed-off-by: Alfredo Moralejo <amoralej@redhat.com>
This commit is contained in:
@@ -71,6 +71,9 @@
|
||||
enable_extended_attributes: true
|
||||
nova_client:
|
||||
api_version: "2.96"
|
||||
nova:
|
||||
migration_max_retries: 120
|
||||
migration_interval: 1
|
||||
$CINDER_CONF:
|
||||
oslo_messaging_notifications:
|
||||
driver: messagingv2
|
||||
|
||||
11
releasenotes/notes/bug-2131663-b486d36f77d13a5d.yaml
Normal file
11
releasenotes/notes/bug-2131663-b486d36f77d13a5d.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
New parameters `migration_max_retries` and `migration_interval` have been
|
||||
added to the `nova` section to define the maximum retries and polling
|
||||
interval in VM migrations. Default values are 180 and 5 seconds.
|
||||
fixes:
|
||||
- |
|
||||
Fixed the issue when migrate actions Failed when the migration took more that
|
||||
120 seconds. After this patch, the default timeout is 900 seconds (15 minutes)
|
||||
which should be a reasonable value for most OpenStack installations.
|
||||
@@ -181,7 +181,7 @@ class NovaHelper:
|
||||
return volume.status == status
|
||||
|
||||
def watcher_non_live_migrate_instance(self, instance_id, dest_hostname,
|
||||
retry=120):
|
||||
retry=None, interval=None):
|
||||
"""This method migrates a given instance
|
||||
|
||||
This method uses the Nova built-in migrate()
|
||||
@@ -196,10 +196,15 @@ class NovaHelper:
|
||||
:param dest_hostname: the name of the destination compute node, if
|
||||
destination_node is None, nova scheduler choose
|
||||
the destination host
|
||||
:param retry: maximum number of retries before giving up
|
||||
:param interval: interval in seconds between retries
|
||||
"""
|
||||
LOG.debug(
|
||||
"Trying a cold migrate of instance '%s' ", instance_id)
|
||||
|
||||
# Use config defaults if not provided in method parameters
|
||||
retry = retry or CONF.nova.migration_max_retries
|
||||
interval = interval or CONF.nova.migration_interval
|
||||
# Looking for the instance to migrate
|
||||
instance = self.find_instance(instance_id)
|
||||
if not instance:
|
||||
@@ -218,7 +223,7 @@ class NovaHelper:
|
||||
while (getattr(instance, 'status') not in
|
||||
["VERIFY_RESIZE", "ERROR"] and retry):
|
||||
instance = self.nova.servers.get(instance.id)
|
||||
time.sleep(2)
|
||||
time.sleep(interval)
|
||||
retry -= 1
|
||||
new_hostname = getattr(instance, 'OS-EXT-SRV-ATTR:host')
|
||||
|
||||
@@ -300,7 +305,8 @@ class NovaHelper:
|
||||
|
||||
return True
|
||||
|
||||
def live_migrate_instance(self, instance_id, dest_hostname, retry=120):
|
||||
def live_migrate_instance(self, instance_id, dest_hostname, retry=None,
|
||||
interval=None):
|
||||
"""This method does a live migration of a given instance
|
||||
|
||||
This method uses the Nova built-in live_migrate()
|
||||
@@ -313,11 +319,16 @@ class NovaHelper:
|
||||
:param dest_hostname: the name of the destination compute node, if
|
||||
destination_node is None, nova scheduler choose
|
||||
the destination host
|
||||
:param retry: maximum number of retries before giving up
|
||||
:param interval: interval in seconds between retries
|
||||
"""
|
||||
LOG.debug(
|
||||
"Trying a live migrate instance %(instance)s ",
|
||||
{'instance': instance_id})
|
||||
|
||||
# Use config defaults if not provided in method parameters
|
||||
retry = retry or CONF.nova.migration_max_retries
|
||||
interval = interval or CONF.nova.migration_interval
|
||||
# Looking for the instance to migrate
|
||||
instance = self.find_instance(instance_id)
|
||||
if not instance:
|
||||
@@ -341,7 +352,7 @@ class NovaHelper:
|
||||
while (instance.status not in ['ACTIVE', 'ERROR'] and retry):
|
||||
instance = self.nova.servers.get(instance.id)
|
||||
LOG.debug('Waiting the migration of %s', instance.id)
|
||||
time.sleep(1)
|
||||
time.sleep(interval)
|
||||
retry -= 1
|
||||
new_hostname = getattr(instance, 'OS-EXT-SRV-ATTR:host')
|
||||
|
||||
@@ -364,7 +375,7 @@ class NovaHelper:
|
||||
LOG.debug('Waiting the migration of %s to %s',
|
||||
instance,
|
||||
getattr(instance, 'OS-EXT-SRV-ATTR:host'))
|
||||
time.sleep(1)
|
||||
time.sleep(interval)
|
||||
retry -= 1
|
||||
|
||||
host_name = getattr(instance, 'OS-EXT-SRV-ATTR:host')
|
||||
|
||||
@@ -36,6 +36,7 @@ from watcher.conf import keystone_client
|
||||
from watcher.conf import maas_client
|
||||
from watcher.conf import models
|
||||
from watcher.conf import monasca_client
|
||||
from watcher.conf import nova
|
||||
from watcher.conf import nova_client
|
||||
from watcher.conf import paths
|
||||
from watcher.conf import placement_client
|
||||
@@ -69,3 +70,4 @@ ironic_client.register_opts(CONF)
|
||||
collector.register_opts(CONF)
|
||||
placement_client.register_opts(CONF)
|
||||
prometheus_client.register_opts(CONF)
|
||||
nova.register_opts(CONF)
|
||||
|
||||
50
watcher/conf/nova.py
Normal file
50
watcher/conf/nova.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# Copyright (c) 2025 OpenStack Foundation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from oslo_config import cfg
|
||||
|
||||
nova = cfg.OptGroup(name='nova',
|
||||
title='Options for the Nova integration '
|
||||
'configuration')
|
||||
|
||||
NOVA_OPTS = [
|
||||
cfg.IntOpt('migration_max_retries',
|
||||
default=180,
|
||||
min=1,
|
||||
help='Maximum number of retries for Nova migrations '
|
||||
'before giving up and considering the operation failed. '
|
||||
'Default value is 180, which for the default '
|
||||
'migration_interval (5 seconds), makes a default '
|
||||
'migration timeout of 900 seconds (15 minutes).'
|
||||
'This is an upper bound on the maximum expected. This '
|
||||
'should not be decreased or increased over 3600s '
|
||||
'(one hour). Shorter values may cause the actions to '
|
||||
'fail and higher values may hide infrastructure issues.'),
|
||||
cfg.FloatOpt('migration_interval',
|
||||
default=5.0,
|
||||
min=0.1,
|
||||
help='Interval in seconds to check the status in Nova VM '
|
||||
'migrations (value is float). Default value is 5.0 '
|
||||
'seconds.'),
|
||||
]
|
||||
|
||||
|
||||
def register_opts(conf):
|
||||
conf.register_group(nova)
|
||||
conf.register_opts(NOVA_OPTS, group=nova)
|
||||
|
||||
|
||||
def list_opts():
|
||||
return [(nova, NOVA_OPTS)]
|
||||
@@ -144,3 +144,15 @@ class TestCase(BaseTestCase):
|
||||
return os.path.join(root, project_file)
|
||||
else:
|
||||
return root
|
||||
|
||||
def flags(self, **kw):
|
||||
"""Override flag variables for a test.
|
||||
|
||||
Example:
|
||||
self.flags(periodic_interval=10,
|
||||
group='watcher_decision_engine')
|
||||
"""
|
||||
group = kw.pop('group', None)
|
||||
for k, v in kw.items():
|
||||
CONF.set_override(k, v, group)
|
||||
self.addCleanup(CONF.clear_override, k, group)
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
import fixtures
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
@@ -43,6 +45,8 @@ class TestNovaHelper(base.TestCase):
|
||||
self.source_node = "ldev-indeedsrv005"
|
||||
self.destination_node = "ldev-indeedsrv006"
|
||||
self.flavor_name = "x1"
|
||||
self.mock_sleep = self.useFixture(
|
||||
fixtures.MockPatchObject(time, 'sleep')).mock
|
||||
|
||||
@staticmethod
|
||||
def fake_server(*args, **kwargs):
|
||||
@@ -360,7 +364,7 @@ class TestNovaHelper(base.TestCase):
|
||||
nova_util.live_migrate_instance(
|
||||
self.instance_uuid, self.destination_node
|
||||
)
|
||||
time.sleep.assert_called_with(1)
|
||||
time.sleep.assert_called_with(5)
|
||||
|
||||
@mock.patch.object(time, 'sleep', mock.Mock())
|
||||
def test_live_migrate_instance_no_destination_node(
|
||||
@@ -443,6 +447,229 @@ class TestNovaHelper(base.TestCase):
|
||||
)
|
||||
self.assertTrue(is_success)
|
||||
|
||||
@mock.patch.object(nova_helper.NovaHelper, 'confirm_resize')
|
||||
def test_watcher_non_live_migrate_instance_retry_success(
|
||||
self, mock_confirm_resize, mock_cinder, mock_nova):
|
||||
"""Test that watcher_non_live_migrate uses config timeout by default"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
server.status = 'MIGRATING' # Never reaches ACTIVE
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
|
||||
verify_server = copy.deepcopy(server)
|
||||
verify_server.status = 'VERIFY_RESIZE'
|
||||
setattr(verify_server, 'OS-EXT-SRV-ATTR:host', self.destination_node)
|
||||
|
||||
# This means instance will be found as VERIFY_RESIZE in second retry
|
||||
nova_util.nova.servers.get.side_effect = (server, server, server,
|
||||
verify_server)
|
||||
|
||||
mock_confirm_resize.return_value = True
|
||||
|
||||
self.flags(migration_max_retries=20, migration_interval=4,
|
||||
group='nova')
|
||||
# Migration will success as will transition from MIGRATING to
|
||||
# VERIFY_RESIZE
|
||||
is_success = nova_util.watcher_non_live_migrate_instance(
|
||||
self.instance_uuid, self.destination_node
|
||||
)
|
||||
|
||||
# Should succeed
|
||||
self.assertTrue(is_success)
|
||||
# It will sleep 2 times because it will be found as VERIFY_RESIZE in
|
||||
# the second retry
|
||||
self.assertEqual(2, self.mock_sleep.call_count)
|
||||
# Verify all sleep calls used 4 second interval
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 4)
|
||||
|
||||
def test_watcher_non_live_migrate_instance_retry_default(
|
||||
self, mock_cinder, mock_nova):
|
||||
"""Test that watcher_non_live_migrate uses config timeout by default"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
server.status = 'MIGRATING' # Never reaches ACTIVE
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
|
||||
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
|
||||
|
||||
# Migration will timeout because status never changes
|
||||
is_success = nova_util.watcher_non_live_migrate_instance(
|
||||
self.instance_uuid, self.destination_node
|
||||
)
|
||||
|
||||
# Should fail due to timeout
|
||||
self.assertFalse(is_success)
|
||||
# With default migration_max_retries and migration_interval, should
|
||||
# sleep 180 times for 5 seconds
|
||||
self.assertEqual(180, self.mock_sleep.call_count)
|
||||
# Verify sleep calls used 5 second
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 5)
|
||||
|
||||
def test_watcher_non_live_migrate_instance_retry_custom(
|
||||
self, mock_cinder, mock_nova):
|
||||
"""Test that watcher_non_live_migrate respects explicit retry value"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
server.status = 'MIGRATING' # Never reaches ACTIVE
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
|
||||
# Set config to a custom values to ensure custom values are used
|
||||
self.flags(migration_max_retries=10, migration_interval=3,
|
||||
group='nova')
|
||||
|
||||
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
|
||||
|
||||
is_success = nova_util.watcher_non_live_migrate_instance(
|
||||
self.instance_uuid, self.destination_node
|
||||
)
|
||||
|
||||
# Should fail due to timeout
|
||||
self.assertFalse(is_success)
|
||||
# It should sleep migration_max_retries times with migration_interval
|
||||
# seconds
|
||||
self.assertEqual(10, self.mock_sleep.call_count)
|
||||
# Verify all sleep calls used migration_interval
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 3)
|
||||
|
||||
def test_live_migrate_instance_retry_default_success(
|
||||
self, mock_cinder, mock_nova):
|
||||
"""Test that live_migrate_instance uses config timeout by default"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
setattr(server, 'OS-EXT-STS:task_state', 'migrating')
|
||||
|
||||
migrated_server = copy.deepcopy(server)
|
||||
migrated_server.status = 'ACTIVE'
|
||||
setattr(migrated_server, 'OS-EXT-SRV-ATTR:host', self.destination_node)
|
||||
|
||||
nova_util.nova.servers.get.side_effect = (
|
||||
server, server, server, migrated_server)
|
||||
|
||||
# Migration will success as will transition from migrating to ACTIVE
|
||||
is_success = nova_util.live_migrate_instance(
|
||||
self.instance_uuid, self.destination_node
|
||||
)
|
||||
|
||||
# Should succeed
|
||||
self.assertTrue(is_success)
|
||||
# It will sleep 2 times because it will be found as ACTIVE in
|
||||
# the second retry
|
||||
self.assertEqual(2, self.mock_sleep.call_count)
|
||||
# Verify all sleep calls used 5 second interval
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 5)
|
||||
|
||||
def test_live_migrate_instance_retry_default(
|
||||
self, mock_cinder, mock_nova):
|
||||
"""Test that live_migrate_instance uses config timeout by default"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
setattr(server, 'OS-EXT-STS:task_state', 'migrating')
|
||||
|
||||
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
|
||||
|
||||
# Migration will timeout because host never changes
|
||||
is_success = nova_util.live_migrate_instance(
|
||||
self.instance_uuid, self.destination_node
|
||||
)
|
||||
|
||||
# Should fail due to timeout
|
||||
self.assertFalse(is_success)
|
||||
# With default migration_max_retries and migration_interval, should
|
||||
# sleep 5 seconds 180 times
|
||||
self.assertEqual(180, self.mock_sleep.call_count)
|
||||
# Verify all sleep calls used 5 second interval
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 5)
|
||||
|
||||
def test_live_migrate_instance_retry_custom(
|
||||
self, mock_cinder, mock_nova):
|
||||
"""Test that live_migrate_instance uses config timeout by default"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
setattr(server, 'OS-EXT-STS:task_state', 'migrating')
|
||||
|
||||
# Set config value
|
||||
self.flags(migration_max_retries=20, migration_interval=1.5,
|
||||
group='nova')
|
||||
|
||||
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
|
||||
|
||||
# Migration will timeout because host never changes
|
||||
is_success = nova_util.live_migrate_instance(
|
||||
self.instance_uuid, self.destination_node
|
||||
)
|
||||
|
||||
# Should fail due to timeout
|
||||
self.assertFalse(is_success)
|
||||
# With migration_max_retries and migration_interval, should sleep 20
|
||||
# times
|
||||
self.assertEqual(20, self.mock_sleep.call_count)
|
||||
|
||||
# Verify sleep calls used 1.5 second
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 1.5)
|
||||
|
||||
def test_live_migrate_instance_no_dest_retry_default(
|
||||
self, mock_cinder, mock_nova):
|
||||
"""Test live_migrate with no destination uses config timeout"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
server.status = 'MIGRATING' # Never reaches ACTIVE
|
||||
|
||||
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
|
||||
|
||||
# Migration with no destination will timeout
|
||||
is_success = nova_util.live_migrate_instance(
|
||||
self.instance_uuid, None
|
||||
)
|
||||
|
||||
# Should fail due to timeout
|
||||
self.assertFalse(is_success)
|
||||
# With default migration_max_retries and migration_interval, should
|
||||
# sleep 180 times
|
||||
self.assertEqual(180, self.mock_sleep.call_count)
|
||||
|
||||
# Verify all sleep calls used 5 second interval
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 5)
|
||||
|
||||
def test_live_migrate_instance_no_dest_retry_custom(
|
||||
self, mock_cinder, mock_nova):
|
||||
"""Test live_migrate with no destination uses config timeout"""
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
server = self.fake_server(self.instance_uuid)
|
||||
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
|
||||
server.status = 'MIGRATING' # Never reaches ACTIVE
|
||||
|
||||
# Set config value
|
||||
self.flags(migration_max_retries=10, migration_interval=3,
|
||||
group='nova')
|
||||
|
||||
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
|
||||
|
||||
# Migration with no destination will timeout
|
||||
is_success = nova_util.live_migrate_instance(
|
||||
self.instance_uuid, None
|
||||
)
|
||||
|
||||
# Should fail due to timeout
|
||||
self.assertFalse(is_success)
|
||||
# With migration_max_retries and migration_interval, should sleep 10
|
||||
# times for 3 seconds
|
||||
self.assertEqual(10, self.mock_sleep.call_count)
|
||||
|
||||
# Verify sleep calls used 3 second
|
||||
for call in self.mock_sleep.call_args_list:
|
||||
self.assertEqual(call[0][0], 3)
|
||||
|
||||
def test_enable_service_nova_compute(self, mock_cinder, mock_nova):
|
||||
nova_util = nova_helper.NovaHelper()
|
||||
nova_services = nova_util.nova.services
|
||||
|
||||
Reference in New Issue
Block a user