diff --git a/.zuul.yaml b/.zuul.yaml index 8118d1cfc..b9e58fca4 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -71,6 +71,9 @@ enable_extended_attributes: true nova_client: api_version: "2.96" + nova: + migration_max_retries: 120 + migration_interval: 1 $CINDER_CONF: oslo_messaging_notifications: driver: messagingv2 diff --git a/releasenotes/notes/bug-2131663-b486d36f77d13a5d.yaml b/releasenotes/notes/bug-2131663-b486d36f77d13a5d.yaml new file mode 100644 index 000000000..8a00566a1 --- /dev/null +++ b/releasenotes/notes/bug-2131663-b486d36f77d13a5d.yaml @@ -0,0 +1,11 @@ +--- +features: + - | + New parameters `migration_max_retries` and `migration_interval` have been + added to the `nova` section to define the maximum retries and polling + interval in VM migrations. Default values are 180 and 5 seconds. +fixes: + - | + Fixed the issue when migrate actions Failed when the migration took more that + 120 seconds. After this patch, the default timeout is 900 seconds (15 minutes) + which should be a reasonable value for most OpenStack installations. diff --git a/watcher/common/nova_helper.py b/watcher/common/nova_helper.py index 7569e822a..45122d3e3 100644 --- a/watcher/common/nova_helper.py +++ b/watcher/common/nova_helper.py @@ -181,7 +181,7 @@ class NovaHelper: return volume.status == status def watcher_non_live_migrate_instance(self, instance_id, dest_hostname, - retry=120): + retry=None, interval=None): """This method migrates a given instance This method uses the Nova built-in migrate() @@ -196,10 +196,15 @@ class NovaHelper: :param dest_hostname: the name of the destination compute node, if destination_node is None, nova scheduler choose the destination host + :param retry: maximum number of retries before giving up + :param interval: interval in seconds between retries """ LOG.debug( "Trying a cold migrate of instance '%s' ", instance_id) + # Use config defaults if not provided in method parameters + retry = retry or CONF.nova.migration_max_retries + interval = interval or CONF.nova.migration_interval # Looking for the instance to migrate instance = self.find_instance(instance_id) if not instance: @@ -218,7 +223,7 @@ class NovaHelper: while (getattr(instance, 'status') not in ["VERIFY_RESIZE", "ERROR"] and retry): instance = self.nova.servers.get(instance.id) - time.sleep(2) + time.sleep(interval) retry -= 1 new_hostname = getattr(instance, 'OS-EXT-SRV-ATTR:host') @@ -300,7 +305,8 @@ class NovaHelper: return True - def live_migrate_instance(self, instance_id, dest_hostname, retry=120): + def live_migrate_instance(self, instance_id, dest_hostname, retry=None, + interval=None): """This method does a live migration of a given instance This method uses the Nova built-in live_migrate() @@ -313,11 +319,16 @@ class NovaHelper: :param dest_hostname: the name of the destination compute node, if destination_node is None, nova scheduler choose the destination host + :param retry: maximum number of retries before giving up + :param interval: interval in seconds between retries """ LOG.debug( "Trying a live migrate instance %(instance)s ", {'instance': instance_id}) + # Use config defaults if not provided in method parameters + retry = retry or CONF.nova.migration_max_retries + interval = interval or CONF.nova.migration_interval # Looking for the instance to migrate instance = self.find_instance(instance_id) if not instance: @@ -341,7 +352,7 @@ class NovaHelper: while (instance.status not in ['ACTIVE', 'ERROR'] and retry): instance = self.nova.servers.get(instance.id) LOG.debug('Waiting the migration of %s', instance.id) - time.sleep(1) + time.sleep(interval) retry -= 1 new_hostname = getattr(instance, 'OS-EXT-SRV-ATTR:host') @@ -364,7 +375,7 @@ class NovaHelper: LOG.debug('Waiting the migration of %s to %s', instance, getattr(instance, 'OS-EXT-SRV-ATTR:host')) - time.sleep(1) + time.sleep(interval) retry -= 1 host_name = getattr(instance, 'OS-EXT-SRV-ATTR:host') diff --git a/watcher/conf/__init__.py b/watcher/conf/__init__.py index 2cb566a52..4f365c66c 100644 --- a/watcher/conf/__init__.py +++ b/watcher/conf/__init__.py @@ -36,6 +36,7 @@ from watcher.conf import keystone_client from watcher.conf import maas_client from watcher.conf import models from watcher.conf import monasca_client +from watcher.conf import nova from watcher.conf import nova_client from watcher.conf import paths from watcher.conf import placement_client @@ -69,3 +70,4 @@ ironic_client.register_opts(CONF) collector.register_opts(CONF) placement_client.register_opts(CONF) prometheus_client.register_opts(CONF) +nova.register_opts(CONF) diff --git a/watcher/conf/nova.py b/watcher/conf/nova.py new file mode 100644 index 000000000..30241104d --- /dev/null +++ b/watcher/conf/nova.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from oslo_config import cfg + +nova = cfg.OptGroup(name='nova', + title='Options for the Nova integration ' + 'configuration') + +NOVA_OPTS = [ + cfg.IntOpt('migration_max_retries', + default=180, + min=1, + help='Maximum number of retries for Nova migrations ' + 'before giving up and considering the operation failed. ' + 'Default value is 180, which for the default ' + 'migration_interval (5 seconds), makes a default ' + 'migration timeout of 900 seconds (15 minutes).' + 'This is an upper bound on the maximum expected. This ' + 'should not be decreased or increased over 3600s ' + '(one hour). Shorter values may cause the actions to ' + 'fail and higher values may hide infrastructure issues.'), + cfg.FloatOpt('migration_interval', + default=5.0, + min=0.1, + help='Interval in seconds to check the status in Nova VM ' + 'migrations (value is float). Default value is 5.0 ' + 'seconds.'), +] + + +def register_opts(conf): + conf.register_group(nova) + conf.register_opts(NOVA_OPTS, group=nova) + + +def list_opts(): + return [(nova, NOVA_OPTS)] diff --git a/watcher/tests/base.py b/watcher/tests/base.py index 25c38a199..d1727436c 100644 --- a/watcher/tests/base.py +++ b/watcher/tests/base.py @@ -144,3 +144,15 @@ class TestCase(BaseTestCase): return os.path.join(root, project_file) else: return root + + def flags(self, **kw): + """Override flag variables for a test. + + Example: + self.flags(periodic_interval=10, + group='watcher_decision_engine') + """ + group = kw.pop('group', None) + for k, v in kw.items(): + CONF.set_override(k, v, group) + self.addCleanup(CONF.clear_override, k, group) diff --git a/watcher/tests/common/test_nova_helper.py b/watcher/tests/common/test_nova_helper.py index 126f87d76..aca4f1cfd 100644 --- a/watcher/tests/common/test_nova_helper.py +++ b/watcher/tests/common/test_nova_helper.py @@ -16,6 +16,8 @@ # limitations under the License. # +import copy +import fixtures import time from unittest import mock @@ -43,6 +45,8 @@ class TestNovaHelper(base.TestCase): self.source_node = "ldev-indeedsrv005" self.destination_node = "ldev-indeedsrv006" self.flavor_name = "x1" + self.mock_sleep = self.useFixture( + fixtures.MockPatchObject(time, 'sleep')).mock @staticmethod def fake_server(*args, **kwargs): @@ -360,7 +364,7 @@ class TestNovaHelper(base.TestCase): nova_util.live_migrate_instance( self.instance_uuid, self.destination_node ) - time.sleep.assert_called_with(1) + time.sleep.assert_called_with(5) @mock.patch.object(time, 'sleep', mock.Mock()) def test_live_migrate_instance_no_destination_node( @@ -443,6 +447,229 @@ class TestNovaHelper(base.TestCase): ) self.assertTrue(is_success) + @mock.patch.object(nova_helper.NovaHelper, 'confirm_resize') + def test_watcher_non_live_migrate_instance_retry_success( + self, mock_confirm_resize, mock_cinder, mock_nova): + """Test that watcher_non_live_migrate uses config timeout by default""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + server.status = 'MIGRATING' # Never reaches ACTIVE + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + + verify_server = copy.deepcopy(server) + verify_server.status = 'VERIFY_RESIZE' + setattr(verify_server, 'OS-EXT-SRV-ATTR:host', self.destination_node) + + # This means instance will be found as VERIFY_RESIZE in second retry + nova_util.nova.servers.get.side_effect = (server, server, server, + verify_server) + + mock_confirm_resize.return_value = True + + self.flags(migration_max_retries=20, migration_interval=4, + group='nova') + # Migration will success as will transition from MIGRATING to + # VERIFY_RESIZE + is_success = nova_util.watcher_non_live_migrate_instance( + self.instance_uuid, self.destination_node + ) + + # Should succeed + self.assertTrue(is_success) + # It will sleep 2 times because it will be found as VERIFY_RESIZE in + # the second retry + self.assertEqual(2, self.mock_sleep.call_count) + # Verify all sleep calls used 4 second interval + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 4) + + def test_watcher_non_live_migrate_instance_retry_default( + self, mock_cinder, mock_nova): + """Test that watcher_non_live_migrate uses config timeout by default""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + server.status = 'MIGRATING' # Never reaches ACTIVE + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + + self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server) + + # Migration will timeout because status never changes + is_success = nova_util.watcher_non_live_migrate_instance( + self.instance_uuid, self.destination_node + ) + + # Should fail due to timeout + self.assertFalse(is_success) + # With default migration_max_retries and migration_interval, should + # sleep 180 times for 5 seconds + self.assertEqual(180, self.mock_sleep.call_count) + # Verify sleep calls used 5 second + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 5) + + def test_watcher_non_live_migrate_instance_retry_custom( + self, mock_cinder, mock_nova): + """Test that watcher_non_live_migrate respects explicit retry value""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + server.status = 'MIGRATING' # Never reaches ACTIVE + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + + # Set config to a custom values to ensure custom values are used + self.flags(migration_max_retries=10, migration_interval=3, + group='nova') + + self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server) + + is_success = nova_util.watcher_non_live_migrate_instance( + self.instance_uuid, self.destination_node + ) + + # Should fail due to timeout + self.assertFalse(is_success) + # It should sleep migration_max_retries times with migration_interval + # seconds + self.assertEqual(10, self.mock_sleep.call_count) + # Verify all sleep calls used migration_interval + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 3) + + def test_live_migrate_instance_retry_default_success( + self, mock_cinder, mock_nova): + """Test that live_migrate_instance uses config timeout by default""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + setattr(server, 'OS-EXT-STS:task_state', 'migrating') + + migrated_server = copy.deepcopy(server) + migrated_server.status = 'ACTIVE' + setattr(migrated_server, 'OS-EXT-SRV-ATTR:host', self.destination_node) + + nova_util.nova.servers.get.side_effect = ( + server, server, server, migrated_server) + + # Migration will success as will transition from migrating to ACTIVE + is_success = nova_util.live_migrate_instance( + self.instance_uuid, self.destination_node + ) + + # Should succeed + self.assertTrue(is_success) + # It will sleep 2 times because it will be found as ACTIVE in + # the second retry + self.assertEqual(2, self.mock_sleep.call_count) + # Verify all sleep calls used 5 second interval + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 5) + + def test_live_migrate_instance_retry_default( + self, mock_cinder, mock_nova): + """Test that live_migrate_instance uses config timeout by default""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + setattr(server, 'OS-EXT-STS:task_state', 'migrating') + + self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server) + + # Migration will timeout because host never changes + is_success = nova_util.live_migrate_instance( + self.instance_uuid, self.destination_node + ) + + # Should fail due to timeout + self.assertFalse(is_success) + # With default migration_max_retries and migration_interval, should + # sleep 5 seconds 180 times + self.assertEqual(180, self.mock_sleep.call_count) + # Verify all sleep calls used 5 second interval + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 5) + + def test_live_migrate_instance_retry_custom( + self, mock_cinder, mock_nova): + """Test that live_migrate_instance uses config timeout by default""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + setattr(server, 'OS-EXT-STS:task_state', 'migrating') + + # Set config value + self.flags(migration_max_retries=20, migration_interval=1.5, + group='nova') + + self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server) + + # Migration will timeout because host never changes + is_success = nova_util.live_migrate_instance( + self.instance_uuid, self.destination_node + ) + + # Should fail due to timeout + self.assertFalse(is_success) + # With migration_max_retries and migration_interval, should sleep 20 + # times + self.assertEqual(20, self.mock_sleep.call_count) + + # Verify sleep calls used 1.5 second + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 1.5) + + def test_live_migrate_instance_no_dest_retry_default( + self, mock_cinder, mock_nova): + """Test live_migrate with no destination uses config timeout""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + server.status = 'MIGRATING' # Never reaches ACTIVE + + self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server) + + # Migration with no destination will timeout + is_success = nova_util.live_migrate_instance( + self.instance_uuid, None + ) + + # Should fail due to timeout + self.assertFalse(is_success) + # With default migration_max_retries and migration_interval, should + # sleep 180 times + self.assertEqual(180, self.mock_sleep.call_count) + + # Verify all sleep calls used 5 second interval + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 5) + + def test_live_migrate_instance_no_dest_retry_custom( + self, mock_cinder, mock_nova): + """Test live_migrate with no destination uses config timeout""" + nova_util = nova_helper.NovaHelper() + server = self.fake_server(self.instance_uuid) + setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node) + server.status = 'MIGRATING' # Never reaches ACTIVE + + # Set config value + self.flags(migration_max_retries=10, migration_interval=3, + group='nova') + + self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server) + + # Migration with no destination will timeout + is_success = nova_util.live_migrate_instance( + self.instance_uuid, None + ) + + # Should fail due to timeout + self.assertFalse(is_success) + # With migration_max_retries and migration_interval, should sleep 10 + # times for 3 seconds + self.assertEqual(10, self.mock_sleep.call_count) + + # Verify sleep calls used 3 second + for call in self.mock_sleep.call_args_list: + self.assertEqual(call[0][0], 3) + def test_enable_service_nova_compute(self, mock_cinder, mock_nova): nova_util = nova_helper.NovaHelper() nova_services = nova_util.nova.services