Make VM migrations timeout configurable and apply reasonable defaults

This change adds new 'migration_max_retries' and 'migrate_interval' configuration
parameters to the [nova] section to control timeout behavior for VM
migration operations.

Changes:
- Add a new [nova] section to the configuration file to store parameters
  related to the integration with nova.
- Add migration_max_retries config option (default: 180) to define the max
  retries to check the result of VM migrations before giving up.
- Add migration_interval config option (default: 5 seconds) to define the
  polling interval to check the VM status in migrate actions.
- Update live_migrate_instance() and watcher_non_live_migrate_instance()
  to use configured migration_max_retries when retry parameter is None.
- Add new parameter interval to the live_migrate_instance() and
  watcher_non_live_migrate_instance() methods.
- Add comprehensive unit tests for timeout and retry functionality
- Set migrate_max_retries to 120 and migration_interval to 1s in CI jobs.

Closes-Bug: #2131663

Assisted-By: claude-code (claude-sonnet-4.5)

Change-Id: Ifed3c058d821ce3b0741627dcc414fe054eb9dca
Signed-off-by: Alfredo Moralejo <amoralej@redhat.com>
This commit is contained in:
Alfredo Moralejo
2025-11-19 12:37:06 +01:00
parent 8d6f4c954b
commit 13d73e9b4e
7 changed files with 322 additions and 6 deletions

View File

@@ -71,6 +71,9 @@
enable_extended_attributes: true
nova_client:
api_version: "2.96"
nova:
migration_max_retries: 120
migration_interval: 1
$CINDER_CONF:
oslo_messaging_notifications:
driver: messagingv2

View File

@@ -0,0 +1,11 @@
---
features:
- |
New parameters `migration_max_retries` and `migration_interval` have been
added to the `nova` section to define the maximum retries and polling
interval in VM migrations. Default values are 180 and 5 seconds.
fixes:
- |
Fixed the issue when migrate actions Failed when the migration took more that
120 seconds. After this patch, the default timeout is 900 seconds (15 minutes)
which should be a reasonable value for most OpenStack installations.

View File

@@ -181,7 +181,7 @@ class NovaHelper:
return volume.status == status
def watcher_non_live_migrate_instance(self, instance_id, dest_hostname,
retry=120):
retry=None, interval=None):
"""This method migrates a given instance
This method uses the Nova built-in migrate()
@@ -196,10 +196,15 @@ class NovaHelper:
:param dest_hostname: the name of the destination compute node, if
destination_node is None, nova scheduler choose
the destination host
:param retry: maximum number of retries before giving up
:param interval: interval in seconds between retries
"""
LOG.debug(
"Trying a cold migrate of instance '%s' ", instance_id)
# Use config defaults if not provided in method parameters
retry = retry or CONF.nova.migration_max_retries
interval = interval or CONF.nova.migration_interval
# Looking for the instance to migrate
instance = self.find_instance(instance_id)
if not instance:
@@ -218,7 +223,7 @@ class NovaHelper:
while (getattr(instance, 'status') not in
["VERIFY_RESIZE", "ERROR"] and retry):
instance = self.nova.servers.get(instance.id)
time.sleep(2)
time.sleep(interval)
retry -= 1
new_hostname = getattr(instance, 'OS-EXT-SRV-ATTR:host')
@@ -300,7 +305,8 @@ class NovaHelper:
return True
def live_migrate_instance(self, instance_id, dest_hostname, retry=120):
def live_migrate_instance(self, instance_id, dest_hostname, retry=None,
interval=None):
"""This method does a live migration of a given instance
This method uses the Nova built-in live_migrate()
@@ -313,11 +319,16 @@ class NovaHelper:
:param dest_hostname: the name of the destination compute node, if
destination_node is None, nova scheduler choose
the destination host
:param retry: maximum number of retries before giving up
:param interval: interval in seconds between retries
"""
LOG.debug(
"Trying a live migrate instance %(instance)s ",
{'instance': instance_id})
# Use config defaults if not provided in method parameters
retry = retry or CONF.nova.migration_max_retries
interval = interval or CONF.nova.migration_interval
# Looking for the instance to migrate
instance = self.find_instance(instance_id)
if not instance:
@@ -341,7 +352,7 @@ class NovaHelper:
while (instance.status not in ['ACTIVE', 'ERROR'] and retry):
instance = self.nova.servers.get(instance.id)
LOG.debug('Waiting the migration of %s', instance.id)
time.sleep(1)
time.sleep(interval)
retry -= 1
new_hostname = getattr(instance, 'OS-EXT-SRV-ATTR:host')
@@ -364,7 +375,7 @@ class NovaHelper:
LOG.debug('Waiting the migration of %s to %s',
instance,
getattr(instance, 'OS-EXT-SRV-ATTR:host'))
time.sleep(1)
time.sleep(interval)
retry -= 1
host_name = getattr(instance, 'OS-EXT-SRV-ATTR:host')

View File

@@ -36,6 +36,7 @@ from watcher.conf import keystone_client
from watcher.conf import maas_client
from watcher.conf import models
from watcher.conf import monasca_client
from watcher.conf import nova
from watcher.conf import nova_client
from watcher.conf import paths
from watcher.conf import placement_client
@@ -69,3 +70,4 @@ ironic_client.register_opts(CONF)
collector.register_opts(CONF)
placement_client.register_opts(CONF)
prometheus_client.register_opts(CONF)
nova.register_opts(CONF)

50
watcher/conf/nova.py Normal file
View File

@@ -0,0 +1,50 @@
# Copyright (c) 2025 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
nova = cfg.OptGroup(name='nova',
title='Options for the Nova integration '
'configuration')
NOVA_OPTS = [
cfg.IntOpt('migration_max_retries',
default=180,
min=1,
help='Maximum number of retries for Nova migrations '
'before giving up and considering the operation failed. '
'Default value is 180, which for the default '
'migration_interval (5 seconds), makes a default '
'migration timeout of 900 seconds (15 minutes).'
'This is an upper bound on the maximum expected. This '
'should not be decreased or increased over 3600s '
'(one hour). Shorter values may cause the actions to '
'fail and higher values may hide infrastructure issues.'),
cfg.FloatOpt('migration_interval',
default=5.0,
min=0.1,
help='Interval in seconds to check the status in Nova VM '
'migrations (value is float). Default value is 5.0 '
'seconds.'),
]
def register_opts(conf):
conf.register_group(nova)
conf.register_opts(NOVA_OPTS, group=nova)
def list_opts():
return [(nova, NOVA_OPTS)]

View File

@@ -144,3 +144,15 @@ class TestCase(BaseTestCase):
return os.path.join(root, project_file)
else:
return root
def flags(self, **kw):
"""Override flag variables for a test.
Example:
self.flags(periodic_interval=10,
group='watcher_decision_engine')
"""
group = kw.pop('group', None)
for k, v in kw.items():
CONF.set_override(k, v, group)
self.addCleanup(CONF.clear_override, k, group)

View File

@@ -16,6 +16,8 @@
# limitations under the License.
#
import copy
import fixtures
import time
from unittest import mock
@@ -43,6 +45,8 @@ class TestNovaHelper(base.TestCase):
self.source_node = "ldev-indeedsrv005"
self.destination_node = "ldev-indeedsrv006"
self.flavor_name = "x1"
self.mock_sleep = self.useFixture(
fixtures.MockPatchObject(time, 'sleep')).mock
@staticmethod
def fake_server(*args, **kwargs):
@@ -360,7 +364,7 @@ class TestNovaHelper(base.TestCase):
nova_util.live_migrate_instance(
self.instance_uuid, self.destination_node
)
time.sleep.assert_called_with(1)
time.sleep.assert_called_with(5)
@mock.patch.object(time, 'sleep', mock.Mock())
def test_live_migrate_instance_no_destination_node(
@@ -443,6 +447,229 @@ class TestNovaHelper(base.TestCase):
)
self.assertTrue(is_success)
@mock.patch.object(nova_helper.NovaHelper, 'confirm_resize')
def test_watcher_non_live_migrate_instance_retry_success(
self, mock_confirm_resize, mock_cinder, mock_nova):
"""Test that watcher_non_live_migrate uses config timeout by default"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
server.status = 'MIGRATING' # Never reaches ACTIVE
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
verify_server = copy.deepcopy(server)
verify_server.status = 'VERIFY_RESIZE'
setattr(verify_server, 'OS-EXT-SRV-ATTR:host', self.destination_node)
# This means instance will be found as VERIFY_RESIZE in second retry
nova_util.nova.servers.get.side_effect = (server, server, server,
verify_server)
mock_confirm_resize.return_value = True
self.flags(migration_max_retries=20, migration_interval=4,
group='nova')
# Migration will success as will transition from MIGRATING to
# VERIFY_RESIZE
is_success = nova_util.watcher_non_live_migrate_instance(
self.instance_uuid, self.destination_node
)
# Should succeed
self.assertTrue(is_success)
# It will sleep 2 times because it will be found as VERIFY_RESIZE in
# the second retry
self.assertEqual(2, self.mock_sleep.call_count)
# Verify all sleep calls used 4 second interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 4)
def test_watcher_non_live_migrate_instance_retry_default(
self, mock_cinder, mock_nova):
"""Test that watcher_non_live_migrate uses config timeout by default"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
server.status = 'MIGRATING' # Never reaches ACTIVE
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
# Migration will timeout because status never changes
is_success = nova_util.watcher_non_live_migrate_instance(
self.instance_uuid, self.destination_node
)
# Should fail due to timeout
self.assertFalse(is_success)
# With default migration_max_retries and migration_interval, should
# sleep 180 times for 5 seconds
self.assertEqual(180, self.mock_sleep.call_count)
# Verify sleep calls used 5 second
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 5)
def test_watcher_non_live_migrate_instance_retry_custom(
self, mock_cinder, mock_nova):
"""Test that watcher_non_live_migrate respects explicit retry value"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
server.status = 'MIGRATING' # Never reaches ACTIVE
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
# Set config to a custom values to ensure custom values are used
self.flags(migration_max_retries=10, migration_interval=3,
group='nova')
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
is_success = nova_util.watcher_non_live_migrate_instance(
self.instance_uuid, self.destination_node
)
# Should fail due to timeout
self.assertFalse(is_success)
# It should sleep migration_max_retries times with migration_interval
# seconds
self.assertEqual(10, self.mock_sleep.call_count)
# Verify all sleep calls used migration_interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 3)
def test_live_migrate_instance_retry_default_success(
self, mock_cinder, mock_nova):
"""Test that live_migrate_instance uses config timeout by default"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
setattr(server, 'OS-EXT-STS:task_state', 'migrating')
migrated_server = copy.deepcopy(server)
migrated_server.status = 'ACTIVE'
setattr(migrated_server, 'OS-EXT-SRV-ATTR:host', self.destination_node)
nova_util.nova.servers.get.side_effect = (
server, server, server, migrated_server)
# Migration will success as will transition from migrating to ACTIVE
is_success = nova_util.live_migrate_instance(
self.instance_uuid, self.destination_node
)
# Should succeed
self.assertTrue(is_success)
# It will sleep 2 times because it will be found as ACTIVE in
# the second retry
self.assertEqual(2, self.mock_sleep.call_count)
# Verify all sleep calls used 5 second interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 5)
def test_live_migrate_instance_retry_default(
self, mock_cinder, mock_nova):
"""Test that live_migrate_instance uses config timeout by default"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
setattr(server, 'OS-EXT-STS:task_state', 'migrating')
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
# Migration will timeout because host never changes
is_success = nova_util.live_migrate_instance(
self.instance_uuid, self.destination_node
)
# Should fail due to timeout
self.assertFalse(is_success)
# With default migration_max_retries and migration_interval, should
# sleep 5 seconds 180 times
self.assertEqual(180, self.mock_sleep.call_count)
# Verify all sleep calls used 5 second interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 5)
def test_live_migrate_instance_retry_custom(
self, mock_cinder, mock_nova):
"""Test that live_migrate_instance uses config timeout by default"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
setattr(server, 'OS-EXT-STS:task_state', 'migrating')
# Set config value
self.flags(migration_max_retries=20, migration_interval=1.5,
group='nova')
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
# Migration will timeout because host never changes
is_success = nova_util.live_migrate_instance(
self.instance_uuid, self.destination_node
)
# Should fail due to timeout
self.assertFalse(is_success)
# With migration_max_retries and migration_interval, should sleep 20
# times
self.assertEqual(20, self.mock_sleep.call_count)
# Verify sleep calls used 1.5 second
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 1.5)
def test_live_migrate_instance_no_dest_retry_default(
self, mock_cinder, mock_nova):
"""Test live_migrate with no destination uses config timeout"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
server.status = 'MIGRATING' # Never reaches ACTIVE
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
# Migration with no destination will timeout
is_success = nova_util.live_migrate_instance(
self.instance_uuid, None
)
# Should fail due to timeout
self.assertFalse(is_success)
# With default migration_max_retries and migration_interval, should
# sleep 180 times
self.assertEqual(180, self.mock_sleep.call_count)
# Verify all sleep calls used 5 second interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 5)
def test_live_migrate_instance_no_dest_retry_custom(
self, mock_cinder, mock_nova):
"""Test live_migrate with no destination uses config timeout"""
nova_util = nova_helper.NovaHelper()
server = self.fake_server(self.instance_uuid)
setattr(server, 'OS-EXT-SRV-ATTR:host', self.source_node)
server.status = 'MIGRATING' # Never reaches ACTIVE
# Set config value
self.flags(migration_max_retries=10, migration_interval=3,
group='nova')
self.fake_nova_find_list(nova_util, fake_find=server, fake_list=server)
# Migration with no destination will timeout
is_success = nova_util.live_migrate_instance(
self.instance_uuid, None
)
# Should fail due to timeout
self.assertFalse(is_success)
# With migration_max_retries and migration_interval, should sleep 10
# times for 3 seconds
self.assertEqual(10, self.mock_sleep.call_count)
# Verify sleep calls used 3 second
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 3)
def test_enable_service_nova_compute(self, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
nova_services = nova_util.nova.services