Merge "Exclude current conductor from offline_conductors" into stable/yoga

This commit is contained in:
Zuul
2022-05-10 18:47:33 +00:00
committed by Gerrit Code Review
4 changed files with 47 additions and 2 deletions

View File

@@ -1604,7 +1604,8 @@ class ConductorManager(base_manager.BaseConductorManager):
:param context: request context.
"""
offline_conductors = self.dbapi.get_offline_conductors()
offline_conductors = utils.exclude_current_conductor(
self.host, self.dbapi.get_offline_conductors())
if not offline_conductors:
return
@@ -3436,7 +3437,8 @@ class ConductorManager(base_manager.BaseConductorManager):
:param context: request context.
"""
offline_conductors = self.dbapi.get_offline_conductors(field='id')
offline_conductors = utils.exclude_current_conductor(
self.conductor.id, self.dbapi.get_offline_conductors(field='id'))
for conductor_id in offline_conductors:
filters = {'state': states.ALLOCATING,
'conductor_affinity': conductor_id}

View File

@@ -1671,3 +1671,24 @@ def update_image_type(context, node):
'image_type',
images.IMAGE_TYPE_WHOLE_DISK if iwdi else images.IMAGE_TYPE_PARTITION)
return True
def exclude_current_conductor(current_conductor, offline_conductors):
"""Wrapper to exclude current conductor from offline_conductors
In some cases the current conductor may have failed to update
the heartbeat timestamp due to failure or resource starvation.
When this occurs the dbapi get_offline_conductors method will
include the current conductor in its return value.
:param current_conductor: id or hostname of the current conductor
:param offline_conductors: List of offline conductors.
:return: List of offline conductors, excluding current conductor
"""
if current_conductor in offline_conductors:
LOG.warning('Current conductor %s will be excluded from offline '
'conductors. Conductor heartbeat has failed to update the '
'database timestamp. This is sign of resource starvation.',
current_conductor)
return [x for x in offline_conductors if x != current_conductor]

View File

@@ -1921,6 +1921,16 @@ class MiscTestCase(db_base.DbTestCase):
conductor_utils.restore_power_state_if_needed(task, power_state)
self.assertEqual(0, power_action_mock.call_count)
@mock.patch.object(conductor_utils.LOG, 'warning', autospec=True)
def test_exclude_current_conductor(self, mock_log):
current_conductor = 'foo'
offline_conductos = ['foo', 'bar']
result = conductor_utils.exclude_current_conductor(current_conductor,
offline_conductos)
self.assertTrue(mock_log.called)
self.assertIn('bar', result)
self.assertNotIn('foo', result)
class ValidateInstanceInfoTraitsTestCase(tests_base.TestCase):

View File

@@ -0,0 +1,12 @@
---
fixes:
- |
Fixes an issue where a conductor would attempt local takeover. In case of
heartbeat failure due to resource starvation, the current conductor was
detected as offline when querying the database. In this scenario the
conductor would forcibly remove reservations of it's own and initiate
takeover. Current conductor is now excluded from the list of offline
conductors, so that local takeover does not occur for this case. A warning
is logged to highlight the potential resource starvation issue.
See bug: `2010016 <https://storyboard.openstack.org/#!/story/2010016>`_.