Support host recovery
This patch adds a host recovery feature to the host monitor plugin. When recovery of a failed host is detected, it updates the reservable field of the host to True. Note that this status change does not trigger healing of degraded leases. The recovered hosts can be re-reserved by calling Create/Update lease request. Partially Implements: blueprint resource-monitoring Change-Id: Ie6767902a9d7dc60b3fa6c525a9a1270578ea49b
This commit is contained in:
parent
2d55a09675
commit
9fa6a13524
|
@ -663,6 +663,15 @@ class PhysicalHostMonitorPlugin(base.BaseMonitorPlugin,
|
|||
LOG.warn('%s failed.',
|
||||
failed_hosts[0]['hypervisor_hostname'])
|
||||
reservation_flags = self._handle_failures(failed_hosts)
|
||||
else:
|
||||
recovered_hosts = db_api.host_get_all_by_queries(
|
||||
['reservable == 0',
|
||||
'hypervisor_hostname == ' + data['host']])
|
||||
if recovered_hosts:
|
||||
db_api.host_update(recovered_hosts[0]['id'],
|
||||
{'reservable': True})
|
||||
LOG.warn('%s recovered.',
|
||||
recovered_hosts[0]['hypervisor_hostname'])
|
||||
|
||||
return reservation_flags
|
||||
|
||||
|
@ -684,32 +693,44 @@ class PhysicalHostMonitorPlugin(base.BaseMonitorPlugin,
|
|||
LOG.trace('Poll...')
|
||||
reservation_flags = {}
|
||||
|
||||
failed_hosts = self._poll_resource_failures()
|
||||
failed_hosts, recovered_hosts = self._poll_resource_failures()
|
||||
if failed_hosts:
|
||||
for host in failed_hosts:
|
||||
LOG.warn('%s failed.', host['hypervisor_hostname'])
|
||||
reservation_flags = self._handle_failures(failed_hosts)
|
||||
if recovered_hosts:
|
||||
for host in recovered_hosts:
|
||||
db_api.host_update(host['id'], {'reservable': True})
|
||||
LOG.warn('%s recovered.', host['hypervisor_hostname'])
|
||||
|
||||
return reservation_flags
|
||||
|
||||
def _poll_resource_failures(self):
|
||||
"""Check health of hosts by calling Nova Hypervisors API.
|
||||
|
||||
:return: a list of failed hosts.
|
||||
:return: a list of failed hosts, a list of recovered hosts.
|
||||
"""
|
||||
reservable_hosts = db_api.reservable_host_get_all_by_queries([])
|
||||
hosts = db_api.host_get_all_by_filters({})
|
||||
reservable_hosts = [h for h in hosts if h['reservable'] is True]
|
||||
unreservable_hosts = [h for h in hosts if h['reservable'] is False]
|
||||
|
||||
try:
|
||||
hvs = self.nova.hypervisors.list()
|
||||
|
||||
failed_hv_ids = [str(hv.id) for hv in hvs
|
||||
if hv.state == 'down' or hv.status == 'disabled']
|
||||
failed_hosts = [host for host in reservable_hosts
|
||||
if host['id'] in failed_hv_ids]
|
||||
|
||||
active_hv_ids = [str(hv.id) for hv in hvs
|
||||
if hv.state == 'up' and hv.status == 'enabled']
|
||||
recovered_hosts = [host for host in unreservable_hosts
|
||||
if host['id'] in active_hv_ids]
|
||||
except Exception as e:
|
||||
LOG.exception('Skipping health check of host %s. %s',
|
||||
host['hypervisor_hostname'], str(e))
|
||||
|
||||
return failed_hosts
|
||||
return failed_hosts, recovered_hosts
|
||||
|
||||
def _handle_failures(self, failed_hosts):
|
||||
"""Handle resource failures.
|
||||
|
|
|
@ -1702,6 +1702,8 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
|
|||
|
||||
result = self.host_monitor_plugin.notification_callback(event_type,
|
||||
payload)
|
||||
host_get_all.assert_called_once_with(
|
||||
['hypervisor_hostname == ' + payload['nova_object.data']['host']])
|
||||
self.assertEqual({'rsrv-1': {'missing_resources': True}}, result)
|
||||
|
||||
def test_notification_callback_no_failure(self):
|
||||
|
@ -1724,14 +1726,53 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
|
|||
'uuid': 'fa69c544-906b-4a6a-a9c6-c1f7a8078c73'
|
||||
}
|
||||
}
|
||||
host_get_all = self.patch(db_api,
|
||||
'reservable_host_get_all_by_queries')
|
||||
host_get_all = self.patch(db_api, 'host_get_all_by_queries')
|
||||
host_get_all.return_value = []
|
||||
handle_failures = self.patch(self.host_monitor_plugin,
|
||||
'_handle_failures')
|
||||
|
||||
result = self.host_monitor_plugin.notification_callback(event_type,
|
||||
payload)
|
||||
host_get_all.assert_not_called()
|
||||
host_get_all.assert_called_once_with(
|
||||
['reservable == 0',
|
||||
'hypervisor_hostname == ' + payload['nova_object.data']['host']])
|
||||
handle_failures.assert_not_called()
|
||||
self.assertEqual({}, result)
|
||||
|
||||
def test_notification_callback_recover(self):
|
||||
recovered_host = {'hypervisor_hostname': 'compute-1', 'id': 1}
|
||||
event_type = 'service.update'
|
||||
payload = {
|
||||
'nova_object.namespace': 'nova',
|
||||
'nova_object.name': 'ServiceStatusPayload',
|
||||
'nova_object.version': '1.1',
|
||||
'nova_object.data': {
|
||||
'host': 'compute-1',
|
||||
'disabled': False,
|
||||
'last_seen_up': '2012-10-29T13:42:05Z',
|
||||
'binary': 'nova-compute',
|
||||
'topic': 'compute',
|
||||
'disabled_reason': None,
|
||||
'report_count': 1,
|
||||
'forced_down': False,
|
||||
'version': 22,
|
||||
'availability_zone': None,
|
||||
'uuid': 'fa69c544-906b-4a6a-a9c6-c1f7a8078c73'
|
||||
}
|
||||
}
|
||||
host_get_all = self.patch(db_api, 'host_get_all_by_queries')
|
||||
host_get_all.return_value = [recovered_host]
|
||||
handle_failures = self.patch(self.host_monitor_plugin,
|
||||
'_handle_failures')
|
||||
host_update = self.patch(db_api, 'host_update')
|
||||
|
||||
result = self.host_monitor_plugin.notification_callback(event_type,
|
||||
payload)
|
||||
host_get_all.assert_called_once_with(
|
||||
['reservable == 0',
|
||||
'hypervisor_hostname == ' + payload['nova_object.data']['host']])
|
||||
host_update.assert_called_once_with(recovered_host['id'],
|
||||
{'reservable': True})
|
||||
handle_failures.assert_not_called()
|
||||
self.assertEqual({}, result)
|
||||
|
||||
|
@ -1739,14 +1780,14 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
|
|||
hosts = [
|
||||
{'id': '1',
|
||||
'hypervisor_hostname': 'compute-1',
|
||||
'trust_id': 'trust-1'},
|
||||
'reservable': True},
|
||||
{'id': '2',
|
||||
'hypervisor_hostname': 'compute-2',
|
||||
'trust_id': 'trust-2'},
|
||||
'reservable': True},
|
||||
]
|
||||
|
||||
host_get_all = self.patch(db_api,
|
||||
'reservable_host_get_all_by_queries')
|
||||
'host_get_all_by_filters')
|
||||
host_get_all.return_value = hosts
|
||||
hypervisors_list = self.patch(
|
||||
self.host_monitor_plugin.nova.hypervisors, 'list')
|
||||
|
@ -1755,20 +1796,20 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
|
|||
mock.MagicMock(id=2, state='down', status='enabled')]
|
||||
|
||||
result = self.host_monitor_plugin._poll_resource_failures()
|
||||
self.assertEqual(hosts, result)
|
||||
self.assertEqual((hosts, []), result)
|
||||
|
||||
def test_poll_resource_failures_status_disabled(self):
|
||||
hosts = [
|
||||
{'id': '1',
|
||||
'hypervisor_hostname': 'compute-1',
|
||||
'trust_id': 'trust-1'},
|
||||
'reservable': True},
|
||||
{'id': '2',
|
||||
'hypervisor_hostname': 'compute-2',
|
||||
'trust_id': 'trust-2'},
|
||||
'reservable': True},
|
||||
]
|
||||
|
||||
host_get_all = self.patch(db_api,
|
||||
'reservable_host_get_all_by_queries')
|
||||
'host_get_all_by_filters')
|
||||
host_get_all.return_value = hosts
|
||||
hypervisors_list = self.patch(
|
||||
self.host_monitor_plugin.nova.hypervisors, 'list')
|
||||
|
@ -1777,20 +1818,20 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
|
|||
mock.MagicMock(id=2, state='up', status='disabled')]
|
||||
|
||||
result = self.host_monitor_plugin._poll_resource_failures()
|
||||
self.assertEqual(hosts, result)
|
||||
self.assertEqual((hosts, []), result)
|
||||
|
||||
def test_poll_resource_failures_nothing(self):
|
||||
hosts = [
|
||||
{'id': '1',
|
||||
'hypervisor_hostname': 'compute-1',
|
||||
'trust_id': 'trust-1'},
|
||||
'reservable': True},
|
||||
{'id': '2',
|
||||
'hypervisor_hostname': 'compute-2',
|
||||
'trust_id': 'trust-2'},
|
||||
'reservable': True},
|
||||
]
|
||||
|
||||
host_get_all = self.patch(db_api,
|
||||
'reservable_host_get_all_by_queries')
|
||||
'host_get_all_by_filters')
|
||||
host_get_all.return_value = hosts
|
||||
hypervisors_list = self.patch(
|
||||
self.host_monitor_plugin.nova.hypervisors, 'list')
|
||||
|
@ -1799,7 +1840,29 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
|
|||
mock.MagicMock(id=2, state='up', status='enabled')]
|
||||
|
||||
result = self.host_monitor_plugin._poll_resource_failures()
|
||||
self.assertEqual([], result)
|
||||
self.assertEqual(([], []), result)
|
||||
|
||||
def test_poll_resource_failures_recover(self):
|
||||
hosts = [
|
||||
{'id': '1',
|
||||
'hypervisor_hostname': 'compute-1',
|
||||
'reservable': False},
|
||||
{'id': '2',
|
||||
'hypervisor_hostname': 'compute-2',
|
||||
'reservable': False},
|
||||
]
|
||||
|
||||
host_get_all = self.patch(db_api,
|
||||
'host_get_all_by_filters')
|
||||
host_get_all.return_value = hosts
|
||||
hypervisors_list = self.patch(
|
||||
self.host_monitor_plugin.nova.hypervisors, 'list')
|
||||
hypervisors_list.return_value = [
|
||||
mock.MagicMock(id=1, state='up', status='enabled'),
|
||||
mock.MagicMock(id=2, state='up', status='enabled')]
|
||||
|
||||
result = self.host_monitor_plugin._poll_resource_failures()
|
||||
self.assertEqual(([], hosts), result)
|
||||
|
||||
def test_handle_failures(self):
|
||||
hosts = [
|
||||
|
|
Loading…
Reference in New Issue