Support host recovery

This patch adds a host recovery feature to the host monitor plugin.
When recovery of a failed host is detected, it updates the reservable
field of the host to True.

Note that this status change does not trigger healing of degraded
leases. The recovered hosts can be re-reserved by calling Create/Update
lease request.

Partially Implements: blueprint resource-monitoring
Change-Id: Ie6767902a9d7dc60b3fa6c525a9a1270578ea49b
This commit is contained in:
Hiroaki Kobayashi 2017-12-14 13:25:05 +09:00
parent 2d55a09675
commit 9fa6a13524
2 changed files with 103 additions and 19 deletions

View File

@ -663,6 +663,15 @@ class PhysicalHostMonitorPlugin(base.BaseMonitorPlugin,
LOG.warn('%s failed.',
failed_hosts[0]['hypervisor_hostname'])
reservation_flags = self._handle_failures(failed_hosts)
else:
recovered_hosts = db_api.host_get_all_by_queries(
['reservable == 0',
'hypervisor_hostname == ' + data['host']])
if recovered_hosts:
db_api.host_update(recovered_hosts[0]['id'],
{'reservable': True})
LOG.warn('%s recovered.',
recovered_hosts[0]['hypervisor_hostname'])
return reservation_flags
@ -684,32 +693,44 @@ class PhysicalHostMonitorPlugin(base.BaseMonitorPlugin,
LOG.trace('Poll...')
reservation_flags = {}
failed_hosts = self._poll_resource_failures()
failed_hosts, recovered_hosts = self._poll_resource_failures()
if failed_hosts:
for host in failed_hosts:
LOG.warn('%s failed.', host['hypervisor_hostname'])
reservation_flags = self._handle_failures(failed_hosts)
if recovered_hosts:
for host in recovered_hosts:
db_api.host_update(host['id'], {'reservable': True})
LOG.warn('%s recovered.', host['hypervisor_hostname'])
return reservation_flags
def _poll_resource_failures(self):
"""Check health of hosts by calling Nova Hypervisors API.
:return: a list of failed hosts.
:return: a list of failed hosts, a list of recovered hosts.
"""
reservable_hosts = db_api.reservable_host_get_all_by_queries([])
hosts = db_api.host_get_all_by_filters({})
reservable_hosts = [h for h in hosts if h['reservable'] is True]
unreservable_hosts = [h for h in hosts if h['reservable'] is False]
try:
hvs = self.nova.hypervisors.list()
failed_hv_ids = [str(hv.id) for hv in hvs
if hv.state == 'down' or hv.status == 'disabled']
failed_hosts = [host for host in reservable_hosts
if host['id'] in failed_hv_ids]
active_hv_ids = [str(hv.id) for hv in hvs
if hv.state == 'up' and hv.status == 'enabled']
recovered_hosts = [host for host in unreservable_hosts
if host['id'] in active_hv_ids]
except Exception as e:
LOG.exception('Skipping health check of host %s. %s',
host['hypervisor_hostname'], str(e))
return failed_hosts
return failed_hosts, recovered_hosts
def _handle_failures(self, failed_hosts):
"""Handle resource failures.

View File

@ -1702,6 +1702,8 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
result = self.host_monitor_plugin.notification_callback(event_type,
payload)
host_get_all.assert_called_once_with(
['hypervisor_hostname == ' + payload['nova_object.data']['host']])
self.assertEqual({'rsrv-1': {'missing_resources': True}}, result)
def test_notification_callback_no_failure(self):
@ -1724,14 +1726,53 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
'uuid': 'fa69c544-906b-4a6a-a9c6-c1f7a8078c73'
}
}
host_get_all = self.patch(db_api,
'reservable_host_get_all_by_queries')
host_get_all = self.patch(db_api, 'host_get_all_by_queries')
host_get_all.return_value = []
handle_failures = self.patch(self.host_monitor_plugin,
'_handle_failures')
result = self.host_monitor_plugin.notification_callback(event_type,
payload)
host_get_all.assert_not_called()
host_get_all.assert_called_once_with(
['reservable == 0',
'hypervisor_hostname == ' + payload['nova_object.data']['host']])
handle_failures.assert_not_called()
self.assertEqual({}, result)
def test_notification_callback_recover(self):
recovered_host = {'hypervisor_hostname': 'compute-1', 'id': 1}
event_type = 'service.update'
payload = {
'nova_object.namespace': 'nova',
'nova_object.name': 'ServiceStatusPayload',
'nova_object.version': '1.1',
'nova_object.data': {
'host': 'compute-1',
'disabled': False,
'last_seen_up': '2012-10-29T13:42:05Z',
'binary': 'nova-compute',
'topic': 'compute',
'disabled_reason': None,
'report_count': 1,
'forced_down': False,
'version': 22,
'availability_zone': None,
'uuid': 'fa69c544-906b-4a6a-a9c6-c1f7a8078c73'
}
}
host_get_all = self.patch(db_api, 'host_get_all_by_queries')
host_get_all.return_value = [recovered_host]
handle_failures = self.patch(self.host_monitor_plugin,
'_handle_failures')
host_update = self.patch(db_api, 'host_update')
result = self.host_monitor_plugin.notification_callback(event_type,
payload)
host_get_all.assert_called_once_with(
['reservable == 0',
'hypervisor_hostname == ' + payload['nova_object.data']['host']])
host_update.assert_called_once_with(recovered_host['id'],
{'reservable': True})
handle_failures.assert_not_called()
self.assertEqual({}, result)
@ -1739,14 +1780,14 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1',
'trust_id': 'trust-1'},
'reservable': True},
{'id': '2',
'hypervisor_hostname': 'compute-2',
'trust_id': 'trust-2'},
'reservable': True},
]
host_get_all = self.patch(db_api,
'reservable_host_get_all_by_queries')
'host_get_all_by_filters')
host_get_all.return_value = hosts
hypervisors_list = self.patch(
self.host_monitor_plugin.nova.hypervisors, 'list')
@ -1755,20 +1796,20 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
mock.MagicMock(id=2, state='down', status='enabled')]
result = self.host_monitor_plugin._poll_resource_failures()
self.assertEqual(hosts, result)
self.assertEqual((hosts, []), result)
def test_poll_resource_failures_status_disabled(self):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1',
'trust_id': 'trust-1'},
'reservable': True},
{'id': '2',
'hypervisor_hostname': 'compute-2',
'trust_id': 'trust-2'},
'reservable': True},
]
host_get_all = self.patch(db_api,
'reservable_host_get_all_by_queries')
'host_get_all_by_filters')
host_get_all.return_value = hosts
hypervisors_list = self.patch(
self.host_monitor_plugin.nova.hypervisors, 'list')
@ -1777,20 +1818,20 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
mock.MagicMock(id=2, state='up', status='disabled')]
result = self.host_monitor_plugin._poll_resource_failures()
self.assertEqual(hosts, result)
self.assertEqual((hosts, []), result)
def test_poll_resource_failures_nothing(self):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1',
'trust_id': 'trust-1'},
'reservable': True},
{'id': '2',
'hypervisor_hostname': 'compute-2',
'trust_id': 'trust-2'},
'reservable': True},
]
host_get_all = self.patch(db_api,
'reservable_host_get_all_by_queries')
'host_get_all_by_filters')
host_get_all.return_value = hosts
hypervisors_list = self.patch(
self.host_monitor_plugin.nova.hypervisors, 'list')
@ -1799,7 +1840,29 @@ class PhysicalHostMonitorPluginTestCase(tests.TestCase):
mock.MagicMock(id=2, state='up', status='enabled')]
result = self.host_monitor_plugin._poll_resource_failures()
self.assertEqual([], result)
self.assertEqual(([], []), result)
def test_poll_resource_failures_recover(self):
hosts = [
{'id': '1',
'hypervisor_hostname': 'compute-1',
'reservable': False},
{'id': '2',
'hypervisor_hostname': 'compute-2',
'reservable': False},
]
host_get_all = self.patch(db_api,
'host_get_all_by_filters')
host_get_all.return_value = hosts
hypervisors_list = self.patch(
self.host_monitor_plugin.nova.hypervisors, 'list')
hypervisors_list.return_value = [
mock.MagicMock(id=1, state='up', status='enabled'),
mock.MagicMock(id=2, state='up', status='enabled')]
result = self.host_monitor_plugin._poll_resource_failures()
self.assertEqual(([], hosts), result)
def test_handle_failures(self):
hosts = [