Fix health manager edge case with zombie amphora
There is a edge case where amphora may have been deleted, marked deleted in our database, but still be running in nova. One example is an instance stuck in nova "deleting" status. These can still report health heartbeats, but we will see them as failed as they are reporting load balancing configuration that we do not have a record for. This will lead to a failover to repair the amphora, which will re-attempt the delete and reset the amphora health record. Which can lead to another failover attempt. This patch will log and leave the amphora as "busy" if a failover is attempted on an amphora we show as "DELETED". This will prevent repeated failover attempts on "DELETED" amphora that are still alive in the compute system. It also fixes a mis-named mock object in a failover test. Change-Id: I3397ffacf8e08964ecd4b47f2353542b6bc57645
This commit is contained in:
parent
df9770a773
commit
1a35d6dc81
|
@ -50,6 +50,7 @@ class ControllerWorker(base_taskflow.BaseTaskFlowEngine):
|
|||
self._l7rule_flows = l7rule_flows.L7RuleFlows()
|
||||
|
||||
self._amphora_repo = repo.AmphoraRepository()
|
||||
self._amphora_health_repo = repo.AmphoraHealthRepository()
|
||||
self._health_mon_repo = repo.HealthMonitorRepository()
|
||||
self._lb_repo = repo.LoadBalancerRepository()
|
||||
self._listener_repo = repo.ListenerRepository()
|
||||
|
@ -656,6 +657,15 @@ class ControllerWorker(base_taskflow.BaseTaskFlowEngine):
|
|||
constants.LOADBALANCER_ID: amp.load_balancer_id,
|
||||
constants.BUILD_TYPE_PRIORITY: priority, }
|
||||
|
||||
if amp.status == constants.DELETED:
|
||||
LOG.warning('Amphora %s is marked DELETED in the database but '
|
||||
'was submitted for failover. Marking it busy in the '
|
||||
'amphora health table to exclude it from health '
|
||||
'checks and skipping the failover.', amp.id)
|
||||
self._amphora_health_repo.update(db_apis.get_session(), amp.id,
|
||||
busy=True)
|
||||
return
|
||||
|
||||
if (CONF.house_keeping.spare_amphora_pool_size == 0) and (
|
||||
CONF.nova.enable_anti_affinity is False):
|
||||
LOG.warning("Failing over amphora with no spares pool may "
|
||||
|
|
|
@ -1108,7 +1108,7 @@ class TestControllerWorker(base.TestCase):
|
|||
@mock.patch('octavia.db.repositories.LoadBalancerRepository.update')
|
||||
def test_failover_amphora(self,
|
||||
mock_update,
|
||||
mock_get_update_listener_flow,
|
||||
mock_get_failover_flow,
|
||||
mock_api_get_session,
|
||||
mock_dyn_log_listener,
|
||||
mock_taskflow_load,
|
||||
|
@ -1140,6 +1140,32 @@ class TestControllerWorker(base.TestCase):
|
|||
mock_update.assert_called_with('TEST', LB_ID,
|
||||
provisioning_status=constants.ACTIVE)
|
||||
|
||||
@mock.patch('octavia.db.repositories.AmphoraHealthRepository.update')
|
||||
def test_failover_deleted_amphora(self,
|
||||
mock_update,
|
||||
mock_api_get_session,
|
||||
mock_dyn_log_listener,
|
||||
mock_taskflow_load,
|
||||
mock_pool_repo_get,
|
||||
mock_member_repo_get,
|
||||
mock_l7rule_repo_get,
|
||||
mock_l7policy_repo_get,
|
||||
mock_listener_repo_get,
|
||||
mock_lb_repo_get,
|
||||
mock_health_mon_repo_get,
|
||||
mock_amp_repo_get):
|
||||
|
||||
mock_taskflow_load.reset_mock()
|
||||
mock_amphora = mock.MagicMock()
|
||||
mock_amphora.id = AMP_ID
|
||||
mock_amphora.status = constants.DELETED
|
||||
|
||||
cw = controller_worker.ControllerWorker()
|
||||
cw._perform_amphora_failover(mock_amphora, 10)
|
||||
|
||||
mock_update.assert_called_with('TEST', AMP_ID, busy=True)
|
||||
mock_taskflow_load.assert_not_called()
|
||||
|
||||
@mock.patch('octavia.controller.worker.'
|
||||
'controller_worker.ControllerWorker._perform_amphora_failover')
|
||||
@mock.patch('octavia.db.repositories.LoadBalancerRepository.update')
|
||||
|
|
Loading…
Reference in New Issue