Fix health manager edge case with zombie amphora

There is a edge case where amphora may have been deleted, marked deleted in
our database, but still be running in nova. One example is an instance stuck
in nova "deleting" status. These can still report health heartbeats, but
we will see them as failed as they are reporting load balancing configuration
that we do not have a record for. This will lead to a failover to repair the
amphora, which will re-attempt the delete and reset the amphora health record.
Which can lead to another failover attempt.

This patch will log and leave the amphora as "busy" if a failover is attempted
on an amphora we show as "DELETED". This will prevent repeated failover
attempts on "DELETED" amphora that are still alive in the compute system.

It also fixes a mis-named mock object in a failover test.

Change-Id: I3397ffacf8e08964ecd4b47f2353542b6bc57645
This commit is contained in:
Michael Johnson 2018-03-26 16:40:00 -07:00 committed by German Eichberger
parent df9770a773
commit 1a35d6dc81
2 changed files with 37 additions and 1 deletions

View File

@ -50,6 +50,7 @@ class ControllerWorker(base_taskflow.BaseTaskFlowEngine):
self._l7rule_flows = l7rule_flows.L7RuleFlows()
self._amphora_repo = repo.AmphoraRepository()
self._amphora_health_repo = repo.AmphoraHealthRepository()
self._health_mon_repo = repo.HealthMonitorRepository()
self._lb_repo = repo.LoadBalancerRepository()
self._listener_repo = repo.ListenerRepository()
@ -656,6 +657,15 @@ class ControllerWorker(base_taskflow.BaseTaskFlowEngine):
constants.LOADBALANCER_ID: amp.load_balancer_id,
constants.BUILD_TYPE_PRIORITY: priority, }
if amp.status == constants.DELETED:
LOG.warning('Amphora %s is marked DELETED in the database but '
'was submitted for failover. Marking it busy in the '
'amphora health table to exclude it from health '
'checks and skipping the failover.', amp.id)
self._amphora_health_repo.update(db_apis.get_session(), amp.id,
busy=True)
return
if (CONF.house_keeping.spare_amphora_pool_size == 0) and (
CONF.nova.enable_anti_affinity is False):
LOG.warning("Failing over amphora with no spares pool may "

View File

@ -1108,7 +1108,7 @@ class TestControllerWorker(base.TestCase):
@mock.patch('octavia.db.repositories.LoadBalancerRepository.update')
def test_failover_amphora(self,
mock_update,
mock_get_update_listener_flow,
mock_get_failover_flow,
mock_api_get_session,
mock_dyn_log_listener,
mock_taskflow_load,
@ -1140,6 +1140,32 @@ class TestControllerWorker(base.TestCase):
mock_update.assert_called_with('TEST', LB_ID,
provisioning_status=constants.ACTIVE)
@mock.patch('octavia.db.repositories.AmphoraHealthRepository.update')
def test_failover_deleted_amphora(self,
mock_update,
mock_api_get_session,
mock_dyn_log_listener,
mock_taskflow_load,
mock_pool_repo_get,
mock_member_repo_get,
mock_l7rule_repo_get,
mock_l7policy_repo_get,
mock_listener_repo_get,
mock_lb_repo_get,
mock_health_mon_repo_get,
mock_amp_repo_get):
mock_taskflow_load.reset_mock()
mock_amphora = mock.MagicMock()
mock_amphora.id = AMP_ID
mock_amphora.status = constants.DELETED
cw = controller_worker.ControllerWorker()
cw._perform_amphora_failover(mock_amphora, 10)
mock_update.assert_called_with('TEST', AMP_ID, busy=True)
mock_taskflow_load.assert_not_called()
@mock.patch('octavia.controller.worker.'
'controller_worker.ControllerWorker._perform_amphora_failover')
@mock.patch('octavia.db.repositories.LoadBalancerRepository.update')