From 66f3a63f442bb790dea5c529aa56a462d9515363 Mon Sep 17 00:00:00 2001 From: Michael Johnson Date: Thu, 10 Sep 2020 14:54:09 -0700 Subject: [PATCH] Fix amphora failover when VRRP port is missing In an amphora failover situation, if the VRRP port of the other amphora in an active/standby pair is missing from neutron, the amphora failover may fail with a "port not found" error. This patch corrects this issue by allowing the amphora failover in progress to complete even if the other amphora is also in a failed state. Story: 2008127 Task: 40851 Change-Id: I806d220236ad741b638ad9315537334f2d923031 --- .../worker/v1/tasks/amphora_driver_tasks.py | 34 ++++++++++------ .../worker/v2/tasks/amphora_driver_tasks.py | 37 ++++++++++-------- .../drivers/neutron/allowed_address_pairs.py | 9 ++++- .../v1/tasks/test_amphora_driver_tasks.py | 26 +++++++++++-- .../v2/tasks/test_amphora_driver_tasks.py | 26 +++++++++++-- .../neutron/test_allowed_address_pairs.py | 39 ++++++++++++++++++- ...er-missing-vrrp-port-9b5f13b9951b7edb.yaml | 5 +++ 7 files changed, 136 insertions(+), 40 deletions(-) create mode 100644 releasenotes/notes/fix-amp-failover-missing-vrrp-port-9b5f13b9951b7edb.yaml diff --git a/octavia/controller/worker/v1/tasks/amphora_driver_tasks.py b/octavia/controller/worker/v1/tasks/amphora_driver_tasks.py index 5ca16c96b2..bad1ffa250 100644 --- a/octavia/controller/worker/v1/tasks/amphora_driver_tasks.py +++ b/octavia/controller/worker/v1/tasks/amphora_driver_tasks.py @@ -126,15 +126,16 @@ class AmphoraIndexListenersReload(BaseAmphoraTask): timeout_dict=None): """Execute listener reload routines for listeners on an amphora.""" if loadbalancer.listeners: - self.amphora_driver.reload( - loadbalancer, amphorae[amphora_index], timeout_dict) - - def revert(self, loadbalancer, *args, **kwargs): - """Handle failed listeners reloads.""" - - LOG.warning("Reverting listener reload.") - for listener in loadbalancer.listeners: - self.task_utils.mark_listener_prov_status_error(listener.id) + try: + self.amphora_driver.reload( + loadbalancer, amphorae[amphora_index], timeout_dict) + except Exception as e: + amphora_id = amphorae[amphora_index].id + LOG.warning('Failed to reload listeners on amphora %s. ' + 'Skipping this amphora as it is failing to ' + 'reload due to: %s', amphora_id, str(e)) + self.amphora_repo.update(db_apis.get_session(), amphora_id, + status=constants.ERROR) class ListenerDelete(BaseAmphoraTask): @@ -372,7 +373,7 @@ class AmphoraIndexVRRPUpdate(BaseAmphoraTask): 'to: %s', amphora_id, str(e)) self.amphora_repo.update(db_apis.get_session(), amphora_id, status=constants.ERROR) - + return LOG.debug("Uploaded VRRP configuration of amphora %s.", amphora_id) @@ -394,8 +395,17 @@ class AmphoraIndexVRRPStart(BaseAmphoraTask): """ def execute(self, amphora_index, amphorae, timeout_dict=None): - self.amphora_driver.start_vrrp_service(amphorae[amphora_index], - timeout_dict) + amphora_id = amphorae[amphora_index].id + try: + self.amphora_driver.start_vrrp_service(amphorae[amphora_index], + timeout_dict) + except Exception as e: + LOG.error('Failed to start VRRP on amphora %s. ' + 'Skipping this amphora as it is failing to start due ' + 'to: %s', amphora_id, str(e)) + self.amphora_repo.update(db_apis.get_session(), amphora_id, + status=constants.ERROR) + return LOG.debug("Started VRRP on amphora %s.", amphorae[amphora_index].id) diff --git a/octavia/controller/worker/v2/tasks/amphora_driver_tasks.py b/octavia/controller/worker/v2/tasks/amphora_driver_tasks.py index f80cc6d13e..cdebfbc7ec 100644 --- a/octavia/controller/worker/v2/tasks/amphora_driver_tasks.py +++ b/octavia/controller/worker/v2/tasks/amphora_driver_tasks.py @@ -188,17 +188,15 @@ class AmphoraIndexListenersReload(BaseAmphoraTask): db_apis.get_session(), id=loadbalancer[constants.LOADBALANCER_ID]) if db_lb.listeners: - self.amphora_driver.reload(db_lb, db_amp, timeout_dict) - - def revert(self, loadbalancer, *args, **kwargs): - """Handle failed listeners reloads.""" - - LOG.warning("Reverting listener reload.") - db_lb = self.loadbalancer_repo.get( - db_apis.get_session(), - id=loadbalancer[constants.LOADBALANCER_ID]) - for listener in db_lb.listeners: - self.task_utils.mark_listener_prov_status_error(listener.id) + try: + self.amphora_driver.reload(db_lb, db_amp, timeout_dict) + except Exception as e: + amphora_id = amphorae[amphora_index].id + LOG.warning('Failed to reload listeners on amphora %s. ' + 'Skipping this amphora as it is failing to ' + 'reload due to: %s', amphora_id, str(e)) + self.amphora_repo.update(db_apis.get_session(), amphora_id, + status=constants.ERROR) class ListenerDelete(BaseAmphoraTask): @@ -494,7 +492,7 @@ class AmphoraIndexVRRPUpdate(BaseAmphoraTask): 'to: %s', amphora_id, str(e)) self.amphora_repo.update(db_apis.get_session(), amphora_id, status=constants.ERROR) - + return LOG.debug("Uploaded VRRP configuration of amphora %s.", amphora_id) @@ -522,10 +520,17 @@ class AmphoraIndexVRRPStart(BaseAmphoraTask): def execute(self, amphora_index, amphorae, timeout_dict=None): # TODO(johnsom) Optimize this to use the dicts and not need the # DB lookups - db_amp = self.amphora_repo.get( - db_apis.get_session(), id=amphorae[amphora_index][constants.ID]) - - self.amphora_driver.start_vrrp_service(db_amp, timeout_dict) + amphora_id = amphorae[amphora_index][constants.ID] + db_amp = self.amphora_repo.get(db_apis.get_session(), id=amphora_id) + try: + self.amphora_driver.start_vrrp_service(db_amp, timeout_dict) + except Exception as e: + LOG.error('Failed to start VRRP on amphora %s. ' + 'Skipping this amphora as it is failing to start due ' + 'to: %s', amphora_id, str(e)) + self.amphora_repo.update(db_apis.get_session(), amphora_id, + status=constants.ERROR) + return LOG.debug("Started VRRP on amphora %s.", amphorae[amphora_index][constants.ID]) diff --git a/octavia/network/drivers/neutron/allowed_address_pairs.py b/octavia/network/drivers/neutron/allowed_address_pairs.py index 0199816348..97320decfe 100644 --- a/octavia/network/drivers/neutron/allowed_address_pairs.py +++ b/octavia/network/drivers/neutron/allowed_address_pairs.py @@ -712,8 +712,13 @@ class AllowedAddressPairsDriver(neutron_base.BaseNeutronDriver): vip_subnet, vip_port) else: for amp in loadbalancer.amphorae: - self._get_amp_net_configs(amp, amp_configs, - vip_subnet, vip_port) + try: + self._get_amp_net_configs(amp, amp_configs, + vip_subnet, vip_port) + except Exception as e: + LOG.warning('Getting network configurations for amphora ' + '%(amp)s failed due to %(err)s.', + {'amp': amp.id, 'err': str(e)}) return amp_configs # TODO(johnsom) This may be dead code now. Remove in failover for v2 patch diff --git a/octavia/tests/unit/controller/worker/v1/tasks/test_amphora_driver_tasks.py b/octavia/tests/unit/controller/worker/v1/tasks/test_amphora_driver_tasks.py index c614eb5921..c1ded1ae5a 100644 --- a/octavia/tests/unit/controller/worker/v1/tasks/test_amphora_driver_tasks.py +++ b/octavia/tests/unit/controller/worker/v1/tasks/test_amphora_driver_tasks.py @@ -200,6 +200,7 @@ class TestAmphoraDriverTasks(base.TestCase): mock_lb = mock.MagicMock() mock_listener = mock.MagicMock() mock_listener.id = '12345' + mock_driver.reload.side_effect = [mock.DEFAULT, Exception('boom')] # Test no listeners mock_lb.listeners = None @@ -213,10 +214,15 @@ class TestAmphoraDriverTasks(base.TestCase): timeout_dict=self.timeout_dict) mock_driver.reload.assert_called_once_with(mock_lb, amphora_mock, self.timeout_dict) - # Test revert - mock_lb.listeners = [mock_listener] - listeners_reload_obj.revert(mock_lb) - mock_prov_status_error.assert_called_once_with('12345') + + # Test with reload exception + mock_driver.reload.reset_mock() + listeners_reload_obj.execute(mock_lb, 0, [amphora_mock], + timeout_dict=self.timeout_dict) + mock_driver.reload.assert_called_once_with(mock_lb, amphora_mock, + self.timeout_dict) + mock_amphora_repo_update.assert_called_once_with( + _session_mock, amphora_mock.id, status=constants.ERROR) @mock.patch('octavia.controller.worker.task_utils.TaskUtils.' 'mark_listener_prov_status_error') @@ -717,11 +723,23 @@ class TestAmphoraDriverTasks(base.TestCase): mock_amphora_repo_update): amphora_vrrp_start_obj = ( amphora_driver_tasks.AmphoraIndexVRRPStart()) + mock_driver.start_vrrp_service.side_effect = [mock.DEFAULT, + Exception('boom')] + amphora_vrrp_start_obj.execute(0, [_amphora_mock], timeout_dict=self.timeout_dict) mock_driver.start_vrrp_service.assert_called_once_with( _amphora_mock, self.timeout_dict) + # Test with a start exception + mock_driver.start_vrrp_service.reset_mock() + amphora_vrrp_start_obj.execute(0, [_amphora_mock], + timeout_dict=self.timeout_dict) + mock_driver.start_vrrp_service.assert_called_once_with( + _amphora_mock, self.timeout_dict) + mock_amphora_repo_update.assert_called_once_with( + _session_mock, _amphora_mock.id, status=constants.ERROR) + def test_amphora_compute_connectivity_wait(self, mock_driver, mock_generate_uuid, diff --git a/octavia/tests/unit/controller/worker/v2/tasks/test_amphora_driver_tasks.py b/octavia/tests/unit/controller/worker/v2/tasks/test_amphora_driver_tasks.py index 058e7fc4b2..e1b6c1ac16 100644 --- a/octavia/tests/unit/controller/worker/v2/tasks/test_amphora_driver_tasks.py +++ b/octavia/tests/unit/controller/worker/v2/tasks/test_amphora_driver_tasks.py @@ -199,6 +199,7 @@ class TestAmphoraDriverTasks(base.TestCase): mock_listener.id = '12345' mock_amphora_repo_get.return_value = amphora_mock mock_lb_repo_get.return_value = mock_lb + mock_driver.reload.side_effect = [mock.DEFAULT, Exception('boom')] # Test no listeners mock_lb.listeners = None @@ -212,10 +213,15 @@ class TestAmphoraDriverTasks(base.TestCase): timeout_dict=self.timeout_dict) mock_driver.reload.assert_called_once_with(mock_lb, amphora_mock, self.timeout_dict) - # Test revert - mock_lb.listeners = [mock_listener] - listeners_reload_obj.revert(mock_lb) - mock_prov_status_error.assert_called_once_with('12345') + + # Test with reload exception + mock_driver.reload.reset_mock() + listeners_reload_obj.execute(mock_lb, 0, [amphora_mock], + timeout_dict=self.timeout_dict) + mock_driver.reload.assert_called_once_with(mock_lb, amphora_mock, + self.timeout_dict) + mock_amphora_repo_update.assert_called_once_with( + _session_mock, amphora_mock.id, status=constants.ERROR) @mock.patch('octavia.controller.worker.task_utils.TaskUtils.' 'mark_listener_prov_status_error') @@ -772,11 +778,23 @@ class TestAmphoraDriverTasks(base.TestCase): mock_amphora_repo_get.return_value = _db_amphora_mock amphora_vrrp_start_obj = ( amphora_driver_tasks.AmphoraIndexVRRPStart()) + mock_driver.start_vrrp_service.side_effect = [mock.DEFAULT, + Exception('boom')] + amphora_vrrp_start_obj.execute(0, [_amphora_mock], timeout_dict=self.timeout_dict) mock_driver.start_vrrp_service.assert_called_once_with( _db_amphora_mock, self.timeout_dict) + # Test with a start exception + mock_driver.start_vrrp_service.reset_mock() + amphora_vrrp_start_obj.execute(0, [_amphora_mock], + timeout_dict=self.timeout_dict) + mock_driver.start_vrrp_service.assert_called_once_with( + _db_amphora_mock, self.timeout_dict) + mock_amphora_repo_update.assert_called_once_with( + _session_mock, _db_amphora_mock.id, status=constants.ERROR) + def test_amphora_compute_connectivity_wait(self, mock_driver, mock_generate_uuid, diff --git a/octavia/tests/unit/network/drivers/neutron/test_allowed_address_pairs.py b/octavia/tests/unit/network/drivers/neutron/test_allowed_address_pairs.py index 05a93381fa..020f6fe997 100644 --- a/octavia/tests/unit/network/drivers/neutron/test_allowed_address_pairs.py +++ b/octavia/tests/unit/network/drivers/neutron/test_allowed_address_pairs.py @@ -1243,12 +1243,19 @@ class TestAllowedAddressPairsDriver(base.TestCase): def test_get_network_configs(self): amphora_mock = mock.MagicMock() + amphora2_mock = mock.MagicMock() load_balancer_mock = mock.MagicMock() vip_mock = mock.MagicMock() amphora_mock.status = constants.DELETED load_balancer_mock.amphorae = [amphora_mock] show_port = self.driver.neutron_client.show_port - show_port.return_value = t_constants.MOCK_NEUTRON_PORT + show_port.side_effect = [ + t_constants.MOCK_NEUTRON_PORT, t_constants.MOCK_NEUTRON_PORT, + t_constants.MOCK_NEUTRON_PORT, t_constants.MOCK_NEUTRON_PORT, + t_constants.MOCK_NEUTRON_PORT, t_constants.MOCK_NEUTRON_PORT, + t_constants.MOCK_NEUTRON_PORT, t_constants.MOCK_NEUTRON_PORT, + t_constants.MOCK_NEUTRON_PORT, t_constants.MOCK_NEUTRON_PORT, + Exception('boom')] fake_subnet = {'subnet': { 'id': t_constants.MOCK_SUBNET_ID, 'gateway_ip': t_constants.MOCK_IP_ADDRESS, @@ -1265,7 +1272,12 @@ class TestAllowedAddressPairsDriver(base.TestCase): amphora_mock.vrrp_ip = "10.0.0.1" amphora_mock.ha_port_id = 3 amphora_mock.ha_ip = "10.0.0.2" - load_balancer_mock.amphorae = [amphora_mock] + amphora2_mock.id = 333 + amphora2_mock.status = constants.ACTIVE + amphora2_mock.vrrp_port_id = 3 + amphora2_mock.vrrp_ip = "10.0.0.2" + amphora2_mock.ha_port_id = 4 + amphora2_mock.ha_ip = "10.0.0.3" configs = self.driver.get_network_configs(load_balancer_mock) self.assertEqual(1, len(configs)) @@ -1282,6 +1294,29 @@ class TestAllowedAddressPairsDriver(base.TestCase): self.assertEqual(expected_subnet_id, config.ha_subnet.id) self.assertEqual(expected_subnet_id, config.vrrp_subnet.id) + # Test with a specific amphora + configs = self.driver.get_network_configs(load_balancer_mock, + amphora_mock) + self.assertEqual(1, len(configs)) + config = configs[222] + # TODO(ptoohill): find a way to return different items for multiple + # calls to the same method, right now each call to show subnet + # will return the same values if a method happens to call it + # multiple times for different subnets. We should be able to verify + # different requests get different expected data. + expected_port_id = t_constants.MOCK_NEUTRON_PORT['port']['id'] + self.assertEqual(expected_port_id, config.ha_port.id) + self.assertEqual(expected_port_id, config.vrrp_port.id) + expected_subnet_id = fake_subnet['subnet']['id'] + self.assertEqual(expected_subnet_id, config.ha_subnet.id) + self.assertEqual(expected_subnet_id, config.vrrp_subnet.id) + + # Test with a load balancer with two amphora, one that has a + # neutron problem. + load_balancer_mock.amphorae = [amphora_mock, amphora2_mock] + configs = self.driver.get_network_configs(load_balancer_mock) + self.assertEqual(1, len(configs)) + @mock.patch('time.sleep') def test_wait_for_port_detach(self, mock_sleep): amphora = data_models.Amphora( diff --git a/releasenotes/notes/fix-amp-failover-missing-vrrp-port-9b5f13b9951b7edb.yaml b/releasenotes/notes/fix-amp-failover-missing-vrrp-port-9b5f13b9951b7edb.yaml new file mode 100644 index 0000000000..9605ac0286 --- /dev/null +++ b/releasenotes/notes/fix-amp-failover-missing-vrrp-port-9b5f13b9951b7edb.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixed an issue with failing over an amphora if the pair amphora in an + active/standby pair had a missing VRRP port in neutron.