diff --git a/doc/source/admin/guides/operator-maintenance.rst b/doc/source/admin/guides/operator-maintenance.rst index ece39af73b..c3fbf79ecb 100644 --- a/doc/source/admin/guides/operator-maintenance.rst +++ b/doc/source/admin/guides/operator-maintenance.rst @@ -73,6 +73,14 @@ configuration from successfully completing. In this case the load balancer is continuing to process traffic through the load balancer, but might not have applied the latest configuration updates yet. +A load balancer in a PENDING provisioning status is immutable, it cannot be +updated or deleted by another process, this PENDING status acts as a lock on +the resource. +If a database outage occurs while a load balancer is deleted, created or +updated, the Octavia control plane will try to remove the PENDING status and +set it to ERROR during a long period of time (around 2h45min with the default +settings), to prevent the resource from remaining immutable. + Monitoring load balancer functionality -------------------------------------- diff --git a/octavia/common/config.py b/octavia/common/config.py index 7ac1851cac..a65176d4bf 100644 --- a/octavia/common/config.py +++ b/octavia/common/config.py @@ -534,7 +534,18 @@ controller_worker_opts = [ cfg.BoolOpt('event_notifications', default=True, help=_('Enable octavia event notifications. See ' 'oslo_messaging_notifications section for additional ' - 'requirements.')) + 'requirements.')), + # 2000 attempts is around 2h45 with the default settings + cfg.IntOpt('db_commit_retry_attempts', default=2000, + help=_('The number of times the database action will be ' + 'attempted.')), + cfg.IntOpt('db_commit_retry_initial_delay', default=1, + help=_('The initial delay before a retry attempt.')), + cfg.IntOpt('db_commit_retry_backoff', default=1, + help=_('The time to backoff retry attempts.')), + cfg.IntOpt('db_commit_retry_max', default=5, + help=_('The maximum amount of time to wait between retry ' + 'attempts.')), ] task_flow_opts = [ diff --git a/octavia/controller/worker/task_utils.py b/octavia/controller/worker/task_utils.py index 9b44f93acd..d47c6d25fc 100644 --- a/octavia/controller/worker/task_utils.py +++ b/octavia/controller/worker/task_utils.py @@ -14,18 +14,32 @@ """ Methods common to the controller work tasks.""" +from oslo_config import cfg from oslo_log import log as logging +from oslo_utils import excutils +import tenacity from octavia.common import constants from octavia.db import api as db_apis from octavia.db import repositories as repo +CONF = cfg.CONF LOG = logging.getLogger(__name__) class TaskUtils(object): """Class of helper/utility methods used by tasks.""" + status_update_retry = tenacity.retry( + retry=tenacity.retry_if_exception_type(Exception), + wait=tenacity.wait_incrementing( + CONF.controller_worker.db_commit_retry_initial_delay, + CONF.controller_worker.db_commit_retry_backoff, + CONF.controller_worker.db_commit_retry_max), + stop=tenacity.stop_after_attempt( + CONF.controller_worker.db_commit_retry_attempts), + after=tenacity.after_log(LOG, logging.DEBUG)) + def __init__(self, **kwargs): self.amphora_repo = repo.AmphoraRepository() self.health_mon_repo = repo.HealthMonitorRepository() @@ -160,6 +174,7 @@ class TaskUtils(object): "provisioning status to ERROR due to: " "%(except)s", {'list': listener_id, 'except': str(e)}) + @status_update_retry def mark_loadbalancer_prov_status_error(self, loadbalancer_id): """Sets a load balancer provisioning status to ERROR. @@ -175,9 +190,12 @@ class TaskUtils(object): id=loadbalancer_id, provisioning_status=constants.ERROR) except Exception as e: - LOG.error("Failed to update load balancer %(lb)s " - "provisioning status to ERROR due to: " - "%(except)s", {'lb': loadbalancer_id, 'except': str(e)}) + # Reraise for tenacity + with excutils.save_and_reraise_exception(): + LOG.error("Failed to update load balancer %(lb)s " + "provisioning status to ERROR due to: " + "%(except)s", {'lb': loadbalancer_id, + 'except': str(e)}) def mark_listener_prov_status_active(self, listener_id): """Sets a listener provisioning status to ACTIVE. @@ -214,6 +232,7 @@ class TaskUtils(object): "to ACTIVE due to: %(except)s", {'pool': pool_id, 'except': str(e)}) + @status_update_retry def mark_loadbalancer_prov_status_active(self, loadbalancer_id): """Sets a load balancer provisioning status to ACTIVE. @@ -229,9 +248,12 @@ class TaskUtils(object): id=loadbalancer_id, provisioning_status=constants.ACTIVE) except Exception as e: - LOG.error("Failed to update load balancer %(lb)s " - "provisioning status to ACTIVE due to: " - "%(except)s", {'lb': loadbalancer_id, 'except': str(e)}) + # Reraise for tenacity + with excutils.save_and_reraise_exception(): + LOG.error("Failed to update load balancer %(lb)s " + "provisioning status to ACTIVE due to: " + "%(except)s", {'lb': loadbalancer_id, + 'except': str(e)}) def mark_member_prov_status_error(self, member_id): """Sets a member provisioning status to ERROR. diff --git a/octavia/controller/worker/v2/tasks/lifecycle_tasks.py b/octavia/controller/worker/v2/tasks/lifecycle_tasks.py index 8e1ad46ea0..1d05422eed 100644 --- a/octavia/controller/worker/v2/tasks/lifecycle_tasks.py +++ b/octavia/controller/worker/v2/tasks/lifecycle_tasks.py @@ -54,15 +54,22 @@ class HealthMonitorToErrorOnRevertTask(BaseLifecycleTask): pass def revert(self, health_mon, listeners, loadbalancer, *args, **kwargs): - self.task_utils.mark_health_mon_prov_status_error( - health_mon[constants.HEALTHMONITOR_ID]) - self.task_utils.mark_pool_prov_status_active( - health_mon[constants.POOL_ID]) + try: + self.task_utils.mark_health_mon_prov_status_error( + health_mon[constants.HEALTHMONITOR_ID]) + self.task_utils.mark_pool_prov_status_active( + health_mon[constants.POOL_ID]) + for listener in listeners: + self.task_utils.mark_listener_prov_status_active( + listener[constants.LISTENER_ID]) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active( loadbalancer[constants.LOADBALANCER_ID]) - for listener in listeners: - self.task_utils.mark_listener_prov_status_active( - listener[constants.LISTENER_ID]) class L7PolicyToErrorOnRevertTask(BaseLifecycleTask): @@ -72,12 +79,19 @@ class L7PolicyToErrorOnRevertTask(BaseLifecycleTask): pass def revert(self, l7policy, listeners, loadbalancer_id, *args, **kwargs): - self.task_utils.mark_l7policy_prov_status_error( - l7policy[constants.L7POLICY_ID]) + try: + self.task_utils.mark_l7policy_prov_status_error( + l7policy[constants.L7POLICY_ID]) + for listener in listeners: + self.task_utils.mark_listener_prov_status_active( + listener[constants.LISTENER_ID]) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active(loadbalancer_id) - for listener in listeners: - self.task_utils.mark_listener_prov_status_active( - listener[constants.LISTENER_ID]) class L7RuleToErrorOnRevertTask(BaseLifecycleTask): @@ -88,14 +102,21 @@ class L7RuleToErrorOnRevertTask(BaseLifecycleTask): def revert(self, l7rule, l7policy_id, listeners, loadbalancer_id, *args, **kwargs): - self.task_utils.mark_l7rule_prov_status_error( - l7rule[constants.L7RULE_ID]) - self.task_utils.mark_l7policy_prov_status_active(l7policy_id) + try: + self.task_utils.mark_l7rule_prov_status_error( + l7rule[constants.L7RULE_ID]) + self.task_utils.mark_l7policy_prov_status_active(l7policy_id) + for listener in listeners: + self.task_utils.mark_listener_prov_status_active( + listener[constants.LISTENER_ID]) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active( loadbalancer_id) - for listener in listeners: - self.task_utils.mark_listener_prov_status_active( - listener[constants.LISTENER_ID]) class ListenerToErrorOnRevertTask(BaseLifecycleTask): @@ -105,8 +126,15 @@ class ListenerToErrorOnRevertTask(BaseLifecycleTask): pass def revert(self, listener, *args, **kwargs): - self.task_utils.mark_listener_prov_status_error( - listener[constants.LISTENER_ID]) + try: + self.task_utils.mark_listener_prov_status_error( + listener[constants.LISTENER_ID]) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active( listener[constants.LOADBALANCER_ID]) @@ -118,9 +146,16 @@ class ListenersToErrorOnRevertTask(BaseLifecycleTask): pass def revert(self, listeners, *args, **kwargs): - for listener in listeners: - self.task_utils.mark_listener_prov_status_error( - listener[constants.LISTENER_ID]) + try: + for listener in listeners: + self.task_utils.mark_listener_prov_status_error( + listener[constants.LISTENER_ID]) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active( listeners[0][constants.LOADBALANCER_ID]) @@ -154,12 +189,19 @@ class MemberToErrorOnRevertTask(BaseLifecycleTask): def revert(self, member, listeners, loadbalancer, pool_id, *args, **kwargs): - self.task_utils.mark_member_prov_status_error( - member[constants.MEMBER_ID]) - for listener in listeners: - self.task_utils.mark_listener_prov_status_active( - listener[constants.LISTENER_ID]) - self.task_utils.mark_pool_prov_status_active(pool_id) + try: + self.task_utils.mark_member_prov_status_error( + member[constants.MEMBER_ID]) + for listener in listeners: + self.task_utils.mark_listener_prov_status_active( + listener[constants.LISTENER_ID]) + self.task_utils.mark_pool_prov_status_active(pool_id) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active( loadbalancer[constants.LOADBALANCER_ID]) @@ -172,13 +214,20 @@ class MembersToErrorOnRevertTask(BaseLifecycleTask): def revert(self, members, listeners, loadbalancer, pool_id, *args, **kwargs): - for m in members: - self.task_utils.mark_member_prov_status_error( - m[constants.MEMBER_ID]) - for listener in listeners: - self.task_utils.mark_listener_prov_status_active( - listener[constants.LISTENER_ID]) - self.task_utils.mark_pool_prov_status_active(pool_id) + try: + for m in members: + self.task_utils.mark_member_prov_status_error( + m[constants.MEMBER_ID]) + for listener in listeners: + self.task_utils.mark_listener_prov_status_active( + listener[constants.LISTENER_ID]) + self.task_utils.mark_pool_prov_status_active(pool_id) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active( loadbalancer[constants.LOADBALANCER_ID]) @@ -190,9 +239,16 @@ class PoolToErrorOnRevertTask(BaseLifecycleTask): pass def revert(self, pool_id, listeners, loadbalancer, *args, **kwargs): - self.task_utils.mark_pool_prov_status_error(pool_id) + try: + self.task_utils.mark_pool_prov_status_error(pool_id) + for listener in listeners: + self.task_utils.mark_listener_prov_status_active( + listener[constants.LISTENER_ID]) + except Exception: + # Catching and skipping, errors are already reported by task_utils + # and we want to ensure that mark_loadbalancer_prov_status_active + # is called to unlock the LB (it will pass or it will fail after a + # very long timeout) + pass self.task_utils.mark_loadbalancer_prov_status_active( loadbalancer[constants.LOADBALANCER_ID]) - for listener in listeners: - self.task_utils.mark_listener_prov_status_active( - listener[constants.LISTENER_ID]) diff --git a/octavia/tests/unit/controller/worker/test_task_utils.py b/octavia/tests/unit/controller/worker/test_task_utils.py index 6a8044d473..9f14be0475 100644 --- a/octavia/tests/unit/controller/worker/test_task_utils.py +++ b/octavia/tests/unit/controller/worker/test_task_utils.py @@ -14,6 +14,7 @@ from unittest import mock from oslo_utils import uuidutils +import tenacity from octavia.common import constants from octavia.controller.worker import task_utils as task_utilities @@ -183,7 +184,13 @@ class TestTaskUtils(base.TestCase): @mock.patch('octavia.db.api.session') @mock.patch('octavia.db.repositories.LoadBalancerRepository.update') + @mock.patch('tenacity.nap.time') + # mock LOG so we don't fill the console with log messages from + # tenacity.retry + @mock.patch('octavia.controller.worker.task_utils.LOG') def test_mark_loadbalancer_prov_status_active(self, + mock_LOG, + mock_time, mock_lb_repo_update, mock_get_session): @@ -202,14 +209,42 @@ class TestTaskUtils(base.TestCase): mock_lb_repo_update.reset_mock() mock_get_session.side_effect = Exception('fail') - self.task_utils.mark_loadbalancer_prov_status_active( + self.assertRaises( + tenacity.RetryError, + self.task_utils.mark_loadbalancer_prov_status_active, self.LOADBALANCER_ID) self.assertFalse(mock_lb_repo_update.called) + # Exceptions then happy path + mock_get_session.reset_mock(side_effect=True) + mock_lb_repo_update.reset_mock() + + mock_session = mock_get_session() + mock_session_context = mock_session.begin().__enter__() + mock_get_session.side_effect = [ + Exception('fail'), + Exception('fail'), + Exception('fail'), + mock_session] + + self.task_utils.mark_loadbalancer_prov_status_active( + self.LOADBALANCER_ID) + + mock_lb_repo_update.assert_called_once_with( + mock_session_context, + id=self.LOADBALANCER_ID, + provisioning_status=constants.ACTIVE) + @mock.patch('octavia.db.api.session') @mock.patch('octavia.db.repositories.LoadBalancerRepository.update') + @mock.patch('tenacity.nap.time') + # mock LOG so we don't fill the console with log messages from + # tenacity.retry + @mock.patch('octavia.controller.worker.task_utils.LOG') def test_mark_loadbalancer_prov_status_error(self, + mock_LOG, + mock_time, mock_lb_repo_update, mock_get_session): @@ -228,10 +263,31 @@ class TestTaskUtils(base.TestCase): mock_lb_repo_update.reset_mock() mock_get_session.side_effect = Exception('fail') + self.assertRaises(tenacity.RetryError, + self.task_utils.mark_loadbalancer_prov_status_error, + self.LOADBALANCER_ID) + + self.assertFalse(mock_lb_repo_update.called) + + # Exceptions then happy path + mock_get_session.reset_mock(side_effect=True) + mock_lb_repo_update.reset_mock() + + mock_session = mock_get_session() + mock_session_context = mock_session.begin().__enter__() + mock_get_session.side_effect = [ + Exception('fail'), + Exception('fail'), + Exception('fail'), + mock_session] + self.task_utils.mark_loadbalancer_prov_status_error( self.LOADBALANCER_ID) - self.assertFalse(mock_lb_repo_update.called) + mock_lb_repo_update.assert_called_once_with( + mock_session_context, + id=self.LOADBALANCER_ID, + provisioning_status=constants.ERROR) @mock.patch('octavia.db.api.session') @mock.patch('octavia.db.repositories.MemberRepository.update') diff --git a/releasenotes/notes/fix-lb-in-PENDING-on-DB-failure-1ffea71a86cd4ea9.yaml b/releasenotes/notes/fix-lb-in-PENDING-on-DB-failure-1ffea71a86cd4ea9.yaml new file mode 100644 index 0000000000..5d46a790da --- /dev/null +++ b/releasenotes/notes/fix-lb-in-PENDING-on-DB-failure-1ffea71a86cd4ea9.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixed an issue with load balancers stuck in a ``PENDING_*`` state during + database outages. Now when a task fails in Octavia, it retries to update + the ``provisioning_status`` of the load balancer until the database is back + (or it gives up after a really long timeout - around 2h45)