diff --git a/octavia/amphorae/backends/agent/api_server/loadbalancer.py b/octavia/amphorae/backends/agent/api_server/loadbalancer.py index 2029fbcf98..c36171dc5a 100644 --- a/octavia/amphorae/backends/agent/api_server/loadbalancer.py +++ b/octavia/amphorae/backends/agent/api_server/loadbalancer.py @@ -18,6 +18,7 @@ import re import shutil import stat import subprocess +import time import flask import jinja2 @@ -35,6 +36,8 @@ from octavia.common import utils as octavia_utils LOG = logging.getLogger(__name__) BUFFER = 100 +HAPROXY_RELOAD_RETRIES = 3 +HAPROXY_QUERY_RETRIES = 5 CONF = cfg.CONF @@ -224,6 +227,23 @@ class Loadbalancer: return res + def _check_haproxy_uptime(self, lb_id): + stat_sock_file = util.haproxy_sock_path(lb_id) + lb_query = haproxy_query.HAProxyQuery(stat_sock_file) + retries = HAPROXY_QUERY_RETRIES + for idx in range(retries): + try: + info = lb_query.show_info() + uptime_sec = info['Uptime_sec'] + except Exception as e: + LOG.warning('Failed to get haproxy info: %s, retrying.', e) + time.sleep(1) + continue + uptime = int(uptime_sec) + return uptime + LOG.error('Failed to get haproxy uptime after %d tries.', retries) + return None + def start_stop_lb(self, lb_id, action): action = action.lower() if action not in [consts.AMP_ACTION_START, @@ -257,20 +277,55 @@ class Loadbalancer: # failure! LOG.warning('Failed to save haproxy-%s state!', lb_id) - cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format( - lb_id=lb_id, action=action)) + retries = (HAPROXY_RELOAD_RETRIES + if action == consts.AMP_ACTION_RELOAD + else 1) + saved_exc = None + for idx in range(retries): + cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format( + lb_id=lb_id, action=action)) - try: - subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - if b'Job is already running' not in e.output: - LOG.debug( - "Failed to %(action)s haproxy-%(lb_id)s service: %(err)s " - "%(out)s", {'action': action, 'lb_id': lb_id, - 'err': e, 'out': e.output}) - return webob.Response(json={ - 'message': f"Error {action}ing haproxy", - 'details': e.output}, status=500) + try: + subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + # Mitigation for + # https://bugs.launchpad.net/octavia/+bug/2054666 + if (b'is not active, cannot reload.' in e.output and + action == consts.AMP_ACTION_RELOAD): + + saved_exc = e + + LOG.debug( + "Failed to %(action)s haproxy-%(lb_id)s service: " + "%(err)s %(out)s", {'action': action, 'lb_id': lb_id, + 'err': e, 'out': e.output}) + + # Wait a few seconds and check that haproxy was restarted + uptime = self._check_haproxy_uptime(lb_id) + # If haproxy is not reachable or was restarted more than 15 + # sec ago, let's retry (or maybe restart?) + if not uptime or uptime > 15: + continue + # haproxy probably crashed and was restarted, log it and + # continue + LOG.warning("An error occured with haproxy while it " + "was reloaded, check the haproxy logs for " + "more details.") + break + if b'Job is already running' not in e.output: + LOG.debug( + "Failed to %(action)s haproxy-%(lb_id)s service: " + "%(err)s %(out)s", {'action': action, 'lb_id': lb_id, + 'err': e, 'out': e.output}) + return webob.Response(json={ + 'message': f"Error {action}ing haproxy", + 'details': e.output}, status=500) + break + else: + # no break, we reach the retry limit for reloads + return webob.Response(json={ + 'message': f"Error {action}ing haproxy", + 'details': saved_exc.output if saved_exc else ''}, status=500) # If we are not in active/standby we need to send an IP # advertisement (GARP or NA). Keepalived handles this for diff --git a/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py b/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py index 8a059cccfb..1adbbb3c15 100644 --- a/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py +++ b/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py @@ -56,6 +56,8 @@ class ListenerTestCase(base.TestCase): consts.OFFLINE, self.test_loadbalancer._check_haproxy_status(LISTENER_ID1)) + @mock.patch('time.sleep') + @mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.LOG') @mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.' 'Loadbalancer._check_haproxy_status') @mock.patch('octavia.amphorae.backends.agent.api_server.util.' @@ -67,7 +69,7 @@ class ListenerTestCase(base.TestCase): @mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery') def test_start_stop_lb(self, mock_haproxy_query, mock_check_output, mock_lb_exists, mock_path_exists, mock_vrrp_update, - mock_check_status): + mock_check_status, mock_LOG, mock_time_sleep): listener_id = uuidutils.generate_uuid() conf = self.useFixture(oslo_fixture.Config(cfg.CONF)) @@ -208,6 +210,65 @@ class ListenerTestCase(base.TestCase): mock_vrrp_update.assert_not_called() mock_check_output.assert_not_called() + # haproxy error on reload + mock_check_output.reset_mock() + mock_lb_exists.reset_mock() + mock_path_exists.reset_mock() + mock_vrrp_update.reset_mock() + mock_check_status.reset_mock() + mock_LOG.reset_mock() + + mock_check_output.side_effect = [ + subprocess.CalledProcessError( + output=b'haproxy.service is not active, cannot reload.', + returncode=-2, cmd='service'), + None] + mock_check_status.return_value = 'ACTIVE' + mock_check_status.side_effect = None + + mock_query = mock.Mock() + mock_haproxy_query.return_value = mock_query + mock_query.show_info.side_effect = [Exception("error"), + {'Uptime_sec': 5}] + + result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload') + self.assertEqual(202, result.status_code) + + LOG_last_call = mock_LOG.mock_calls[-1] + self.assertIn('An error occured with haproxy', LOG_last_call[1][0]) + + # haproxy error on reload - retry limit + print("--") + mock_check_output.reset_mock() + mock_lb_exists.reset_mock() + mock_path_exists.reset_mock() + mock_vrrp_update.reset_mock() + mock_check_status.reset_mock() + mock_LOG.reset_mock() + + mock_check_output.side_effect = [ + subprocess.CalledProcessError( + output=b'haproxy.service is not active, cannot reload.', + returncode=-2, cmd='service'), + subprocess.CalledProcessError( + output=b'haproxy.service is not active, cannot reload.', + returncode=-2, cmd='service'), + subprocess.CalledProcessError( + output=b'haproxy.service is not active, cannot reload.', + returncode=-2, cmd='service')] + mock_check_status.return_value = 'ACTIVE' + mock_check_status.side_effect = None + + mock_query = mock.Mock() + mock_haproxy_query.return_value = mock_query + mock_query.show_info.side_effect = Exception("error") + + result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload') + self.assertEqual(500, result.status_code) + self.assertEqual('Error reloading haproxy', result.json['message']) + self.assertEqual('haproxy.service is not active, cannot reload.', + result.json['details']) + @mock.patch('octavia.amphorae.backends.agent.api_server.util.' 'config_path') @mock.patch('octavia.amphorae.backends.agent.api_server.util.' diff --git a/releasenotes/notes/workaround-for-haproxy-crash-on-reload-813859171a6ac023.yaml b/releasenotes/notes/workaround-for-haproxy-crash-on-reload-813859171a6ac023.yaml new file mode 100644 index 0000000000..845f7c23a3 --- /dev/null +++ b/releasenotes/notes/workaround-for-haproxy-crash-on-reload-813859171a6ac023.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Added a workaround that prevent the listener PUT API call from failing if + haproxy crashes during a reload. The amphora-agent ensures that in case of + crashes, haproxy is correctly restarted and ready to accept incoming + requests (see https://bugs.launchpad.net/octavia/+bug/2054666)