Check haproxy status on reload

When reloading haproxy, check if a "<service> is not active, cannot reload." error is triggered by systemd, it means that haproxy crashed during the reload. When this error is detected, verify if haproxy has reloaded correctly (check the socket and its uptime). Related-Bug: #2054666 Change-Id: Ibadf6e529d53fb5a45b73af57243cee5a3f70d9b
2024-02-22 04:16:45 -05:00 · 2024-02-22 04:16:45 -05:00 · f5349aca00
commit f5349aca00
parent 93dd9916c3
3 changed files with 137 additions and 14 deletions
--- a/octavia/amphorae/backends/agent/api_server/loadbalancer.py
+++ b/octavia/amphorae/backends/agent/api_server/loadbalancer.py
@ -18,6 +18,7 @@ import re
 import shutil
 import stat
 import subprocess
 import time
 import flask
 import jinja2
@ -35,6 +36,8 @@ from octavia.common import utils as octavia_utils
 LOG = logging.getLogger(__name__)
 BUFFER = 100
 HAPROXY_RELOAD_RETRIES = 3
 HAPROXY_QUERY_RETRIES = 5
 CONF = cfg.CONF
@ -224,6 +227,23 @@ class Loadbalancer:
        return res
    def _check_haproxy_uptime(self, lb_id):
        stat_sock_file = util.haproxy_sock_path(lb_id)
        lb_query = haproxy_query.HAProxyQuery(stat_sock_file)
        retries = HAPROXY_QUERY_RETRIES
        for idx in range(retries):
            try:
                info = lb_query.show_info()
                uptime_sec = info['Uptime_sec']
            except Exception as e:
                LOG.warning('Failed to get haproxy info: %s, retrying.', e)
                time.sleep(1)
                continue
            uptime = int(uptime_sec)
            return uptime
        LOG.error('Failed to get haproxy uptime after %d tries.', retries)
        return None
    def start_stop_lb(self, lb_id, action):
        action = action.lower()
        if action not in [consts.AMP_ACTION_START,
@ -257,20 +277,55 @@ class Loadbalancer:
                    # failure!
                    LOG.warning('Failed to save haproxy-%s state!', lb_id)
-        cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
+        retries = (HAPROXY_RELOAD_RETRIES
-            lb_id=lb_id, action=action))
+                   if action == consts.AMP_ACTION_RELOAD
                   else 1)
        saved_exc = None
        for idx in range(retries):
            cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
                lb_id=lb_id, action=action))
-        try:
+            try:
-            subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
+                subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
-        except subprocess.CalledProcessError as e:
+            except subprocess.CalledProcessError as e:
-            if b'Job is already running' not in e.output:
+                # Mitigation for
-                LOG.debug(
+                # https://bugs.launchpad.net/octavia/+bug/2054666
-                    "Failed to %(action)s haproxy-%(lb_id)s service: %(err)s "
+                if (b'is not active, cannot reload.' in e.output and
-                    "%(out)s", {'action': action, 'lb_id': lb_id,
+                        action == consts.AMP_ACTION_RELOAD):
-                                'err': e, 'out': e.output})
+
-                return webob.Response(json={
+                    saved_exc = e
-                    'message': f"Error {action}ing haproxy",
+
-                    'details': e.output}, status=500)
+                    LOG.debug(
                        "Failed to %(action)s haproxy-%(lb_id)s service: "
                        "%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
                                            'err': e, 'out': e.output})
                    # Wait a few seconds and check that haproxy was restarted
                    uptime = self._check_haproxy_uptime(lb_id)
                    # If haproxy is not reachable or was restarted more than 15
                    # sec ago, let's retry (or maybe restart?)
                    if not uptime or uptime > 15:
                        continue
                    # haproxy probably crashed and was restarted, log it and
                    # continue
                    LOG.warning("An error occured with haproxy while it "
                                "was reloaded, check the haproxy logs for "
                                "more details.")
                    break
                if b'Job is already running' not in e.output:
                    LOG.debug(
                        "Failed to %(action)s haproxy-%(lb_id)s service: "
                        "%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
                                            'err': e, 'out': e.output})
                    return webob.Response(json={
                        'message': f"Error {action}ing haproxy",
                        'details': e.output}, status=500)
            break
        else:
            # no break, we reach the retry limit for reloads
            return webob.Response(json={
                'message': f"Error {action}ing haproxy",
                'details': saved_exc.output if saved_exc else ''}, status=500)
        # If we are not in active/standby we need to send an IP
        # advertisement (GARP or NA). Keepalived handles this for
--- a/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py
+++ b/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py
@ -56,6 +56,8 @@ class ListenerTestCase(base.TestCase):
            consts.OFFLINE,
            self.test_loadbalancer._check_haproxy_status(LISTENER_ID1))
    @mock.patch('time.sleep')
    @mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.LOG')
    @mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.'
                'Loadbalancer._check_haproxy_status')
    @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
@ -67,7 +69,7 @@ class ListenerTestCase(base.TestCase):
    @mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery')
    def test_start_stop_lb(self, mock_haproxy_query, mock_check_output,
                           mock_lb_exists, mock_path_exists, mock_vrrp_update,
-                           mock_check_status):
+                           mock_check_status, mock_LOG, mock_time_sleep):
        listener_id = uuidutils.generate_uuid()
        conf = self.useFixture(oslo_fixture.Config(cfg.CONF))
@ -208,6 +210,65 @@ class ListenerTestCase(base.TestCase):
        mock_vrrp_update.assert_not_called()
        mock_check_output.assert_not_called()
        # haproxy error on reload
        mock_check_output.reset_mock()
        mock_lb_exists.reset_mock()
        mock_path_exists.reset_mock()
        mock_vrrp_update.reset_mock()
        mock_check_status.reset_mock()
        mock_LOG.reset_mock()
        mock_check_output.side_effect = [
            subprocess.CalledProcessError(
                output=b'haproxy.service is not active, cannot reload.',
                returncode=-2, cmd='service'),
            None]
        mock_check_status.return_value = 'ACTIVE'
        mock_check_status.side_effect = None
        mock_query = mock.Mock()
        mock_haproxy_query.return_value = mock_query
        mock_query.show_info.side_effect = [Exception("error"),
                                            {'Uptime_sec': 5}]
        result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
        self.assertEqual(202, result.status_code)
        LOG_last_call = mock_LOG.mock_calls[-1]
        self.assertIn('An error occured with haproxy', LOG_last_call[1][0])
        # haproxy error on reload - retry limit
        print("--")
        mock_check_output.reset_mock()
        mock_lb_exists.reset_mock()
        mock_path_exists.reset_mock()
        mock_vrrp_update.reset_mock()
        mock_check_status.reset_mock()
        mock_LOG.reset_mock()
        mock_check_output.side_effect = [
            subprocess.CalledProcessError(
                output=b'haproxy.service is not active, cannot reload.',
                returncode=-2, cmd='service'),
            subprocess.CalledProcessError(
                output=b'haproxy.service is not active, cannot reload.',
                returncode=-2, cmd='service'),
            subprocess.CalledProcessError(
                output=b'haproxy.service is not active, cannot reload.',
                returncode=-2, cmd='service')]
        mock_check_status.return_value = 'ACTIVE'
        mock_check_status.side_effect = None
        mock_query = mock.Mock()
        mock_haproxy_query.return_value = mock_query
        mock_query.show_info.side_effect = Exception("error")
        result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
        self.assertEqual(500, result.status_code)
        self.assertEqual('Error reloading haproxy', result.json['message'])
        self.assertEqual('haproxy.service is not active, cannot reload.',
                         result.json['details'])
    @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
                'config_path')
    @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
--- a/releasenotes/notes/workaround-for-haproxy-crash-on-reload-813859171a6ac023.yaml
+++ b/releasenotes/notes/workaround-for-haproxy-crash-on-reload-813859171a6ac023.yaml
@ -0,0 +1,7 @@
 ---
 fixes:
  - |
    Added a workaround that prevent the listener PUT API call from failing if
    haproxy crashes during a reload. The amphora-agent ensures that in case of
    crashes, haproxy is correctly restarted and ready to accept incoming
    requests (see https://bugs.launchpad.net/octavia/+bug/2054666)