Check haproxy status on reload

When reloading haproxy, check if a "<service> is not active, cannot reload." error is triggered by systemd, it means that haproxy crashed during the reload. When this error is detected, verify if haproxy has reloaded correctly (check the socket and its uptime). Related-Bug: #2054666 Change-Id: Ibadf6e529d53fb5a45b73af57243cee5a3f70d9b
2024-02-22 04:16:45 -05:00 · 2024-02-22 04:16:45 -05:00 · f5349aca00
commit f5349aca00
parent 93dd9916c3
3 changed files with 137 additions and 14 deletions
--- a/octavia/amphorae/backends/agent/api_server/loadbalancer.py
+++ b/octavia/amphorae/backends/agent/api_server/loadbalancer.py
@ -18,6 +18,7 @@ import re
 import shutil
 import stat
 import subprocess
+import time

 import flask
 import jinja2
@ -35,6 +36,8 @@ from octavia.common import utils as octavia_utils

 LOG = logging.getLogger(__name__)
 BUFFER = 100
+HAPROXY_RELOAD_RETRIES = 3
+HAPROXY_QUERY_RETRIES = 5

 CONF = cfg.CONF

@ -224,6 +227,23 @@ class Loadbalancer:

        return res

+    def _check_haproxy_uptime(self, lb_id):
+        stat_sock_file = util.haproxy_sock_path(lb_id)
+        lb_query = haproxy_query.HAProxyQuery(stat_sock_file)
+        retries = HAPROXY_QUERY_RETRIES
+        for idx in range(retries):
+            try:
+                info = lb_query.show_info()
+                uptime_sec = info['Uptime_sec']
+            except Exception as e:
+                LOG.warning('Failed to get haproxy info: %s, retrying.', e)
+                time.sleep(1)
+                continue
+            uptime = int(uptime_sec)
+            return uptime
+        LOG.error('Failed to get haproxy uptime after %d tries.', retries)
+        return None
+
    def start_stop_lb(self, lb_id, action):
        action = action.lower()
        if action not in [consts.AMP_ACTION_START,
@ -257,20 +277,55 @@ class Loadbalancer:
                    # failure!
                    LOG.warning('Failed to save haproxy-%s state!', lb_id)

-        cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
-            lb_id=lb_id, action=action))
+        retries = (HAPROXY_RELOAD_RETRIES
+                   if action == consts.AMP_ACTION_RELOAD
+                   else 1)
+        saved_exc = None
+        for idx in range(retries):
+            cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
+                lb_id=lb_id, action=action))

-        try:
-            subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
-        except subprocess.CalledProcessError as e:
-            if b'Job is already running' not in e.output:
-                LOG.debug(
-                    "Failed to %(action)s haproxy-%(lb_id)s service: %(err)s "
-                    "%(out)s", {'action': action, 'lb_id': lb_id,
-                                'err': e, 'out': e.output})
-                return webob.Response(json={
-                    'message': f"Error {action}ing haproxy",
-                    'details': e.output}, status=500)
+            try:
+                subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
+            except subprocess.CalledProcessError as e:
+                # Mitigation for
+                # https://bugs.launchpad.net/octavia/+bug/2054666
+                if (b'is not active, cannot reload.' in e.output and
+                        action == consts.AMP_ACTION_RELOAD):
+
+                    saved_exc = e
+
+                    LOG.debug(
+                        "Failed to %(action)s haproxy-%(lb_id)s service: "
+                        "%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
+                                            'err': e, 'out': e.output})
+
+                    # Wait a few seconds and check that haproxy was restarted
+                    uptime = self._check_haproxy_uptime(lb_id)
+                    # If haproxy is not reachable or was restarted more than 15
+                    # sec ago, let's retry (or maybe restart?)
+                    if not uptime or uptime > 15:
+                        continue
+                    # haproxy probably crashed and was restarted, log it and
+                    # continue
+                    LOG.warning("An error occured with haproxy while it "
+                                "was reloaded, check the haproxy logs for "
+                                "more details.")
+                    break
+                if b'Job is already running' not in e.output:
+                    LOG.debug(
+                        "Failed to %(action)s haproxy-%(lb_id)s service: "
+                        "%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
+                                            'err': e, 'out': e.output})
+                    return webob.Response(json={
+                        'message': f"Error {action}ing haproxy",
+                        'details': e.output}, status=500)
+            break
+        else:
+            # no break, we reach the retry limit for reloads
+            return webob.Response(json={
+                'message': f"Error {action}ing haproxy",
+                'details': saved_exc.output if saved_exc else ''}, status=500)

        # If we are not in active/standby we need to send an IP
        # advertisement (GARP or NA). Keepalived handles this for
--- a/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py
+++ b/octavia/tests/unit/amphorae/backends/agent/api_server/test_loadbalancer.py
@ -56,6 +56,8 @@ class ListenerTestCase(base.TestCase):
            consts.OFFLINE,
            self.test_loadbalancer._check_haproxy_status(LISTENER_ID1))

+    @mock.patch('time.sleep')
+    @mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.LOG')
    @mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.'
                'Loadbalancer._check_haproxy_status')
    @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
@ -67,7 +69,7 @@ class ListenerTestCase(base.TestCase):
    @mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery')
    def test_start_stop_lb(self, mock_haproxy_query, mock_check_output,
                           mock_lb_exists, mock_path_exists, mock_vrrp_update,
-                           mock_check_status):
+                           mock_check_status, mock_LOG, mock_time_sleep):
        listener_id = uuidutils.generate_uuid()

        conf = self.useFixture(oslo_fixture.Config(cfg.CONF))
@ -208,6 +210,65 @@ class ListenerTestCase(base.TestCase):
        mock_vrrp_update.assert_not_called()
        mock_check_output.assert_not_called()

+        # haproxy error on reload
+        mock_check_output.reset_mock()
+        mock_lb_exists.reset_mock()
+        mock_path_exists.reset_mock()
+        mock_vrrp_update.reset_mock()
+        mock_check_status.reset_mock()
+        mock_LOG.reset_mock()
+
+        mock_check_output.side_effect = [
+            subprocess.CalledProcessError(
+                output=b'haproxy.service is not active, cannot reload.',
+                returncode=-2, cmd='service'),
+            None]
+        mock_check_status.return_value = 'ACTIVE'
+        mock_check_status.side_effect = None
+
+        mock_query = mock.Mock()
+        mock_haproxy_query.return_value = mock_query
+        mock_query.show_info.side_effect = [Exception("error"),
+                                            {'Uptime_sec': 5}]
+
+        result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
+        self.assertEqual(202, result.status_code)
+
+        LOG_last_call = mock_LOG.mock_calls[-1]
+        self.assertIn('An error occured with haproxy', LOG_last_call[1][0])
+
+        # haproxy error on reload - retry limit
+        print("--")
+        mock_check_output.reset_mock()
+        mock_lb_exists.reset_mock()
+        mock_path_exists.reset_mock()
+        mock_vrrp_update.reset_mock()
+        mock_check_status.reset_mock()
+        mock_LOG.reset_mock()
+
+        mock_check_output.side_effect = [
+            subprocess.CalledProcessError(
+                output=b'haproxy.service is not active, cannot reload.',
+                returncode=-2, cmd='service'),
+            subprocess.CalledProcessError(
+                output=b'haproxy.service is not active, cannot reload.',
+                returncode=-2, cmd='service'),
+            subprocess.CalledProcessError(
+                output=b'haproxy.service is not active, cannot reload.',
+                returncode=-2, cmd='service')]
+        mock_check_status.return_value = 'ACTIVE'
+        mock_check_status.side_effect = None
+
+        mock_query = mock.Mock()
+        mock_haproxy_query.return_value = mock_query
+        mock_query.show_info.side_effect = Exception("error")
+
+        result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
+        self.assertEqual(500, result.status_code)
+        self.assertEqual('Error reloading haproxy', result.json['message'])
+        self.assertEqual('haproxy.service is not active, cannot reload.',
+                         result.json['details'])
+
    @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
                'config_path')
    @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
--- a/releasenotes/notes/workaround-for-haproxy-crash-on-reload-813859171a6ac023.yaml
+++ b/releasenotes/notes/workaround-for-haproxy-crash-on-reload-813859171a6ac023.yaml
@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Added a workaround that prevent the listener PUT API call from failing if
+    haproxy crashes during a reload. The amphora-agent ensures that in case of
+    crashes, haproxy is correctly restarted and ready to accept incoming
+    requests (see https://bugs.launchpad.net/octavia/+bug/2054666)