Check haproxy status on reload

When reloading haproxy, check if a "<service> is not active, cannot
reload." error is triggered by systemd, it means that haproxy crashed
during the reload. When this error is detected, verify if haproxy has
reloaded correctly (check the socket and its uptime).

Related-Bug: #2054666

Change-Id: Ibadf6e529d53fb5a45b73af57243cee5a3f70d9b
This commit is contained in:
Gregory Thiemonge 2024-02-22 04:16:45 -05:00
parent 93dd9916c3
commit f5349aca00
3 changed files with 137 additions and 14 deletions

View File

@ -18,6 +18,7 @@ import re
import shutil import shutil
import stat import stat
import subprocess import subprocess
import time
import flask import flask
import jinja2 import jinja2
@ -35,6 +36,8 @@ from octavia.common import utils as octavia_utils
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
BUFFER = 100 BUFFER = 100
HAPROXY_RELOAD_RETRIES = 3
HAPROXY_QUERY_RETRIES = 5
CONF = cfg.CONF CONF = cfg.CONF
@ -224,6 +227,23 @@ class Loadbalancer:
return res return res
def _check_haproxy_uptime(self, lb_id):
stat_sock_file = util.haproxy_sock_path(lb_id)
lb_query = haproxy_query.HAProxyQuery(stat_sock_file)
retries = HAPROXY_QUERY_RETRIES
for idx in range(retries):
try:
info = lb_query.show_info()
uptime_sec = info['Uptime_sec']
except Exception as e:
LOG.warning('Failed to get haproxy info: %s, retrying.', e)
time.sleep(1)
continue
uptime = int(uptime_sec)
return uptime
LOG.error('Failed to get haproxy uptime after %d tries.', retries)
return None
def start_stop_lb(self, lb_id, action): def start_stop_lb(self, lb_id, action):
action = action.lower() action = action.lower()
if action not in [consts.AMP_ACTION_START, if action not in [consts.AMP_ACTION_START,
@ -257,20 +277,55 @@ class Loadbalancer:
# failure! # failure!
LOG.warning('Failed to save haproxy-%s state!', lb_id) LOG.warning('Failed to save haproxy-%s state!', lb_id)
retries = (HAPROXY_RELOAD_RETRIES
if action == consts.AMP_ACTION_RELOAD
else 1)
saved_exc = None
for idx in range(retries):
cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format( cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
lb_id=lb_id, action=action)) lb_id=lb_id, action=action))
try: try:
subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT) subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
# Mitigation for
# https://bugs.launchpad.net/octavia/+bug/2054666
if (b'is not active, cannot reload.' in e.output and
action == consts.AMP_ACTION_RELOAD):
saved_exc = e
LOG.debug(
"Failed to %(action)s haproxy-%(lb_id)s service: "
"%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
'err': e, 'out': e.output})
# Wait a few seconds and check that haproxy was restarted
uptime = self._check_haproxy_uptime(lb_id)
# If haproxy is not reachable or was restarted more than 15
# sec ago, let's retry (or maybe restart?)
if not uptime or uptime > 15:
continue
# haproxy probably crashed and was restarted, log it and
# continue
LOG.warning("An error occured with haproxy while it "
"was reloaded, check the haproxy logs for "
"more details.")
break
if b'Job is already running' not in e.output: if b'Job is already running' not in e.output:
LOG.debug( LOG.debug(
"Failed to %(action)s haproxy-%(lb_id)s service: %(err)s " "Failed to %(action)s haproxy-%(lb_id)s service: "
"%(out)s", {'action': action, 'lb_id': lb_id, "%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
'err': e, 'out': e.output}) 'err': e, 'out': e.output})
return webob.Response(json={ return webob.Response(json={
'message': f"Error {action}ing haproxy", 'message': f"Error {action}ing haproxy",
'details': e.output}, status=500) 'details': e.output}, status=500)
break
else:
# no break, we reach the retry limit for reloads
return webob.Response(json={
'message': f"Error {action}ing haproxy",
'details': saved_exc.output if saved_exc else ''}, status=500)
# If we are not in active/standby we need to send an IP # If we are not in active/standby we need to send an IP
# advertisement (GARP or NA). Keepalived handles this for # advertisement (GARP or NA). Keepalived handles this for

View File

@ -56,6 +56,8 @@ class ListenerTestCase(base.TestCase):
consts.OFFLINE, consts.OFFLINE,
self.test_loadbalancer._check_haproxy_status(LISTENER_ID1)) self.test_loadbalancer._check_haproxy_status(LISTENER_ID1))
@mock.patch('time.sleep')
@mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.LOG')
@mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.' @mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.'
'Loadbalancer._check_haproxy_status') 'Loadbalancer._check_haproxy_status')
@mock.patch('octavia.amphorae.backends.agent.api_server.util.' @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
@ -67,7 +69,7 @@ class ListenerTestCase(base.TestCase):
@mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery') @mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery')
def test_start_stop_lb(self, mock_haproxy_query, mock_check_output, def test_start_stop_lb(self, mock_haproxy_query, mock_check_output,
mock_lb_exists, mock_path_exists, mock_vrrp_update, mock_lb_exists, mock_path_exists, mock_vrrp_update,
mock_check_status): mock_check_status, mock_LOG, mock_time_sleep):
listener_id = uuidutils.generate_uuid() listener_id = uuidutils.generate_uuid()
conf = self.useFixture(oslo_fixture.Config(cfg.CONF)) conf = self.useFixture(oslo_fixture.Config(cfg.CONF))
@ -208,6 +210,65 @@ class ListenerTestCase(base.TestCase):
mock_vrrp_update.assert_not_called() mock_vrrp_update.assert_not_called()
mock_check_output.assert_not_called() mock_check_output.assert_not_called()
# haproxy error on reload
mock_check_output.reset_mock()
mock_lb_exists.reset_mock()
mock_path_exists.reset_mock()
mock_vrrp_update.reset_mock()
mock_check_status.reset_mock()
mock_LOG.reset_mock()
mock_check_output.side_effect = [
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service'),
None]
mock_check_status.return_value = 'ACTIVE'
mock_check_status.side_effect = None
mock_query = mock.Mock()
mock_haproxy_query.return_value = mock_query
mock_query.show_info.side_effect = [Exception("error"),
{'Uptime_sec': 5}]
result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
self.assertEqual(202, result.status_code)
LOG_last_call = mock_LOG.mock_calls[-1]
self.assertIn('An error occured with haproxy', LOG_last_call[1][0])
# haproxy error on reload - retry limit
print("--")
mock_check_output.reset_mock()
mock_lb_exists.reset_mock()
mock_path_exists.reset_mock()
mock_vrrp_update.reset_mock()
mock_check_status.reset_mock()
mock_LOG.reset_mock()
mock_check_output.side_effect = [
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service'),
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service'),
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service')]
mock_check_status.return_value = 'ACTIVE'
mock_check_status.side_effect = None
mock_query = mock.Mock()
mock_haproxy_query.return_value = mock_query
mock_query.show_info.side_effect = Exception("error")
result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
self.assertEqual(500, result.status_code)
self.assertEqual('Error reloading haproxy', result.json['message'])
self.assertEqual('haproxy.service is not active, cannot reload.',
result.json['details'])
@mock.patch('octavia.amphorae.backends.agent.api_server.util.' @mock.patch('octavia.amphorae.backends.agent.api_server.util.'
'config_path') 'config_path')
@mock.patch('octavia.amphorae.backends.agent.api_server.util.' @mock.patch('octavia.amphorae.backends.agent.api_server.util.'

View File

@ -0,0 +1,7 @@
---
fixes:
- |
Added a workaround that prevent the listener PUT API call from failing if
haproxy crashes during a reload. The amphora-agent ensures that in case of
crashes, haproxy is correctly restarted and ready to accept incoming
requests (see https://bugs.launchpad.net/octavia/+bug/2054666)