Check haproxy status on reload

When reloading haproxy, check if a "<service> is not active, cannot
reload." error is triggered by systemd, it means that haproxy crashed
during the reload. When this error is detected, verify if haproxy has
reloaded correctly (check the socket and its uptime).

Related-Bug: #2054666

Change-Id: Ibadf6e529d53fb5a45b73af57243cee5a3f70d9b
This commit is contained in:
Gregory Thiemonge 2024-02-22 04:16:45 -05:00
parent 93dd9916c3
commit f5349aca00
3 changed files with 137 additions and 14 deletions

View File

@ -18,6 +18,7 @@ import re
import shutil
import stat
import subprocess
import time
import flask
import jinja2
@ -35,6 +36,8 @@ from octavia.common import utils as octavia_utils
LOG = logging.getLogger(__name__)
BUFFER = 100
HAPROXY_RELOAD_RETRIES = 3
HAPROXY_QUERY_RETRIES = 5
CONF = cfg.CONF
@ -224,6 +227,23 @@ class Loadbalancer:
return res
def _check_haproxy_uptime(self, lb_id):
stat_sock_file = util.haproxy_sock_path(lb_id)
lb_query = haproxy_query.HAProxyQuery(stat_sock_file)
retries = HAPROXY_QUERY_RETRIES
for idx in range(retries):
try:
info = lb_query.show_info()
uptime_sec = info['Uptime_sec']
except Exception as e:
LOG.warning('Failed to get haproxy info: %s, retrying.', e)
time.sleep(1)
continue
uptime = int(uptime_sec)
return uptime
LOG.error('Failed to get haproxy uptime after %d tries.', retries)
return None
def start_stop_lb(self, lb_id, action):
action = action.lower()
if action not in [consts.AMP_ACTION_START,
@ -257,20 +277,55 @@ class Loadbalancer:
# failure!
LOG.warning('Failed to save haproxy-%s state!', lb_id)
cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
lb_id=lb_id, action=action))
retries = (HAPROXY_RELOAD_RETRIES
if action == consts.AMP_ACTION_RELOAD
else 1)
saved_exc = None
for idx in range(retries):
cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
lb_id=lb_id, action=action))
try:
subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
if b'Job is already running' not in e.output:
LOG.debug(
"Failed to %(action)s haproxy-%(lb_id)s service: %(err)s "
"%(out)s", {'action': action, 'lb_id': lb_id,
'err': e, 'out': e.output})
return webob.Response(json={
'message': f"Error {action}ing haproxy",
'details': e.output}, status=500)
try:
subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
# Mitigation for
# https://bugs.launchpad.net/octavia/+bug/2054666
if (b'is not active, cannot reload.' in e.output and
action == consts.AMP_ACTION_RELOAD):
saved_exc = e
LOG.debug(
"Failed to %(action)s haproxy-%(lb_id)s service: "
"%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
'err': e, 'out': e.output})
# Wait a few seconds and check that haproxy was restarted
uptime = self._check_haproxy_uptime(lb_id)
# If haproxy is not reachable or was restarted more than 15
# sec ago, let's retry (or maybe restart?)
if not uptime or uptime > 15:
continue
# haproxy probably crashed and was restarted, log it and
# continue
LOG.warning("An error occured with haproxy while it "
"was reloaded, check the haproxy logs for "
"more details.")
break
if b'Job is already running' not in e.output:
LOG.debug(
"Failed to %(action)s haproxy-%(lb_id)s service: "
"%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
'err': e, 'out': e.output})
return webob.Response(json={
'message': f"Error {action}ing haproxy",
'details': e.output}, status=500)
break
else:
# no break, we reach the retry limit for reloads
return webob.Response(json={
'message': f"Error {action}ing haproxy",
'details': saved_exc.output if saved_exc else ''}, status=500)
# If we are not in active/standby we need to send an IP
# advertisement (GARP or NA). Keepalived handles this for

View File

@ -56,6 +56,8 @@ class ListenerTestCase(base.TestCase):
consts.OFFLINE,
self.test_loadbalancer._check_haproxy_status(LISTENER_ID1))
@mock.patch('time.sleep')
@mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.LOG')
@mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.'
'Loadbalancer._check_haproxy_status')
@mock.patch('octavia.amphorae.backends.agent.api_server.util.'
@ -67,7 +69,7 @@ class ListenerTestCase(base.TestCase):
@mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery')
def test_start_stop_lb(self, mock_haproxy_query, mock_check_output,
mock_lb_exists, mock_path_exists, mock_vrrp_update,
mock_check_status):
mock_check_status, mock_LOG, mock_time_sleep):
listener_id = uuidutils.generate_uuid()
conf = self.useFixture(oslo_fixture.Config(cfg.CONF))
@ -208,6 +210,65 @@ class ListenerTestCase(base.TestCase):
mock_vrrp_update.assert_not_called()
mock_check_output.assert_not_called()
# haproxy error on reload
mock_check_output.reset_mock()
mock_lb_exists.reset_mock()
mock_path_exists.reset_mock()
mock_vrrp_update.reset_mock()
mock_check_status.reset_mock()
mock_LOG.reset_mock()
mock_check_output.side_effect = [
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service'),
None]
mock_check_status.return_value = 'ACTIVE'
mock_check_status.side_effect = None
mock_query = mock.Mock()
mock_haproxy_query.return_value = mock_query
mock_query.show_info.side_effect = [Exception("error"),
{'Uptime_sec': 5}]
result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
self.assertEqual(202, result.status_code)
LOG_last_call = mock_LOG.mock_calls[-1]
self.assertIn('An error occured with haproxy', LOG_last_call[1][0])
# haproxy error on reload - retry limit
print("--")
mock_check_output.reset_mock()
mock_lb_exists.reset_mock()
mock_path_exists.reset_mock()
mock_vrrp_update.reset_mock()
mock_check_status.reset_mock()
mock_LOG.reset_mock()
mock_check_output.side_effect = [
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service'),
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service'),
subprocess.CalledProcessError(
output=b'haproxy.service is not active, cannot reload.',
returncode=-2, cmd='service')]
mock_check_status.return_value = 'ACTIVE'
mock_check_status.side_effect = None
mock_query = mock.Mock()
mock_haproxy_query.return_value = mock_query
mock_query.show_info.side_effect = Exception("error")
result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
self.assertEqual(500, result.status_code)
self.assertEqual('Error reloading haproxy', result.json['message'])
self.assertEqual('haproxy.service is not active, cannot reload.',
result.json['details'])
@mock.patch('octavia.amphorae.backends.agent.api_server.util.'
'config_path')
@mock.patch('octavia.amphorae.backends.agent.api_server.util.'

View File

@ -0,0 +1,7 @@
---
fixes:
- |
Added a workaround that prevent the listener PUT API call from failing if
haproxy crashes during a reload. The amphora-agent ensures that in case of
crashes, haproxy is correctly restarted and ready to accept incoming
requests (see https://bugs.launchpad.net/octavia/+bug/2054666)