Check haproxy status on reload
When reloading haproxy, check if a "<service> is not active, cannot reload." error is triggered by systemd, it means that haproxy crashed during the reload. When this error is detected, verify if haproxy has reloaded correctly (check the socket and its uptime). Related-Bug: #2054666 Change-Id: Ibadf6e529d53fb5a45b73af57243cee5a3f70d9b
This commit is contained in:
parent
93dd9916c3
commit
f5349aca00
@ -18,6 +18,7 @@ import re
|
||||
import shutil
|
||||
import stat
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import flask
|
||||
import jinja2
|
||||
@ -35,6 +36,8 @@ from octavia.common import utils as octavia_utils
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
BUFFER = 100
|
||||
HAPROXY_RELOAD_RETRIES = 3
|
||||
HAPROXY_QUERY_RETRIES = 5
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
||||
@ -224,6 +227,23 @@ class Loadbalancer:
|
||||
|
||||
return res
|
||||
|
||||
def _check_haproxy_uptime(self, lb_id):
|
||||
stat_sock_file = util.haproxy_sock_path(lb_id)
|
||||
lb_query = haproxy_query.HAProxyQuery(stat_sock_file)
|
||||
retries = HAPROXY_QUERY_RETRIES
|
||||
for idx in range(retries):
|
||||
try:
|
||||
info = lb_query.show_info()
|
||||
uptime_sec = info['Uptime_sec']
|
||||
except Exception as e:
|
||||
LOG.warning('Failed to get haproxy info: %s, retrying.', e)
|
||||
time.sleep(1)
|
||||
continue
|
||||
uptime = int(uptime_sec)
|
||||
return uptime
|
||||
LOG.error('Failed to get haproxy uptime after %d tries.', retries)
|
||||
return None
|
||||
|
||||
def start_stop_lb(self, lb_id, action):
|
||||
action = action.lower()
|
||||
if action not in [consts.AMP_ACTION_START,
|
||||
@ -257,20 +277,55 @@ class Loadbalancer:
|
||||
# failure!
|
||||
LOG.warning('Failed to save haproxy-%s state!', lb_id)
|
||||
|
||||
cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
|
||||
lb_id=lb_id, action=action))
|
||||
retries = (HAPROXY_RELOAD_RETRIES
|
||||
if action == consts.AMP_ACTION_RELOAD
|
||||
else 1)
|
||||
saved_exc = None
|
||||
for idx in range(retries):
|
||||
cmd = ("/usr/sbin/service haproxy-{lb_id} {action}".format(
|
||||
lb_id=lb_id, action=action))
|
||||
|
||||
try:
|
||||
subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as e:
|
||||
if b'Job is already running' not in e.output:
|
||||
LOG.debug(
|
||||
"Failed to %(action)s haproxy-%(lb_id)s service: %(err)s "
|
||||
"%(out)s", {'action': action, 'lb_id': lb_id,
|
||||
'err': e, 'out': e.output})
|
||||
return webob.Response(json={
|
||||
'message': f"Error {action}ing haproxy",
|
||||
'details': e.output}, status=500)
|
||||
try:
|
||||
subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as e:
|
||||
# Mitigation for
|
||||
# https://bugs.launchpad.net/octavia/+bug/2054666
|
||||
if (b'is not active, cannot reload.' in e.output and
|
||||
action == consts.AMP_ACTION_RELOAD):
|
||||
|
||||
saved_exc = e
|
||||
|
||||
LOG.debug(
|
||||
"Failed to %(action)s haproxy-%(lb_id)s service: "
|
||||
"%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
|
||||
'err': e, 'out': e.output})
|
||||
|
||||
# Wait a few seconds and check that haproxy was restarted
|
||||
uptime = self._check_haproxy_uptime(lb_id)
|
||||
# If haproxy is not reachable or was restarted more than 15
|
||||
# sec ago, let's retry (or maybe restart?)
|
||||
if not uptime or uptime > 15:
|
||||
continue
|
||||
# haproxy probably crashed and was restarted, log it and
|
||||
# continue
|
||||
LOG.warning("An error occured with haproxy while it "
|
||||
"was reloaded, check the haproxy logs for "
|
||||
"more details.")
|
||||
break
|
||||
if b'Job is already running' not in e.output:
|
||||
LOG.debug(
|
||||
"Failed to %(action)s haproxy-%(lb_id)s service: "
|
||||
"%(err)s %(out)s", {'action': action, 'lb_id': lb_id,
|
||||
'err': e, 'out': e.output})
|
||||
return webob.Response(json={
|
||||
'message': f"Error {action}ing haproxy",
|
||||
'details': e.output}, status=500)
|
||||
break
|
||||
else:
|
||||
# no break, we reach the retry limit for reloads
|
||||
return webob.Response(json={
|
||||
'message': f"Error {action}ing haproxy",
|
||||
'details': saved_exc.output if saved_exc else ''}, status=500)
|
||||
|
||||
# If we are not in active/standby we need to send an IP
|
||||
# advertisement (GARP or NA). Keepalived handles this for
|
||||
|
@ -56,6 +56,8 @@ class ListenerTestCase(base.TestCase):
|
||||
consts.OFFLINE,
|
||||
self.test_loadbalancer._check_haproxy_status(LISTENER_ID1))
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.LOG')
|
||||
@mock.patch('octavia.amphorae.backends.agent.api_server.loadbalancer.'
|
||||
'Loadbalancer._check_haproxy_status')
|
||||
@mock.patch('octavia.amphorae.backends.agent.api_server.util.'
|
||||
@ -67,7 +69,7 @@ class ListenerTestCase(base.TestCase):
|
||||
@mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery')
|
||||
def test_start_stop_lb(self, mock_haproxy_query, mock_check_output,
|
||||
mock_lb_exists, mock_path_exists, mock_vrrp_update,
|
||||
mock_check_status):
|
||||
mock_check_status, mock_LOG, mock_time_sleep):
|
||||
listener_id = uuidutils.generate_uuid()
|
||||
|
||||
conf = self.useFixture(oslo_fixture.Config(cfg.CONF))
|
||||
@ -208,6 +210,65 @@ class ListenerTestCase(base.TestCase):
|
||||
mock_vrrp_update.assert_not_called()
|
||||
mock_check_output.assert_not_called()
|
||||
|
||||
# haproxy error on reload
|
||||
mock_check_output.reset_mock()
|
||||
mock_lb_exists.reset_mock()
|
||||
mock_path_exists.reset_mock()
|
||||
mock_vrrp_update.reset_mock()
|
||||
mock_check_status.reset_mock()
|
||||
mock_LOG.reset_mock()
|
||||
|
||||
mock_check_output.side_effect = [
|
||||
subprocess.CalledProcessError(
|
||||
output=b'haproxy.service is not active, cannot reload.',
|
||||
returncode=-2, cmd='service'),
|
||||
None]
|
||||
mock_check_status.return_value = 'ACTIVE'
|
||||
mock_check_status.side_effect = None
|
||||
|
||||
mock_query = mock.Mock()
|
||||
mock_haproxy_query.return_value = mock_query
|
||||
mock_query.show_info.side_effect = [Exception("error"),
|
||||
{'Uptime_sec': 5}]
|
||||
|
||||
result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
|
||||
self.assertEqual(202, result.status_code)
|
||||
|
||||
LOG_last_call = mock_LOG.mock_calls[-1]
|
||||
self.assertIn('An error occured with haproxy', LOG_last_call[1][0])
|
||||
|
||||
# haproxy error on reload - retry limit
|
||||
print("--")
|
||||
mock_check_output.reset_mock()
|
||||
mock_lb_exists.reset_mock()
|
||||
mock_path_exists.reset_mock()
|
||||
mock_vrrp_update.reset_mock()
|
||||
mock_check_status.reset_mock()
|
||||
mock_LOG.reset_mock()
|
||||
|
||||
mock_check_output.side_effect = [
|
||||
subprocess.CalledProcessError(
|
||||
output=b'haproxy.service is not active, cannot reload.',
|
||||
returncode=-2, cmd='service'),
|
||||
subprocess.CalledProcessError(
|
||||
output=b'haproxy.service is not active, cannot reload.',
|
||||
returncode=-2, cmd='service'),
|
||||
subprocess.CalledProcessError(
|
||||
output=b'haproxy.service is not active, cannot reload.',
|
||||
returncode=-2, cmd='service')]
|
||||
mock_check_status.return_value = 'ACTIVE'
|
||||
mock_check_status.side_effect = None
|
||||
|
||||
mock_query = mock.Mock()
|
||||
mock_haproxy_query.return_value = mock_query
|
||||
mock_query.show_info.side_effect = Exception("error")
|
||||
|
||||
result = self.test_loadbalancer.start_stop_lb(listener_id, 'reload')
|
||||
self.assertEqual(500, result.status_code)
|
||||
self.assertEqual('Error reloading haproxy', result.json['message'])
|
||||
self.assertEqual('haproxy.service is not active, cannot reload.',
|
||||
result.json['details'])
|
||||
|
||||
@mock.patch('octavia.amphorae.backends.agent.api_server.util.'
|
||||
'config_path')
|
||||
@mock.patch('octavia.amphorae.backends.agent.api_server.util.'
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Added a workaround that prevent the listener PUT API call from failing if
|
||||
haproxy crashes during a reload. The amphora-agent ensures that in case of
|
||||
crashes, haproxy is correctly restarted and ready to accept incoming
|
||||
requests (see https://bugs.launchpad.net/octavia/+bug/2054666)
|
Loading…
Reference in New Issue
Block a user