Fix driver-agent cleanup

The cleanup function (that handles graceful shutdown) of the Octavia
driver-agent didn't work properly, the threads that were spawned for
stopping the Unix listeners were stuck (deadlock), so the threads leaked
and the join functions in the caller didn't return.

Now the code uses a thread for handling the connections, and the
shutdown function is called from the process.

Story 2010052
Task 45468

Change-Id: Ida33a1a24eab2706902b1ce3cca704d51fa6a599
This commit is contained in:
Gregory Thiemonge 2022-05-25 10:08:11 +02:00
parent 8a65153f08
commit 51eef0d6d7
3 changed files with 40 additions and 42 deletions

View File

@ -113,59 +113,53 @@ def _cleanup_socket_file(filename):
def status_listener(exit_event):
_cleanup_socket_file(CONF.driver_agent.status_socket_path)
server = ForkingUDSServer(CONF.driver_agent.status_socket_path,
StatusRequestHandler)
with ForkingUDSServer(CONF.driver_agent.status_socket_path,
StatusRequestHandler) as server:
server.timeout = CONF.driver_agent.status_request_timeout
server.max_children = CONF.driver_agent.status_max_processes
server.timeout = CONF.driver_agent.status_request_timeout
server.max_children = CONF.driver_agent.status_max_processes
threading.Thread(target=server.serve_forever).start()
while not exit_event.is_set():
server.handle_request()
exit_event.wait()
LOG.info('Waiting for driver status listener to shutdown...')
# Can't shut ourselves down as we would deadlock, spawn a thread
threading.Thread(target=server.shutdown).start()
LOG.info('Driver status listener shutdown finished.')
server.server_close()
LOG.info('Waiting for driver status listener to shutdown...')
server.shutdown()
LOG.info('Driver status listener shutdown finished.')
_cleanup_socket_file(CONF.driver_agent.status_socket_path)
def stats_listener(exit_event):
_cleanup_socket_file(CONF.driver_agent.stats_socket_path)
server = ForkingUDSServer(CONF.driver_agent.stats_socket_path,
StatsRequestHandler)
with ForkingUDSServer(CONF.driver_agent.stats_socket_path,
StatsRequestHandler) as server:
server.timeout = CONF.driver_agent.stats_request_timeout
server.max_children = CONF.driver_agent.stats_max_processes
server.timeout = CONF.driver_agent.stats_request_timeout
server.max_children = CONF.driver_agent.stats_max_processes
threading.Thread(target=server.serve_forever).start()
while not exit_event.is_set():
server.handle_request()
exit_event.wait()
LOG.info('Waiting for driver statistics listener to shutdown...')
# Can't shut ourselves down as we would deadlock, spawn a thread
threading.Thread(target=server.shutdown).start()
LOG.info('Driver statistics listener shutdown finished.')
server.server_close()
LOG.info('Waiting for driver statistics listener to shutdown...')
server.shutdown()
LOG.info('Driver statistics listener shutdown finished.')
_cleanup_socket_file(CONF.driver_agent.stats_socket_path)
def get_listener(exit_event):
_cleanup_socket_file(CONF.driver_agent.get_socket_path)
server = ForkingUDSServer(CONF.driver_agent.get_socket_path,
GetRequestHandler)
with ForkingUDSServer(CONF.driver_agent.get_socket_path,
GetRequestHandler) as server:
server.timeout = CONF.driver_agent.get_request_timeout
server.max_children = CONF.driver_agent.get_max_processes
server.timeout = CONF.driver_agent.get_request_timeout
server.max_children = CONF.driver_agent.get_max_processes
threading.Thread(target=server.serve_forever).start()
while not exit_event.is_set():
server.handle_request()
exit_event.wait()
LOG.info('Waiting for driver get listener to shutdown...')
# Can't shut ourselves down as we would deadlock, spawn a thread
threading.Thread(target=server.shutdown).start()
LOG.info('Driver get listener shutdown finished.')
server.server_close()
LOG.info('Waiting for driver get listener to shutdown...')
server.shutdown()
LOG.info('Driver get listener shutdown finished.')
_cleanup_socket_file(CONF.driver_agent.get_socket_path)
LOG.info("UDS server was closed and socket was cleaned up.")

View File

@ -156,13 +156,12 @@ class TestDriverListener(base.TestCase):
'a' * CONF.driver_agent.status_max_processes, 'a',
'a' * 1000, ''])
type(mock_server).active_children = mock_active_children
mock_forking_server.return_value = mock_server
mock_forking_server.return_value.__enter__.return_value = mock_server
mock_exit_event = mock.MagicMock()
mock_exit_event.is_set.side_effect = [False, False, False, False, True]
driver_listener.status_listener(mock_exit_event)
mock_server.handle_request.assert_called()
mock_server.server_close.assert_called_once()
mock_server.serve_forever.assert_called()
self.assertEqual(2, mock_cleanup.call_count)
@mock.patch('octavia.api.drivers.driver_agent.driver_listener.'
@ -176,13 +175,12 @@ class TestDriverListener(base.TestCase):
'a' * CONF.driver_agent.status_max_processes, 'a',
'a' * 1000, ''])
type(mock_server).active_children = mock_active_children
mock_forking_server.return_value = mock_server
mock_forking_server.return_value.__enter__.return_value = mock_server
mock_exit_event = mock.MagicMock()
mock_exit_event.is_set.side_effect = [False, False, False, False, True]
driver_listener.stats_listener(mock_exit_event)
mock_server.handle_request.assert_called()
mock_server.server_close.assert_called_once()
mock_server.serve_forever.assert_called()
@mock.patch('octavia.api.drivers.driver_agent.driver_listener.'
'_cleanup_socket_file')
@ -195,10 +193,9 @@ class TestDriverListener(base.TestCase):
'a' * CONF.driver_agent.status_max_processes, 'a',
'a' * 1000, ''])
type(mock_server).active_children = mock_active_children
mock_forking_server.return_value = mock_server
mock_forking_server.return_value.__enter__.return_value = mock_server
mock_exit_event = mock.MagicMock()
mock_exit_event.is_set.side_effect = [False, False, False, False, True]
driver_listener.get_listener(mock_exit_event)
mock_server.handle_request.assert_called()
mock_server.server_close.assert_called_once()
mock_server.serve_forever.assert_called()

View File

@ -0,0 +1,7 @@
---
fixes:
- |
Fix the shutdown of the driver-agent, the process might have been stuck
while waiting for threads to finish. Systemd would have killed the process
after a timeout, but some children processes might have leaked on the
controllers.