Gracefully restart dnsmasq to not break tcp DNS

When talking to dnsmasq using DNS over tcp dnsmasq will fork out for
TCP connections. Forked processes will stay until all connections have
been closed, meaning that dangling connections will keep the processes
and with that will also keep the tcp/53 port in listening state. On
dnsmasq restart (e.g. on network update, subnet create, ...) the parent
process is killed with SIGKILL and a new process is started. This new
process cannot listen on tcp/53, as it is still in use by the old child
with the dangling connection.

To prevent dangling dnsmasq connections on tcp we need to properly
shutdown the child. This is done by first sending SIGTERM and only send
a SIGKILL if the process is not shutting down properly. With that we
get proper cleanup of all children and tcp will come up after a restart.

Change-Id: Ie633148c512f5124e978648c50a4c6318c61baa8
Closes-bug: #1998621
This commit is contained in:
Sebastian Lohff 2022-12-02 17:36:44 +01:00
parent 16399a2ce5
commit 74224e79e0
2 changed files with 45 additions and 2 deletions

View File

@ -21,6 +21,7 @@ import itertools
import os
import re
import shutil
import signal
import time
import netaddr
@ -45,6 +46,7 @@ from neutron.ipam import utils as ipam_utils
from neutron.privileged.agent.linux import dhcp as priv_dhcp
LOG = logging.getLogger(__name__)
SIGTERM_TIMEOUT = 5
DNS_PORT = 53
WIN2k3_STATIC_DNS = 249
@ -349,9 +351,18 @@ class DhcpLocalProcess(DhcpBase, metaclass=abc.ABCMeta):
def disable(self, retain_port=False, block=False):
"""Disable DHCP for this network by killing the local process."""
self.process_monitor.unregister(self.network.id, DNSMASQ_SERVICE_NAME)
self._get_process_manager().disable()
pm = self._get_process_manager()
pm.disable(sig=str(int(signal.SIGTERM)))
if block:
common_utils.wait_until_true(lambda: not self.active)
try:
common_utils.wait_until_true(lambda: not self.active,
timeout=SIGTERM_TIMEOUT)
except common_utils.WaitTimeout:
LOG.warning('dnsmasq process %s did not finish after SIGTERM '
'signal in %s seconds, sending SIGKILL signal',
pm.pid, SIGTERM_TIMEOUT)
pm.disable(sig=str(int(signal.SIGKILL)))
common_utils.wait_until_true(lambda: not self.active)
self._del_running_interface(self.interface_name)
if not retain_port:
self._destroy_namespace_and_port()

View File

@ -15,6 +15,7 @@
import copy
import os
import signal
from unittest import mock
import netaddr
@ -32,6 +33,7 @@ import testtools
from neutron.agent.linux import dhcp
from neutron.agent.linux import ip_lib
from neutron.cmd import runtime_checks as checks
from neutron.common import utils as common_utils
from neutron.conf.agent import common as config
from neutron.conf.agent import dhcp as dhcp_config
from neutron.conf import common as base_config
@ -1272,6 +1274,36 @@ class TestDhcpLocalProcess(TestBase):
parent.assert_has_calls(expected)
delete_ns.assert_called_with('qdhcp-ns')
@mock.patch.object(common_utils, 'wait_until_true')
def test_disable_blocking(self, mock_wait_until):
lp = LocalChild(self.conf, FakeDualNetwork())
mock_pm = mock.Mock()
with mock.patch('neutron.agent.linux.ip_lib.'
'delete_network_namespace'), \
mock.patch.object(dhcp.DhcpLocalProcess,
'_get_process_manager',
return_value=mock_pm):
lp.disable(block=True)
self.assertEqual(1, mock_wait_until.call_count)
mock_pm.disable.assert_called_once_with(sig=str(int(signal.SIGTERM)))
@mock.patch.object(common_utils, 'wait_until_true')
def test_disable_blocking_sigterm_sigkill(self, mock_wait_until):
mock_wait_until.side_effect = [common_utils.WaitTimeout, None]
lp = LocalChild(self.conf, FakeDualNetwork())
mock_pm = mock.Mock()
with mock.patch('neutron.agent.linux.ip_lib.'
'delete_network_namespace'), \
mock.patch.object(dhcp.DhcpLocalProcess,
'_get_process_manager',
return_value=mock_pm):
lp.disable(block=True)
self.assertEqual(2, mock_wait_until.call_count)
mock_pm.disable.assert_has_calls([
mock.call(sig=str(int(signal.SIGTERM))),
mock.call(sig=str(int(signal.SIGKILL)))])
def test_get_interface_name(self):
net = FakeDualNetwork()
path = '/dhcp/%s/interface' % net.id