Gracefully restart dnsmasq to not break tcp DNS

When talking to dnsmasq using DNS over tcp dnsmasq will fork out for
TCP connections. Forked processes will stay until all connections have
been closed, meaning that dangling connections will keep the processes
and with that will also keep the tcp/53 port in listening state. On
dnsmasq restart (e.g. on network update, subnet create, ...) the parent
process is killed with SIGKILL and a new process is started. This new
process cannot listen on tcp/53, as it is still in use by the old child
with the dangling connection.

To prevent dangling dnsmasq connections on tcp we need to properly
shutdown the child. This is done by first sending SIGTERM and only send
a SIGKILL if the process is not shutting down properly. With that we
get proper cleanup of all children and tcp will come up after a restart.

Change-Id: Ie633148c512f5124e978648c50a4c6318c61baa8
Closes-bug: #1998621
This commit is contained in:
Sebastian Lohff 2022-12-02 17:36:44 +01:00
parent 16399a2ce5
commit 74224e79e0
2 changed files with 45 additions and 2 deletions

View File

@ -21,6 +21,7 @@ import itertools
import os import os
import re import re
import shutil import shutil
import signal
import time import time
import netaddr import netaddr
@ -45,6 +46,7 @@ from neutron.ipam import utils as ipam_utils
from neutron.privileged.agent.linux import dhcp as priv_dhcp from neutron.privileged.agent.linux import dhcp as priv_dhcp
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
SIGTERM_TIMEOUT = 5
DNS_PORT = 53 DNS_PORT = 53
WIN2k3_STATIC_DNS = 249 WIN2k3_STATIC_DNS = 249
@ -349,9 +351,18 @@ class DhcpLocalProcess(DhcpBase, metaclass=abc.ABCMeta):
def disable(self, retain_port=False, block=False): def disable(self, retain_port=False, block=False):
"""Disable DHCP for this network by killing the local process.""" """Disable DHCP for this network by killing the local process."""
self.process_monitor.unregister(self.network.id, DNSMASQ_SERVICE_NAME) self.process_monitor.unregister(self.network.id, DNSMASQ_SERVICE_NAME)
self._get_process_manager().disable() pm = self._get_process_manager()
pm.disable(sig=str(int(signal.SIGTERM)))
if block: if block:
common_utils.wait_until_true(lambda: not self.active) try:
common_utils.wait_until_true(lambda: not self.active,
timeout=SIGTERM_TIMEOUT)
except common_utils.WaitTimeout:
LOG.warning('dnsmasq process %s did not finish after SIGTERM '
'signal in %s seconds, sending SIGKILL signal',
pm.pid, SIGTERM_TIMEOUT)
pm.disable(sig=str(int(signal.SIGKILL)))
common_utils.wait_until_true(lambda: not self.active)
self._del_running_interface(self.interface_name) self._del_running_interface(self.interface_name)
if not retain_port: if not retain_port:
self._destroy_namespace_and_port() self._destroy_namespace_and_port()

View File

@ -15,6 +15,7 @@
import copy import copy
import os import os
import signal
from unittest import mock from unittest import mock
import netaddr import netaddr
@ -32,6 +33,7 @@ import testtools
from neutron.agent.linux import dhcp from neutron.agent.linux import dhcp
from neutron.agent.linux import ip_lib from neutron.agent.linux import ip_lib
from neutron.cmd import runtime_checks as checks from neutron.cmd import runtime_checks as checks
from neutron.common import utils as common_utils
from neutron.conf.agent import common as config from neutron.conf.agent import common as config
from neutron.conf.agent import dhcp as dhcp_config from neutron.conf.agent import dhcp as dhcp_config
from neutron.conf import common as base_config from neutron.conf import common as base_config
@ -1272,6 +1274,36 @@ class TestDhcpLocalProcess(TestBase):
parent.assert_has_calls(expected) parent.assert_has_calls(expected)
delete_ns.assert_called_with('qdhcp-ns') delete_ns.assert_called_with('qdhcp-ns')
@mock.patch.object(common_utils, 'wait_until_true')
def test_disable_blocking(self, mock_wait_until):
lp = LocalChild(self.conf, FakeDualNetwork())
mock_pm = mock.Mock()
with mock.patch('neutron.agent.linux.ip_lib.'
'delete_network_namespace'), \
mock.patch.object(dhcp.DhcpLocalProcess,
'_get_process_manager',
return_value=mock_pm):
lp.disable(block=True)
self.assertEqual(1, mock_wait_until.call_count)
mock_pm.disable.assert_called_once_with(sig=str(int(signal.SIGTERM)))
@mock.patch.object(common_utils, 'wait_until_true')
def test_disable_blocking_sigterm_sigkill(self, mock_wait_until):
mock_wait_until.side_effect = [common_utils.WaitTimeout, None]
lp = LocalChild(self.conf, FakeDualNetwork())
mock_pm = mock.Mock()
with mock.patch('neutron.agent.linux.ip_lib.'
'delete_network_namespace'), \
mock.patch.object(dhcp.DhcpLocalProcess,
'_get_process_manager',
return_value=mock_pm):
lp.disable(block=True)
self.assertEqual(2, mock_wait_until.call_count)
mock_pm.disable.assert_has_calls([
mock.call(sig=str(int(signal.SIGTERM))),
mock.call(sig=str(int(signal.SIGKILL)))])
def test_get_interface_name(self): def test_get_interface_name(self):
net = FakeDualNetwork() net = FakeDualNetwork()
path = '/dhcp/%s/interface' % net.id path = '/dhcp/%s/interface' % net.id