Manage cleanup of .ctl/.pid files for LibreSwan

LibreSwan checks for the presence of pid/ctl files when starting up and
will error out if they already exist. However, LibreSwan's usage of the
capabilities library removes the access required to cleanup .ctl and
.pid files on shutdown if any of the directories in the path are missing
explicit permissions for root. This is not considered a bug by the
LibreSwan maintainers, so the LibreSwan driver must work around it by
checking if it is okay to remove the files and removing them on
startup. It must also wait for shutdown to complete before restarting
the daemon on LibreSwanProcess.restart().

Introduces new configuration for retrying check for process shutdown.

DocImpact


Change-Id: I5c215d70c348524979b740f882029f74e400e6d7
Closes-Bug: #1331502
This commit is contained in:
Brent Eagles 2015-08-13 13:35:36 -02:30
parent af903a2fad
commit 00b633d284
3 changed files with 315 additions and 2 deletions

View File

@ -25,3 +25,14 @@
# default_config_area=/usr/share/strongswan/templates/config/strongswan.d
# Default is for ubuntu use, /etc/strongswan.d
# default_config_area=/etc/strongswan.d
[libreswan]
# Initial interval in seconds for checking if pluto daemon is shutdown
# shutdown_check_timeout=1
#
# The maximum number of retries for checking for pluto daemon shutdown
# shutdown_check_retries=5
#
# A factor to increase the retry interval for each retry
# shutdown_check_back_off=1.5

View File

@ -12,8 +12,37 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import os
import os.path
import eventlet
from neutron.i18n import _LE
from neutron.i18n import _LW
from oslo_config import cfg
from oslo_log import log as logging
from neutron_vpnaas.services.vpn.device_drivers import ipsec
LOG = logging.getLogger(__name__)
libreswan_opts = [
cfg.IntOpt('shutdown_check_timeout',
default=1,
help=_('Initial interval in seconds for checking if pluto '
'daemon is shutdown')),
cfg.IntOpt('shutdown_check_retries',
default=5,
help=_('The maximum number of retries for checking for '
'pluto daemon shutdown')),
cfg.FloatOpt('shutdown_check_back_off',
default=1.5,
help=_('A factor to increase the retry interval for '
'each retry'))
]
cfg.CONF.register_opts(libreswan_opts, 'libreswan')
class LibreSwanProcess(ipsec.OpenSwanProcess):
"""Libreswan Process manager class.
@ -23,6 +52,7 @@ class LibreSwanProcess(ipsec.OpenSwanProcess):
def __init__(self, conf, process_id, vpnservice, namespace):
super(LibreSwanProcess, self).__init__(conf, process_id,
vpnservice, namespace)
self.pid_file = '%s.pid' % self.pid_path
def ensure_configs(self):
"""Generate config files which are needed for Libreswan.
@ -40,6 +70,96 @@ class LibreSwanProcess(ipsec.OpenSwanProcess):
except RuntimeError:
self._execute([self.binary, 'initnss', self.etc_dir])
def _process_running(self):
"""Checks if process is still running."""
# If no PID file, we assume the process is not running.
if not os.path.exists(self.pid_file):
return False
try:
# We take an ask-forgiveness-not-permission approach and rely
# on throwing to tell us something. If the pid file exists,
# delve into the process information and check if it matches
# our expected command line.
with open(self.pid_file, 'r') as f:
pid = f.readline().strip()
with open('/proc/%s/cmdline' % pid) as cmd_line_file:
cmd_line = cmd_line_file.readline()
if self.pid_path in cmd_line and 'pluto' in cmd_line:
# Okay the process is probably a libreswan process
# and it contains the pid_path in the command
# line... could be a race. Log to error and return
# that it is *NOT* okay to clean up files. We are
# logging to error instead of debug because it
# indicates something bad has happened and this is
# valuable information for figuring it out.
LOG.error(_LE('Process %(pid)s exists with command '
'line %(cmd_line)s.') %
{'pid': pid, 'cmd_line': cmd_line})
return True
except IOError as e:
LOG.error(_LE('Unable to check control files on startup for '
'router %(router)s: %(msg)s'),
{'router': self.id, 'msg': e})
return False
def _cleanup_control_files(self):
try:
ctl_file = '%s.ctl' % self.pid_path
LOG.debug('Removing %(pidfile)s and %(ctlfile)s',
{'pidfile': self.pid_file,
'ctlfile': '%s.ctl' % ctl_file})
if os.path.exists(self.pid_file):
os.remove(self.pid_file)
if os.path.exists(ctl_file):
os.remove(ctl_file)
except OSError as e:
LOG.error(_LE('Unable to remove libreswan control '
'files for router %(router)s. %(msg)s'),
{'router': self.id, 'msg': e})
def start(self):
# NOTE: The restart operation calls the parent's start() instead of
# this one to avoid having to special case the startup file check.
# If anything is added to this method that needs to run whenever
# a restart occurs, it should be either added to the restart()
# override or things refactored to special-case start() when
# called from restart().
# LibreSwan's use of the capablities library may prevent the ctl
# and pid files from being cleaned up, so we check to see if the
# process is running and if not, attempt a cleanup. In either case
# we fall through to allow the LibreSwan process to start or fail
# in the usual way.
if not self._process_running():
self._cleanup_control_files()
super(LibreSwanProcess, self).start()
def restart(self):
# stop() followed immediately by a start() runs the risk that the
# current pluto daemon has not had a chance to shutdown. We check
# the current process information to see if the daemon is still
# running and if so, wait a short interval and retry.
self.stop()
wait_interval = cfg.CONF.libreswan.shutdown_check_timeout
for i in range(cfg.CONF.libreswan.shutdown_check_retries):
if not self._process_running():
self._cleanup_control_files()
break
eventlet.sleep(wait_interval)
wait_interval *= cfg.CONF.libreswan.shutdown_check_back_off
else:
LOG.warning(_LW('Server appears to still be running, restart '
'of router %s may fail'), self.id)
super(LibreSwanProcess, self).start()
class LibreSwanDriver(ipsec.IPsecDriver):
def create_process(self, process_id, vpnservice, namespace):

View File

@ -14,6 +14,7 @@
# under the License.
import copy
import difflib
import io
import mock
import socket
@ -794,14 +795,195 @@ class TestOpenSwanProcess(BaseIPsecDeviceDriver):
class TestLibreSwanProcess(base.BaseTestCase):
_test_timeout = 1
_test_backoff = 2
_test_retries = 5
def setUp(self):
super(TestLibreSwanProcess, self).setUp()
# Insulate tests against changes to configuration defaults.
cfg.CONF.register_opts(libreswan_ipsec.libreswan_opts,
'libreswan')
cfg.CONF.set_override('shutdown_check_timeout', self._test_timeout,
group='libreswan')
cfg.CONF.set_override('shutdown_check_back_off', self._test_backoff,
group='libreswan')
cfg.CONF.set_override('shutdown_check_retries', self._test_retries,
group='libreswan')
self.addCleanup(cfg.CONF.reset)
self.vpnservice = copy.deepcopy(FAKE_VPN_SERVICE)
self.parent_start = mock.patch('neutron_vpnaas.services.'
'vpn.device_drivers.ipsec.'
'OpenSwanProcess.start').start()
self.parent_stop = mock.patch('neutron_vpnaas.services.'
'vpn.device_drivers.ipsec.'
'OpenSwanProcess.stop').start()
self.os_remove = mock.patch('os.remove').start()
self.ipsec_process = libreswan_ipsec.LibreSwanProcess(mock.ANY,
'foo-process-id',
self.vpnservice,
mock.ANY)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
def test_no_cleanups(self, cleanup_mock):
# Not an "awesome test" but more of a check box item. Basically,
# what happens if we didn't need to clean up any files.
with mock.patch.object(self.ipsec_process,
'_process_running',
return_value=True) as query_mock:
self.ipsec_process.start()
self.assertEqual(1, self.parent_start.call_count)
self.assertEqual(1, query_mock.call_count)
# This is really what is being tested here. If process is
# running, we shouldn't attempt a cleanup.
self.assertFalse(cleanup_mock.called)
@mock.patch('os.path.exists', return_value=True)
def test_cleanup_files(self, exists_mock):
# Tests the 'bones' of things really and kind of check-box-item-bogus
# test - this really needs exercising through a higher level test.
with mock.patch.object(self.ipsec_process,
'_process_running',
return_value=False) as query_mock:
fake_path = '/fake/path/run'
self.ipsec_process.pid_path = fake_path
self.ipsec_process.pid_file = '%s.pid' % fake_path
self.ipsec_process.start()
self.assertEqual(1, self.parent_start.call_count)
self.assertEqual(1, query_mock.call_count)
self.assertEqual(2, self.os_remove.call_count)
self.os_remove.assert_has_calls([mock.call('%s.pid' % fake_path),
mock.call('%s.ctl' % fake_path)])
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._process_running',
return_value=False)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
@mock.patch('eventlet.sleep')
def test_restart_process_not_running(self, sleep_mock, cleanup_mock,
query_mock):
self.ipsec_process.restart()
# Lame checks that are really for sanity
self.assertTrue(self.parent_stop.called)
self.assertTrue(self.parent_start.called)
# Really what is being tested - retry configuration exists and that
# we do the right things when process check is false.
self.assertTrue(query_mock.called)
self.assertTrue(cleanup_mock.called)
self.assertFalse(sleep_mock.called)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._process_running',
return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
@mock.patch('eventlet.sleep')
def test_restart_process_doesnt_stop(self, sleep_mock, cleanup_mock,
query_mock):
self.ipsec_process.restart()
# Lame checks that are really for sanity
self.assertTrue(self.parent_stop.called)
self.assertTrue(self.parent_start.called)
# Really what is being tested - retry configuration exists and that
# we do the right things when process check is True.
self.assertEqual(5, query_mock.call_count)
self.assertFalse(cleanup_mock.called)
self.assertEqual(5, sleep_mock.call_count)
calls = [mock.call(1), mock.call(2), mock.call(4),
mock.call(8), mock.call(16)]
sleep_mock.assert_has_calls(calls)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._process_running',
side_effect=[True, True, False])
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
@mock.patch('eventlet.sleep')
def test_restart_process_retry_until_stop(self, sleep_mock, cleanup_mock,
query_mock):
self.ipsec_process.restart()
# Lame checks that are really for sanity
self.assertTrue(self.parent_start.called)
self.assertTrue(self.parent_stop.called)
# Really what is being tested - retry configuration exists and that
# we do the right things when process check is True a few times and
# then returns False.
self.assertEqual(3, query_mock.call_count)
self.assertTrue(cleanup_mock.called)
self.assertEqual(2, sleep_mock.call_count)
def test_process_running_no_pid(self):
with mock.patch('os.path.exists', return_value=False):
self.assertFalse(
self.ipsec_process._process_running())
# open() is used elsewhere, so we need to inject a mocked open into the
# module to be tested.
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
create=True,
side_effect=IOError)
def test_process_running_open_failure(self, mock_open, mock_exists):
self.assertFalse(self.ipsec_process._process_running())
self.assertTrue(mock_exists.called)
self.assertTrue(mock_open.called)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
create=True,
side_effect=[io.StringIO(u'invalid'),
IOError])
def test_process_running_bogus_pid(self, mock_open, mock_exists):
with mock.patch.object(libreswan_ipsec.LOG, 'error'):
self.assertFalse(self.ipsec_process._process_running())
self.assertTrue(mock_exists.called)
self.assertEqual(2, mock_open.call_count)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
create=True,
side_effect=[io.StringIO(u'134'), io.StringIO(u'')])
def test_process_running_no_cmdline(self, mock_open, mock_exists):
with mock.patch.object(libreswan_ipsec.LOG, 'error') as log_mock:
self.assertFalse(self.ipsec_process._process_running())
self.assertFalse(log_mock.called)
self.assertEqual(2, mock_open.call_count)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
create=True,
side_effect=[io.StringIO(u'134'), io.StringIO(u'ps ax')])
def test_process_running_cmdline_mismatch(self, mock_open, mock_exists):
with mock.patch.object(libreswan_ipsec.LOG, 'error') as log_mock:
self.assertFalse(self.ipsec_process._process_running())
self.assertFalse(log_mock.called)
self.assertEqual(2, mock_open.call_count)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
create=True,
side_effect=[io.StringIO(u'134'),
io.StringIO(u'/usr/libexec/ipsec/pluto -ctlbase'
'/some/foo/path')])
def test_process_running_cmdline_match(self, mock_open, mock_exists):
self.ipsec_process.pid_path = '/some/foo/path'
with mock.patch.object(libreswan_ipsec.LOG, 'error') as log_mock:
self.assertTrue(self.ipsec_process._process_running())
self.assertTrue(log_mock.called)
def test_ensure_configs(self):
openswan_ipsec.OpenSwanProcess.ensure_configs = mock.Mock()
with mock.patch.object(self.ipsec_process, '_execute') as fake_execute:
@ -810,7 +992,7 @@ class TestLibreSwanProcess(base.BaseTestCase):
mock.call(['ipsec', 'checknss',
self.ipsec_process.etc_dir])]
fake_execute.assert_has_calls(expected)
self.assertEqual(fake_execute.call_count, 2)
self.assertEqual(2, fake_execute.call_count)
with mock.patch.object(self.ipsec_process, '_execute') as fake_execute:
fake_execute.side_effect = [None, RuntimeError, None]
@ -821,7 +1003,7 @@ class TestLibreSwanProcess(base.BaseTestCase):
mock.call(['ipsec', 'initnss',
self.ipsec_process.etc_dir])]
fake_execute.assert_has_calls(expected)
self.assertEqual(fake_execute.call_count, 3)
self.assertEqual(3, fake_execute.call_count)
class IPsecStrongswanDeviceDriverLegacy(IPSecDeviceLegacy):