Kill the vrrp orphan process when (re)spawn keepalived

When keepalived crashed unexpectedly, the vrrp process that
it associates with will be orphan process. This will make
the VIP unable to migrate to the router in the same host.
Also, neutron code is not able to respawn the keepalived
process, because keepalived thinks itself is still running,
according to [1-3]. As a result, neutron will report respawning
keepalived all the time. Restart l3-agent will not help.

This patch will check and delete the orphan vrrp process
if there is any, in the processmonitor of l3 agent.

More details can be found in the bug description and comments.

[1] https://goo.gl/W3GL9I
[2] https://goo.gl/F0Ixfb
[3] https://goo.gl/dUqhTo

Change-Id: Ia1759ed1365b845d404686a8cd25f882cce35caf
Closes-Bug: #1511311
This commit is contained in:
Hong Hui Xiao 2015-11-04 01:44:43 -05:00
parent 681e1ff1af
commit 2d1b53bcfa
2 changed files with 44 additions and 30 deletions

View File

@ -383,19 +383,18 @@ class KeepalivedManager(object):
def spawn(self):
config_path = self._output_config_file()
def callback(pid_file):
cmd = ['keepalived', '-P',
'-f', config_path,
'-p', pid_file,
'-r', '%s-vrrp' % pid_file]
return cmd
keepalived_pm = self.get_process()
vrrp_pm = self._get_vrrp_process(
'%s-vrrp' % keepalived_pm.get_pid_file_name())
pm = self.get_process(callback=callback)
pm.enable(reload_cfg=True)
keepalived_pm.default_cmd_callback = (
self._get_keepalived_process_callback(vrrp_pm, config_path))
keepalived_pm.enable(reload_cfg=True)
self.process_monitor.register(uuid=self.resource_id,
service_name=KEEPALIVED_SERVICE_NAME,
monitored_process=pm)
monitored_process=keepalived_pm)
LOG.debug('Keepalived spawned with config %s', config_path)
@ -413,3 +412,27 @@ class KeepalivedManager(object):
self.namespace,
pids_path=self.conf_path,
default_cmd_callback=callback)
def _get_vrrp_process(self, pid_file):
return external_process.ProcessManager(
cfg.CONF,
self.resource_id,
self.namespace,
pid_file=pid_file)
def _get_keepalived_process_callback(self, vrrp_pm, config_path):
def callback(pid_file):
# If keepalived process crashed unexpectedly, the vrrp process
# will be orphan and prevent keepalived process to be spawned.
# A check here will let the l3-agent to kill the orphan process
# and spawn keepalived successfully.
if vrrp_pm.active:
vrrp_pm.disable()
cmd = ['keepalived', '-P',
'-f', config_path,
'-p', pid_file,
'-r', '%s-vrrp' % pid_file]
return cmd
return callback

View File

@ -13,12 +13,8 @@
# License for the specific language governing permissions and limitations
# under the License.
from distutils import version
import re
from oslo_config import cfg
from oslo_log import log as logging
import testtools
from neutron._i18n import _
from neutron.agent.linux import external_process
@ -31,18 +27,6 @@ from neutron.tests.unit.agent.linux import test_keepalived
LOG = logging.getLogger(__name__)
def keepalived_version_not_supported():
try:
cmd = ['keepalived', '--version']
out = utils.execute(cmd, return_stderr=True)
ver = re.search(r"Keepalived v(\d+\.\d+\.\d+)", out[1]).group(1)
return version.LooseVersion(ver) >= version.LooseVersion("1.2.11")
except (OSError, RuntimeError, IndexError, ValueError) as e:
LOG.debug("Exception while checking keepalived version. "
"Exception: %s", e)
return False
class KeepalivedManagerTestCase(base.BaseTestCase,
test_keepalived.KeepalivedConfBaseMixin):
@ -70,8 +54,7 @@ class KeepalivedManagerTestCase(base.BaseTestCase,
self.assertEqual(self.expected_config.get_config_str(),
self.manager.get_conf_on_disk())
@testtools.skipIf(keepalived_version_not_supported(), 'bug/1511311')
def test_keepalived_respawns(self):
def _test_keepalived_respawns(self, normal_exit=True):
self.manager.spawn()
process = self.manager.get_process()
pid = process.pid
@ -81,11 +64,19 @@ class KeepalivedManagerTestCase(base.BaseTestCase,
sleep=0.01,
exception=RuntimeError(_("Keepalived didn't spawn")))
# force process crash, and see that when it comes back
# it's indeed a different process
utils.execute(['kill', '-9', pid], run_as_root=True)
exit_code = '-15' if normal_exit else '-9'
# Exit the process, and see that when it comes back
# It's indeed a different process
utils.execute(['kill', exit_code, pid], run_as_root=True)
utils.wait_until_true(
lambda: process.active and pid != process.pid,
timeout=5,
sleep=0.01,
exception=RuntimeError(_("Keepalived didn't respawn")))
def test_keepalived_respawns(self):
self._test_keepalived_respawns()
def test_keepalived_respawn_with_unexpected_exit(self):
self._test_keepalived_respawns(False)