Add janitor to cleanup orphaned fip ports

This adds a janitor worker to the L3 DB module that
will run every 5 minutes looking for floating IP ports
with the device_id of 'PENDING'. If it finds any, it
will keep track of the port ID to see if any stay in
'PENDING' with the next iteration.

If the device ID is still PENDING after 5 minutes, it
means one of two things has happened. Either the server
died after creating the floating IP port, but before
creating the floating IP itself; or, it died after creating
the floating IP port and the floating IP record, but before
updating the device_id of the floating IP port to the
floating IP ID.

The janitor handles both cases by deleting the floating IP
port if it has no associated floating IP and by updating
the floating IP port device ID if it does have an associated
floating IP.

Related-Bug: #1540844
Closes-Bug: #1648098
Change-Id: I684a822553a5a0c54513ca7d20ccaf3c74180593
This commit is contained in:
Kevin Benton 2016-11-14 19:05:16 -08:00 committed by Brian Haley
parent b8347e16d1
commit 6948467b77
3 changed files with 146 additions and 4 deletions
neutron
db
tests/unit/extensions
releasenotes/notes

@ -14,6 +14,7 @@
import functools
import itertools
import random
from debtcollector import removals
import netaddr
@ -28,7 +29,7 @@ import six
from sqlalchemy import orm
from sqlalchemy.orm import exc
from neutron._i18n import _, _LI
from neutron._i18n import _, _LE, _LI, _LW
from neutron.api.rpc.agentnotifiers import l3_rpc_agent_api
from neutron.callbacks import events
from neutron.callbacks import exceptions
@ -39,6 +40,7 @@ from neutron.common import constants as n_const
from neutron.common import ipv6_utils
from neutron.common import rpc as n_rpc
from neutron.common import utils
from neutron import context as n_ctx
from neutron.db import _utils as db_utils
from neutron.db import api as db_api
from neutron.db import common_db_mixin
@ -48,6 +50,7 @@ from neutron.db import standardattrdescription_db as st_attr
from neutron.extensions import external_net
from neutron.extensions import l3
from neutron.plugins.common import utils as p_utils
from neutron import worker as neutron_worker
LOG = logging.getLogger(__name__)
@ -71,6 +74,7 @@ CORE_ROUTER_ATTRS = ('id', 'name', 'tenant_id', 'admin_state_up', 'status')
class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
neutron_worker.WorkerSupportServiceMixin,
st_attr.StandardAttrDescriptionMixin):
"""Mixin class to add L3/NAT router methods to db_base_plugin_v2."""
@ -90,7 +94,9 @@ class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
# in each and every l3 plugin out there.
def __new__(cls):
L3_NAT_dbonly_mixin._subscribe_callbacks()
return super(L3_NAT_dbonly_mixin, cls).__new__(cls)
inst = super(L3_NAT_dbonly_mixin, cls).__new__(cls)
inst._start_janitor()
return inst
@staticmethod
def _subscribe_callbacks():
@ -109,6 +115,58 @@ class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
def _core_plugin(self):
return directory.get_plugin()
def _start_janitor(self):
"""Starts the periodic job that cleans up broken complex resources.
This job will look for things like floating IP ports without an
associated floating IP and delete them 5 minutes after detection.
"""
interval = 60 * 5 # only every 5 minutes. cleanups should be rare
initial_delay = random.randint(0, interval) # splay multiple servers
janitor = neutron_worker.PeriodicWorker(self._clean_garbage, interval,
initial_delay)
self.add_worker(janitor)
def _clean_garbage(self):
if not hasattr(self, '_candidate_broken_fip_ports'):
self._candidate_broken_fip_ports = set()
context = n_ctx.get_admin_context()
candidates = self._get_dead_floating_port_candidates(context)
# just because a port is in 'candidates' doesn't necessarily mean
# it's broken, we could have just caught it before it was updated.
# We confirm by waiting until the next call of this function to see
# if it persists.
to_cleanup = candidates & self._candidate_broken_fip_ports
self._candidate_broken_fip_ports = candidates - to_cleanup
for port_id in to_cleanup:
# ensure it wasn't just a failure to update device_id before we
# delete it
try:
self._fix_or_kill_floating_port(context, port_id)
except Exception:
LOG.exception(_LE("Error cleaning up floating IP port: %s"),
port_id)
def _fix_or_kill_floating_port(self, context, port_id):
fip = (context.session.query(l3_models.FloatingIP).
filter_by(floating_port_id=port_id).first())
if fip:
LOG.warning(_LW("Found incorrect device_id on floating port "
"%(pid)s, correcting to %(fip)s."),
{'pid': port_id, 'fip': fip.id})
self._core_plugin.update_port(
context, port_id, {'port': {'device_id': fip.id}})
else:
LOG.warning(_LW("Found floating IP port %s without floating IP, "
"deleting."), port_id)
self._core_plugin.delete_port(
context, port_id, l3_port_check=False)
def _get_dead_floating_port_candidates(self, context):
filters = {'device_id': ['PENDING'],
'device_owner': [DEVICE_OWNER_FLOATINGIP]}
return {p['id'] for p in self._core_plugin.get_ports(context, filters)}
def _get_router(self, context, router_id):
try:
router = self._get_by_id(context, l3_models.Router, router_id)
@ -1256,8 +1314,6 @@ class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
dns_data = self._process_dns_floatingip_create_precommit(
context, floatingip_dict, fip)
# TODO(kevinbenton): garbage collector that deletes ports that didn't
# make it this far to handle servers dying in this process
self._core_plugin.update_port(context.elevated(), external_port['id'],
{'port': {'device_id': fip_id}})
registry.notify(resources.FLOATING_IP,

@ -3240,6 +3240,72 @@ class L3NatTestCaseBase(L3NatTestCaseMixin):
context=admin_ctx, subnetpool_id=subnetpool_id)
chk_method.assert_called_with(admin_ctx, [router['router']['id']])
def test_janitor_clears_orphaned_floatingip_port(self):
plugin = directory.get_plugin(lib_constants.L3)
with self.network() as n:
# floating IP ports are initially created with a device ID of
# PENDING and are updated after the floating IP is actually
# created.
port_res = self._create_port(
self.fmt, n['network']['id'],
tenant_id=n['network']['tenant_id'], device_id='PENDING',
device_owner=lib_constants.DEVICE_OWNER_FLOATINGIP)
port = self.deserialize(self.fmt, port_res)
plugin._clean_garbage()
# first call should just have marked it as a candidate so port
# should still exist
port = self._show('ports', port['port']['id'])
self.assertEqual('PENDING', port['port']['device_id'])
# second call will delete the port since it has no associated
# floating IP
plugin._clean_garbage()
self._show('ports', port['port']['id'],
expected_code=exc.HTTPNotFound.code)
def test_janitor_updates_port_device_id(self):
# if a server dies after the floating IP is created but before it
# updates the floating IP port device ID, the janitor will be
# responsible for updating the device ID to the correct value.
plugin = directory.get_plugin(lib_constants.L3)
with self.floatingip_with_assoc() as fip:
fip_port = self._list('ports',
query_params='device_owner=network:floatingip')['ports'][0]
# simulate a failed update by just setting the device_id of
# the fip port back to PENDING
data = {'port': {'device_id': 'PENDING'}}
self._update('ports', fip_port['id'], data)
plugin._clean_garbage()
# first call just marks as candidate, so it shouldn't be changed
port = self._show('ports', fip_port['id'])
self.assertEqual('PENDING', port['port']['device_id'])
# second call updates device ID to fip
plugin._clean_garbage()
# first call just marks as candidate, so it shouldn't be changed
port = self._show('ports', fip_port['id'])
self.assertEqual(fip['floatingip']['id'],
port['port']['device_id'])
def test_janitor_doesnt_delete_if_fixed_in_interim(self):
# here we ensure that the janitor doesn't delete the port on the second
# call if the conditions have been fixed
plugin = directory.get_plugin(lib_constants.L3)
with self.network() as n:
port_res = self._create_port(
self.fmt, n['network']['id'],
tenant_id=n['network']['tenant_id'], device_id='PENDING',
device_owner=lib_constants.DEVICE_OWNER_FLOATINGIP)
port = self.deserialize(self.fmt, port_res)
plugin._clean_garbage()
# first call should just have marked it as a candidate so port
# should still exist
port = self._show('ports', port['port']['id'])
self.assertEqual('PENDING', port['port']['device_id'])
data = {'port': {'device_id': 'something_else'}}
self._update('ports', port['port']['id'], data)
# now that the device ID has changed, the janitor shouldn't delete
plugin._clean_garbage()
self._show('ports', port['port']['id'])
class L3AgentDbTestCaseBase(L3NatTestCaseMixin):

@ -0,0 +1,20 @@
---
prelude: >
Due to changes in internal L3 logic, a server
crash/backend failure during FIP creation may
leave dangling ports attached on external
networks. These ports can be identified by a
'PENDING' device_id. The neutron server will
attempt a cleanup periodically to address the issue.
other:
- If a floating IP creation gets interrupted by
a server crash or backend failure, a port can
be left behind on the external network. Neutron
will now automatically clean these up after
approximately 10 minutes. This time value is not
configurable.
- Ports in this state will be visible on the external
network to admins, and will have a device_id value
of 'PENDING'. They can also be removed manually by
an admin if waiting for the periodic job to do it is
undesired.