Add janitor to cleanup orphaned fip ports
This adds a janitor worker to the L3 DB module that will run every 5 minutes looking for floating IP ports with the device_id of 'PENDING'. If it finds any, it will keep track of the port ID to see if any stay in 'PENDING' with the next iteration. If the device ID is still PENDING after 5 minutes, it means one of two things has happened. Either the server died after creating the floating IP port, but before creating the floating IP itself; or, it died after creating the floating IP port and the floating IP record, but before updating the device_id of the floating IP port to the floating IP ID. The janitor handles both cases by deleting the floating IP port if it has no associated floating IP and by updating the floating IP port device ID if it does have an associated floating IP. Related-Bug: #1540844 Closes-Bug: #1648098 Change-Id: I684a822553a5a0c54513ca7d20ccaf3c74180593
This commit is contained in:
parent
b8347e16d1
commit
6948467b77
@ -14,6 +14,7 @@
|
||||
|
||||
import functools
|
||||
import itertools
|
||||
import random
|
||||
|
||||
from debtcollector import removals
|
||||
import netaddr
|
||||
@ -28,7 +29,7 @@ import six
|
||||
from sqlalchemy import orm
|
||||
from sqlalchemy.orm import exc
|
||||
|
||||
from neutron._i18n import _, _LI
|
||||
from neutron._i18n import _, _LE, _LI, _LW
|
||||
from neutron.api.rpc.agentnotifiers import l3_rpc_agent_api
|
||||
from neutron.callbacks import events
|
||||
from neutron.callbacks import exceptions
|
||||
@ -39,6 +40,7 @@ from neutron.common import constants as n_const
|
||||
from neutron.common import ipv6_utils
|
||||
from neutron.common import rpc as n_rpc
|
||||
from neutron.common import utils
|
||||
from neutron import context as n_ctx
|
||||
from neutron.db import _utils as db_utils
|
||||
from neutron.db import api as db_api
|
||||
from neutron.db import common_db_mixin
|
||||
@ -48,6 +50,7 @@ from neutron.db import standardattrdescription_db as st_attr
|
||||
from neutron.extensions import external_net
|
||||
from neutron.extensions import l3
|
||||
from neutron.plugins.common import utils as p_utils
|
||||
from neutron import worker as neutron_worker
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@ -71,6 +74,7 @@ CORE_ROUTER_ATTRS = ('id', 'name', 'tenant_id', 'admin_state_up', 'status')
|
||||
|
||||
|
||||
class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
|
||||
neutron_worker.WorkerSupportServiceMixin,
|
||||
st_attr.StandardAttrDescriptionMixin):
|
||||
"""Mixin class to add L3/NAT router methods to db_base_plugin_v2."""
|
||||
|
||||
@ -90,7 +94,9 @@ class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
|
||||
# in each and every l3 plugin out there.
|
||||
def __new__(cls):
|
||||
L3_NAT_dbonly_mixin._subscribe_callbacks()
|
||||
return super(L3_NAT_dbonly_mixin, cls).__new__(cls)
|
||||
inst = super(L3_NAT_dbonly_mixin, cls).__new__(cls)
|
||||
inst._start_janitor()
|
||||
return inst
|
||||
|
||||
@staticmethod
|
||||
def _subscribe_callbacks():
|
||||
@ -109,6 +115,58 @@ class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
|
||||
def _core_plugin(self):
|
||||
return directory.get_plugin()
|
||||
|
||||
def _start_janitor(self):
|
||||
"""Starts the periodic job that cleans up broken complex resources.
|
||||
|
||||
This job will look for things like floating IP ports without an
|
||||
associated floating IP and delete them 5 minutes after detection.
|
||||
"""
|
||||
interval = 60 * 5 # only every 5 minutes. cleanups should be rare
|
||||
initial_delay = random.randint(0, interval) # splay multiple servers
|
||||
janitor = neutron_worker.PeriodicWorker(self._clean_garbage, interval,
|
||||
initial_delay)
|
||||
self.add_worker(janitor)
|
||||
|
||||
def _clean_garbage(self):
|
||||
if not hasattr(self, '_candidate_broken_fip_ports'):
|
||||
self._candidate_broken_fip_ports = set()
|
||||
context = n_ctx.get_admin_context()
|
||||
candidates = self._get_dead_floating_port_candidates(context)
|
||||
# just because a port is in 'candidates' doesn't necessarily mean
|
||||
# it's broken, we could have just caught it before it was updated.
|
||||
# We confirm by waiting until the next call of this function to see
|
||||
# if it persists.
|
||||
to_cleanup = candidates & self._candidate_broken_fip_ports
|
||||
self._candidate_broken_fip_ports = candidates - to_cleanup
|
||||
for port_id in to_cleanup:
|
||||
# ensure it wasn't just a failure to update device_id before we
|
||||
# delete it
|
||||
try:
|
||||
self._fix_or_kill_floating_port(context, port_id)
|
||||
except Exception:
|
||||
LOG.exception(_LE("Error cleaning up floating IP port: %s"),
|
||||
port_id)
|
||||
|
||||
def _fix_or_kill_floating_port(self, context, port_id):
|
||||
fip = (context.session.query(l3_models.FloatingIP).
|
||||
filter_by(floating_port_id=port_id).first())
|
||||
if fip:
|
||||
LOG.warning(_LW("Found incorrect device_id on floating port "
|
||||
"%(pid)s, correcting to %(fip)s."),
|
||||
{'pid': port_id, 'fip': fip.id})
|
||||
self._core_plugin.update_port(
|
||||
context, port_id, {'port': {'device_id': fip.id}})
|
||||
else:
|
||||
LOG.warning(_LW("Found floating IP port %s without floating IP, "
|
||||
"deleting."), port_id)
|
||||
self._core_plugin.delete_port(
|
||||
context, port_id, l3_port_check=False)
|
||||
|
||||
def _get_dead_floating_port_candidates(self, context):
|
||||
filters = {'device_id': ['PENDING'],
|
||||
'device_owner': [DEVICE_OWNER_FLOATINGIP]}
|
||||
return {p['id'] for p in self._core_plugin.get_ports(context, filters)}
|
||||
|
||||
def _get_router(self, context, router_id):
|
||||
try:
|
||||
router = self._get_by_id(context, l3_models.Router, router_id)
|
||||
@ -1256,8 +1314,6 @@ class L3_NAT_dbonly_mixin(l3.RouterPluginBase,
|
||||
dns_data = self._process_dns_floatingip_create_precommit(
|
||||
context, floatingip_dict, fip)
|
||||
|
||||
# TODO(kevinbenton): garbage collector that deletes ports that didn't
|
||||
# make it this far to handle servers dying in this process
|
||||
self._core_plugin.update_port(context.elevated(), external_port['id'],
|
||||
{'port': {'device_id': fip_id}})
|
||||
registry.notify(resources.FLOATING_IP,
|
||||
|
@ -3240,6 +3240,72 @@ class L3NatTestCaseBase(L3NatTestCaseMixin):
|
||||
context=admin_ctx, subnetpool_id=subnetpool_id)
|
||||
chk_method.assert_called_with(admin_ctx, [router['router']['id']])
|
||||
|
||||
def test_janitor_clears_orphaned_floatingip_port(self):
|
||||
plugin = directory.get_plugin(lib_constants.L3)
|
||||
with self.network() as n:
|
||||
# floating IP ports are initially created with a device ID of
|
||||
# PENDING and are updated after the floating IP is actually
|
||||
# created.
|
||||
port_res = self._create_port(
|
||||
self.fmt, n['network']['id'],
|
||||
tenant_id=n['network']['tenant_id'], device_id='PENDING',
|
||||
device_owner=lib_constants.DEVICE_OWNER_FLOATINGIP)
|
||||
port = self.deserialize(self.fmt, port_res)
|
||||
plugin._clean_garbage()
|
||||
# first call should just have marked it as a candidate so port
|
||||
# should still exist
|
||||
port = self._show('ports', port['port']['id'])
|
||||
self.assertEqual('PENDING', port['port']['device_id'])
|
||||
# second call will delete the port since it has no associated
|
||||
# floating IP
|
||||
plugin._clean_garbage()
|
||||
self._show('ports', port['port']['id'],
|
||||
expected_code=exc.HTTPNotFound.code)
|
||||
|
||||
def test_janitor_updates_port_device_id(self):
|
||||
# if a server dies after the floating IP is created but before it
|
||||
# updates the floating IP port device ID, the janitor will be
|
||||
# responsible for updating the device ID to the correct value.
|
||||
plugin = directory.get_plugin(lib_constants.L3)
|
||||
with self.floatingip_with_assoc() as fip:
|
||||
fip_port = self._list('ports',
|
||||
query_params='device_owner=network:floatingip')['ports'][0]
|
||||
# simulate a failed update by just setting the device_id of
|
||||
# the fip port back to PENDING
|
||||
data = {'port': {'device_id': 'PENDING'}}
|
||||
self._update('ports', fip_port['id'], data)
|
||||
plugin._clean_garbage()
|
||||
# first call just marks as candidate, so it shouldn't be changed
|
||||
port = self._show('ports', fip_port['id'])
|
||||
self.assertEqual('PENDING', port['port']['device_id'])
|
||||
# second call updates device ID to fip
|
||||
plugin._clean_garbage()
|
||||
# first call just marks as candidate, so it shouldn't be changed
|
||||
port = self._show('ports', fip_port['id'])
|
||||
self.assertEqual(fip['floatingip']['id'],
|
||||
port['port']['device_id'])
|
||||
|
||||
def test_janitor_doesnt_delete_if_fixed_in_interim(self):
|
||||
# here we ensure that the janitor doesn't delete the port on the second
|
||||
# call if the conditions have been fixed
|
||||
plugin = directory.get_plugin(lib_constants.L3)
|
||||
with self.network() as n:
|
||||
port_res = self._create_port(
|
||||
self.fmt, n['network']['id'],
|
||||
tenant_id=n['network']['tenant_id'], device_id='PENDING',
|
||||
device_owner=lib_constants.DEVICE_OWNER_FLOATINGIP)
|
||||
port = self.deserialize(self.fmt, port_res)
|
||||
plugin._clean_garbage()
|
||||
# first call should just have marked it as a candidate so port
|
||||
# should still exist
|
||||
port = self._show('ports', port['port']['id'])
|
||||
self.assertEqual('PENDING', port['port']['device_id'])
|
||||
data = {'port': {'device_id': 'something_else'}}
|
||||
self._update('ports', port['port']['id'], data)
|
||||
# now that the device ID has changed, the janitor shouldn't delete
|
||||
plugin._clean_garbage()
|
||||
self._show('ports', port['port']['id'])
|
||||
|
||||
|
||||
class L3AgentDbTestCaseBase(L3NatTestCaseMixin):
|
||||
|
||||
|
20
releasenotes/notes/fip-janitor-53f0d42a7471c5ed.yaml
Normal file
20
releasenotes/notes/fip-janitor-53f0d42a7471c5ed.yaml
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
prelude: >
|
||||
Due to changes in internal L3 logic, a server
|
||||
crash/backend failure during FIP creation may
|
||||
leave dangling ports attached on external
|
||||
networks. These ports can be identified by a
|
||||
'PENDING' device_id. The neutron server will
|
||||
attempt a cleanup periodically to address the issue.
|
||||
other:
|
||||
- If a floating IP creation gets interrupted by
|
||||
a server crash or backend failure, a port can
|
||||
be left behind on the external network. Neutron
|
||||
will now automatically clean these up after
|
||||
approximately 10 minutes. This time value is not
|
||||
configurable.
|
||||
- Ports in this state will be visible on the external
|
||||
network to admins, and will have a device_id value
|
||||
of 'PENDING'. They can also be removed manually by
|
||||
an admin if waiting for the periodic job to do it is
|
||||
undesired.
|
Loading…
x
Reference in New Issue
Block a user