Lower spacing time of the OVN maintenance tasks which should be run once

Some of the OVN maintenance tasks are expected to be run just once and
then they raise periodic.NeverAgain() to not be run anymore. Those tasks
also require to have acquried ovn db lock so that only one of the
maintenance workers really runs them.
All those tasks had set 600 seconds as a spacing time so they were run
every 600 seconds. This works fine usually but that may cause small
issue in the environments were Neutron is run in POD as k8s/openshift
application. In such case, when e.g. configuration of neutron is
updated, it may happen that first new POD with Neutron is spawned and
only once it is already running, k8s will stop old POD. Because of that
maintenance worker running in the new neutron-server POD will not
acquire lock on the OVN DB (old POD still holds the lock) and will not
run all those maintenance tasks immediately. After old POD will be
terminated, one of the new PODs will at some point acquire that lock and
then will run all those maintenance tasks but this would cause 600
seconds delay in running them.

To avoid such long waiting time to run those maintenance tasks, this
patch lowers its spacing time from 600 to just 5 seconds.
Additionally maintenance tasks which are supposed to be run only once and
only by the maintenance worker which has acquired ovn db lock will now be
stopped (periodic.NeverAgain will be raised) after 100 attempts of
run.
This will avoid running them every 5 seconds forever on the workers
which don't acquire lock on the OVN DB at all.

Closes-bug: #2074209
Change-Id: Iabb4bb427588c1a5da27a5d313f75b5bd23805b2
This commit is contained in:
Slawek Kaplonski 2024-07-30 14:17:44 +02:00
parent 0488bdafa1
commit 04c217bcd0
3 changed files with 162 additions and 23 deletions
neutron
common/ovn
plugins/ml2/drivers/ovn/mech_driver/ovsdb
tests/unit/plugins/ml2/drivers/ovn/mech_driver/ovsdb

@ -275,6 +275,8 @@ _TYPES_PRIORITY_ORDER = (
TYPE_SECURITY_GROUP_RULES)
DB_CONSISTENCY_CHECK_INTERVAL = 300 # 5 minutes
MAINTENANCE_TASK_RETRY_LIMIT = 100 # times
MAINTENANCE_ONE_RUN_TASK_SPACING = 5 # seconds
# The order in which the resources should be created or updated by the
# maintenance task: Root ones first and leafs at the end.

@ -56,14 +56,28 @@ INCONSISTENCY_TYPE_CREATE_UPDATE = 'create/update'
INCONSISTENCY_TYPE_DELETE = 'delete'
def has_lock_periodic(*args, **kwargs):
def has_lock_periodic(*args, periodic_run_limit=0, **kwargs):
def wrapper(f):
_retries = 0
@functools.wraps(f)
@periodics.periodic(*args, **kwargs)
def decorator(self, *args, **kwargs):
# This periodic task is included in DBInconsistenciesPeriodics
# since it uses the lock to ensure only one worker is executing
# additonally, if periodic_run_limit parameter with value > 0 is
# provided and lock is not acquired for periodic_run_limit
# times, task will not be run anymore by this maintenance worker
nonlocal _retries
if not self.has_lock:
if periodic_run_limit > 0:
if _retries >= periodic_run_limit:
LOG.debug("Have not been able to acquire lock to run "
"task '%s' after %s tries, limit reached. "
"No more attempts will be made.",
f, _retries)
raise periodics.NeverAgain()
_retries += 1
return
return f(self, *args, **kwargs)
return decorator
@ -444,7 +458,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_global_dhcp_opts(self):
if (not ovn_conf.get_global_dhcpv4_opts() and
not ovn_conf.get_global_dhcpv6_opts()):
@ -474,7 +491,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_for_igmp_snoop_support(self):
snooping_conf = ovs_conf.get_igmp_snooping_enabled()
flood_conf = ovs_conf.get_igmp_flood_unregistered()
@ -504,7 +524,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# TODO(czesla): Remove this in the A+4 cycle
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_port_has_address_scope(self):
ports = self._nb_idl.db_find_rows(
"Logical_Switch_Port", ("type", "!=", ovn_const.LSP_TYPE_LOCALNET)
@ -539,7 +562,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_for_ha_chassis_group(self):
# If external ports is not supported stop running
# this periodic task
@ -567,7 +593,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# TODO(lucasagomes): Remove this in the B+3 cycle
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_for_mcast_flood_reports(self):
mcast_flood_conf = ovs_conf.get_igmp_flood()
mcast_flood_reports_conf = ovs_conf.get_igmp_flood_reports()
@ -625,7 +654,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_localnet_port_has_learn_fdb(self):
ports = self._nb_idl.db_find_rows(
"Logical_Switch_Port", ("type", "=", ovn_const.LSP_TYPE_LOCALNET)
@ -652,7 +684,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_redirect_type_router_gateway_ports(self):
"""Check OVN router gateway ports
Check for the option "redirect-type=bridged" value for
@ -714,7 +749,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_vlan_distributed_ports(self):
"""Check VLAN distributed ports
Check for the option "reside-on-redirect-chassis" value for
@ -752,7 +790,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_fdb_aging_settings(self):
"""Check FDB aging settings
Ensure FDB aging settings are enforced.
@ -788,7 +829,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def update_mac_aging_settings(self):
"""Ensure that MAC_Binding aging options are set"""
with self._nb_idl.transaction(check_error=True) as txn:
@ -804,7 +848,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# "external_ids:OVN_GW_PORT_EXT_ID_KEY" from to each router.
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def remove_gw_ext_ids_from_logical_router(self):
"""Remove `gw_port_id` and `gw_network_id` external_ids from LRs"""
cmds = []
@ -832,7 +879,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
# A static spacing value is used here, but this method will only run
# once per lock due to the use of periodics.NeverAgain().
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_baremetal_ports_dhcp_options(self):
"""Update baremetal ports DHCP options
@ -881,7 +931,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
# TODO(ralonsoh): Remove this in the Antelope+4 cycle
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def create_router_extra_attributes_registers(self):
"""Create missing ``RouterExtraAttributes`` registers.
@ -902,7 +955,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
# TODO(slaweq): Remove this in the E cycle (C+2 as it will be next SLURP)
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def add_gw_port_info_to_logical_router_port(self):
"""Add info if LRP is connecting internal subnet or ext gateway."""
cmds = []
@ -938,7 +994,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
txn.add(cmd)
raise periodics.NeverAgain()
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_router_default_route_empty_dst_ip(self):
"""Check routers with default route with empty dst-ip (LP: #2002993).
"""
@ -965,7 +1024,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
# TODO(ralonsoh): Remove this in the Antelope+4 cycle
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def add_vnic_type_and_pb_capabilities_to_lsp(self):
"""Add the port VNIC type and port binding capabilities to the LSP.
@ -997,7 +1059,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_fair_meter_consistency(self):
"""Update the logging meter after neutron-server reload
@ -1060,7 +1125,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def update_nat_floating_ip_with_gateway_port_reference(self):
"""Set NAT rule gateway_port column to any floating IP without
router gateway port uuid reference - LP#2035281.
@ -1107,7 +1175,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
# TODO(ralonsoh): Remove this method in the C+2 cycle (next SLURP release)
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def add_provider_resource_association_to_routers(self):
"""Add the ``ProviderResourceAssociation`` register to all routers"""
provider_name = 'ovn'
@ -1126,7 +1197,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
# TODO(ralonsoh): Remove this method in the C+2 cycle (next SLURP release)
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def remove_invalid_gateway_chassis_from_unbound_lrp(self):
"""Removes all invalid 'Gateway_Chassis' from unbound LRPs"""
is_gw = ovn_const.OVN_ROUTER_IS_EXT_GW
@ -1149,7 +1223,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
# TODO(ralonsoh): Remove this method in the E cycle (SLURP release)
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def set_network_type(self):
"""Add the network type to the Logical_Switch registers"""
context = n_context.get_admin_context()
@ -1172,7 +1249,10 @@ class DBInconsistenciesPeriodics(SchemaAwarePeriodicsBase):
raise periodics.NeverAgain()
@has_lock_periodic(spacing=600, run_immediately=True)
@has_lock_periodic(
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
run_immediately=True)
def check_network_broadcast_arps_to_all_routers(self):
"""Check the broadcast-arps-to-all-routers config

@ -32,12 +32,69 @@ from neutron.db.models import ovn as ovn_models
from neutron.db import ovn_revision_numbers_db
from neutron.objects import ports as ports_obj
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import maintenance
from neutron.tests import base
from neutron.tests.unit import fake_resources as fakes
from neutron.tests.unit.plugins.ml2 import test_security_group as test_sg
from neutron.tests.unit import testlib_api
from neutron_lib import exceptions as n_exc
class TestHasLockPeriodicDecorator(base.BaseTestCase):
def test_decorator_no_limit_have_lock(self):
run_counter = 0
@maintenance.has_lock_periodic(
periodic_run_limit=0, spacing=30)
def test_maintenance_task(worker):
nonlocal run_counter
run_counter += 1
worker_mock = mock.MagicMock()
worker_mock.has_lock = True
for _ in range(3):
test_maintenance_task(worker_mock)
self.assertEqual(3, run_counter)
def test_decorator_no_lock_no_limit(self):
run_counter = 0
@maintenance.has_lock_periodic(
periodic_run_limit=0, spacing=30)
def test_maintenance_task(worker):
nonlocal run_counter
run_counter += 1
worker_mock = mock.MagicMock()
has_lock_values = [False, False, True]
for has_lock in has_lock_values:
worker_mock.has_lock = has_lock
test_maintenance_task(worker_mock)
self.assertEqual(1, run_counter)
def test_decorator_no_lock_with_limit(self):
run_counter = 0
@maintenance.has_lock_periodic(
periodic_run_limit=1, spacing=30)
def test_maintenance_task(worker):
nonlocal run_counter
run_counter += 1
worker_mock = mock.MagicMock()
worker_mock.has_lock = False
test_maintenance_task(worker_mock)
self.assertEqual(0, run_counter)
worker_mock.has_lock = False
self.assertRaises(periodics.NeverAgain,
test_maintenance_task, worker_mock)
self.assertEqual(0, run_counter)
class TestSchemaAwarePeriodicsBase(testlib_api.SqlTestCaseLight):
def test__set_schema_aware_periodics(self):