Fix race condition when creating two routers without HA network

When a HA router is created and the HA is not yet, before creating
the router, the Neutron server creates the HA network and the
corresponding subnet.

The HA network cannot be duplicated (see previous patches related to
this bug). But the subnet, that is created in another database
transaction, cannot be present when the router creation call tries
to create the HA port.

This patch adds a HA subnet check before creating the router and the
HA port. Even if the subnet check fails and the worker tries to
create this subnet, if the process fails with ``InvalidInput``, that
means other worker created the subnet before and the current one
fails because tries to create the same subnet with the same CIDR.
In this case, we dismiss the exception and continue with the router
creation.

Closes-Bug: #2016198

Change-Id: I82225fcc6248bb0fd68959ceb1daabff423d81ff
This commit is contained in:
Rodolfo Alonso Hernandez 2023-04-28 13:56:02 +00:00 committed by Rodolfo Alonso
parent 4109ee9bb4
commit e6fb32e27d
2 changed files with 54 additions and 5 deletions

View File

@ -53,6 +53,7 @@ from neutron.db import l3_dvr_db
from neutron.objects import base
from neutron.objects import l3_hamode
from neutron.objects import router as l3_obj
from neutron.objects import subnet as subnet_obj
VR_ID_RANGE = set(range(1, 255))
@ -222,8 +223,7 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
network[providernet.PHYSICAL_NETWORK] = (
cfg.CONF.l3_ha_network_physical_name)
def _create_ha_network(self, context, tenant_id):
admin_ctx = context.elevated()
def _create_ha_network(self, admin_ctx, tenant_id):
# The project ID is needed to create the ``L3HARouterNetwork``
# resource; the project ID cannot be retrieved from the network because
# is explicitly created without it.
@ -371,13 +371,32 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
"""Event handler to create HA resources before router creation."""
if not self._is_ha(router):
return
admin_ctx = context.elevated()
# ensure the HA network exists before we start router creation so
# we can provide meaningful errors back to the user if no network
# can be allocated
# TODO(ralonsoh): remove once bp/keystone-v3 migration finishes.
project_id = router.get('project_id') or router['tenant_id']
if not self.get_ha_network(context, project_id):
self._create_ha_network(context, project_id)
ha_network = self.get_ha_network(admin_ctx, project_id)
if not ha_network:
self._create_ha_network(admin_ctx, project_id)
else:
# Check the HA network subnet. As reported in LP#2016198, two
# consecutive router creation can try to create the HA network at
# the same time. Because the network and subnet creation operations
# cannot be executed in the same transaction, it is needed to make
# this check before leaving this callback.
network_id = ha_network['network_id']
if subnet_obj.Subnet.count(admin_ctx, network_id=network_id) > 0:
return
try:
self._create_ha_subnet(admin_ctx, network_id, project_id)
except n_exc.InvalidInput:
# That could happen when the previous method tries to create
# the HA subnet with the same CIDR. In this case, dismiss the
# exception.
pass
@registry.receives(resources.ROUTER, [events.PRECOMMIT_CREATE],
priority_group.PRIORITY_ROUTER_EXTENDED_ATTRIBUTE)

View File

@ -48,6 +48,7 @@ from neutron.db import l3_agentschedulers_db
from neutron.db import l3_hamode_db
from neutron.objects import l3_hamode
from neutron.objects import network as network_obj
from neutron.objects import subnet as subnet_obj
from neutron import quota
from neutron.scheduler import l3_agent_scheduler
from neutron.services.revisions import revision_plugin
@ -1399,7 +1400,7 @@ class L3HAModeDbTestCase(L3HATestFramework):
self.assertEqual(self.agent2['host'], routers[0]['gw_port_host'])
def test__before_router_create_no_network(self):
project_id = 'project1'
project_id = uuidutils.generate_uuid()
ha_network = self.plugin.get_ha_network(self.admin_ctx, project_id)
self.assertIsNone(ha_network)
@ -1413,6 +1414,35 @@ class L3HAModeDbTestCase(L3HATestFramework):
ha_network = self.plugin.get_ha_network(self.admin_ctx, project_id)
self.assertEqual(project_id, ha_network.project_id)
def test__before_router_create_no_subnet(self):
project_id = uuidutils.generate_uuid()
admin_ctx = self.admin_ctx
admin_ctx.project_id = project_id
self._create_network(self.core_plugin, admin_ctx,
tenant_id=project_id, ha=True)
ha_network = self.plugin.get_ha_network(self.admin_ctx, project_id)
self.assertEqual(project_id, ha_network.project_id)
subnets = subnet_obj.Subnet.get_objects(
admin_ctx, network_id=ha_network.network_id)
self.assertEqual([], subnets)
router = {'ha': True, 'project_id': project_id}
self.plugin._before_router_create(mock.ANY, admin_ctx, router)
ha_network = self.plugin.get_ha_network(admin_ctx, project_id)
self.assertEqual(project_id, ha_network.project_id)
subnets = subnet_obj.Subnet.get_objects(
admin_ctx, network_id=ha_network.network_id)
self.assertEqual(1, len(subnets))
# This test is simulating a concurrent update. The "Subnet.count"
# method returns 0 and "_before_router_create" tries to create the
# subnet again. The exception is catch and dismissed.
with mock.patch.object(subnet_obj.Subnet, 'count', return_value=0):
self.plugin._before_router_create(mock.ANY, admin_ctx, router)
subnets = subnet_obj.Subnet.get_objects(
admin_ctx, network_id=ha_network.network_id)
self.assertEqual(1, len(subnets))
class L3HAUserTestCase(L3HATestFramework):