L3 scheduler: add retry indicators for bind_router

Change I3447ea5bcb7c57365c6f50efe12a1671e86588b3 added a binding_index
to the RouterL3AgentBinding table. In certain (concurrent) cases, a row
with the same binding_index might be used twice, which will raise
DBDuplicateEntry. However, that change didn't retry on this case at all
code-paths, so this patch rectifies this issue.

Closes-Bug: #1633042
Change-Id: I228b0084a8e7c48e78bc2ea6a3fccf7437210e73
This commit is contained in:
John Schwarz 2016-09-04 13:48:58 +03:00
parent ff9c6e219b
commit 1e195afaf1
4 changed files with 124 additions and 87 deletions

View File

@ -183,16 +183,17 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
router_id = router['id'] router_id = router['id']
agent_id = agent['id'] agent_id = agent['id']
if self.router_scheduler: if self.router_scheduler:
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)
try: try:
if router.get('ha'): if router.get('ha'):
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)
self.router_scheduler.create_ha_port_and_bind( self.router_scheduler.create_ha_port_and_bind(
plugin, context, router['id'], plugin, context, router['id'],
router['tenant_id'], agent, is_manual_scheduling=True) router['tenant_id'], agent,
is_manual_scheduling=True)
else: else:
self.router_scheduler.bind_router( self.router_scheduler.bind_router(
context, router_id, agent) plugin, context, router_id, agent.id)
except db_exc.DBError: except db_exc.DBError:
raise l3agentscheduler.RouterSchedulingFailed( raise l3agentscheduler.RouterSchedulingFailed(
router_id=router_id, agent_id=agent_id) router_id=router_id, agent_id=agent_id)

View File

@ -199,44 +199,72 @@ class L3Scheduler(object):
plugin, context, router['id'], plugin, context, router['id'],
router['tenant_id'], l3_agent) router['tenant_id'], l3_agent)
else: else:
self.bind_router(context, router['id'], l3_agent) self.bind_router(plugin, context, router['id'], l3_agent.id)
def bind_router(self, context, router_id, chosen_agent, @db_api.retry_db_errors
binding_index=rb_model.LOWEST_BINDING_INDEX): def bind_router(self, plugin, context, router_id, agent_id,
"""Bind the router to the l3 agent which has been chosen.""" is_manual_scheduling=False, is_ha=False):
# Pre-cache the agent's id so that if an exception is raised we can """Bind the router to the l3 agent which has been chosen.
# safely access its value. Otherwise, sqlalchemy will try to fetch it
# from the database during a rollback, which is bad for us. The function tries to create a RouterL3AgentBinding object and add it
agent_id = chosen_agent.id to the database. It returns the binding that was created or None if it
failed to create it due to some conflict.
In the HA router case, when creating a RouterL3AgentBinding (with some
binding_index) fails because some other RouterL3AgentBinding was
concurrently created using the same binding_index, then the function
will retry to create an entry with a new binding_index. This creation
will be retried up to db_api.MAX_RETRIES times.
If, still in the HA router case, the creation failed because the
router has already been bound to the l3 agent in question or has been
removed (by a concurrent operation), then no further attempts will be
made and the function will return None.
Note that for non-HA routers, the function will always perform exactly
one try, regardless of the error preventing the addition of a new
RouterL3AgentBinding object to the database.
"""
bindings = context.session.query(
rb_model.RouterL3AgentBinding).filter_by(router_id=router_id)
if bindings.filter_by(l3_agent_id=agent_id).first():
LOG.debug('Router %(router_id)s has already been scheduled '
'to L3 agent %(agent_id)s.',
{'router_id': router_id, 'agent_id': agent_id})
return
if not is_ha:
binding_index = rb_model.LOWEST_BINDING_INDEX
if bindings.filter_by(binding_index=binding_index).first():
LOG.debug('Non-HA router %s has already been scheduled',
router_id)
return
else:
binding_index = plugin.get_vacant_binding_index(
context, router_id, is_manual_scheduling)
if binding_index < rb_model.LOWEST_BINDING_INDEX:
LOG.debug('Unable to find a vacant binding_index for '
'router %(router_id)s and agent %(agent_id)s',
{'router_id': router_id,
'agent_id': agent_id})
return
try: try:
with context.session.begin(subtransactions=True): with context.session.begin(subtransactions=True):
binding = rb_model.RouterL3AgentBinding() binding = rb_model.RouterL3AgentBinding()
binding.l3_agent = chosen_agent binding.l3_agent_id = agent_id
binding.router_id = router_id binding.router_id = router_id
binding.binding_index = binding_index binding.binding_index = binding_index
context.session.add(binding) context.session.add(binding)
except db_exc.DBDuplicateEntry as error: LOG.debug('Router %(router_id)s is scheduled to L3 agent '
LOG.debug('Router %(router_id)s has already been scheduled ' '%(agent_id)s with binding_index %(binding_index)d',
'to L3 agent %(agent_id)s (tried to bind with ' {'router_id': router_id,
'binding_index %(binding_index)d). The conflict was ' 'agent_id': agent_id,
'with columns %(columns)r.', 'binding_index': binding_index})
{'agent_id': agent_id, return binding
'router_id': router_id,
'binding_index': binding_index,
'columns': error.columns})
return
except db_exc.DBReferenceError: except db_exc.DBReferenceError:
LOG.debug('Router %s has already been removed ' LOG.debug('Router %s has already been removed '
'by concurrent operation', router_id) 'by concurrent operation', router_id)
return
LOG.debug('Router %(router_id)s is scheduled to L3 agent '
'%(agent_id)s with binding_index %(binding_index)d',
{'router_id': router_id,
'agent_id': agent_id,
'binding_index': binding_index})
return binding
def _schedule_router(self, plugin, context, router_id, def _schedule_router(self, plugin, context, router_id,
candidates=None): candidates=None):
@ -256,7 +284,7 @@ class L3Scheduler(object):
else: else:
chosen_agent = self._choose_router_agent( chosen_agent = self._choose_router_agent(
plugin, context, candidates) plugin, context, candidates)
self.bind_router(context, router_id, chosen_agent) self.bind_router(plugin, context, router_id, chosen_agent.id)
return chosen_agent return chosen_agent
@abc.abstractmethod @abc.abstractmethod
@ -297,35 +325,27 @@ class L3Scheduler(object):
dep_deleter = functools.partial(plugin._delete_ha_network, ctxt) dep_deleter = functools.partial(plugin._delete_ha_network, ctxt)
dep_id_attr = 'network_id' dep_id_attr = 'network_id'
for attempts in range(1, db_api.MAX_RETRIES + 1): # This might fail in case of concurrent calls, which is good for us
binding_index = plugin.get_vacant_binding_index( # as we can skip the rest of this function.
context, router_id, is_manual_scheduling) binding = self.bind_router(
if binding_index == -1: plugin, context, router_id, agent['id'],
LOG.debug("Couldn't find a vacant binding_index for router %s", is_manual_scheduling=is_manual_scheduling, is_ha=True)
router_id) if not binding:
return return
# This might fail in case of concurrent calls, which is good for us try:
# as we can skip the rest of this function. port_binding = utils.create_object_with_dependency(
if not self.bind_router(context, router_id, agent, binding_index): creator, dep_getter, dep_creator,
return dep_id_attr, dep_deleter)[0]
with db_api.autonested_transaction(context.session):
try: port_binding.l3_agent_id = agent['id']
port_binding = utils.create_object_with_dependency( except db_exc.DBDuplicateEntry:
creator, dep_getter, dep_creator, LOG.debug("Router %(router)s already scheduled for agent "
dep_id_attr, dep_deleter)[0] "%(agent)s", {'router': router_id,
with db_api.autonested_transaction(context.session): 'agent': agent['id']})
port_binding.l3_agent_id = agent['id'] except l3.RouterNotFound:
return LOG.debug('Router %s has already been removed '
except db_exc.DBDuplicateEntry: 'by concurrent operation', router_id)
LOG.debug("Router %(router)s already scheduled for agent "
"%(agent)s", {'router': router_id,
'agent': agent['id']})
return
except l3.RouterNotFound:
LOG.debug('Router %s has already been removed '
'by concurrent operation', router_id)
return
def get_ha_routers_l3_agents_counts(self, plugin, context, filters=None): def get_ha_routers_l3_agents_counts(self, plugin, context, filters=None):
"""Return a mapping (router, # agents) matching specified filters.""" """Return a mapping (router, # agents) matching specified filters."""
@ -359,13 +379,11 @@ class L3Scheduler(object):
chosen_agents): chosen_agents):
port_bindings = plugin.get_ha_router_port_bindings(context, port_bindings = plugin.get_ha_router_port_bindings(context,
[router_id]) [router_id])
binding_indices = range(rb_model.LOWEST_BINDING_INDEX, for port_binding, agent in zip(port_bindings, chosen_agents):
len(port_bindings) + 1) if not self.bind_router(plugin, context, router_id, agent.id,
for port_binding, agent, binding_index in zip( is_ha=True):
port_bindings, chosen_agents, binding_indices): break
if not self.bind_router(context, router_id, agent, binding_index):
continue
try: try:
with db_api.autonested_transaction(context.session): with db_api.autonested_transaction(context.session):
port_binding.l3_agent_id = agent.id port_binding.l3_agent_id = agent.id

View File

@ -89,7 +89,8 @@ class L3SchedulerBaseTest(test_db_base_plugin_v2.NeutronDbPluginV2TestCase):
for i in range(count): for i in range(count):
router = self.routers[i] router = self.routers[i]
agent = random.choice(self.l3_agents) agent = random.choice(self.l3_agents)
scheduler.bind_router(self.adminContext, router['id'], agent) scheduler.bind_router(self.l3_plugin, self.adminContext,
router['id'], agent.id)
hosting_agents.append(agent) hosting_agents.append(agent)
return hosting_agents return hosting_agents
@ -531,7 +532,8 @@ class L3AZAutoScheduleTestCaseBase(L3AZSchedulerBaseTest):
az = 'az%s' % i az = 'az%s' % i
for j in range(self.scheduled_agent_count[i]): for j in range(self.scheduled_agent_count[i]):
agent = l3_agents[az][j + self.down_agent_count[i]] agent = l3_agents[az][j + self.down_agent_count[i]]
scheduler.bind_router(self.adminContext, router['id'], agent) scheduler.bind_router(self.l3_plugin, self.adminContext,
router['id'], agent.id)
# activate down agent and call auto_schedule_routers # activate down agent and call auto_schedule_routers
activate_agent = l3_agents[self.agent_az][0] activate_agent = l3_agents[self.agent_az][0]
@ -768,9 +770,10 @@ class L3DVRSchedulerTestCase(L3DVRSchedulerBaseTest):
def _test_schedule_router(self): def _test_schedule_router(self):
if self.router_already_hosted: if self.router_already_hosted:
self.scheduler.bind_router(self.adminContext, self.scheduler.bind_router(self.l3_plugin,
self.adminContext,
self.router_to_schedule['id'], self.router_to_schedule['id'],
self.l3_agents[0]) self.l3_agents[0].id)
# schedule: # schedule:
actual_scheduled_agent = self.scheduler.schedule( actual_scheduled_agent = self.scheduler.schedule(
@ -785,9 +788,10 @@ class L3DVRSchedulerTestCase(L3DVRSchedulerBaseTest):
def _test_auto_schedule_routers(self): def _test_auto_schedule_routers(self):
if self.router_already_hosted: if self.router_already_hosted:
self.scheduler.bind_router(self.adminContext, self.scheduler.bind_router(self.l3_plugin,
self.adminContext,
self.router_to_schedule['id'], self.router_to_schedule['id'],
self.l3_agents[0]) self.l3_agents[0].id)
# schedule: # schedule:
hosting_before = self.l3_plugin.get_l3_agents_hosting_routers( hosting_before = self.l3_plugin.get_l3_agents_hosting_routers(
self.adminContext, [self.router_to_schedule['id']]) self.adminContext, [self.router_to_schedule['id']])

View File

@ -254,9 +254,11 @@ class L3SchedulerBaseTestCase(base.BaseTestCase):
def test__bind_routers_centralized(self): def test__bind_routers_centralized(self):
routers = [{'id': 'foo_router'}] routers = [{'id': 'foo_router'}]
agent = agent_model.Agent(id='foo_agent')
with mock.patch.object(self.scheduler, 'bind_router') as mock_bind: with mock.patch.object(self.scheduler, 'bind_router') as mock_bind:
self.scheduler._bind_routers(mock.ANY, mock.ANY, routers, mock.ANY) self.scheduler._bind_routers(mock.ANY, mock.ANY, routers, agent)
mock_bind.assert_called_once_with(mock.ANY, 'foo_router', mock.ANY) mock_bind.assert_called_once_with(mock.ANY, mock.ANY,
'foo_router', agent.id)
def _test__bind_routers_ha(self, has_binding): def _test__bind_routers_ha(self, has_binding):
routers = [{'id': 'foo_router', 'ha': True, 'tenant_id': '42'}] routers = [{'id': 'foo_router', 'ha': True, 'tenant_id': '42'}]
@ -577,7 +579,7 @@ class L3SchedulerTestBaseMixin(object):
scheduler = l3_agent_scheduler.ChanceScheduler() scheduler = l3_agent_scheduler.ChanceScheduler()
rid = router['router']['id'] rid = router['router']['id']
scheduler.bind_router(ctx, rid, agent) scheduler.bind_router(self.plugin, ctx, rid, agent.id)
results = (session.query(db).filter_by(router_id=rid).all()) results = (session.query(db).filter_by(router_id=rid).all())
self.assertGreater(len(results), 0) self.assertGreater(len(results), 0)
self.assertIn(agent.id, [bind.l3_agent_id for bind in results]) self.assertIn(agent.id, [bind.l3_agent_id for bind in results])
@ -596,7 +598,8 @@ class L3SchedulerTestBaseMixin(object):
scheduler = l3_agent_scheduler.ChanceScheduler() scheduler = l3_agent_scheduler.ChanceScheduler()
# checking that bind_router() is not throwing # checking that bind_router() is not throwing
# when supplied with router_id of non-existing router # when supplied with router_id of non-existing router
scheduler.bind_router(self.adminContext, "dummyID", self.agent1) scheduler.bind_router(self.plugin, self.adminContext,
"dummyID", self.agent_id1)
def test_bind_existing_router(self): def test_bind_existing_router(self):
router = self._make_router(self.fmt, router = self._make_router(self.fmt,
@ -732,7 +735,8 @@ class L3SchedulerTestBaseMixin(object):
name='r1') name='r1')
ctx = self.adminContext ctx = self.adminContext
router_id = router['router']['id'] router_id = router['router']['id']
self.plugin.router_scheduler.bind_router(ctx, router_id, agent) self.plugin.router_scheduler.bind_router(self.plugin, ctx,
router_id, agent.id)
agents = self.plugin.get_l3_agents_hosting_routers(ctx, agents = self.plugin.get_l3_agents_hosting_routers(ctx,
[router_id]) [router_id])
self.assertEqual([agent.id], [agt.id for agt in agents]) self.assertEqual([agent.id], [agt.id for agt in agents])
@ -1490,8 +1494,7 @@ class L3HATestCaseMixin(testlib_api.SqlTestCase,
instance.router_id = 'nonexistent_router' instance.router_id = 'nonexistent_router'
return orig_fn(s, instance) return orig_fn(s, instance)
with mock.patch.object(self.plugin.router_scheduler, with mock.patch.object(self.plugin.router_scheduler, 'bind_router'):
'bind_router') as bind_router:
with mock.patch.object( with mock.patch.object(
orm.Session, 'add', orm.Session, 'add',
side_effect=db_ref_err_for_add_haportbinding, side_effect=db_ref_err_for_add_haportbinding,
@ -1499,21 +1502,30 @@ class L3HATestCaseMixin(testlib_api.SqlTestCase,
self.plugin.router_scheduler.create_ha_port_and_bind( self.plugin.router_scheduler.create_ha_port_and_bind(
self.plugin, self.adminContext, self.plugin, self.adminContext,
router['id'], router['tenant_id'], agent) router['id'], router['tenant_id'], agent)
self.assertFalse(bind_router.called)
def test_create_ha_port_and_bind_catch_router_not_found(self): def test_create_ha_port_and_bind_catch_router_not_found(self):
router = self._create_ha_router(tenant_id='foo_tenant') router = self._create_ha_router(tenant_id='foo_tenant')
agent = {'id': 'foo_agent'} agent = {'id': 'foo_agent'}
with mock.patch.object(self.plugin.router_scheduler, with mock.patch.object(self.plugin.router_scheduler, 'bind_router'):
'bind_router') as bind_router:
with mock.patch.object( with mock.patch.object(
self.plugin, 'add_ha_port', self.plugin, 'add_ha_port',
side_effect=l3.RouterNotFound(router_id='foo_router')): side_effect=l3.RouterNotFound(router_id='foo_router')):
self.plugin.router_scheduler.create_ha_port_and_bind( self.plugin.router_scheduler.create_ha_port_and_bind(
self.plugin, self.adminContext, self.plugin, self.adminContext,
router['id'], router['tenant_id'], agent) router['id'], router['tenant_id'], agent)
self.assertFalse(bind_router.called)
def test_create_ha_port_and_bind_bind_router_returns_None(self):
router = self._create_ha_router(tenant_id='foo_tenant')
agent = {'id': 'foo_agent'}
with mock.patch.object(self.plugin.router_scheduler, 'bind_router',
return_value=None):
with mock.patch.object(self.plugin, 'add_ha_port') as add_ha_port:
self.plugin.router_scheduler.create_ha_port_and_bind(
self.plugin, self.adminContext,
router['id'], router['tenant_id'], agent)
self.assertFalse(add_ha_port.called)
class VacantBindingIndexTestCase(L3HATestCaseMixin): class VacantBindingIndexTestCase(L3HATestCaseMixin):
@ -1763,12 +1775,14 @@ class L3AgentSchedulerDbMixinTestCase(L3HATestCaseMixin):
def test_bind_router_twice_for_non_ha(self): def test_bind_router_twice_for_non_ha(self):
router = self._create_ha_router(ha=False) router = self._create_ha_router(ha=False)
self.plugin.router_scheduler.bind_router(self.adminContext, self.plugin.router_scheduler.bind_router(self.plugin,
self.adminContext,
router['id'], router['id'],
self.agent1) self.agent_id1)
self.plugin.router_scheduler.bind_router(self.adminContext, self.plugin.router_scheduler.bind_router(self.plugin,
self.adminContext,
router['id'], router['id'],
self.agent2) self.agent_id2)
# Make sure the second bind_router call didn't schedule the router to # Make sure the second bind_router call didn't schedule the router to
# more agents than allowed. # more agents than allowed.