Browse Source

Race condition of L3-agent to add/remove routers

This race condition happens when repeatedly calling
l3-agent-router-add and l3-agent-router-remove
by different neutron-servers at the same time.

The primary key constraint is added for the pair of
(router_id and l3_agent_id).

During migration, verification is done if the current
records violate the PK constraint defined in this bug
fix, and sanitize the data before schema modification.

Due to different dialects of database engines, different
sql statements are executed correspondingly to do
the verification.

Change-Id: Ia541e023b757b2e77c4eec9bb1670632c7a271fa
Closes-Bug: #1230323
changes/34/73234/56
Li Ma 9 years ago
parent
commit
fbc6b991a7
  1. 10
      neutron/db/l3_agentschedulers_db.py
  2. 127
      neutron/db/migration/alembic_migrations/versions/31d7f831a591_add_constraint_for_routerid.py
  3. 2
      neutron/db/migration/alembic_migrations/versions/HEAD
  4. 26
      neutron/scheduler/l3_agent_scheduler.py
  5. 35
      neutron/tests/unit/test_l3_schedulers.py

10
neutron/db/l3_agentschedulers_db.py

@ -24,7 +24,6 @@ from neutron.common import constants
from neutron.db import agents_db
from neutron.db import agentschedulers_db
from neutron.db import model_base
from neutron.db import models_v2
from neutron.extensions import l3agentscheduler
@ -40,15 +39,16 @@ L3_AGENTS_SCHEDULER_OPTS = [
cfg.CONF.register_opts(L3_AGENTS_SCHEDULER_OPTS)
class RouterL3AgentBinding(model_base.BASEV2, models_v2.HasId):
class RouterL3AgentBinding(model_base.BASEV2):
"""Represents binding between neutron routers and L3 agents."""
router_id = sa.Column(sa.String(36),
sa.ForeignKey("routers.id", ondelete='CASCADE'))
sa.ForeignKey("routers.id", ondelete='CASCADE'),
primary_key=True)
l3_agent = orm.relation(agents_db.Agent)
l3_agent_id = sa.Column(sa.String(36),
sa.ForeignKey("agents.id",
ondelete='CASCADE'))
sa.ForeignKey("agents.id", ondelete='CASCADE'),
primary_key=True)
class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,

127
neutron/db/migration/alembic_migrations/versions/31d7f831a591_add_constraint_for_routerid.py

@ -0,0 +1,127 @@
# Copyright 2014 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
"""add constraint for routerid
Revision ID: 31d7f831a591
Revises: 37f322991f59
Create Date: 2014-02-26 06:47:16.494393
"""
# revision identifiers, used by Alembic.
revision = '31d7f831a591'
down_revision = '37f322991f59'
from alembic import op
import sqlalchemy as sa
TABLE_NAME = 'routerl3agentbindings'
PK_NAME = 'pk_routerl3agentbindings'
fk_names = {'postgresql':
{'router_id':
'routerl3agentbindings_router_id_fkey',
'l3_agent_id':
'routerl3agentbindings_l3_agent_id_fkey'},
'mysql':
{'router_id':
'routerl3agentbindings_ibfk_2',
'l3_agent_id':
'routerl3agentbindings_ibfk_1'}}
def upgrade(active_plugins=None, options=None):
# In order to sanitize the data during migration,
# the current records in the table need to be verified
# and all the duplicate records which violate the PK
# constraint need to be removed.
context = op.get_context()
if context.bind.dialect.name == 'postgresql':
op.execute('DELETE FROM %(table)s WHERE id in ('
'SELECT %(table)s.id FROM %(table)s LEFT OUTER JOIN '
'(SELECT MIN(id) as id, router_id, l3_agent_id '
' FROM %(table)s GROUP BY router_id, l3_agent_id) AS temp '
'ON %(table)s.id = temp.id WHERE temp.id is NULL);'
% {'table': TABLE_NAME})
else:
op.execute('DELETE %(table)s FROM %(table)s LEFT OUTER JOIN '
'(SELECT MIN(id) as id, router_id, l3_agent_id '
' FROM %(table)s GROUP BY router_id, l3_agent_id) AS temp '
'ON %(table)s.id = temp.id WHERE temp.id is NULL;'
% {'table': TABLE_NAME})
op.drop_column(TABLE_NAME, 'id')
op.create_primary_key(
name=PK_NAME,
table_name=TABLE_NAME,
cols=['router_id', 'l3_agent_id']
)
def downgrade(active_plugins=None, options=None):
context = op.get_context()
dialect = context.bind.dialect.name
# Drop the existed foreign key constraints
# In order to perform primary key changes
op.drop_constraint(
name=fk_names[dialect]['l3_agent_id'],
table_name=TABLE_NAME,
type_='foreignkey'
)
op.drop_constraint(
name=fk_names[dialect]['router_id'],
table_name=TABLE_NAME,
type_='foreignkey'
)
op.drop_constraint(
name=PK_NAME,
table_name=TABLE_NAME,
type_='primary'
)
op.add_column(
TABLE_NAME,
sa.Column('id', sa.String(32))
)
# Restore the foreign key constraints
op.create_foreign_key(
name=fk_names[dialect]['router_id'],
source=TABLE_NAME,
referent='routers',
local_cols=['router_id'],
remote_cols=['id'],
ondelete='CASCADE'
)
op.create_foreign_key(
name=fk_names[dialect]['l3_agent_id'],
source=TABLE_NAME,
referent='agents',
local_cols=['l3_agent_id'],
remote_cols=['id'],
ondelete='CASCADE'
)
op.create_primary_key(
name=PK_NAME,
table_name=TABLE_NAME,
cols=['id']
)

2
neutron/db/migration/alembic_migrations/versions/HEAD

@ -1 +1 @@
37f322991f59
31d7f831a591

26
neutron/scheduler/l3_agent_scheduler.py

@ -16,6 +16,7 @@
import abc
import random
from oslo.db import exception as db_exc
import six
from sqlalchemy.orm import exc
from sqlalchemy import sql
@ -145,15 +146,22 @@ class L3Scheduler(object):
def bind_router(self, context, router_id, chosen_agent):
"""Bind the router to the l3 agent which has been chosen."""
with context.session.begin(subtransactions=True):
binding = l3_agentschedulers_db.RouterL3AgentBinding()
binding.l3_agent = chosen_agent
binding.router_id = router_id
context.session.add(binding)
LOG.debug(_('Router %(router_id)s is scheduled to '
'L3 agent %(agent_id)s'),
{'router_id': router_id,
'agent_id': chosen_agent.id})
try:
with context.session.begin(subtransactions=True):
binding = l3_agentschedulers_db.RouterL3AgentBinding()
binding.l3_agent = chosen_agent
binding.router_id = router_id
context.session.add(binding)
except db_exc.DBDuplicateEntry:
LOG.debug('Router %(router_id)s has already been scheduled '
'to L3 agent %(agent_id)s.',
{'agent_id': chosen_agent.id,
'router_id': router_id})
return
LOG.debug('Router %(router_id)s is scheduled to L3 agent '
'%(agent_id)s', {'router_id': router_id,
'agent_id': chosen_agent.id})
class ChanceScheduler(L3Scheduler):

35
neutron/tests/unit/test_l3_schedulers.py

@ -31,6 +31,7 @@ from neutron.db import l3_agentschedulers_db
from neutron.extensions import l3 as ext_l3
from neutron import manager
from neutron.openstack.common import timeutils
from neutron.scheduler import l3_agent_scheduler
from neutron.tests.unit import test_db_plugin
from neutron.tests.unit import test_l3_plugin
@ -93,6 +94,7 @@ class L3SchedulerTestCase(l3_agentschedulers_db.L3AgentSchedulerDbMixin,
agent_db = self.plugin.get_agents_db(self.adminContext,
filters={'host': [HOST]})
self.agent_id1 = agent_db[0].id
self.agent1 = agent_db[0]
callback.report_state(self.adminContext,
agent_state={'agent_state': SECOND_L3_AGENT},
@ -124,6 +126,39 @@ class L3SchedulerTestCase(l3_agentschedulers_db.L3AgentSchedulerDbMixin,
router['router']['id'], subnet['subnet']['network_id'])
self._delete('routers', router['router']['id'])
def _test_schedule_bind_router(self, agent, router):
ctx = self.adminContext
session = ctx.session
db = l3_agentschedulers_db.RouterL3AgentBinding
scheduler = l3_agent_scheduler.ChanceScheduler()
rid = router['router']['id']
scheduler.bind_router(ctx, rid, agent)
results = (session.query(db).filter_by(router_id=rid).all())
self.assertTrue(len(results) > 0)
self.assertIn(agent.id, [bind.l3_agent_id for bind in results])
def test_bind_new_router(self):
router = self._make_router(self.fmt,
tenant_id=str(uuid.uuid4()),
name='r1')
with mock.patch.object(l3_agent_scheduler.LOG, 'debug') as flog:
self._test_schedule_bind_router(self.agent1, router)
self.assertEqual(1, flog.call_count)
args, kwargs = flog.call_args
self.assertIn('is scheduled', args[0])
def test_bind_existing_router(self):
router = self._make_router(self.fmt,
tenant_id=str(uuid.uuid4()),
name='r2')
self._test_schedule_bind_router(self.agent1, router)
with mock.patch.object(l3_agent_scheduler.LOG, 'debug') as flog:
self._test_schedule_bind_router(self.agent1, router)
self.assertEqual(1, flog.call_count)
args, kwargs = flog.call_args
self.assertIn('has already been scheduled', args[0])
class L3AgentChanceSchedulerTestCase(L3SchedulerTestCase):

Loading…
Cancel
Save