From 2ad9c679ed8718633732da1e97307f9fd9647dcc Mon Sep 17 00:00:00 2001 From: John Schwarz Date: Thu, 13 Oct 2016 13:54:07 +0300 Subject: [PATCH] Don't create HA resources until needed Change I3447ea5bcb7c57365c6f50efe12a1671e86588b3 introduced a new running-index for RouterL3AgentBinding, binding_index, which helps to keep count of how many bindings a router has for each agent (and how many bindings in total). Since we were able use this DB column to make sure concurrency doesn't break on creating a new HA router, we also postponed the creation of L3HARouterAgentPortBinding to after the first binding was successfully created. This patch proposes a change to the way routers are scheduled to an agent: when creating a new HA router, no L3HARouterAgentPortBinding entities will be created until after the corresponding RouterL3AgentBinding was successfully created. In other words, instead of pre-creating the L3HARouterAgentPortBinding objects without assigning it to an agent, we'll create them only after the RouterL3AgentBinding were successfully created. Related-Bug: #1609738 Change-Id: Ie98d5e3760cdb17450aea546f4b61f5ba14baf1c --- neutron/db/l3_hamode_db.py | 56 ++++------ neutron/scheduler/l3_agent_scheduler.py | 38 ++----- neutron/tests/unit/db/test_l3_hamode_db.py | 100 ++++++------------ .../ml2/drivers/l2pop/test_mech_driver.py | 21 ++-- .../unit/scheduler/test_l3_agent_scheduler.py | 6 +- 5 files changed, 76 insertions(+), 145 deletions(-) diff --git a/neutron/db/l3_hamode_db.py b/neutron/db/l3_hamode_db.py index 3d6d4bfb8cc..ea26d015699 100644 --- a/neutron/db/l3_hamode_db.py +++ b/neutron/db/l3_hamode_db.py @@ -332,24 +332,6 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, transaction=False) return bindings - def _create_ha_interfaces(self, context, router, ha_network): - admin_ctx = context.elevated() - - num_agents = self.get_number_of_agents_for_scheduling(context) - - port_ids = [] - try: - for index in range(num_agents): - binding = self.add_ha_port(admin_ctx, router.id, - ha_network.network['id'], - router.tenant_id) - port_ids.append(binding.port_id) - except Exception: - with excutils.save_and_reraise_exception(): - for port_id in port_ids: - self._core_plugin.delete_port(admin_ctx, port_id, - l3_port_check=False) - def _delete_ha_interfaces(self, context, router_id): admin_ctx = context.elevated() device_filter = {'device_id': [router_id], @@ -370,8 +352,8 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, self._core_plugin.delete_port(admin_ctx, port_id, l3_port_check=False) - def _notify_ha_interfaces_updated(self, context, router_id, - schedule_routers=True): + def _notify_router_updated(self, context, router_id, + schedule_routers=True): self.l3_rpc_notifier.routers_updated( context, [router_id], shuffle_agents=True, schedule_routers=schedule_routers) @@ -394,9 +376,9 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, self)._get_device_owner(context, router) @n_utils.transaction_guard - def _create_ha_interfaces_and_ensure_network(self, context, router_db): - """Attach interfaces to a network while tolerating network deletes.""" - creator = functools.partial(self._create_ha_interfaces, + def _set_vr_id_and_ensure_network(self, context, router_db): + """Attach vr_id to router while tolerating network deletes.""" + creator = functools.partial(self._set_vr_id, context, router_db) dep_getter = functools.partial(self.get_ha_network, context, router_db.tenant_id) @@ -405,7 +387,7 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, dep_deleter = functools.partial(self._delete_ha_network, context) dep_id_attr = 'network_id' return n_utils.create_object_with_dependency( - creator, dep_getter, dep_creator, dep_id_attr, dep_deleter) + creator, dep_getter, dep_creator, dep_id_attr, dep_deleter)[1] def _process_extra_attr_router_create(self, context, router_db, router_res): @@ -417,6 +399,10 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, def create_router(self, context, router): is_ha = self._is_ha(router['router']) if is_ha: + # This will throw an exception if there aren't enough agents to + # handle this HA router + self.get_number_of_agents_for_scheduling(context) + # we set the allocating status to hide it from the L3 agents # until we have created all of the requisite interfaces/networks router['router']['status'] = n_const.ROUTER_STATUS_ALLOCATING @@ -426,20 +412,16 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, if is_ha: try: router_db = self._get_router(context, router_dict['id']) - # the following returns interfaces and the network we only - # care about the network - ha_network = self._create_ha_interfaces_and_ensure_network( - context, router_db)[1] - - self._set_vr_id(context, router_db, ha_network) - router_dict['ha_vr_id'] = router_db.extra_attributes.ha_vr_id + self._set_vr_id_and_ensure_network(context, router_db) self.schedule_router(context, router_dict['id']) + + router_dict['ha_vr_id'] = router_db.extra_attributes.ha_vr_id router_dict['status'] = self._update_router_db( context, router_dict['id'], {'status': n_const.ROUTER_STATUS_ACTIVE})['status'] - self._notify_ha_interfaces_updated(context, router_db.id, - schedule_routers=False) + self._notify_router_updated(context, router_db.id, + schedule_routers=False) except Exception: with excutils.save_and_reraise_exception(): self.delete_router(context, router_dict['id']) @@ -520,9 +502,7 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, self._unbind_ha_router(context, router_id) if requested_ha_state: - ha_network = self._create_ha_interfaces_and_ensure_network( - context, router_db)[1] - self._set_vr_id(context, router_db, ha_network) + self._set_vr_id_and_ensure_network(context, router_db) else: self._delete_ha_interfaces(context, router_db.id) # always attempt to cleanup the network as the router is @@ -533,8 +513,8 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin, self.schedule_router(context, router_id) router_db = super(L3_HA_NAT_db_mixin, self)._update_router_db( context, router_id, {'status': n_const.ROUTER_STATUS_ACTIVE}) - self._notify_ha_interfaces_updated(context, router_db.id, - schedule_routers=False) + self._notify_router_updated(context, router_db.id, + schedule_routers=False) return router_db diff --git a/neutron/scheduler/l3_agent_scheduler.py b/neutron/scheduler/l3_agent_scheduler.py index eb86ae66e47..d78c579be86 100644 --- a/neutron/scheduler/l3_agent_scheduler.py +++ b/neutron/scheduler/l3_agent_scheduler.py @@ -277,7 +277,9 @@ class L3Scheduler(object): return elif sync_router.get('ha', False): chosen_agents = self._bind_ha_router(plugin, context, - router_id, candidates) + router_id, + sync_router.get('tenant_id'), + candidates) if not chosen_agents: return chosen_agent = chosen_agents[-1] @@ -317,11 +319,12 @@ class L3Scheduler(object): tenant_id, agent, is_manual_scheduling=False): """Creates and binds a new HA port for this agent.""" ctxt = context.elevated() + router_db = plugin._get_router(ctxt, router_id) creator = functools.partial(self._add_port_from_net, plugin, ctxt, router_id, tenant_id) dep_getter = functools.partial(plugin.get_ha_network, ctxt, tenant_id) - dep_creator = functools.partial(plugin._create_ha_network, - ctxt, tenant_id) + dep_creator = functools.partial(plugin._set_vr_id_and_ensure_network, + ctxt, router_db) dep_deleter = functools.partial(plugin._delete_ha_network, ctxt) dep_id_attr = 'network_id' @@ -375,28 +378,8 @@ class L3Scheduler(object): router['tenant_id'], agent) - def _bind_ha_router_to_agents(self, plugin, context, router_id, - chosen_agents): - port_bindings = plugin.get_ha_router_port_bindings(context, - [router_id]) - for port_binding, agent in zip(port_bindings, chosen_agents): - if not self.bind_router(plugin, context, router_id, agent.id, - is_ha=True): - break - - try: - with db_api.autonested_transaction(context.session): - port_binding.l3_agent_id = agent.id - except db_exc.DBDuplicateEntry: - LOG.debug("Router %(router)s already scheduled for agent " - "%(agent)s", {'router': router_id, - 'agent': agent.id}) - else: - LOG.debug('HA Router %(router_id)s is scheduled to L3 agent ' - '%(agent_id)s)', - {'router_id': router_id, 'agent_id': agent.id}) - - def _bind_ha_router(self, plugin, context, router_id, candidates): + def _bind_ha_router(self, plugin, context, router_id, + tenant_id, candidates): """Bind a HA router to agents based on a specific policy.""" if not self._enough_candidates_for_ha(candidates): @@ -405,8 +388,9 @@ class L3Scheduler(object): chosen_agents = self._choose_router_agents_for_ha( plugin, context, candidates) - self._bind_ha_router_to_agents(plugin, context, router_id, - chosen_agents) + for agent in chosen_agents: + self.create_ha_port_and_bind(plugin, context, router_id, + tenant_id, agent) return chosen_agents diff --git a/neutron/tests/unit/db/test_l3_hamode_db.py b/neutron/tests/unit/db/test_l3_hamode_db.py index 834f38b086f..934d2dece35 100644 --- a/neutron/tests/unit/db/test_l3_hamode_db.py +++ b/neutron/tests/unit/db/test_l3_hamode_db.py @@ -59,7 +59,7 @@ class L3HATestFramework(testlib_api.SqlTestCase): self.setup_coreplugin('ml2') self.core_plugin = manager.NeutronManager.get_plugin() notif_p = mock.patch.object(l3_hamode_db.L3_HA_NAT_db_mixin, - '_notify_ha_interfaces_updated') + '_notify_router_updated') self.notif_m = notif_p.start() cfg.CONF.set_override('allow_overlapping_ips', True) @@ -162,21 +162,6 @@ class L3HATestCase(L3HATestFramework): self.assertIn((self.agent1['id'], 'active'), agent_ids) self.assertIn((self.agent2['id'], 'standby'), agent_ids) - def test_get_l3_bindings_hosting_router_with_ha_states_agent_none(self): - with mock.patch.object(self.plugin, 'schedule_router'): - # Do not bind router to leave agents as None - router = self._create_router() - - res = self.admin_ctx.session.query( - l3ha_model.L3HARouterAgentPortBinding).filter( - l3ha_model.L3HARouterAgentPortBinding.router_id == router['id'] - ).all() - # Check that agents are None - self.assertEqual([None, None], [r.agent for r in res]) - bindings = self.plugin.get_l3_bindings_hosting_router_with_ha_states( - self.admin_ctx, router['id']) - self.assertEqual([], bindings) - def test_get_l3_bindings_hosting_router_with_ha_states_not_scheduled(self): router = self._create_router(ha=False) # Check that there no L3 agents scheduled for this router @@ -262,16 +247,16 @@ class L3HATestCase(L3HATestFramework): router['status']) def test_router_created_allocating_state_during_interface_create(self): - _orig = self.plugin._create_ha_interfaces + _orig = self.plugin._set_vr_id_and_ensure_network - def check_state(context, router_db, ha_network): + def check_state(context, router_db): self.assertEqual(n_const.ROUTER_STATUS_ALLOCATING, router_db.status) - return _orig(context, router_db, ha_network) - with mock.patch.object(self.plugin, '_create_ha_interfaces', - side_effect=check_state) as ha_mock: + return _orig(context, router_db) + with mock.patch.object(self.plugin, '_set_vr_id_and_ensure_network', + side_effect=check_state) as vr_id_mock: router = self._create_router() - self.assertTrue(ha_mock.called) + self.assertTrue(vr_id_mock.called) self.assertEqual(n_const.ROUTER_STATUS_ACTIVE, router['status']) def test_ha_router_create(self): @@ -649,15 +634,15 @@ class L3HATestCase(L3HATestFramework): networks_after = self.core_plugin.get_networks(self.admin_ctx) self.assertEqual(networks_before, networks_after) - def test_create_ha_interfaces_and_ensure_network_net_exists(self): + def test_set_vr_id_and_ensure_network_net_exists(self): router = self._create_router() router_db = self.plugin._get_router(self.admin_ctx, router['id']) with mock.patch.object(self.plugin, '_create_ha_network') as create: - self.plugin._create_ha_interfaces_and_ensure_network( + self.plugin._set_vr_id_and_ensure_network( self.admin_ctx, router_db) self.assertFalse(create.called) - def test_create_ha_interfaces_and_ensure_network_concurrent_create(self): + def test_set_vr_id_and_ensure_network_concurrent_create(self): # create a non-ha router so we can manually invoke the create ha # interfaces call down below router = self._create_router(ha=False) @@ -672,56 +657,56 @@ class L3HATestCase(L3HATestFramework): raise db_exc.DBDuplicateEntry(columns=['tenant_id']) with mock.patch.object(self.plugin, '_create_ha_network', new=_create_ha_network): - net = self.plugin._create_ha_interfaces_and_ensure_network( - self.admin_ctx, router_db)[1] + net = self.plugin._set_vr_id_and_ensure_network( + self.admin_ctx, router_db) # ensure that it used the concurrently created network self.assertEqual([net], created_nets) - def _test_ensure_with_patched_int_create(self, _create_ha_interfaces): + def _test_ensure_with_patched_set_vr_id(self, _set_vr_id): # create a non-ha router so we can manually invoke the create ha # interfaces call down below router = self._create_router(ha=False) router_db = self.plugin._get_router(self.admin_ctx, router['id']) - with mock.patch.object(self.plugin, '_create_ha_interfaces', - new=_create_ha_interfaces): - self.plugin._create_ha_interfaces_and_ensure_network( + with mock.patch.object(self.plugin, '_set_vr_id', + new=_set_vr_id): + self.plugin._set_vr_id_and_ensure_network( self.admin_ctx, router_db) - self.assertTrue(_create_ha_interfaces.called) + self.assertTrue(_set_vr_id.called) - def test_create_ha_interfaces_and_ensure_network_interface_failure(self): + def test_set_vr_id_and_ensure_network_interface_failure(self): - def _create_ha_interfaces(ctx, rdb, ha_net): + def _set_vr_id(ctx, rdb, ha_net): raise ValueError('broken') with testtools.ExpectedException(ValueError): - self._test_ensure_with_patched_int_create(_create_ha_interfaces) + self._test_ensure_with_patched_set_vr_id(_set_vr_id) self.assertEqual([], self.core_plugin.get_networks(self.admin_ctx)) - def test_create_ha_interfaces_and_ensure_network_concurrent_delete(self): - orig_create = self.plugin._create_ha_interfaces + def test_set_vr_id_and_ensure_network_concurrent_delete(self): + orig_create = self.plugin._set_vr_id - def _create_ha_interfaces(ctx, rdb, ha_net): + def _set_vr_id(ctx, rdb, ha_net): # concurrent delete on the first attempt - if not getattr(_create_ha_interfaces, 'called', False): - setattr(_create_ha_interfaces, 'called', True) + if not getattr(_set_vr_id, 'called', False): + setattr(_set_vr_id, 'called', True) self.core_plugin.delete_network(self.admin_ctx, ha_net['network_id']) return orig_create(ctx, rdb, ha_net) - self._test_ensure_with_patched_int_create(_create_ha_interfaces) + self._test_ensure_with_patched_set_vr_id(_set_vr_id) - def test_create_ha_interfaces_and_ensure_network_concurrent_swap(self): - orig_create = self.plugin._create_ha_interfaces + def test_set_vr_id_and_ensure_network_concurrent_swap(self): + orig_create = self.plugin._set_vr_id - def _create_ha_interfaces(ctx, rdb, ha_net): + def _set_vr_id(ctx, rdb, ha_net): # concurrent delete on the first attempt - if not getattr(_create_ha_interfaces, 'called', False): - setattr(_create_ha_interfaces, 'called', True) + if not getattr(_set_vr_id, 'called', False): + setattr(_set_vr_id, 'called', True) self.core_plugin.delete_network(self.admin_ctx, ha_net['network_id']) self.plugin._create_ha_network(self.admin_ctx, rdb.tenant_id) return orig_create(ctx, rdb, ha_net) - self._test_ensure_with_patched_int_create(_create_ha_interfaces) + self._test_ensure_with_patched_set_vr_id(_set_vr_id) def test_create_ha_network_tenant_binding_raises_duplicate(self): router = self._create_router() @@ -733,30 +718,11 @@ class L3HATestCase(L3HATestFramework): self.plugin._create_ha_network_tenant_binding( self.admin_ctx, 't1', network['network_id']) - def test_create_ha_interfaces_binding_failure_rolls_back_ports(self): - router = self._create_router() - network = self.plugin.get_ha_network(self.admin_ctx, - router['tenant_id']) - device_filter = {'device_id': [router['id']]} - ports_before = self.core_plugin.get_ports( - self.admin_ctx, filters=device_filter) - - router_db = self.plugin._get_router(self.admin_ctx, router['id']) - with mock.patch.object(l3ha_model, 'L3HARouterAgentPortBinding', - side_effect=ValueError): - self.assertRaises(ValueError, self.plugin._create_ha_interfaces, - self.admin_ctx, router_db, network) - - ports_after = self.core_plugin.get_ports( - self.admin_ctx, filters=device_filter) - self.assertEqual(ports_before, ports_after) - def test_create_router_db_ha_attribute_failure_rolls_back_router(self): routers_before = self.plugin.get_routers(self.admin_ctx) for method in ('_set_vr_id', - '_create_ha_interfaces', - '_notify_ha_interfaces_updated'): + '_notify_router_updated'): with mock.patch.object(self.plugin, method, side_effect=ValueError): self.assertRaises(ValueError, self._create_router) diff --git a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py index e3ec9ccc5df..710dc09c402 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py +++ b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py @@ -119,7 +119,7 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): def _setup_l3(self): notif_p = mock.patch.object(l3_hamode_db.L3_HA_NAT_db_mixin, - '_notify_ha_interfaces_updated') + '_notify_router_updated') self.notif_m = notif_p.start() self.plugin = FakeL3PluginWithAgents() self._register_ml2_agents() @@ -208,17 +208,18 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): router['distributed'] = distributed return self.plugin.create_router(ctx, {'router': router}) - def _bind_router(self, router_id): - with self.adminContext.session.begin(subtransactions=True): - scheduler = l3_agent_scheduler.ChanceScheduler() - filters = {'agent_type': [constants.AGENT_TYPE_L3]} - agents_db = self.plugin.get_agents_db(self.adminContext, - filters=filters) - scheduler._bind_ha_router_to_agents( + def _bind_router(self, router_id, tenant_id): + scheduler = l3_agent_scheduler.ChanceScheduler() + filters = {'agent_type': [constants.AGENT_TYPE_L3]} + agents_db = self.plugin.get_agents_db(self.adminContext, + filters=filters) + for agent_db in agents_db: + scheduler.create_ha_port_and_bind( self.plugin, self.adminContext, router_id, - agents_db) + tenant_id, + agent_db) self._bind_ha_network_ports(router_id) def _bind_ha_network_ports(self, router_id): @@ -262,7 +263,7 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): def _create_ha_router(self): self._setup_l3() router = self._create_router() - self._bind_router(router['id']) + self._bind_router(router['id'], router['tenant_id']) return router def _verify_remove_fdb(self, expected, agent_id, device, host=None): diff --git a/neutron/tests/unit/scheduler/test_l3_agent_scheduler.py b/neutron/tests/unit/scheduler/test_l3_agent_scheduler.py index 41a461ec787..bbe18b10250 100644 --- a/neutron/tests/unit/scheduler/test_l3_agent_scheduler.py +++ b/neutron/tests/unit/scheduler/test_l3_agent_scheduler.py @@ -1623,7 +1623,7 @@ class L3_HA_scheduler_db_mixinTestCase(L3HATestCaseMixin): def test_get_ordered_l3_agents_by_num_routers(self): # Mock scheduling so that the test can control it explicitly mock.patch.object(l3_hamode_db.L3_HA_NAT_db_mixin, - '_notify_ha_interfaces_updated').start() + '_notify_router_updated').start() with mock.patch.object(self.plugin, 'schedule_router'): router1 = self._create_ha_router() router2 = self._create_ha_router() @@ -1817,7 +1817,7 @@ class L3HAChanceSchedulerTestCase(L3HATestCaseMixin): def test_auto_schedule(self): # Mock scheduling so that the test can control it explicitly mock.patch.object(l3_hamode_db.L3_HA_NAT_db_mixin, - '_notify_ha_interfaces_updated').start() + '_notify_router_updated').start() router = self._create_ha_router() self.plugin.auto_schedule_routers( @@ -2036,7 +2036,7 @@ class L3AgentAZLeastRoutersSchedulerTestCase(L3HATestCaseMixin): 'neutron.scheduler.l3_agent_scheduler.AZLeastRoutersScheduler') # Mock scheduling so that the test can control it explicitly mock.patch.object(l3_hamode_db.L3_HA_NAT_db_mixin, - '_notify_ha_interfaces_updated').start() + '_notify_router_updated').start() def _register_l3_agents(self): self.agent1 = helpers.register_l3_agent(host='az1-host1', az='az1')