From 74c51a2e5390f258290ee890c9218beb5fdfd29c Mon Sep 17 00:00:00 2001 From: Bence Romsics Date: Tue, 26 Jun 2018 11:16:54 +0200 Subject: [PATCH] Drive binding by placement allocation Drive the choice of mechanism driver during binding as inferred from the resource provider allocated by nova and as told to neutron via the port's binding:profile. As discussed on a neutron qos irc meeting some time ago this patch introduces a new assumption on bind_port() implementations. That is an implementation of bind_port() in any mech driver supporting Guaranteed Minimum Bandwidth bind_port() must not have a non-idempotent side effect. Because the last binding level will be redone for a 2nd time with a narrowed down list of mechanism drivers. And if the 2nd call does not give the same result as the first all kind of weird things can happen. Change-Id: I2b7573ec6795170ce45a13d5d0ad7844fb85182d Depends-On: https://review.openstack.org/574781 Depends-On: https://review.openstack.org/635160 Partial-Bug: #1578989 See-Also: https://review.openstack.org/502306 (nova spec) See-Also: https://review.openstack.org/508149 (neutron spec) --- neutron/plugins/ml2/driver_context.py | 10 ++ neutron/plugins/ml2/drivers/mech_agent.py | 69 ++++++++++ neutron/plugins/ml2/managers.py | 123 +++++++++++++++++- .../unit/plugins/ml2/_test_mech_agent.py | 16 +++ .../plugins/ml2/drivers/mech_fake_agent.py | 4 + .../mech_driver/test_mech_sriov_nic_switch.py | 22 ++++ .../mech_driver/test_mech_openvswitch.py | 22 +++- .../tests/unit/plugins/ml2/test_managers.py | 55 ++++++++ .../unit/plugins/ml2/test_port_binding.py | 17 +++ setup.cfg | 1 + 10 files changed, 334 insertions(+), 5 deletions(-) diff --git a/neutron/plugins/ml2/driver_context.py b/neutron/plugins/ml2/driver_context.py index c61982729e3..4e4afaf5bd8 100644 --- a/neutron/plugins/ml2/driver_context.py +++ b/neutron/plugins/ml2/driver_context.py @@ -304,6 +304,16 @@ class PortContext(MechanismDriverContext, api.PortContext): self._binding.vif_details = jsonutils.dumps(vif_details) self._new_port_status = status + def _unset_binding(self): + '''Undo a previous call to set_binding() before it gets committed. + + This method is for MechanismManager and is not part of the driver API. + ''' + self._new_bound_segment = None + self._binding.vif_type = portbindings.VIF_TYPE_UNBOUND + self._binding.vif_details = '' + self._new_port_status = None + def continue_binding(self, segment_id, next_segments_to_bind): # TODO(rkukura) Verify binding allowed, segment in network self._new_bound_segment = segment_id diff --git a/neutron/plugins/ml2/drivers/mech_agent.py b/neutron/plugins/ml2/drivers/mech_agent.py index ee1fa12e8e3..5b41cdfea79 100644 --- a/neutron/plugins/ml2/drivers/mech_agent.py +++ b/neutron/plugins/ml2/drivers/mech_agent.py @@ -14,10 +14,12 @@ # under the License. import abc +import uuid from neutron_lib.api.definitions import portbindings from neutron_lib.callbacks import resources from neutron_lib import constants as const +from neutron_lib.placement import utils as place_utils from neutron_lib.plugins.ml2 import api from oslo_log import log import six @@ -161,6 +163,73 @@ class AgentMechanismDriverBase(api.MechanismDriver): "%s mechanism driver!") % self.agent_type) return supported_vnic_types + def _possible_agents_for_port(self, context): + agent_filters = { + 'host': [context.current['binding:host_id']], + 'agent_type': [self.agent_type], + 'admin_state_up': [True], + # By not filtering for 'alive' we may report being responsible + # and still not being able to handle the binding. But that case + # will be properly logged and handled very soon. That is when + # trying to bind with a dead agent. + } + return context._plugin.get_agents( + context._plugin_context, + filters=agent_filters, + ) + + def responsible_for_ports_allocation(self, context): + """Report if an agent is responsible for a resource provider. + + :param context: PortContext instance describing the port + :returns: True for responsible, False for not responsible + + An agent based mechanism driver is reponsible for a resource provider + if an agent of it is responsible for that resource provider. An agent + reports responsibility by including the resource provider in the + configurations field of the agent heartbeat. + """ + uuid_ns = self.resource_provider_uuid5_namespace + if uuid_ns is None: + return False + if 'allocation' not in context.current['binding:profile']: + return False + + rp = uuid.UUID(context.current['binding:profile']['allocation']) + host_agents = self._possible_agents_for_port(context) + + reported = {} + for agent in host_agents: + if 'resource_provider_bandwidths' in agent['configurations']: + for device in agent['configurations'][ + 'resource_provider_bandwidths'].keys(): + device_rp_uuid = place_utils.device_resource_provider_uuid( + namespace=uuid_ns, + host=agent['host'], + device=device) + if device_rp_uuid == rp: + reported[agent['id']] = agent + + if len(reported) == 1: + agent = list(reported.values())[0] + LOG.debug("Agent %(agent)s of type %(agent_type)s reports to be " + "responsible for resource provider %(rsc_provider)s", + {'agent': agent['id'], + 'agent_type': agent['agent_type'], + 'rsc_provider': rp}) + return True + elif len(reported) > 1: + LOG.error("Agent misconfiguration, multiple agents on the same " + "host %(host)s reports being responsible for resource " + "provider %(rsc_provider)s: %(agents)s", + {'host': context.current['binding:host_id'], + 'rsc_provider': rp, + 'agents': reported.keys()}) + return False + else: + # not responsible, must be somebody else + return False + @six.add_metaclass(abc.ABCMeta) class SimpleAgentMechanismDriverBase(AgentMechanismDriverBase): diff --git a/neutron/plugins/ml2/managers.py b/neutron/plugins/ml2/managers.py index 5ca415d735c..1e8fd01f012 100644 --- a/neutron/plugins/ml2/managers.py +++ b/neutron/plugins/ml2/managers.py @@ -22,6 +22,7 @@ from neutron_lib import constants from neutron_lib.db import api as db_api from neutron_lib import exceptions as exc from neutron_lib.exceptions import multiprovidernet as mpnet_exc +from neutron_lib.exceptions import placement as place_exc from neutron_lib.exceptions import vlantransparent as vlan_exc from neutron_lib.plugins.ml2 import api from oslo_config import cfg @@ -757,12 +758,18 @@ class MechanismManager(stevedore.named.NamedExtensionManager): 'vnic_type': binding.vnic_type, 'segments': context.network.network_segments}) - def _bind_port_level(self, context, level, segments_to_bind): + def _bind_port_level(self, context, level, segments_to_bind, + drivers=None, redoing_bottom=False): + if drivers is None: + drivers = self.ordered_mech_drivers + binding = context._binding port_id = context.current['id'] - LOG.debug("Attempting to bind port %(port)s on host %(host)s " - "at level %(level)s using segments %(segments)s", + LOG.debug("Attempting to bind port %(port)s by drivers %(drivers)s " + "on host %(host)s at level %(level)s using " + "segments %(segments)s", {'port': port_id, + 'drivers': ','.join([driver.name for driver in drivers]), 'host': context.host, 'level': level, 'segments': segments_to_bind}) @@ -774,7 +781,7 @@ class MechanismManager(stevedore.named.NamedExtensionManager): 'host': context.host}) return False - for driver in self.ordered_mech_drivers: + for driver in drivers: if not self._check_driver_to_bind(driver, segments_to_bind, context._binding_levels): continue @@ -806,6 +813,61 @@ class MechanismManager(stevedore.named.NamedExtensionManager): 'lvl': level + 1}) context._pop_binding_level() else: + # NOTE(bence romsics): Consider: "In case of + # hierarchical port binding binding_profile.allocation + # [decided and sent by Placement and Nova] + # is meant to drive the binding only on the binding + # level that represents the closest physical interface + # to the nova server." Link to spec: + # + # https://review.openstack.org/#/c/508149/14/specs\ + # /rocky/minimum-bandwidth-\ + # allocation-placement-api.rst@582 + # + # But we cannot tell if a binding level is + # the bottom binding level before set_binding() + # gets called, and that's already too late. So we + # must undo the last binding after set_binding() + # was called and redo the last level trying to + # bind only with one driver as inferred from + # the allocation. In order to undo the binding + # here we must also assume that each driver's + # bind_port() implementation is side effect free + # beyond calling set_binding(). + # + # Also please note that technically we allow for + # a redo to call continue_binding() instead of + # set_binding() and by that turn what was supposed + # to be the bottom level into a non-bottom binding + # level. A thorough discussion is recommended if + # you think of taking advantage of this. + # + # Also if we find use cases requiring + # diamond-shaped selections of drivers on different + # levels (eg. driverA and driverB can be both + # a valid choice on level 0, but on level 1 both + # previous choice leads to driverC) then we need + # to restrict segment selection too based on + # traits of the allocated resource provider on + # the top binding_level (==0). + if (context.current['binding:profile'] is not None and + 'allocation' in context.current[ + 'binding:profile'] and + not redoing_bottom): + LOG.debug( + "Undo bottom bound level and redo it " + "according to binding_profile.allocation, " + "resource provider uuid: %s", + context.current[ + 'binding:profile']['allocation']) + context._pop_binding_level() + context._unset_binding() + return self._bind_port_level( + context, level, segments_to_bind, + drivers=[self._infer_driver_from_allocation( + context)], + redoing_bottom=True) + # Binding complete. LOG.debug("Bound port: %(port)s, " "host: %(host)s, " @@ -823,6 +885,59 @@ class MechanismManager(stevedore.named.NamedExtensionManager): "bind_port", driver.name) + def _infer_driver_from_allocation(self, context): + """Choose mechanism driver as implied by allocation in placement. + + :param context: PortContext instance describing the port + :returns: a single MechanismDriver instance + + Ports allocated to a resource provider (ie. a physical network + interface) in Placement have the UUID of the provider in their + binding:profile.allocation. The choice of a physical network + interface (as recorded in the allocation) implies a choice of + mechanism driver too. When an allocation was received we expect + exactly one mechanism driver to be responsible for that physical + network interface resource provider. + """ + + drivers = [] + for driver in self.ordered_mech_drivers: + if driver.obj.responsible_for_ports_allocation(context): + drivers.append(driver) + + if len(drivers) == 0: + LOG.error("Failed to bind port %(port)s on host " + "%(host)s allocated on resource provider " + "%(rsc_provider)s, because no mechanism driver " + "reports being responsible", + {'port': context.current['id'], + 'host': context.host, + 'rsc_provider': context.current[ + 'binding:profile']['allocation']}) + raise place_exc.UnknownResourceProvider( + rsc_provider=context.current['binding:profile']['allocation']) + + if len(drivers) >= 2: + raise place_exc.AmbiguousResponsibilityForResourceProvider( + rsc_provider=context.current['binding:profile']['allocation'], + drivers=','.join([driver.name for driver in drivers])) + + # NOTE(bence romsics): The error conditions for raising either + # UnknownResourceProvider or AmbiguousResponsibilityForResourceProvider + # are pretty static therefore the usual 10-times-retry of a binding + # failure could easily be unnecessary in those cases. However at this + # point special handling of these exceptions in the binding retry loop + # seems like premature optimization to me since these exceptions are + # always a sign of a misconfigured neutron deployment. + + LOG.debug("Restricting possible bindings of port %(port)s " + "(as inferred from placement allocation) to " + "mechanism driver '%(driver)s'", + {'port': context.current['id'], + 'driver': drivers[0].name}) + + return drivers[0] + def is_host_filtering_supported(self): return all(driver.obj.is_host_filtering_supported() for driver in self.ordered_mech_drivers) diff --git a/neutron/tests/unit/plugins/ml2/_test_mech_agent.py b/neutron/tests/unit/plugins/ml2/_test_mech_agent.py index 89398c29171..edc30ce9ced 100644 --- a/neutron/tests/unit/plugins/ml2/_test_mech_agent.py +++ b/neutron/tests/unit/plugins/ml2/_test_mech_agent.py @@ -246,6 +246,22 @@ class AgentMechanismGenericTestCase(AgentMechanismBaseTestCase): self.driver.bind_port(context) self._check_unbound(context) + def test_driver_not_responsible_for_ports_allocation(self): + agents = [ + {'configurations': {'rp_bandwidths': {'eth0': {}}}, + 'host': 'host'}, + ] + profile = {} + segments = [] + port_ctx = FakePortContext( + self.AGENT_TYPE, + agents, + segments, + vnic_type=portbindings.VNIC_DIRECT, + profile=profile) + self.assertFalse( + self.driver.responsible_for_ports_allocation(port_ctx)) + class AgentMechanismLocalTestCase(AgentMechanismBaseTestCase): LOCAL_SEGMENTS = [{api.ID: 'unknown_segment_id', diff --git a/neutron/tests/unit/plugins/ml2/drivers/mech_fake_agent.py b/neutron/tests/unit/plugins/ml2/drivers/mech_fake_agent.py index 41a5e32c75e..c71c0f2ff5c 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/mech_fake_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/mech_fake_agent.py @@ -60,3 +60,7 @@ class FakeAgentMechanismDriver(mech_agent.SimpleAgentMechanismDriverBase): def get_mappings(self, agent): return dict(agent['configurations'].get('interface_mappings', {})) + + +class AnotherFakeAgentMechanismDriver(FakeAgentMechanismDriver): + pass diff --git a/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/mech_driver/test_mech_sriov_nic_switch.py b/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/mech_driver/test_mech_sriov_nic_switch.py index 8e084d7a701..35d0afecb78 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/mech_driver/test_mech_sriov_nic_switch.py +++ b/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/mech_driver/test_mech_sriov_nic_switch.py @@ -87,6 +87,28 @@ class SriovSwitchMechGenericTestCase(SriovNicSwitchMechanismBaseTestCase, segment = {api.NETWORK_TYPE: network_type} self.assertTrue(self.driver.check_segment_for_agent(segment)) + def test_driver_responsible_for_ports_allocation(self): + agents = [ + {'agent_type': 'NIC Switch agent', + 'configurations': {'resource_provider_bandwidths': {'eth0': {}}}, + 'host': 'host', + 'id': '1'} + ] + segments = [] + # uuid -v5 87f1895c-73bb-11e8-9008-c4d987b2a692 host:eth0 + profile = {'allocation': '5762cf50-781b-5f01-8ebc-0cce8c9e74cd'} + + port_ctx = base.FakePortContext( + self.AGENT_TYPE, + agents, + segments, + vnic_type=portbindings.VNIC_DIRECT, + profile=profile) + with mock.patch.object(self.driver, '_possible_agents_for_port', + return_value=agents): + self.assertTrue( + self.driver.responsible_for_ports_allocation(port_ctx)) + class SriovMechVlanTestCase(SriovNicSwitchMechanismBaseTestCase, base.AgentMechanismBaseTestCase): diff --git a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/mech_driver/test_mech_openvswitch.py b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/mech_driver/test_mech_openvswitch.py index b67783f175b..cf073b5471a 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/mech_driver/test_mech_openvswitch.py +++ b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/mech_driver/test_mech_openvswitch.py @@ -166,7 +166,27 @@ class OpenvswitchMechanismHybridPlugTestCase(OpenvswitchMechanismBaseTestCase): class OpenvswitchMechanismGenericTestCase(OpenvswitchMechanismBaseTestCase, base.AgentMechanismGenericTestCase): - pass + def test_driver_responsible_for_ports_allocation(self): + agents = [ + {'agent_type': 'Open vSwitch agent', + 'configurations': {'resource_provider_bandwidths': {'eth0': {}}}, + 'id': '1', + 'host': 'host'} + ] + segments = [] + # uuid -v5 87ee7d5c-73bb-11e8-9008-c4d987b2a692 host:eth0 + profile = {'allocation': '13cc0ed9-e802-5eaa-b4c7-3441855e31f2'} + + port_ctx = base.FakePortContext( + self.AGENT_TYPE, + agents, + segments, + vnic_type=portbindings.VNIC_NORMAL, + profile=profile) + with mock.patch.object(self.driver, '_possible_agents_for_port', + return_value=agents): + self.assertTrue( + self.driver.responsible_for_ports_allocation(port_ctx)) class OpenvswitchMechanismLocalTestCase(OpenvswitchMechanismBaseTestCase, diff --git a/neutron/tests/unit/plugins/ml2/test_managers.py b/neutron/tests/unit/plugins/ml2/test_managers.py index 38e42981049..dfb53080286 100644 --- a/neutron/tests/unit/plugins/ml2/test_managers.py +++ b/neutron/tests/unit/plugins/ml2/test_managers.py @@ -16,6 +16,7 @@ import mock +from neutron_lib.exceptions import placement as place_exc from neutron_lib.plugins.ml2 import api from oslo_config import cfg from oslo_db import exception as db_exc @@ -68,6 +69,60 @@ class TestManagers(base.BaseTestCase): manager._bind_port_level(self.context, 0, self.segments_to_bind) self.assertEqual(0, bind_port.call_count) + def test__infer_driver_from_allocation_positive(self): + cfg.CONF.set_override( + 'mechanism_drivers', ['fake_agent'], group='ml2') + manager = managers.MechanismManager() + with mock.patch.object(mech_fake_agent.FakeAgentMechanismDriver, + 'responsible_for_ports_allocation', + return_value=True): + responsible_driver = manager._infer_driver_from_allocation( + FakePortContext( + None, + None, + self.segments_to_bind, + profile={'allocation': 'fake_resource_provider'})) + self.assertEqual(responsible_driver.name, 'fake_agent') + + def test__infer_driver_from_allocation_negative(self): + cfg.CONF.set_override( + 'mechanism_drivers', ['fake_agent'], group='ml2') + manager = managers.MechanismManager() + with mock.patch.object(mech_fake_agent.FakeAgentMechanismDriver, + 'responsible_for_ports_allocation', + return_value=False): + self.assertRaises( + place_exc.UnknownResourceProvider, + manager._infer_driver_from_allocation, + FakePortContext( + None, + None, + self.segments_to_bind, + profile={'allocation': 'fake_resource_provider'}) + ) + + def test__infer_driver_from_allocation_ambiguous(self): + cfg.CONF.set_override( + 'mechanism_drivers', + ['fake_agent', 'another_fake_agent'], + group='ml2') + manager = managers.MechanismManager() + with mock.patch.object(mech_fake_agent.FakeAgentMechanismDriver, + 'responsible_for_ports_allocation', + return_value=True), \ + mock.patch.object(mech_fake_agent.AnotherFakeAgentMechanismDriver, + 'responsible_for_ports_allocation', + return_value=True): + self.assertRaises( + place_exc.AmbiguousResponsibilityForResourceProvider, + manager._infer_driver_from_allocation, + FakePortContext( + None, + None, + self.segments_to_bind, + profile={'allocation': 'fake_resource_provider'}) + ) + @mock.patch.object(managers.LOG, 'critical') @mock.patch.object(managers.MechanismManager, '_driver_not_loaded') def test__driver_not_found(self, mock_not_loaded, mock_log): diff --git a/neutron/tests/unit/plugins/ml2/test_port_binding.py b/neutron/tests/unit/plugins/ml2/test_port_binding.py index 18e51c480d8..39fa94a3a0a 100644 --- a/neutron/tests/unit/plugins/ml2/test_port_binding.py +++ b/neutron/tests/unit/plugins/ml2/test_port_binding.py @@ -640,3 +640,20 @@ class ExtendedPortBindingTestCase(test_plugin.NeutronDbPluginV2TestCase): port, new_binding = self._create_port_and_binding() response = self._delete_port_binding(port['id'], 'other-host') self.assertEqual(webob.exc.HTTPNotFound.code, response.status_int) + + def test_binding_fail_for_unknown_allocation(self): + # The UUID is a random one - which of course is unknown to neutron + # as a resource provider UUID. + profile = {'allocation': 'ccccbb4c-2adf-11e9-91bc-db7063775d06'} + kwargs = {pbe_ext.PROFILE: profile} + device_owner = '%s%s' % (const.DEVICE_OWNER_COMPUTE_PREFIX, 'nova') + + with self.port(device_owner=device_owner) as port: + port_id = port['port']['id'] + response = self._create_port_binding( + self.fmt, port_id, self.host, **kwargs) + + self.assertEqual(webob.exc.HTTPInternalServerError.code, + response.status_int) + self.assertTrue(exceptions.PortBindingError.__name__ in + response.text) diff --git a/setup.cfg b/setup.cfg index 5b0e3aa08ae..9c587c07578 100644 --- a/setup.cfg +++ b/setup.cfg @@ -92,6 +92,7 @@ neutron.ml2.mechanism_drivers = l2population = neutron.plugins.ml2.drivers.l2pop.mech_driver:L2populationMechanismDriver sriovnicswitch = neutron.plugins.ml2.drivers.mech_sriov.mech_driver.mech_driver:SriovNicSwitchMechanismDriver fake_agent = neutron.tests.unit.plugins.ml2.drivers.mech_fake_agent:FakeAgentMechanismDriver + another_fake_agent = neutron.tests.unit.plugins.ml2.drivers.mech_fake_agent:AnotherFakeAgentMechanismDriver faulty_agent = neutron.tests.unit.plugins.ml2.drivers.mech_faulty_agent:FaultyAgentMechanismDriver neutron.ml2.extension_drivers = test = neutron.tests.unit.plugins.ml2.drivers.ext_test:TestExtensionDriver