Drive binding by placement allocation

Drive the choice of mechanism driver during binding as inferred from
the resource provider allocated by nova and as told to neutron via the
port's binding:profile.

As discussed on a neutron qos irc meeting some time ago
this patch introduces a new assumption on bind_port() implementations.
That is an implementation of bind_port() in any mech driver supporting
Guaranteed Minimum Bandwidth bind_port() must not have a non-idempotent
side effect. Because the last binding level will be redone for a 2nd
time with a narrowed down list of mechanism drivers. And if the 2nd call
does not give the same result as the first all kind of weird things can
happen.

Change-Id: I2b7573ec6795170ce45a13d5d0ad7844fb85182d
Depends-On: https://review.openstack.org/574781
Depends-On: https://review.openstack.org/635160
Partial-Bug: #1578989
See-Also: https://review.openstack.org/502306 (nova spec)
See-Also: https://review.openstack.org/508149 (neutron spec)
This commit is contained in:
Bence Romsics 2018-06-26 11:16:54 +02:00 committed by Slawek Kaplonski
parent fe73e8c9b3
commit 74c51a2e53
10 changed files with 334 additions and 5 deletions

View File

@ -304,6 +304,16 @@ class PortContext(MechanismDriverContext, api.PortContext):
self._binding.vif_details = jsonutils.dumps(vif_details)
self._new_port_status = status
def _unset_binding(self):
'''Undo a previous call to set_binding() before it gets committed.
This method is for MechanismManager and is not part of the driver API.
'''
self._new_bound_segment = None
self._binding.vif_type = portbindings.VIF_TYPE_UNBOUND
self._binding.vif_details = ''
self._new_port_status = None
def continue_binding(self, segment_id, next_segments_to_bind):
# TODO(rkukura) Verify binding allowed, segment in network
self._new_bound_segment = segment_id

View File

@ -14,10 +14,12 @@
# under the License.
import abc
import uuid
from neutron_lib.api.definitions import portbindings
from neutron_lib.callbacks import resources
from neutron_lib import constants as const
from neutron_lib.placement import utils as place_utils
from neutron_lib.plugins.ml2 import api
from oslo_log import log
import six
@ -161,6 +163,73 @@ class AgentMechanismDriverBase(api.MechanismDriver):
"%s mechanism driver!") % self.agent_type)
return supported_vnic_types
def _possible_agents_for_port(self, context):
agent_filters = {
'host': [context.current['binding:host_id']],
'agent_type': [self.agent_type],
'admin_state_up': [True],
# By not filtering for 'alive' we may report being responsible
# and still not being able to handle the binding. But that case
# will be properly logged and handled very soon. That is when
# trying to bind with a dead agent.
}
return context._plugin.get_agents(
context._plugin_context,
filters=agent_filters,
)
def responsible_for_ports_allocation(self, context):
"""Report if an agent is responsible for a resource provider.
:param context: PortContext instance describing the port
:returns: True for responsible, False for not responsible
An agent based mechanism driver is reponsible for a resource provider
if an agent of it is responsible for that resource provider. An agent
reports responsibility by including the resource provider in the
configurations field of the agent heartbeat.
"""
uuid_ns = self.resource_provider_uuid5_namespace
if uuid_ns is None:
return False
if 'allocation' not in context.current['binding:profile']:
return False
rp = uuid.UUID(context.current['binding:profile']['allocation'])
host_agents = self._possible_agents_for_port(context)
reported = {}
for agent in host_agents:
if 'resource_provider_bandwidths' in agent['configurations']:
for device in agent['configurations'][
'resource_provider_bandwidths'].keys():
device_rp_uuid = place_utils.device_resource_provider_uuid(
namespace=uuid_ns,
host=agent['host'],
device=device)
if device_rp_uuid == rp:
reported[agent['id']] = agent
if len(reported) == 1:
agent = list(reported.values())[0]
LOG.debug("Agent %(agent)s of type %(agent_type)s reports to be "
"responsible for resource provider %(rsc_provider)s",
{'agent': agent['id'],
'agent_type': agent['agent_type'],
'rsc_provider': rp})
return True
elif len(reported) > 1:
LOG.error("Agent misconfiguration, multiple agents on the same "
"host %(host)s reports being responsible for resource "
"provider %(rsc_provider)s: %(agents)s",
{'host': context.current['binding:host_id'],
'rsc_provider': rp,
'agents': reported.keys()})
return False
else:
# not responsible, must be somebody else
return False
@six.add_metaclass(abc.ABCMeta)
class SimpleAgentMechanismDriverBase(AgentMechanismDriverBase):

View File

@ -22,6 +22,7 @@ from neutron_lib import constants
from neutron_lib.db import api as db_api
from neutron_lib import exceptions as exc
from neutron_lib.exceptions import multiprovidernet as mpnet_exc
from neutron_lib.exceptions import placement as place_exc
from neutron_lib.exceptions import vlantransparent as vlan_exc
from neutron_lib.plugins.ml2 import api
from oslo_config import cfg
@ -757,12 +758,18 @@ class MechanismManager(stevedore.named.NamedExtensionManager):
'vnic_type': binding.vnic_type,
'segments': context.network.network_segments})
def _bind_port_level(self, context, level, segments_to_bind):
def _bind_port_level(self, context, level, segments_to_bind,
drivers=None, redoing_bottom=False):
if drivers is None:
drivers = self.ordered_mech_drivers
binding = context._binding
port_id = context.current['id']
LOG.debug("Attempting to bind port %(port)s on host %(host)s "
"at level %(level)s using segments %(segments)s",
LOG.debug("Attempting to bind port %(port)s by drivers %(drivers)s "
"on host %(host)s at level %(level)s using "
"segments %(segments)s",
{'port': port_id,
'drivers': ','.join([driver.name for driver in drivers]),
'host': context.host,
'level': level,
'segments': segments_to_bind})
@ -774,7 +781,7 @@ class MechanismManager(stevedore.named.NamedExtensionManager):
'host': context.host})
return False
for driver in self.ordered_mech_drivers:
for driver in drivers:
if not self._check_driver_to_bind(driver, segments_to_bind,
context._binding_levels):
continue
@ -806,6 +813,61 @@ class MechanismManager(stevedore.named.NamedExtensionManager):
'lvl': level + 1})
context._pop_binding_level()
else:
# NOTE(bence romsics): Consider: "In case of
# hierarchical port binding binding_profile.allocation
# [decided and sent by Placement and Nova]
# is meant to drive the binding only on the binding
# level that represents the closest physical interface
# to the nova server." Link to spec:
#
# https://review.openstack.org/#/c/508149/14/specs\
# /rocky/minimum-bandwidth-\
# allocation-placement-api.rst@582
#
# But we cannot tell if a binding level is
# the bottom binding level before set_binding()
# gets called, and that's already too late. So we
# must undo the last binding after set_binding()
# was called and redo the last level trying to
# bind only with one driver as inferred from
# the allocation. In order to undo the binding
# here we must also assume that each driver's
# bind_port() implementation is side effect free
# beyond calling set_binding().
#
# Also please note that technically we allow for
# a redo to call continue_binding() instead of
# set_binding() and by that turn what was supposed
# to be the bottom level into a non-bottom binding
# level. A thorough discussion is recommended if
# you think of taking advantage of this.
#
# Also if we find use cases requiring
# diamond-shaped selections of drivers on different
# levels (eg. driverA and driverB can be both
# a valid choice on level 0, but on level 1 both
# previous choice leads to driverC) then we need
# to restrict segment selection too based on
# traits of the allocated resource provider on
# the top binding_level (==0).
if (context.current['binding:profile'] is not None and
'allocation' in context.current[
'binding:profile'] and
not redoing_bottom):
LOG.debug(
"Undo bottom bound level and redo it "
"according to binding_profile.allocation, "
"resource provider uuid: %s",
context.current[
'binding:profile']['allocation'])
context._pop_binding_level()
context._unset_binding()
return self._bind_port_level(
context, level, segments_to_bind,
drivers=[self._infer_driver_from_allocation(
context)],
redoing_bottom=True)
# Binding complete.
LOG.debug("Bound port: %(port)s, "
"host: %(host)s, "
@ -823,6 +885,59 @@ class MechanismManager(stevedore.named.NamedExtensionManager):
"bind_port",
driver.name)
def _infer_driver_from_allocation(self, context):
"""Choose mechanism driver as implied by allocation in placement.
:param context: PortContext instance describing the port
:returns: a single MechanismDriver instance
Ports allocated to a resource provider (ie. a physical network
interface) in Placement have the UUID of the provider in their
binding:profile.allocation. The choice of a physical network
interface (as recorded in the allocation) implies a choice of
mechanism driver too. When an allocation was received we expect
exactly one mechanism driver to be responsible for that physical
network interface resource provider.
"""
drivers = []
for driver in self.ordered_mech_drivers:
if driver.obj.responsible_for_ports_allocation(context):
drivers.append(driver)
if len(drivers) == 0:
LOG.error("Failed to bind port %(port)s on host "
"%(host)s allocated on resource provider "
"%(rsc_provider)s, because no mechanism driver "
"reports being responsible",
{'port': context.current['id'],
'host': context.host,
'rsc_provider': context.current[
'binding:profile']['allocation']})
raise place_exc.UnknownResourceProvider(
rsc_provider=context.current['binding:profile']['allocation'])
if len(drivers) >= 2:
raise place_exc.AmbiguousResponsibilityForResourceProvider(
rsc_provider=context.current['binding:profile']['allocation'],
drivers=','.join([driver.name for driver in drivers]))
# NOTE(bence romsics): The error conditions for raising either
# UnknownResourceProvider or AmbiguousResponsibilityForResourceProvider
# are pretty static therefore the usual 10-times-retry of a binding
# failure could easily be unnecessary in those cases. However at this
# point special handling of these exceptions in the binding retry loop
# seems like premature optimization to me since these exceptions are
# always a sign of a misconfigured neutron deployment.
LOG.debug("Restricting possible bindings of port %(port)s "
"(as inferred from placement allocation) to "
"mechanism driver '%(driver)s'",
{'port': context.current['id'],
'driver': drivers[0].name})
return drivers[0]
def is_host_filtering_supported(self):
return all(driver.obj.is_host_filtering_supported()
for driver in self.ordered_mech_drivers)

View File

@ -246,6 +246,22 @@ class AgentMechanismGenericTestCase(AgentMechanismBaseTestCase):
self.driver.bind_port(context)
self._check_unbound(context)
def test_driver_not_responsible_for_ports_allocation(self):
agents = [
{'configurations': {'rp_bandwidths': {'eth0': {}}},
'host': 'host'},
]
profile = {}
segments = []
port_ctx = FakePortContext(
self.AGENT_TYPE,
agents,
segments,
vnic_type=portbindings.VNIC_DIRECT,
profile=profile)
self.assertFalse(
self.driver.responsible_for_ports_allocation(port_ctx))
class AgentMechanismLocalTestCase(AgentMechanismBaseTestCase):
LOCAL_SEGMENTS = [{api.ID: 'unknown_segment_id',

View File

@ -60,3 +60,7 @@ class FakeAgentMechanismDriver(mech_agent.SimpleAgentMechanismDriverBase):
def get_mappings(self, agent):
return dict(agent['configurations'].get('interface_mappings', {}))
class AnotherFakeAgentMechanismDriver(FakeAgentMechanismDriver):
pass

View File

@ -87,6 +87,28 @@ class SriovSwitchMechGenericTestCase(SriovNicSwitchMechanismBaseTestCase,
segment = {api.NETWORK_TYPE: network_type}
self.assertTrue(self.driver.check_segment_for_agent(segment))
def test_driver_responsible_for_ports_allocation(self):
agents = [
{'agent_type': 'NIC Switch agent',
'configurations': {'resource_provider_bandwidths': {'eth0': {}}},
'host': 'host',
'id': '1'}
]
segments = []
# uuid -v5 87f1895c-73bb-11e8-9008-c4d987b2a692 host:eth0
profile = {'allocation': '5762cf50-781b-5f01-8ebc-0cce8c9e74cd'}
port_ctx = base.FakePortContext(
self.AGENT_TYPE,
agents,
segments,
vnic_type=portbindings.VNIC_DIRECT,
profile=profile)
with mock.patch.object(self.driver, '_possible_agents_for_port',
return_value=agents):
self.assertTrue(
self.driver.responsible_for_ports_allocation(port_ctx))
class SriovMechVlanTestCase(SriovNicSwitchMechanismBaseTestCase,
base.AgentMechanismBaseTestCase):

View File

@ -166,7 +166,27 @@ class OpenvswitchMechanismHybridPlugTestCase(OpenvswitchMechanismBaseTestCase):
class OpenvswitchMechanismGenericTestCase(OpenvswitchMechanismBaseTestCase,
base.AgentMechanismGenericTestCase):
pass
def test_driver_responsible_for_ports_allocation(self):
agents = [
{'agent_type': 'Open vSwitch agent',
'configurations': {'resource_provider_bandwidths': {'eth0': {}}},
'id': '1',
'host': 'host'}
]
segments = []
# uuid -v5 87ee7d5c-73bb-11e8-9008-c4d987b2a692 host:eth0
profile = {'allocation': '13cc0ed9-e802-5eaa-b4c7-3441855e31f2'}
port_ctx = base.FakePortContext(
self.AGENT_TYPE,
agents,
segments,
vnic_type=portbindings.VNIC_NORMAL,
profile=profile)
with mock.patch.object(self.driver, '_possible_agents_for_port',
return_value=agents):
self.assertTrue(
self.driver.responsible_for_ports_allocation(port_ctx))
class OpenvswitchMechanismLocalTestCase(OpenvswitchMechanismBaseTestCase,

View File

@ -16,6 +16,7 @@
import mock
from neutron_lib.exceptions import placement as place_exc
from neutron_lib.plugins.ml2 import api
from oslo_config import cfg
from oslo_db import exception as db_exc
@ -68,6 +69,60 @@ class TestManagers(base.BaseTestCase):
manager._bind_port_level(self.context, 0, self.segments_to_bind)
self.assertEqual(0, bind_port.call_count)
def test__infer_driver_from_allocation_positive(self):
cfg.CONF.set_override(
'mechanism_drivers', ['fake_agent'], group='ml2')
manager = managers.MechanismManager()
with mock.patch.object(mech_fake_agent.FakeAgentMechanismDriver,
'responsible_for_ports_allocation',
return_value=True):
responsible_driver = manager._infer_driver_from_allocation(
FakePortContext(
None,
None,
self.segments_to_bind,
profile={'allocation': 'fake_resource_provider'}))
self.assertEqual(responsible_driver.name, 'fake_agent')
def test__infer_driver_from_allocation_negative(self):
cfg.CONF.set_override(
'mechanism_drivers', ['fake_agent'], group='ml2')
manager = managers.MechanismManager()
with mock.patch.object(mech_fake_agent.FakeAgentMechanismDriver,
'responsible_for_ports_allocation',
return_value=False):
self.assertRaises(
place_exc.UnknownResourceProvider,
manager._infer_driver_from_allocation,
FakePortContext(
None,
None,
self.segments_to_bind,
profile={'allocation': 'fake_resource_provider'})
)
def test__infer_driver_from_allocation_ambiguous(self):
cfg.CONF.set_override(
'mechanism_drivers',
['fake_agent', 'another_fake_agent'],
group='ml2')
manager = managers.MechanismManager()
with mock.patch.object(mech_fake_agent.FakeAgentMechanismDriver,
'responsible_for_ports_allocation',
return_value=True), \
mock.patch.object(mech_fake_agent.AnotherFakeAgentMechanismDriver,
'responsible_for_ports_allocation',
return_value=True):
self.assertRaises(
place_exc.AmbiguousResponsibilityForResourceProvider,
manager._infer_driver_from_allocation,
FakePortContext(
None,
None,
self.segments_to_bind,
profile={'allocation': 'fake_resource_provider'})
)
@mock.patch.object(managers.LOG, 'critical')
@mock.patch.object(managers.MechanismManager, '_driver_not_loaded')
def test__driver_not_found(self, mock_not_loaded, mock_log):

View File

@ -640,3 +640,20 @@ class ExtendedPortBindingTestCase(test_plugin.NeutronDbPluginV2TestCase):
port, new_binding = self._create_port_and_binding()
response = self._delete_port_binding(port['id'], 'other-host')
self.assertEqual(webob.exc.HTTPNotFound.code, response.status_int)
def test_binding_fail_for_unknown_allocation(self):
# The UUID is a random one - which of course is unknown to neutron
# as a resource provider UUID.
profile = {'allocation': 'ccccbb4c-2adf-11e9-91bc-db7063775d06'}
kwargs = {pbe_ext.PROFILE: profile}
device_owner = '%s%s' % (const.DEVICE_OWNER_COMPUTE_PREFIX, 'nova')
with self.port(device_owner=device_owner) as port:
port_id = port['port']['id']
response = self._create_port_binding(
self.fmt, port_id, self.host, **kwargs)
self.assertEqual(webob.exc.HTTPInternalServerError.code,
response.status_int)
self.assertTrue(exceptions.PortBindingError.__name__ in
response.text)

View File

@ -92,6 +92,7 @@ neutron.ml2.mechanism_drivers =
l2population = neutron.plugins.ml2.drivers.l2pop.mech_driver:L2populationMechanismDriver
sriovnicswitch = neutron.plugins.ml2.drivers.mech_sriov.mech_driver.mech_driver:SriovNicSwitchMechanismDriver
fake_agent = neutron.tests.unit.plugins.ml2.drivers.mech_fake_agent:FakeAgentMechanismDriver
another_fake_agent = neutron.tests.unit.plugins.ml2.drivers.mech_fake_agent:AnotherFakeAgentMechanismDriver
faulty_agent = neutron.tests.unit.plugins.ml2.drivers.mech_faulty_agent:FaultyAgentMechanismDriver
neutron.ml2.extension_drivers =
test = neutron.tests.unit.plugins.ml2.drivers.ext_test:TestExtensionDriver