From 7f35e4e857f7c6e83c635125ce9b42df6e10a510 Mon Sep 17 00:00:00 2001 From: Bence Romsics Date: Tue, 23 Mar 2021 14:07:36 +0100 Subject: [PATCH] Physical NIC RP should be child of agent RP In the fix for #1853840 I made a mistake and since then we created the physical NIC resource providers as a child of the hypervisor resource provider instead of the agent resource provider. Here: https://review.opendev.org/c/openstack/neutron/+/696600/3/neutron/agent/common/placement_report.py#159 This *did not* break the minimum bandwidth aware scheduling. But still there are multiple problems: 1) If you created your physical NIC RPs before the fix for #1853840 but upgraded to after the fix for #1853840, then resource syncs will throw an error in neutron-server at each physical NIC RP update. That pollutes the logs and wastes some resources since the prohibited update will be forever retried. 2) If you created your physical NIC RPs after the fix for #1853840 then your physical NIC RPs have the wrong parent. Which again does not break minimum bandwidth aware scheduling. But it may pose problems for later features wanting to build on the originally planned RP tree structure. 3) Cleanup of decommissioned RPs is a bit different than expected. This cleanup was always left to the admin, so it only affects a manual process. The proper RP structure was and should be the following: The hypervisor RP(s) must be the root(s). As a child of each hypervisor RP, there should be an agent RP. The physical NIC RPs should be the children of the agent RPs. Unfortunately at the moment the Placement API generically prohibits update of the parent resource provider id in a PUT request: https://docs.openstack.org/api-ref/placement/?expanded=update-resource-provider-detail#update-resource-provider Therefore without a later Placement change we cannot fix the RPs already created with the wrong parent. However we can fix the RPs to be created later. We do that here. We also fix a bug in the unit tests that allowed the wrong parent to pass unnoticed. Plus we add an extra log message to direct the user seeing the pollution in the logs to the proper bug report. There may be a follow up patch later, because not all RP re-parenting operations are problematic, therefore we are thinking of relaxing this blanket prohibition in Placement. When Placement allows updates to the parent id we can fix RPs already created with the wrong parent too. Change-Id: I7caa8827d22103600ca685a58294640fc831dbd9 Closes-Bug: #1921150 Co-Authored-By: "Balazs Gibizer" Related-Bug: #1853840 --- neutron/agent/common/placement_report.py | 4 +- neutron/services/placement_report/plugin.py | 16 ++++-- .../agent/common/test_placement_report.py | 8 +-- .../notes/bug-1921150-c02692e548a3750e.yaml | 17 ++++++ tools/bug-1921150-re-parent-device-rps.sql | 52 +++++++++++++++++++ 5 files changed, 86 insertions(+), 11 deletions(-) create mode 100644 releasenotes/notes/bug-1921150-c02692e548a3750e.yaml create mode 100644 tools/bug-1921150-re-parent-device-rps.sql diff --git a/neutron/agent/common/placement_report.py b/neutron/agent/common/placement_report.py index cd37d19d1cd..6e447256c97 100644 --- a/neutron/agent/common/placement_report.py +++ b/neutron/agent/common/placement_report.py @@ -151,12 +151,14 @@ class PlacementState(object): self._driver_uuid_namespace, hypervisor['name'], device) + agent_rp_uuid = place_utils.agent_resource_provider_uuid( + self._driver_uuid_namespace, hypervisor['name']) rps.append( DeferredCall( self._client.ensure_resource_provider, {'name': rp_name, 'uuid': rp_uuid, - 'parent_provider_uuid': hypervisor['uuid']})) + 'parent_provider_uuid': agent_rp_uuid})) return rps def deferred_create_resource_providers(self): diff --git a/neutron/services/placement_report/plugin.py b/neutron/services/placement_report/plugin.py index 49e4987f7d2..cd1ad712b37 100644 --- a/neutron/services/placement_report/plugin.py +++ b/neutron/services/placement_report/plugin.py @@ -161,11 +161,19 @@ class PlacementReportPlugin(service_base.ServicePluginBase): try: LOG.debug('placement client: {}'.format(deferred)) deferred.execute() - except Exception: + except Exception as e: errors = True - LOG.exception( - 'placement client call failed: %s', - str(deferred)) + placement_error_str = \ + 're-parenting a provider is not currently allowed' + if (placement_error_str in str(e)): + msg = ( + 'placement client call failed' + ' (this may be due to bug' + ' https://launchpad.net/bugs/1921150): %s' + ) + else: + msg = 'placement client call failed: %s' + LOG.exception(msg, str(deferred)) resources_synced = not errors agent_db.resources_synced = resources_synced diff --git a/neutron/tests/unit/agent/common/test_placement_report.py b/neutron/tests/unit/agent/common/test_placement_report.py index 4432f144cdb..082e3724cc6 100644 --- a/neutron/tests/unit/agent/common/test_placement_report.py +++ b/neutron/tests/unit/agent/common/test_placement_report.py @@ -47,14 +47,10 @@ class PlacementStateTestCase(base.BaseTestCase): self.client_mock = mock.Mock() self.driver_uuid_namespace = uuid.UUID( '00000000-0000-0000-0000-000000000001') - # uuid below generated by the following command: - # uuid -v5 '00000000-0000-0000-0000-000000000001' 'fakehost' self.hypervisor1_rp_uuid = uuid.UUID( - 'c0b4abe5-516f-54b8-b965-ff94060dcbcc') - # uuid below generated by the following command: - # uuid -v5 '00000000-0000-0000-0000-000000000001' 'fakehost2' + '00000000-0000-0000-0000-000000000002') self.hypervisor2_rp_uuid = uuid.UUID( - '544155b7-1295-5f10-b5f0-eadc50abc6d4') + '00000000-0000-0000-0000-000000000003') self.kwargs = { 'rp_bandwidths': {}, 'rp_inventory_defaults': {}, diff --git a/releasenotes/notes/bug-1921150-c02692e548a3750e.yaml b/releasenotes/notes/bug-1921150-c02692e548a3750e.yaml new file mode 100644 index 00000000000..22e16f0cece --- /dev/null +++ b/releasenotes/notes/bug-1921150-c02692e548a3750e.yaml @@ -0,0 +1,17 @@ +--- +issues: + - | + When using the minimim-bandwidth QoS feature due to bug + https://launchpad.net/bugs/1921150 physical NIC resource providers + were for some time created with the wrong parent (i.e. the + hypervisor RP). This is now partially fixed and new resource + providers are created now with the expected parent (i.e. the agent + RP). However Placement does not allow re-parenting an already + existing resource provider, therefore the following Placement + DB update may be needed after the fix for bug 1921150 is applied: + neutron/tools/bug-1921150-re-parent-device-rps.sql + Until all resource providers have the proper parent, neutron-server + will retry the re-parenting update, which will be rejected every time, + therefore expect polluted logs and some wasted load on Placement. + However please note that the bandwidth-aware scheduling is supposed + to work even with the wrongly parented resource providers. diff --git a/tools/bug-1921150-re-parent-device-rps.sql b/tools/bug-1921150-re-parent-device-rps.sql new file mode 100644 index 00000000000..c87a0c2845c --- /dev/null +++ b/tools/bug-1921150-re-parent-device-rps.sql @@ -0,0 +1,52 @@ +/* + Licensed under the Apache License, Version 2.0 (the "License"); you may + not use this file except in compliance with the License. You may obtain + a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations + under the License. +*/ + +/* +Fix wrongly parented physical NIC resource providers due to bug +https://bugs.launchpad.net/neutron/+bug/1921150 + +Compatible with MySQL. +*/ + +USE placement; + +SELECT 'Affected device RPs:' as ''; + +SELECT * +FROM resource_providers +WHERE + (name LIKE '%:NIC Switch agent:%' OR + name LIKE '%:Open vSwitch agent:%') AND + parent_provider_id=root_provider_id; + + +/* +To find the proper parent we have to use the naming scheme of the RPs which +is :: +The name of the proper parent for deviceRP is name of the deviceRP minus +everything after the second ':'. +*/ + +UPDATE resource_providers as rp +INNER JOIN resource_providers as parent_rp +ON + parent_rp.name=SUBSTRING(rp.name, 1, LOCATE(':', rp.name, LOCATE(':', rp.name) + 1) -1) +SET + rp.parent_provider_id = parent_rp.id +WHERE + (rp.name LIKE '%:NIC Switch agent:%' OR + rp.name LIKE '%:Open vSwitch agent:%') AND + rp.parent_provider_id=rp.root_provider_id; + +SELECT CONCAT('Fixed ', ROW_COUNT(), ' RPs') as '';