Correct power state handling for managed in-band inspection

Do not try to configure networks when powered on, unless it's a node
with a SmartNIC, in which case do power on before configuring networks.

A new helper is created based on existing code in agent.py.

Change-Id: I3a8fab7a39b604ed17a690fa9c31b3cd1dbdc6a7
Story: #1528920
Task: #37753
This commit is contained in:
Dmitry Tantsur 2019-12-05 12:47:55 +01:00 committed by Julia Kreger
parent 7a7e9689a3
commit fd064a4f6b
7 changed files with 60 additions and 63 deletions

View File

@ -12,6 +12,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import contextlib
import datetime
import time
@ -831,6 +832,20 @@ def restore_power_state_if_needed(task, power_state_to_restore):
node_power_action(task, power_state_to_restore)
@contextlib.contextmanager
def power_state_for_network_configuration(task):
"""Handle the power state for a node reconfiguration.
Powers the node on if and only if it has a Smart NIC port. Yields for
the actual reconfiguration, then restores the power state.
:param task: A TaskManager object.
"""
previous = power_on_node_if_needed(task)
yield task
restore_power_state_if_needed(task, previous)
def build_configdrive(node, configdrive):
"""Build a configdrive from provided meta_data, network_data and user_data.

View File

@ -478,12 +478,9 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
# This is not being done now as it is expected to be
# refactored in the near future.
manager_utils.node_power_action(task, states.POWER_OFF)
power_state_to_restore = (
manager_utils.power_on_node_if_needed(task))
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
task.driver.boot.prepare_instance(task)
manager_utils.node_power_action(task, states.POWER_ON)
LOG.info('Deployment to node %s done', task.node.uuid)
@ -507,13 +504,11 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
manager_utils.node_power_action(task, states.POWER_OFF)
task.driver.storage.detach_volumes(task)
deploy_utils.tear_down_storage_configuration(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.unconfigure_tenant_networks(task)
# NOTE(mgoddard): If the deployment was unsuccessful the node may have
# ports on the provisioning network which were not deleted.
task.driver.network.remove_provisioning_network(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.unconfigure_tenant_networks(task)
# NOTE(mgoddard): If the deployment was unsuccessful the node may
# have ports on the provisioning network which were not deleted.
task.driver.network.remove_provisioning_network(task)
return states.DELETED
@METRICS.timer('AgentDeploy.prepare')
@ -853,11 +848,9 @@ class AgentRescue(base.RescueInterface):
task.node.save()
task.driver.boot.clean_up_instance(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.unconfigure_tenant_networks(task)
task.driver.network.add_rescuing_network(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.unconfigure_tenant_networks(task)
task.driver.network.add_rescuing_network(task)
if CONF.agent.manage_agent_boot:
ramdisk_opts = deploy_utils.build_agent_options(task.node)
# prepare_ramdisk will set the boot device
@ -892,10 +885,8 @@ class AgentRescue(base.RescueInterface):
task.node.save()
self.clean_up(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.configure_tenant_networks(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.configure_tenant_networks(task)
task.driver.boot.prepare_instance(task)
manager_utils.node_power_action(task, states.POWER_ON)
@ -947,7 +938,5 @@ class AgentRescue(base.RescueInterface):
manager_utils.remove_node_rescue_password(task.node, save=True)
if CONF.agent.manage_agent_boot:
task.driver.boot.clean_up_ramdisk(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.remove_rescuing_network(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_rescuing_network(task)

View File

@ -464,10 +464,8 @@ class HeartbeatMixin(object):
reason=fail_reason)
task.process_event('resume')
task.driver.rescue.clean_up(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.configure_tenant_networks(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.configure_tenant_networks(task)
task.process_event('done')
@ -736,12 +734,9 @@ class AgentDeployMixin(HeartbeatMixin):
log_and_raise_deployment_error(task, msg, exc=e)
try:
power_state_to_restore = (
manager_utils.power_on_node_if_needed(task))
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
manager_utils.node_power_action(task, states.POWER_ON)
except Exception as e:
msg = (_('Error rebooting node %(node)s after deploy. '

View File

@ -114,7 +114,8 @@ def _tear_down_managed_boot(task):
LOG.exception('Unable to clean up ramdisk boot for node %s',
task.node.uuid)
try:
task.driver.network.remove_inspection_network(task)
with cond_utils.power_state_for_network_configuration(task):
task.driver.network.remove_inspection_network(task)
except Exception as exc:
errors.append(_('unable to remove inspection ports: %s') % exc)
LOG.exception('Unable to remove inspection network for node %s',
@ -194,10 +195,12 @@ def _start_managed_inspection(task):
params = dict(_parse_kernel_params(),
**{'ipa-inspection-callback-url': endpoint})
task.driver.network.add_inspection_network(task)
cond_utils.node_power_action(task, states.POWER_OFF)
with cond_utils.power_state_for_network_configuration(task):
task.driver.network.add_inspection_network(task)
task.driver.boot.prepare_ramdisk(task, ramdisk_params=params)
client.start_introspection(task.node.uuid, manage_boot=False)
cond_utils.node_power_action(task, states.REBOOT)
cond_utils.node_power_action(task, states.POWER_ON)
except Exception as exc:
LOG.exception('Unable to start managed inspection for node %(uuid)s: '
'%(err)s', {'uuid': task.node.uuid, 'err': exc})

View File

@ -440,12 +440,9 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface):
# This is not being done now as it is expected to be
# refactored in the near future.
manager_utils.node_power_action(task, states.POWER_OFF)
power_state_to_restore = (
manager_utils.power_on_node_if_needed(task))
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
task.driver.boot.prepare_instance(task)
manager_utils.node_power_action(task, states.POWER_ON)
@ -471,13 +468,11 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface):
manager_utils.node_power_action(task, states.POWER_OFF)
task.driver.storage.detach_volumes(task)
deploy_utils.tear_down_storage_configuration(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.unconfigure_tenant_networks(task)
# NOTE(mgoddard): If the deployment was unsuccessful the node may have
# ports on the provisioning network which were not deleted.
task.driver.network.remove_provisioning_network(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.unconfigure_tenant_networks(task)
# NOTE(mgoddard): If the deployment was unsuccessful the node may
# have ports on the provisioning network which were not deleted.
task.driver.network.remove_provisioning_network(task)
return states.DELETED
@METRICS.timer('ISCSIDeploy.prepare')

View File

@ -357,10 +357,8 @@ class PXERamdiskDeploy(agent.AgentDeploy):
# IDEA(TheJulia): Maybe a "trusted environment" mode flag
# that we otherwise fail validation on for drivers that
# require explicit security postures.
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.configure_tenant_networks(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.configure_tenant_networks(task)
# calling boot.prepare_instance will also set the node
# to PXE boot, and update PXE templates accordingly

View File

@ -192,11 +192,12 @@ class InspectHardwareTestCase(BaseTestCase):
})
self.driver.network.add_inspection_network.assert_called_once_with(
self.task)
self.driver.power.reboot.assert_called_once_with(
self.task, timeout=None)
self.driver.power.set_power_state.assert_has_calls([
mock.call(self.task, states.POWER_OFF, timeout=None),
mock.call(self.task, states.POWER_ON, timeout=None),
])
self.assertFalse(self.driver.network.remove_inspection_network.called)
self.assertFalse(self.driver.boot.clean_up_ramdisk.called)
self.assertFalse(self.driver.power.set_power_state.called)
def test_managed_custom_params(self, mock_client):
CONF.set_override('extra_kernel_params',
@ -219,11 +220,12 @@ class InspectHardwareTestCase(BaseTestCase):
})
self.driver.network.add_inspection_network.assert_called_once_with(
self.task)
self.driver.power.reboot.assert_called_once_with(
self.task, timeout=None)
self.driver.power.set_power_state.assert_has_calls([
mock.call(self.task, states.POWER_OFF, timeout=None),
mock.call(self.task, states.POWER_ON, timeout=None),
])
self.assertFalse(self.driver.network.remove_inspection_network.called)
self.assertFalse(self.driver.boot.clean_up_ramdisk.called)
self.assertFalse(self.driver.power.set_power_state.called)
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test_managed_error(self, mock_acquire, mock_client):
@ -246,7 +248,7 @@ class InspectHardwareTestCase(BaseTestCase):
self.driver.network.remove_inspection_network.assert_called_once_with(
self.task)
self.driver.boot.clean_up_ramdisk.assert_called_once_with(self.task)
self.driver.power.set_power_state.assert_called_once_with(
self.driver.power.set_power_state.assert_called_with(
self.task, 'power off', timeout=None)