From f6c286dbe882111b8de3b4f53391f1e96ad2d120 Mon Sep 17 00:00:00 2001 From: Oliver Walsh Date: Wed, 1 Feb 2017 02:51:15 +0000 Subject: [PATCH] Fix race in undercloud cell_v2 host discovery Ensure that the ironic nodes have been picked up by the nova resource tracker before running nova-manage cell_v2 host discovery. Also adds logging of the verbose command output to mistral engine log. Change-Id: I4cc67935df8f37cdb2d8b0bfd96cf90eb7a6ce25 Closes-Bug: #1660160 --- setup.cfg | 1 + sudoers | 2 +- tripleo_common/actions/baremetal.py | 22 ++++++ .../tests/actions/test_baremetal.py | 23 ++++++ tripleo_common/tests/utils/test_nodes.py | 20 ++---- tripleo_common/utils/nodes.py | 4 +- workbooks/baremetal.yaml | 71 ++++++++++++++++++- 7 files changed, 124 insertions(+), 19 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8952e1afd..12a402c41 100644 --- a/setup.cfg +++ b/setup.cfg @@ -64,6 +64,7 @@ mistral.actions = tripleo.baremetal.configure_root_device = tripleo_common.actions.baremetal:ConfigureRootDeviceAction tripleo.baremetal.register_or_update_nodes = tripleo_common.actions.baremetal:RegisterOrUpdateNodes tripleo.baremetal.update_node_capability = tripleo_common.actions.baremetal:UpdateNodeCapability + tripleo.baremetal.cell_v2_discover_hosts = tripleo_common.actions.baremetal:CellV2DiscoverHostsAction tripleo.deployment.config = tripleo_common.actions.deployment:OrchestrationDeployAction tripleo.deployment.deploy = tripleo_common.actions.deployment:DeployStackAction tripleo.deployment.overcloudrc = tripleo_common.actions.deployment:OvercloudRcAction diff --git a/sudoers b/sudoers index a7cb35eb8..d3b9787b7 100644 --- a/sudoers +++ b/sudoers @@ -4,5 +4,5 @@ Defaults:mistral !requiretty mistral ALL = (validations) NOPASSWD:SETENV: /usr/bin/run-validation mistral ALL = NOPASSWD: /usr/bin/chown validations\: /tmp/validations_identity_* mistral ALL = NOPASSWD: /usr/bin/rm -f /tmp/validations_identity_* -mistral ALL = NOPASSWD: /bin/nova-manage cell_v2 discover_hosts +mistral ALL = NOPASSWD: /bin/nova-manage cell_v2 discover_hosts * validations ALL = NOPASSWD: ALL diff --git a/tripleo_common/actions/baremetal.py b/tripleo_common/actions/baremetal.py index e51afe8d2..c9c56c7da 100644 --- a/tripleo_common/actions/baremetal.py +++ b/tripleo_common/actions/baremetal.py @@ -288,3 +288,25 @@ class UpdateNodeCapability(base.TripleOAction): return mistral_workflow_utils.Result( error="%s: %s" % (type(err).__name__, str(err)) ) + + +class CellV2DiscoverHostsAction(base.TripleOAction): + """Run cell_v2 host discovery + + Runs cell_v2 host discovery to map any newly available ironic nodes. + + """ + + def run(self): + try: + result = nodes.run_nova_cell_v2_discovery() + LOG.info( + 'Successfully ran cell_v2 discover_hosts\n' + 'stdout: %(stdout)r\n', + {"stdout": result[0]} + ) + except Exception as err: + LOG.exception("Error running cell_v2 discover_hosts") + return mistral_workflow_utils.Result( + error="%s: %s" % (type(err).__name__, str(err)) + ) diff --git a/tripleo_common/tests/actions/test_baremetal.py b/tripleo_common/tests/actions/test_baremetal.py index ef7335eb4..c2aa6cec3 100644 --- a/tripleo_common/tests/actions/test_baremetal.py +++ b/tripleo_common/tests/actions/test_baremetal.py @@ -15,6 +15,7 @@ import mock from glanceclient import exc as glance_exceptions import ironic_inspector_client +from oslo_concurrency import processutils from oslo_utils import units from tripleo_common.actions import baremetal @@ -330,3 +331,25 @@ class TestConfigureRootDeviceAction(base.TestCase): "Cannot find a disk", action.run) self.assertEqual(self.ironic.node.update.call_count, 0) + + +class TestCellV2DiscoverHostsAction(base.TestCase): + + @mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') + def test_run(self, mock_command): + action = baremetal.CellV2DiscoverHostsAction() + action.run() + mock_command.assert_called_once() + + @mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') + def test_failure(self, mock_command): + mock_command.side_effect = processutils.ProcessExecutionError( + exit_code=1, + stdout='captured stdout', + stderr='captured stderr', + cmd='command' + ) + action = baremetal.CellV2DiscoverHostsAction() + result = action.run() + self.assertTrue(result.is_error()) + mock_command.assert_called_once() diff --git a/tripleo_common/tests/utils/test_nodes.py b/tripleo_common/tests/utils/test_nodes.py index 45c4f850a..63e4f8a52 100644 --- a/tripleo_common/tests/utils/test_nodes.py +++ b/tripleo_common/tests/utils/test_nodes.py @@ -190,8 +190,7 @@ class NodesTest(base.TestCase): 'pm_password': 'random', 'pm_type': 'pxe_ssh', 'name': 'node1', 'capabilities': 'num_nics:6'} - @mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') - def test_register_all_nodes_ironic_no_hw_stats(self, mock_discovery): + def test_register_all_nodes_ironic_no_hw_stats(self): node_list = [self._get_node()] # Remove the hardware stats from the node dictionary @@ -219,10 +218,8 @@ class NodesTest(base.TestCase): address='aaa') ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.port.create.assert_has_calls([port_call]) - mock_discovery.assert_called_once() - @mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') - def test_register_all_nodes(self, mock_discovery): + def test_register_all_nodes(self): node_list = [self._get_node()] node_properties = {"cpus": "1", "memory_mb": "2048", @@ -243,10 +240,8 @@ class NodesTest(base.TestCase): address='aaa') ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.port.create.assert_has_calls([port_call]) - mock_discovery.assert_called_once() - @mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') - def test_register_all_nodes_kernel_ramdisk(self, mock_discovery): + def test_register_all_nodes_kernel_ramdisk(self): node_list = [self._get_node()] node_properties = {"cpus": "1", "memory_mb": "2048", @@ -275,10 +270,8 @@ class NodesTest(base.TestCase): address='aaa') ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.port.create.assert_has_calls([port_call]) - mock_discovery.assert_called_once() - @mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') - def test_register_all_nodes_uuid(self, mock_discovery): + def test_register_all_nodes_uuid(self): node_list = [self._get_node()] node_list[0]['uuid'] = 'abcdef' node_properties = {"cpus": "1", @@ -301,10 +294,8 @@ class NodesTest(base.TestCase): address='aaa') ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.port.create.assert_has_calls([port_call]) - mock_discovery.assert_called_once() - @mock.patch('tripleo_common.utils.nodes.run_nova_cell_v2_discovery') - def test_register_all_nodes_caps_dict(self, mock_discovery): + def test_register_all_nodes_caps_dict(self): node_list = [self._get_node()] node_list[0]['capabilities'] = { 'num_nics': 7 @@ -328,7 +319,6 @@ class NodesTest(base.TestCase): address='aaa') ironic.node.create.assert_has_calls([pxe_node, mock.ANY]) ironic.port.create.assert_has_calls([port_call]) - mock_discovery.assert_called_once() def test_register_update(self): node = self._get_node() diff --git a/tripleo_common/utils/nodes.py b/tripleo_common/utils/nodes.py index 503e5805d..65e88063c 100644 --- a/tripleo_common/utils/nodes.py +++ b/tripleo_common/utils/nodes.py @@ -382,7 +382,6 @@ def register_all_nodes(nodes_list, client, remove=False, glance_client=None, seen.append(node) _clean_up_extra_nodes(seen, client, remove=remove) - run_nova_cell_v2_discovery() return seen @@ -452,5 +451,6 @@ def run_nova_cell_v2_discovery(): '/usr/bin/sudo', '/bin/nova-manage', 'cell_v2', - 'discover_hosts' + 'discover_hosts', + '--verbose' ) diff --git a/workbooks/baremetal.yaml b/workbooks/baremetal.yaml index e09407b64..3c7f0305f 100644 --- a/workbooks/baremetal.yaml +++ b/workbooks/baremetal.yaml @@ -193,7 +193,7 @@ workflows: tasks: set_nodes_available: - on-success: try_power_off + on-success: wait_for_nova_resources on-error: set_status_failed_nodes_available with-items: uuid in <% $.node_uuids %> workflow: tripleo.baremetal.v1.set_node_state @@ -209,6 +209,36 @@ workflows: status: FAILED message: <% task(set_nodes_available).result %> + wait_for_nova_resources: + on-success: cell_v2_discover_hosts + on-error: wait_for_nova_resources_failed + with-items: node_uuid in <% $.node_uuids %> + action: nova.hypervisors_find hypervisor_hostname=<% $.node_uuid %> + timeout: 900 #15 minutes + retry: + delay: 30 + count: 30 + + wait_for_nova_resources_failed: + on-success: send_message + publish: + status: FAILED + message: <% task(wait_for_nova_resources).result %> + + cell_v2_discover_hosts: + on-success: try_power_off + on-error: cell_v2_discover_hosts_failed + workflow: tripleo.baremetal.v1.cellv2_discovery + input: + node_uuids: <% $.node_uuids %> + queue_name: <% $.queue_name %> + + cell_v2_discover_hosts_failed: + on-success: send_message + publish: + status: FAILED + message: <% task(cell_v2_discover_hosts).result %> + try_power_off: on-success: send_message on-error: power_off_failed @@ -758,3 +788,42 @@ workflows: fail_workflow: action: std.fail + + + cellv2_discovery: + description: Run cell_v2 host discovery + + input: + - node_uuids + - queue_name: tripleo + + tasks: + + cell_v2_discover_hosts: + on-success: send_message + on-error: cell_v2_discover_hosts_failed + action: tripleo.baremetal.cell_v2_discover_hosts + + cell_v2_discover_hosts_failed: + on-success: send_message + publish: + status: FAILED + message: <% task(cell_v2_discover_hosts).result %> + + send_message: + action: zaqar.queue_post + retry: count=5 delay=1 + input: + queue_name: <% $.queue_name %> + messages: + body: + type: tripleo.baremetal.v1.cellv2_discovery + payload: + status: <% $.get('status', 'SUCCESS') %> + message: <% $.get('message', '') %> + execution: <% execution() %> + on-success: + - fail_workflow: <% $.get('status') = "FAILED" %> + + fail_workflow: + action: std.fail