Reimplement os_tripleo_baremetal_node_introspection

Triggering introspection via the introspection API is discouraged when
also using the Ironic API, because Ironic will be out of sync with
introspection state. This is suspected to be causing occasional errors
in CI and with end users.

This module was one of the very first to be converted from mistral, so
hasn't benefited from the concurrency and logging patterns in later
modules.

The internal implementation of this module has been rewritten, and now
does the following:

- Concurrency uses a ThreadPoolExecutor instead of a custom pool with
  async calls
- Waiting for state changes uses existing openstacksdk wait functions
- Logging ends up in a 'logging' result value instead of calling
  module.log which ansible does not pass back to the caller
- Introspection is triggered by setting the node provision state to
  'inspect'
- Node is powered off after a successful introspection
- Auth configuration relies on environment, as for other modules
- Debug tasks are added to the playbook to print the logging

This was reverted previously due to a 50% failure rate in the periodic
featureset001, the locking interaction between inspector and ironic
has now been fixed so the root cause of these failures is now
resolved.

Change-Id: I24eb7014cc657f50be63767af439208eb854dfb6
Closes-Bug: 1903786
(cherry picked from commit e54faf1e10)
This commit is contained in:
Steve Baker 2020-11-12 13:31:59 +13:00
parent 2ac36a494f
commit be0c1dd6af
3 changed files with 432 additions and 230 deletions

View File

@ -1,4 +1,3 @@
#!/usr/bin/python
# Copyright (c) 2019 OpenStack Foundation
# All Rights Reserved.
#
@ -13,7 +12,18 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
__metaclass__ = type
from concurrent import futures
import io
import logging
import yaml
from ansible.module_utils.basic import AnsibleModule
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_full_argument_spec
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_module_kwargs
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_cloud_from_module
LOG = logging.getLogger('os_tripleo_baremetal_node_introspection')
ANSIBLE_METADATA = {'metadata_version': '1.1',
'status': ['preview'],
@ -70,6 +80,16 @@ options:
- Don't provide instrospection info in output of the module
type: bool
default: False
log_level:
description:
- Set the logging level for the log which is available in the
returned 'logging' result.
default: info
choices:
- debug
- info
- warning
- error
'''
RETURN = '''
@ -127,205 +147,153 @@ EXAMPLES = '''
'''
import time
import yaml
from ansible.module_utils.basic import AnsibleModule
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_full_argument_spec
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_module_kwargs
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_cloud_from_module
BASE_LOG_MAP = {
'debug': logging.DEBUG,
'info': logging.INFO,
'warning': logging.WARNING,
'error': logging.ERROR
}
class IntrospectionManagement(object):
def __init__(self,
cloud,
module,
concurrency,
max_retries,
node_timeout,
retry_timeout):
self.client = cloud.baremetal_introspection
self.cloud = cloud
self.module = module
self.concurrency = concurrency
self.max_retries = max_retries
self.node_timeout = node_timeout
self.retry_timeout = retry_timeout
def _configure_logging(log_level):
log_fmt = ('%(asctime)s %(levelname)s %(name)s: %(message)s')
urllib_level = logging.CRITICAL
def log(self, msg):
self.module.log("os_tripleo_baremetal_node_introspection: %s" % msg)
log_stream = io.StringIO()
handler = logging.StreamHandler(log_stream)
logging.basicConfig(level=BASE_LOG_MAP[log_level], format=log_fmt,
handlers=[handler])
logging.getLogger('urllib3.connectionpool').setLevel(urllib_level)
return log_stream
def push_next(self, pool, queue):
try:
next_introspection = next(queue)
pool.append(next_introspection)
except StopIteration:
pass
return pool
def introspect(self, node_uuids):
result = {}
queue = (NodeIntrospection(
uuid,
self.client,
self.cloud,
self.node_timeout,
self.max_retries,
self.retry_timeout,
self.log) for uuid in node_uuids)
pool = []
for i in range(self.concurrency):
pool = self.push_next(pool, queue)
while len(pool) > 0:
finished = []
for intro in pool:
if not intro.started:
try:
intro.start_introspection()
continue
except Exception as e:
self.log("ERROR Node %s can't start introspection"
" because: %s" % (intro.node_id, str(e)))
result[intro.node_id] = {
"error": "Error for introspection node %s: %s " % (
intro.node_id, str(e)),
"failed": True,
"status": ''
}
finished.append(intro)
continue
status = intro.get_introspection()
if (not status.is_finished and intro.timeouted()) or (
status.is_finished and status.error is not None
):
if status.is_finished:
self.log("ERROR Introspection of node %s "
"failed: %s" % (
status.id, str(status.error))
)
if intro.last_retry():
result[status.id] = (intro.error_msg()
if status.is_finished
else intro.timeout_msg())
finished.append(intro)
else:
intro.restart_introspection()
if status.is_finished and status.error is None:
result[status.id] = {
'status': intro.get_introspection_data(),
'failed': False,
'error': None}
finished.append(intro)
for i in finished:
pool.remove(i)
pool = self.push_next(pool, queue)
# Let's not DDOS Ironic service
if pool:
time.sleep(min(10, self.node_timeout))
def introspect(cloud, node_uuids, node_timeout, retry_timeout, max_retries,
concurrency, fetch_data):
result = {}
if not node_uuids:
return result
introspect_jobs = []
class NodeIntrospection:
started = False
def __init__(
self,
node_id,
os_client,
os_cloud,
timeout,
max_retries,
retry_timeout,
log):
self.node_id = node_id
self.os_client = os_client
self.os_cloud = os_cloud
self.timeout = timeout
self.max_retries = max_retries
self.log = log
self.start = int(time.time())
self.retries = 0
self.retry_timeout = retry_timeout
self.last_status = None
def restart_introspection(self):
self.retries += 1
try:
self.os_client.abort_introspection(self.node_id)
except Exception as e:
# Node is locked
self.log("ERROR Node %s can't abort introspection: %s" % (
self.node_id, str(e)))
return
# need to wait before restarting introspection till it's aborted
# to prevent hanging let's use introspect timeout for that
try:
self.os_client.wait_for_introspection(
self.node_id, timeout=self.timeout, ignore_error=True)
# Wait until node is unlocked
self.os_cloud.baremetal.wait_for_node_reservation(
self.node_id, timeout=self.retry_timeout)
except Exception as e:
self.log("ERROR Node %s can't restart introspection because can't "
"abort and unlock it: %s" % (self.node_id, str(e)))
return
self.start = int(time.time())
return self.start_introspection(restart=True)
def start_introspection(self, restart=False):
self.started = True
if restart:
self.log("INFO Restarting (try %s of %s) introspection of "
"node %s" % (
self.retries, self.max_retries, self.node_id))
else:
self.log("INFO Starting introspection of node %s" % (self.node_id))
return self.os_client.start_introspection(self.node_id)
def get_introspection(self):
self.last_status = self.os_client.get_introspection(self.node_id)
return self.last_status
def get_introspection_data(self):
self.log(
"Instrospection of node %s finished successfully!" % self.node_id)
return self.os_client.get_introspection_data(self.node_id)
def time_elapsed(self):
return int(time.time()) - self.start
def timeouted(self):
return self.time_elapsed() > self.timeout
def last_retry(self):
return self.retries >= self.max_retries
def timeout_msg(self):
self.log(
"ERROR Retry limit %s reached for introspection "
"node %s: exceeded timeout" % (
self.max_retries, self.node_id))
return {"error": "Timeout error for introspection node %s: %s "
"sec exceeded max timeout of %s sec" % (
self.node_id, self.time_elapsed(), self.timeout),
with futures.ThreadPoolExecutor(max_workers=concurrency) as p:
for node_uuid in node_uuids:
introspect_jobs.append(p.submit(
introspect_node, cloud, node_uuid,
node_timeout, retry_timeout, max_retries, fetch_data
))
for job in futures.as_completed(introspect_jobs):
e = job.exception()
if e:
# This should not happen, but handle it anyway
result[node_uuid] = {
"error": str(e),
"failed": True,
"status": self.last_status
}
"status": 'failed'
}
LOG.error('Unexpected error: %s', e)
else:
result[node_uuid] = job.result()
return result
def error_msg(self):
self.log(
"ERROR Retry limit %s reached for introspection "
"node %s: %s" % (
self.max_retries, self.node_id, self.last_status.error))
return {"error": "Error for introspection node %s: %s " % (
self.node_id, self.last_status.error),
"failed": True,
"status": self.last_status
}
def introspect_node(cloud, node_uuid, node_timeout, retry_timeout,
max_retries, fetch_data):
last_error = None
attempt = 0
status = ''
while attempt <= max_retries:
attempt += 1
node = cloud.baremetal.get_node(node_uuid)
# Put into required state for attempt
LOG.info("Preparing for attempt %s for node: %s", attempt, node_uuid)
node = prepare_for_attempt(cloud, node, node_timeout, retry_timeout)
try:
# Start introspection
LOG.info("Introspecting node: %s", node_uuid)
node = cloud.baremetal.set_node_provision_state(
node, 'inspect', wait=True, timeout=node_timeout)
if node.power_state != 'power off':
# power off the node
LOG.info('Power off node: %s', node_uuid)
cloud.baremetal.set_node_power_state(
node, 'power off', wait=True, timeout=node_timeout
)
if fetch_data:
# Get the introspection data for the result
LOG.info("Fetching introspection data: %s", node_uuid)
status = cloud.baremetal_introspection.get_introspection_data(
node_uuid)
LOG.info("Introspecting node complete: %s", node_uuid)
# Success
return {
'status': status,
'failed': False,
'error': None
}
except Exception as e:
last_error = str(e)
LOG.error("Introspection of node %s failed on attempt %s: "
"%s", node_uuid, attempt, last_error)
message = 'unknown error'
# All attempts failed, fetch node to get the reason
try:
node = cloud.baremetal.get_node(node_uuid)
message = node.last_error
status = node.provision_state
except Exception:
if last_error:
# Couldn't fetch the node, use the last exception message instead
message = last_error
return {
"error": "Error for introspection node %s on attempt %s: %s " %
(node_uuid, attempt, message),
"failed": True,
"status": status
}
def prepare_for_attempt(cloud, node, node_timeout, retry_timeout):
if node.provision_state not in ('manageable', 'inspect failed'):
# Attempt to abort any existing introspection
try:
LOG.info('Node in state "%s", calling abort: %s',
node.provision_state, node.id)
node = cloud.baremetal.set_node_provision_state(
node, 'abort', wait=True, timeout=node_timeout)
except Exception as e:
LOG.warn("Abort introspection of node %s failed: %s",
node.id, str(e))
if node.power_state != 'power off':
# Attempt to power off the node
try:
LOG.info('Power off node: %s', node.id)
cloud.baremetal.set_node_power_state(
node, 'power off', wait=True, timeout=node_timeout
)
except Exception as e:
LOG.warn("Power off of node %s failed: %s",
node.id, str(e))
if node.reservation:
# Wait until node is unlocked
try:
node = cloud.baremetal.wait_for_node_reservation(
node, timeout=retry_timeout)
except Exception as e:
LOG.warn("Waiting for node unlock %s failed: %s",
node.id, str(e))
return node
def main():
@ -349,40 +317,58 @@ def main():
else:
module.params['auth'] = {'endpoint': ironic_url}
_, cloud = openstack_cloud_from_module(module)
log_stream = _configure_logging(module.params['log_level'])
sdk, cloud = openstack_cloud_from_module(module)
quiet = module.params['quiet']
failed_nodes = []
passed_nodes = []
try:
result = introspect(
cloud,
node_uuids=module.params["node_uuids"],
node_timeout=module.params["node_timeout"],
retry_timeout=module.params["retry_timeout"],
max_retries=module.params["max_retries"],
concurrency=module.params["concurrency"],
fetch_data=not quiet)
except Exception as e:
# This should not happen, but handle it anyway
LOG.error('Unexpected error: %s', e)
module.fail_json(
msg=str(e),
failed_nodes=module.params["node_uuids"],
passed_nodes=[],
logging=log_stream.getvalue().split('\n')
)
for node_uuid, result in result.items():
if result['failed']:
failed_nodes.append(node_uuid)
else:
passed_nodes.append(node_uuid)
introspector = IntrospectionManagement(
cloud,
module,
module.params["concurrency"],
module.params["max_retries"],
module.params["node_timeout"],
module.params["retry_timeout"]
)
module_results = {"changed": True}
result = introspector.introspect(module.params["node_uuids"])
failed_nodes = [k for k, v in result.items() if v['failed']]
passed_nodes = [k for k, v in result.items() if not v['failed']]
failed = len(failed_nodes)
if failed > 0:
message = ("Introspection completed with failures. %s node(s) failed."
% failed)
module.log("os_tripleo_baremetal_node_introspection ERROR %s" %
message)
module_results.update({'failed': True})
else:
message = "Introspection completed successfully: %s nodes" % len(
module.params["node_uuids"])
passed_nodes)
module.log("os_tripleo_baremetal_node_introspection INFO %s" %
message)
module_results.update({
"introspection_data": result if not module.params['quiet'] else {},
"failed_nodes": failed_nodes,
"passed_nodes": passed_nodes,
"msg": message
})
module.exit_json(**module_results)
module.exit_json(
changed=True,
failed=failed > 0,
introspection_data=result if not quiet else {},
failed_nodes=failed_nodes,
passed_nodes=passed_nodes,
msg=message,
logging=log_stream.getvalue().split('\n')
)
if __name__ == "__main__":

View File

@ -96,30 +96,28 @@
max_retries: "{{ max_retries }}"
node_timeout: "{{ node_timeout }}"
retry_timeout: "{{ retry_timeout }}"
log_level: info
register: baremetal_introspection_result
failed_when: false
- name: Nodes that passed introspection
- name: Introspection log
debug:
msg: >-
{% if baremetal_introspection_result.passed_nodes != [] %}
{{ baremetal_introspection_result.passed_nodes | join(' ') }}{% else %}
No nodes completed introspection successfully!{% endif %}
when: baremetal_introspection_result.passed_nodes is defined
var: baremetal_introspection_result.logging
- name: Nodes that failed introspection
- name: Node introspection summary
debug:
msg: >-
{% if baremetal_introspection_result.failed_nodes != [] %}
{{ baremetal_introspection_result.failed_nodes | join(' ') }}{% else %}
All nodes completed introspection successfully!{% endif %}
failed_when: baremetal_introspection_result.failed_nodes != []
when: baremetal_introspection_result.failed_nodes is defined
msg: |-
Passed: [{{ baremetal_introspection_result.passed_nodes | join(', ') }}]
Failed: [{{ baremetal_introspection_result.failed_nodes | join(', ') }}]
{% if baremetal_introspection_result.passed_nodes == [] %}
No nodes passed introspection
{% elif baremetal_introspection_result.failed_nodes == [] %}
All nodes completed introspection successfully!
{% endif %}
- name: Node introspection failed and no results are provided
fail:
msg: >-
Nodes failed introspection and no info was provided
Nodes failed introspection
when:
- baremetal_introspection_result.failed_nodes is not defined
- baremetal_introspection_result.passed_nodes is not defined
- baremetal_introspection_result.failed_nodes != []

View File

@ -0,0 +1,218 @@
# Copyright 2020 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from tripleo_ansible.ansible_plugins.modules import (
os_tripleo_baremetal_node_introspection as module)
from tripleo_ansible.tests import base as tests_base
class TestIntrospect(tests_base.TestCase):
def setUp(self):
super(TestIntrospect, self).setUp()
c = mock.Mock()
self.cloud = c
self.node = mock.Mock(
uuid='1234abcd',
power_state='power on',
provision_state='manageable',
last_error=None,
)
c.baremetal.get_node.return_value = self.node
c.baremetal.set_node_provision_state.return_value = self.node
c.baremetal.wait_for_node_reservation.return_value = self.node
c.baremetal_introspection.get_introspection_data.return_value = {
'foo': 'bar'
}
@mock.patch.object(module, 'prepare_for_attempt')
def test_introspect_node(self, mock_pfa):
mock_pfa.return_value = self.node
c = self.cloud
result = module.introspect_node(
c, '1234abcd', 1200, 120, 3, True)
self.assertEqual({
'status': {'foo': 'bar'},
'failed': False,
'error': None},
result)
mock_pfa.assert_called_once_with(c, self.node, 1200, 120)
c.baremetal.set_node_provision_state.assert_called_once_with(
self.node, 'inspect', wait=True, timeout=1200
)
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
c.baremetal_introspection.get_introspection_data.assert_called_once_with(
'1234abcd'
)
@mock.patch.object(module, 'prepare_for_attempt')
def test_introspect_node_retries(self, mock_pfa):
mock_pfa.return_value = self.node
c = self.cloud
ouch = Exception('ouch')
c.baremetal.set_node_provision_state.side_effect = [
ouch,
ouch,
ouch,
self.node
]
result = module.introspect_node(
c, '1234abcd', 1200, 120, 3, fetch_data=False)
self.assertEqual({
'status': '',
'failed': False,
'error': None},
result)
mock_pfa.assert_has_calls([
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120)
])
c.baremetal.set_node_provision_state.assert_has_calls([
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
])
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
# fetch_data is False
c.baremetal_introspection.get_introspection_data.assert_not_called()
@mock.patch.object(module, 'prepare_for_attempt')
def test_introspect_node_retries_failed(self, mock_pfa):
mock_pfa.return_value = self.node
c = self.cloud
ouch = Exception('ouch')
c.baremetal.set_node_provision_state.side_effect = [
ouch,
ouch,
ouch,
ouch,
]
result = module.introspect_node(
c, '1234abcd', 1200, 120, 3, True)
self.assertEqual({
'error': 'Error for introspection node 1234abcd on attempt 4: None ',
'failed': True,
'status': 'manageable'}, result)
mock_pfa.assert_has_calls([
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
])
c.baremetal.set_node_provision_state.assert_has_calls([
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
])
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal_introspection.get_introspection_data.assert_not_called()
def test_prepare_for_attempt_noop(self):
c = self.cloud
self.node.provision_state = 'manageable'
self.node.power_state = 'power off'
self.node.reservation = None
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_not_called()
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal.wait_for_node_reservation.assert_not_called()
def test_prepare_for_attempt_not_manageable(self):
c = self.cloud
self.node.provision_state = 'inspect wait'
self.node.power_state = 'power off'
self.node.reservation = None
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_called_once_with(
self.node, 'abort', wait=True, timeout=1200
)
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal.wait_for_node_reservation.assert_not_called()
def test_prepare_for_attempt_powered_on(self):
c = self.cloud
self.node.provision_state = 'manageable'
self.node.power_state = 'power on'
self.node.reservation = None
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_not_called()
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
c.baremetal.wait_for_node_reservation.assert_not_called()
def test_prepare_for_attempt_reserved(self):
c = self.cloud
self.node.provision_state = 'manageable'
self.node.power_state = 'power off'
self.node.reservation = 'conductor1'
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_not_called()
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal.wait_for_node_reservation.assert_called_once_with(
self.node, timeout=120
)
def test_prepare_for_attempt_everything_failed(self):
c = self.cloud
ouch = Exception('ouch')
c.baremetal.set_node_provision_state.side_effect = ouch
c.baremetal.set_node_power_state.side_effect = ouch
c.baremetal.wait_for_node_reservation.side_effect = ouch
self.node.provision_state = 'inspect wait'
self.node.power_state = 'power on'
self.node.reservation = 'conductor1'
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_called_once_with(
self.node, 'abort', wait=True, timeout=1200
)
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
c.baremetal.wait_for_node_reservation.assert_called_once_with(
self.node, timeout=120
)