Merge "Reimplement os_tripleo_baremetal_node_introspection" into stable/wallaby

This commit is contained in:
Zuul 2021-10-04 21:05:10 +00:00 committed by Gerrit Code Review
commit e43151323b
3 changed files with 432 additions and 230 deletions

View File

@ -1,4 +1,3 @@
#!/usr/bin/python
# Copyright (c) 2019 OpenStack Foundation
# All Rights Reserved.
#
@ -13,7 +12,18 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
__metaclass__ = type
from concurrent import futures
import io
import logging
import yaml
from ansible.module_utils.basic import AnsibleModule
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_full_argument_spec
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_module_kwargs
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_cloud_from_module
LOG = logging.getLogger('os_tripleo_baremetal_node_introspection')
ANSIBLE_METADATA = {'metadata_version': '1.1',
'status': ['preview'],
@ -70,6 +80,16 @@ options:
- Don't provide instrospection info in output of the module
type: bool
default: False
log_level:
description:
- Set the logging level for the log which is available in the
returned 'logging' result.
default: info
choices:
- debug
- info
- warning
- error
'''
RETURN = '''
@ -127,205 +147,153 @@ EXAMPLES = '''
'''
import time
import yaml
from ansible.module_utils.basic import AnsibleModule
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_full_argument_spec
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_module_kwargs
from ansible_collections.openstack.cloud.plugins.module_utils.openstack import openstack_cloud_from_module
BASE_LOG_MAP = {
'debug': logging.DEBUG,
'info': logging.INFO,
'warning': logging.WARNING,
'error': logging.ERROR
}
class IntrospectionManagement(object):
def __init__(self,
cloud,
module,
concurrency,
max_retries,
node_timeout,
retry_timeout):
self.client = cloud.baremetal_introspection
self.cloud = cloud
self.module = module
self.concurrency = concurrency
self.max_retries = max_retries
self.node_timeout = node_timeout
self.retry_timeout = retry_timeout
def _configure_logging(log_level):
log_fmt = ('%(asctime)s %(levelname)s %(name)s: %(message)s')
urllib_level = logging.CRITICAL
def log(self, msg):
self.module.log("os_tripleo_baremetal_node_introspection: %s" % msg)
log_stream = io.StringIO()
handler = logging.StreamHandler(log_stream)
logging.basicConfig(level=BASE_LOG_MAP[log_level], format=log_fmt,
handlers=[handler])
logging.getLogger('urllib3.connectionpool').setLevel(urllib_level)
return log_stream
def push_next(self, pool, queue):
try:
next_introspection = next(queue)
pool.append(next_introspection)
except StopIteration:
pass
return pool
def introspect(self, node_uuids):
result = {}
queue = (NodeIntrospection(
uuid,
self.client,
self.cloud,
self.node_timeout,
self.max_retries,
self.retry_timeout,
self.log) for uuid in node_uuids)
pool = []
for i in range(self.concurrency):
pool = self.push_next(pool, queue)
while len(pool) > 0:
finished = []
for intro in pool:
if not intro.started:
try:
intro.start_introspection()
continue
except Exception as e:
self.log("ERROR Node %s can't start introspection"
" because: %s" % (intro.node_id, str(e)))
result[intro.node_id] = {
"error": "Error for introspection node %s: %s " % (
intro.node_id, str(e)),
"failed": True,
"status": ''
}
finished.append(intro)
continue
status = intro.get_introspection()
if (not status.is_finished and intro.timeouted()) or (
status.is_finished and status.error is not None
):
if status.is_finished:
self.log("ERROR Introspection of node %s "
"failed: %s" % (
status.id, str(status.error))
)
if intro.last_retry():
result[status.id] = (intro.error_msg()
if status.is_finished
else intro.timeout_msg())
finished.append(intro)
else:
intro.restart_introspection()
if status.is_finished and status.error is None:
result[status.id] = {
'status': intro.get_introspection_data(),
'failed': False,
'error': None}
finished.append(intro)
for i in finished:
pool.remove(i)
pool = self.push_next(pool, queue)
# Let's not DDOS Ironic service
if pool:
time.sleep(min(10, self.node_timeout))
def introspect(cloud, node_uuids, node_timeout, retry_timeout, max_retries,
concurrency, fetch_data):
result = {}
if not node_uuids:
return result
introspect_jobs = []
class NodeIntrospection:
started = False
def __init__(
self,
node_id,
os_client,
os_cloud,
timeout,
max_retries,
retry_timeout,
log):
self.node_id = node_id
self.os_client = os_client
self.os_cloud = os_cloud
self.timeout = timeout
self.max_retries = max_retries
self.log = log
self.start = int(time.time())
self.retries = 0
self.retry_timeout = retry_timeout
self.last_status = None
def restart_introspection(self):
self.retries += 1
try:
self.os_client.abort_introspection(self.node_id)
except Exception as e:
# Node is locked
self.log("ERROR Node %s can't abort introspection: %s" % (
self.node_id, str(e)))
return
# need to wait before restarting introspection till it's aborted
# to prevent hanging let's use introspect timeout for that
try:
self.os_client.wait_for_introspection(
self.node_id, timeout=self.timeout, ignore_error=True)
# Wait until node is unlocked
self.os_cloud.baremetal.wait_for_node_reservation(
self.node_id, timeout=self.retry_timeout)
except Exception as e:
self.log("ERROR Node %s can't restart introspection because can't "
"abort and unlock it: %s" % (self.node_id, str(e)))
return
self.start = int(time.time())
return self.start_introspection(restart=True)
def start_introspection(self, restart=False):
self.started = True
if restart:
self.log("INFO Restarting (try %s of %s) introspection of "
"node %s" % (
self.retries, self.max_retries, self.node_id))
else:
self.log("INFO Starting introspection of node %s" % (self.node_id))
return self.os_client.start_introspection(self.node_id)
def get_introspection(self):
self.last_status = self.os_client.get_introspection(self.node_id)
return self.last_status
def get_introspection_data(self):
self.log(
"Instrospection of node %s finished successfully!" % self.node_id)
return self.os_client.get_introspection_data(self.node_id)
def time_elapsed(self):
return int(time.time()) - self.start
def timeouted(self):
return self.time_elapsed() > self.timeout
def last_retry(self):
return self.retries >= self.max_retries
def timeout_msg(self):
self.log(
"ERROR Retry limit %s reached for introspection "
"node %s: exceeded timeout" % (
self.max_retries, self.node_id))
return {"error": "Timeout error for introspection node %s: %s "
"sec exceeded max timeout of %s sec" % (
self.node_id, self.time_elapsed(), self.timeout),
with futures.ThreadPoolExecutor(max_workers=concurrency) as p:
for node_uuid in node_uuids:
introspect_jobs.append(p.submit(
introspect_node, cloud, node_uuid,
node_timeout, retry_timeout, max_retries, fetch_data
))
for job in futures.as_completed(introspect_jobs):
e = job.exception()
if e:
# This should not happen, but handle it anyway
result[node_uuid] = {
"error": str(e),
"failed": True,
"status": self.last_status
}
"status": 'failed'
}
LOG.error('Unexpected error: %s', e)
else:
result[node_uuid] = job.result()
return result
def error_msg(self):
self.log(
"ERROR Retry limit %s reached for introspection "
"node %s: %s" % (
self.max_retries, self.node_id, self.last_status.error))
return {"error": "Error for introspection node %s: %s " % (
self.node_id, self.last_status.error),
"failed": True,
"status": self.last_status
}
def introspect_node(cloud, node_uuid, node_timeout, retry_timeout,
max_retries, fetch_data):
last_error = None
attempt = 0
status = ''
while attempt <= max_retries:
attempt += 1
node = cloud.baremetal.get_node(node_uuid)
# Put into required state for attempt
LOG.info("Preparing for attempt %s for node: %s", attempt, node_uuid)
node = prepare_for_attempt(cloud, node, node_timeout, retry_timeout)
try:
# Start introspection
LOG.info("Introspecting node: %s", node_uuid)
node = cloud.baremetal.set_node_provision_state(
node, 'inspect', wait=True, timeout=node_timeout)
if node.power_state != 'power off':
# power off the node
LOG.info('Power off node: %s', node_uuid)
cloud.baremetal.set_node_power_state(
node, 'power off', wait=True, timeout=node_timeout
)
if fetch_data:
# Get the introspection data for the result
LOG.info("Fetching introspection data: %s", node_uuid)
status = cloud.baremetal_introspection.get_introspection_data(
node_uuid)
LOG.info("Introspecting node complete: %s", node_uuid)
# Success
return {
'status': status,
'failed': False,
'error': None
}
except Exception as e:
last_error = str(e)
LOG.error("Introspection of node %s failed on attempt %s: "
"%s", node_uuid, attempt, last_error)
message = 'unknown error'
# All attempts failed, fetch node to get the reason
try:
node = cloud.baremetal.get_node(node_uuid)
message = node.last_error
status = node.provision_state
except Exception:
if last_error:
# Couldn't fetch the node, use the last exception message instead
message = last_error
return {
"error": "Error for introspection node %s on attempt %s: %s " %
(node_uuid, attempt, message),
"failed": True,
"status": status
}
def prepare_for_attempt(cloud, node, node_timeout, retry_timeout):
if node.provision_state not in ('manageable', 'inspect failed'):
# Attempt to abort any existing introspection
try:
LOG.info('Node in state "%s", calling abort: %s',
node.provision_state, node.id)
node = cloud.baremetal.set_node_provision_state(
node, 'abort', wait=True, timeout=node_timeout)
except Exception as e:
LOG.warn("Abort introspection of node %s failed: %s",
node.id, str(e))
if node.power_state != 'power off':
# Attempt to power off the node
try:
LOG.info('Power off node: %s', node.id)
cloud.baremetal.set_node_power_state(
node, 'power off', wait=True, timeout=node_timeout
)
except Exception as e:
LOG.warn("Power off of node %s failed: %s",
node.id, str(e))
if node.reservation:
# Wait until node is unlocked
try:
node = cloud.baremetal.wait_for_node_reservation(
node, timeout=retry_timeout)
except Exception as e:
LOG.warn("Waiting for node unlock %s failed: %s",
node.id, str(e))
return node
def main():
@ -349,40 +317,58 @@ def main():
else:
module.params['auth'] = {'endpoint': ironic_url}
_, cloud = openstack_cloud_from_module(module)
log_stream = _configure_logging(module.params['log_level'])
sdk, cloud = openstack_cloud_from_module(module)
quiet = module.params['quiet']
failed_nodes = []
passed_nodes = []
try:
result = introspect(
cloud,
node_uuids=module.params["node_uuids"],
node_timeout=module.params["node_timeout"],
retry_timeout=module.params["retry_timeout"],
max_retries=module.params["max_retries"],
concurrency=module.params["concurrency"],
fetch_data=not quiet)
except Exception as e:
# This should not happen, but handle it anyway
LOG.error('Unexpected error: %s', e)
module.fail_json(
msg=str(e),
failed_nodes=module.params["node_uuids"],
passed_nodes=[],
logging=log_stream.getvalue().split('\n')
)
for node_uuid, result in result.items():
if result['failed']:
failed_nodes.append(node_uuid)
else:
passed_nodes.append(node_uuid)
introspector = IntrospectionManagement(
cloud,
module,
module.params["concurrency"],
module.params["max_retries"],
module.params["node_timeout"],
module.params["retry_timeout"]
)
module_results = {"changed": True}
result = introspector.introspect(module.params["node_uuids"])
failed_nodes = [k for k, v in result.items() if v['failed']]
passed_nodes = [k for k, v in result.items() if not v['failed']]
failed = len(failed_nodes)
if failed > 0:
message = ("Introspection completed with failures. %s node(s) failed."
% failed)
module.log("os_tripleo_baremetal_node_introspection ERROR %s" %
message)
module_results.update({'failed': True})
else:
message = "Introspection completed successfully: %s nodes" % len(
module.params["node_uuids"])
passed_nodes)
module.log("os_tripleo_baremetal_node_introspection INFO %s" %
message)
module_results.update({
"introspection_data": result if not module.params['quiet'] else {},
"failed_nodes": failed_nodes,
"passed_nodes": passed_nodes,
"msg": message
})
module.exit_json(**module_results)
module.exit_json(
changed=True,
failed=failed > 0,
introspection_data=result if not quiet else {},
failed_nodes=failed_nodes,
passed_nodes=passed_nodes,
msg=message,
logging=log_stream.getvalue().split('\n')
)
if __name__ == "__main__":

View File

@ -96,30 +96,28 @@
max_retries: "{{ max_retries }}"
node_timeout: "{{ node_timeout }}"
retry_timeout: "{{ retry_timeout }}"
log_level: info
register: baremetal_introspection_result
failed_when: false
- name: Nodes that passed introspection
- name: Introspection log
debug:
msg: >-
{% if baremetal_introspection_result.passed_nodes != [] %}
{{ baremetal_introspection_result.passed_nodes | join(' ') }}{% else %}
No nodes completed introspection successfully!{% endif %}
when: baremetal_introspection_result.passed_nodes is defined
var: baremetal_introspection_result.logging
- name: Nodes that failed introspection
- name: Node introspection summary
debug:
msg: >-
{% if baremetal_introspection_result.failed_nodes != [] %}
{{ baremetal_introspection_result.failed_nodes | join(' ') }}{% else %}
All nodes completed introspection successfully!{% endif %}
failed_when: baremetal_introspection_result.failed_nodes != []
when: baremetal_introspection_result.failed_nodes is defined
msg: |-
Passed: [{{ baremetal_introspection_result.passed_nodes | join(', ') }}]
Failed: [{{ baremetal_introspection_result.failed_nodes | join(', ') }}]
{% if baremetal_introspection_result.passed_nodes == [] %}
No nodes passed introspection
{% elif baremetal_introspection_result.failed_nodes == [] %}
All nodes completed introspection successfully!
{% endif %}
- name: Node introspection failed and no results are provided
fail:
msg: >-
Nodes failed introspection and no info was provided
Nodes failed introspection
when:
- baremetal_introspection_result.failed_nodes is not defined
- baremetal_introspection_result.passed_nodes is not defined
- baremetal_introspection_result.failed_nodes != []

View File

@ -0,0 +1,218 @@
# Copyright 2020 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from tripleo_ansible.ansible_plugins.modules import (
os_tripleo_baremetal_node_introspection as module)
from tripleo_ansible.tests import base as tests_base
class TestIntrospect(tests_base.TestCase):
def setUp(self):
super(TestIntrospect, self).setUp()
c = mock.Mock()
self.cloud = c
self.node = mock.Mock(
uuid='1234abcd',
power_state='power on',
provision_state='manageable',
last_error=None,
)
c.baremetal.get_node.return_value = self.node
c.baremetal.set_node_provision_state.return_value = self.node
c.baremetal.wait_for_node_reservation.return_value = self.node
c.baremetal_introspection.get_introspection_data.return_value = {
'foo': 'bar'
}
@mock.patch.object(module, 'prepare_for_attempt')
def test_introspect_node(self, mock_pfa):
mock_pfa.return_value = self.node
c = self.cloud
result = module.introspect_node(
c, '1234abcd', 1200, 120, 3, True)
self.assertEqual({
'status': {'foo': 'bar'},
'failed': False,
'error': None},
result)
mock_pfa.assert_called_once_with(c, self.node, 1200, 120)
c.baremetal.set_node_provision_state.assert_called_once_with(
self.node, 'inspect', wait=True, timeout=1200
)
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
c.baremetal_introspection.get_introspection_data.assert_called_once_with(
'1234abcd'
)
@mock.patch.object(module, 'prepare_for_attempt')
def test_introspect_node_retries(self, mock_pfa):
mock_pfa.return_value = self.node
c = self.cloud
ouch = Exception('ouch')
c.baremetal.set_node_provision_state.side_effect = [
ouch,
ouch,
ouch,
self.node
]
result = module.introspect_node(
c, '1234abcd', 1200, 120, 3, fetch_data=False)
self.assertEqual({
'status': '',
'failed': False,
'error': None},
result)
mock_pfa.assert_has_calls([
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120)
])
c.baremetal.set_node_provision_state.assert_has_calls([
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
])
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
# fetch_data is False
c.baremetal_introspection.get_introspection_data.assert_not_called()
@mock.patch.object(module, 'prepare_for_attempt')
def test_introspect_node_retries_failed(self, mock_pfa):
mock_pfa.return_value = self.node
c = self.cloud
ouch = Exception('ouch')
c.baremetal.set_node_provision_state.side_effect = [
ouch,
ouch,
ouch,
ouch,
]
result = module.introspect_node(
c, '1234abcd', 1200, 120, 3, True)
self.assertEqual({
'error': 'Error for introspection node 1234abcd on attempt 4: None ',
'failed': True,
'status': 'manageable'}, result)
mock_pfa.assert_has_calls([
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
mock.call(c, self.node, 1200, 120),
])
c.baremetal.set_node_provision_state.assert_has_calls([
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
mock.call(self.node, 'inspect', wait=True, timeout=1200),
])
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal_introspection.get_introspection_data.assert_not_called()
def test_prepare_for_attempt_noop(self):
c = self.cloud
self.node.provision_state = 'manageable'
self.node.power_state = 'power off'
self.node.reservation = None
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_not_called()
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal.wait_for_node_reservation.assert_not_called()
def test_prepare_for_attempt_not_manageable(self):
c = self.cloud
self.node.provision_state = 'inspect wait'
self.node.power_state = 'power off'
self.node.reservation = None
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_called_once_with(
self.node, 'abort', wait=True, timeout=1200
)
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal.wait_for_node_reservation.assert_not_called()
def test_prepare_for_attempt_powered_on(self):
c = self.cloud
self.node.provision_state = 'manageable'
self.node.power_state = 'power on'
self.node.reservation = None
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_not_called()
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
c.baremetal.wait_for_node_reservation.assert_not_called()
def test_prepare_for_attempt_reserved(self):
c = self.cloud
self.node.provision_state = 'manageable'
self.node.power_state = 'power off'
self.node.reservation = 'conductor1'
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_not_called()
c.baremetal.set_node_power_state.assert_not_called()
c.baremetal.wait_for_node_reservation.assert_called_once_with(
self.node, timeout=120
)
def test_prepare_for_attempt_everything_failed(self):
c = self.cloud
ouch = Exception('ouch')
c.baremetal.set_node_provision_state.side_effect = ouch
c.baremetal.set_node_power_state.side_effect = ouch
c.baremetal.wait_for_node_reservation.side_effect = ouch
self.node.provision_state = 'inspect wait'
self.node.power_state = 'power on'
self.node.reservation = 'conductor1'
node = module.prepare_for_attempt(c, self.node, 1200, 120)
self.assertEqual(node, self.node)
c.baremetal.set_node_provision_state.assert_called_once_with(
self.node, 'abort', wait=True, timeout=1200
)
c.baremetal.set_node_power_state.assert_called_once_with(
self.node, 'power off', wait=True, timeout=1200
)
c.baremetal.wait_for_node_reservation.assert_called_once_with(
self.node, timeout=120
)