Merge "Refactoring node-health validation"

This commit is contained in:
Zuul 2022-10-21 16:36:14 +00:00 committed by Gerrit Code Review
commit 54aa994870
4 changed files with 61 additions and 33 deletions

View File

@ -4,3 +4,4 @@ collections:
- community.general
- community.crypto
- ansible.posix
- openstack.cloud

View File

@ -2,5 +2,12 @@
node_health
===========
Role is used by the :ref:`pre-upgrade_node-health` validation to verify state of the overcloud
compute services and baremetal nodes they are running on.
As the clients contacted require Keystone authentication, the role requires
relevant values, such as Keystone endpoint and username, for correct operation.
Otherwise it will produce authentication error.
.. ansibleautoplugin::
:role: roles/node_health

View File

@ -1,11 +1,12 @@
---
- hosts: undercloud
- hosts: localhost
vars:
metadata:
name: Node health check
description: |
Check if all overcloud nodes can be connected to before starting a
scale-up or an upgrade.
scale-up or an upgrade. Validation requires cloud authentication details
in the form of accessible clouds.yaml file to be correctly executed.
groups:
- pre-upgrade
categories:

View File

@ -1,35 +1,54 @@
---
- name: Collect IPs for allovercloud nodes
set_fact: ansible_host="{{ hostvars[item]['ansible_host'] }}"
register: oc_ips
with_items: "{{ groups.allovercloud }}"
- name: Ping all overcloud nodes
icmp_ping:
host: "{{ item }}"
with_items: "{{ oc_ips.results | map(attribute='ansible_facts.ansible_host') | list }}"
- name: Retrieving compute services
ignore_errors: true
register: ping_results
openstack.cloud.compute_service_info:
cloud: overcloud
register: result
- name: Extract failed pings
set_fact:
failed_ips: "{{ ping_results.results | selectattr('failed', 'equalto', True) | map(attribute='item') | list }}"
- name: Lookup nova servers for each failed IP
set_fact:
servers: "{{ lookup('nova_servers', 'ip', 'ctlplane', failed_ips, wantlist=True) }}"
- name: Extract nova ids
set_fact:
server_ids: "{{ servers | map(attribute='id') | list }}"
- name: Lookup ironic nodes for unreachable nova servers
set_fact:
nodes: "{{ lookup('ironic_nodes', 'instance_uuid', server_ids, wantlist=True) }}"
- name: Fail if there are unreachable nodes
- name: Fail if the compute services can't be queried
fail:
msg: |
{{ lookup('template', './templates/unreachable_nodes.j2',
template_vars=dict(nodes=nodes)) }}
when: nodes|length > 0
msg: Compute services query failed with {{ result.msg }}
when: result.failed
- name: Get nova nodes
set_fact:
nova_nodes: "{{ result.openstack_compute_services | community.general.json_query(query) }}"
vars:
query: "[?contains(name, 'nova')]"
- name: Get failed nova nodes
set_fact:
failed_nodes: "{{ nova_nodes | community.general.json_query(failed_nodes_query) }}"
vars:
failed_nodes_query: "[?state!='up']"
- when: failed_nodes | length > 0
block:
- name: Get baremetal nodes info
become: true
openstack.cloud.baremetal_node_info:
cloud: undercloud
register: result
- name: Get baremetal nodes
set_fact:
baremetal_nodes: "{{ result.baremetal_nodes }}"
- name: Get failed node names
set_fact:
node_names: "{{ item.host.split('.')[0]}}"
with_items: "{{ failed_nodes }}"
- name: Get failed baremetal nodes
set_fact:
failed_baremetal_nodes: "{{ baremetal_nodes | to_json | from_json | community.general.json_query(query) }}"
with_items: "{{ node_names }}"
vars:
query: "[?contains(name, '{{ item }}')]"
- name: Fail if there are unreachable nodes
fail:
msg: |
{{ lookup('template', './templates/unreachable_nodes.j2',
template_vars=dict(nodes=failed_baremetal_nodes)) }}
when: failed_baremetal_nodes|length > 0