Merge "nova: improve compute service registration failure handling" into stable/yoga

This commit is contained in:
Zuul 2022-04-29 09:56:38 +00:00 committed by Gerrit Code Review
commit 4cd70b71f0
6 changed files with 133 additions and 73 deletions

View File

@ -488,6 +488,12 @@ nova_compute_startup_delay: 30
# nova_cell_conductor_has_api_database to no.
nova_cell_conductor_has_api_database: "yes"
# Whether the failure of a nova-compute service to register itself is fatal to
# the Kolla Ansible run. This is evaluated on a per-cell basis. Default
# behaviour is to only fail the host on which the compute service failed to
# register itself.
nova_compute_registration_fatal: false
####################
# Notification
####################

View File

@ -16,7 +16,7 @@
- name: Flush handlers
meta: flush_handlers
- include_tasks: discover_computes.yml
- import_tasks: wait_discover_computes.yml
vars:
# List of virtualised compute hypervisors in this Ansible play batch.
virt_computes_in_batch: >-
@ -34,5 +34,4 @@
# Run discovery when one or more compute hosts are in the Ansible batch,
# and there is a cell conductor in the inventory to delegate to.
- all_computes_in_batch | length > 0
- inventory_hostname == all_computes_in_batch[0]
- groups[nova_cell_conductor_group] | length > 0

View File

@ -1,79 +1,23 @@
---
# We need to wait for all expected compute services to register before running
# cells v2 host discovery. This includes virtualised compute services and
# ironic compute services.
# Work with --limit by including only hosts in ansible_play_batch.
- name: Build a list of expected compute service hosts
vars:
# For virt, use ansible_facts.nodename rather than inventory_hostname, since this
# is similar to what nova uses internally as its default for the
# [DEFAULT] host config option.
virt_compute_service_hosts: >-
{{ virt_computes_in_batch |
map('extract', hostvars, ['ansible_facts', 'nodename']) |
list }}
# For ironic, use {{ansible_facts.hostname}}-ironic since this is what we
# configure for [DEFAULT] host in nova.conf.
ironic_compute_service_hosts: >-
{{ ironic_computes_in_batch |
map('extract', hostvars, ['ansible_facts', 'hostname']) |
map('regex_replace', '^(.*)$', '\1-ironic') |
list }}
set_fact:
expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
# Discover compute hosts for a cell.
- name: Waiting for nova-compute services to register themselves
become: true
command: >
docker exec kolla_toolbox openstack
--os-interface {{ openstack_interface }}
--os-auth-url {{ openstack_auth.auth_url }}
--os-username {{ openstack_auth.username }}
--os-password {{ openstack_auth.password }}
--os-identity-api-version 3
--os-user-domain-name {{ openstack_auth.user_domain_name }}
--os-system-scope {{ openstack_auth.system_scope }}
--os-region-name {{ openstack_region_name }}
{% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %}
compute service list --format json --column Host --service nova-compute
register: nova_compute_services
changed_when: false
retries: 20
delay: 10
until:
- nova_compute_services is success
# A list containing the 'Host' field of compute services that have
# registered themselves. Don't exclude compute services that are disabled
# since these could have been explicitly disabled by the operator. While we
# could exclude services that are down, the nova-manage cell_v2
# discover_hosts does not do this so let's not block on it here.
# NOTE(mgoddard): Cannot factor this out into an intermediary variable
# before ansible 2.8, due to
# https://bugs.launchpad.net/kolla-ansible/+bug/1835817.
- (nova_compute_services.stdout |
from_json |
map(attribute='Host') |
list)
is superset(expected_compute_service_hosts)
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
- block:
- import_tasks: get_cell_settings.yml
- import_tasks: get_cell_settings.yml
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
- name: Fail if cell settings not found
fail:
msg: >-
Unable to find settings for {{ nova_cell_name or 'the default cell' }}.
when: not nova_cell_settings
- name: Fail if cell settings not found
fail:
msg: >-
Unable to find settings for {{ nova_cell_name or 'the default cell' }}.
when: not nova_cell_settings
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
# TODO(yoctozepto): no need to do --by-service if ironic not used
- name: Discover nova hosts
become: true
command: >
docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }}
changed_when: False
# TODO(yoctozepto): no need to do --by-service if ironic not used
- name: Discover nova hosts
become: true
command: >
docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }}
changed_when: False
# Delegate to a cell conductor.
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
# Fail all hosts if any of these once-per-cell tasks fail.
any_errors_fatal: true

View File

@ -0,0 +1,88 @@
---
# We need to wait for all expected compute services to register before running
# cells v2 host discovery. This includes virtualised compute services and
# ironic compute services.
# Work with --limit by including only hosts in ansible_play_batch.
- block:
- name: Waiting for nova-compute services to register themselves
become: true
command: >
docker exec kolla_toolbox openstack
--os-interface {{ openstack_interface }}
--os-auth-url {{ openstack_auth.auth_url }}
--os-username {{ openstack_auth.username }}
--os-password {{ openstack_auth.password }}
--os-identity-api-version 3
--os-user-domain-name {{ openstack_auth.user_domain_name }}
--os-system-scope {{ openstack_auth.system_scope }}
--os-region-name {{ openstack_region_name }}
{% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %}
compute service list --format json --column Host --service nova-compute
register: nova_compute_services
changed_when: false
failed_when: false
retries: 20
delay: 10
until:
- nova_compute_services is success
# A list containing the 'Host' field of compute services that have
# registered themselves. Don't exclude compute services that are disabled
# since these could have been explicitly disabled by the operator. While we
# could exclude services that are down, the nova-manage cell_v2
# discover_hosts does not do this so let's not block on it here.
- (nova_compute_services.stdout |
from_json |
map(attribute='Host') |
list)
is superset(expected_compute_service_hosts)
# Execute on one compute per cell, and delegate to a cell conductor.
when: inventory_hostname == all_computes_in_batch[0]
delegate_to: "{{ groups[nova_cell_conductor_group][0] }}"
# NOTE(mgoddard): Use a separate fail task to ensure we fail only those hosts
# that failed to register.
- name: Fail if nova-compute service failed to register
vars:
# 'Host' field of all registered compute services.
nova_compute_service_hosts: >-
{{ hostvars[all_computes_in_batch[0]].nova_compute_services.stdout |
from_json |
map(attribute='Host') |
list }}
# 'Host' field of failed compute services.
failed_compute_service_hosts: >-
{{ expected_compute_service_hosts | difference(nova_compute_service_hosts) | list }}
# Whether any compute services failed on this host.
any_failed_services: >-
{{ ansible_facts.nodename in failed_compute_service_hosts or
(ansible_facts.hostname ~ "-ironic") in failed_compute_service_hosts }}
fail:
msg: >-
The Nova compute service failed to register itself on the following
hosts: {{ failed_compute_service_hosts | join(',') }}
when: >-
any_failed_services or
(nova_compute_registration_fatal | bool and
failed_compute_service_hosts | length > 0)
vars:
# For virt, use ansible_facts.nodename rather than inventory_hostname, since this
# is similar to what nova uses internally as its default for the
# [DEFAULT] host config option.
virt_compute_service_hosts: >-
{{ virt_computes_in_batch |
map('extract', hostvars, ['ansible_facts', 'nodename']) |
list }}
# For ironic, use {{ansible_facts.hostname}}-ironic since this is what we
# configure for [DEFAULT] host in nova.conf.
ironic_compute_service_hosts: >-
{{ ironic_computes_in_batch |
map('extract', hostvars, ['ansible_facts', 'hostname']) |
map('regex_replace', '^(.*)$', '\1-ironic') |
list }}
expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
- name: Include discover_computes.yml
include_tasks: discover_computes.yml
# Execute on one compute host per cell.
when: inventory_hostname == all_computes_in_batch[0]

View File

@ -65,3 +65,17 @@ concept known as Vendordata. If a Vendordata file is located in the
following path within the Kolla configuration, Kolla will
automatically use it when the Nova service is deployed or
reconfigured: ``/etc/kolla/config/nova/vendordata.json``.
Failure handling
================
Compute service registration
----------------------------
During deployment, Kolla Ansible waits for Nova compute services to register
themselves. By default, if a compute service does not register itself before
the timeout, that host will be marked as failed in the Ansible run. This
behaviour is useful at scale, where failures are more frequent.
Alternatively, to fail all hosts in a cell when any compute service fails
to register, set ``nova_compute_registration_fatal`` to ``true``.

View File

@ -0,0 +1,9 @@
---
fixes:
- |
Fixes an issue where a failure of any Nova compute service to register
itself would cause only the host querying the nova API to fail.
Now, only hosts that fail to register will fail the Kolla Ansible run.
Alternatively, to fail all hosts in a cell when any compute service fails
to register, set ``nova_compute_registration_fatal`` to ``true``.
`LP#1940119 <https://bugs.launchpad.net/kolla-ansible/+bug/1940119>`__