From f1d3ff11d0e43b7e70fe9c36709975d96dfa86e8 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 16 Aug 2021 17:10:18 +0100 Subject: [PATCH] nova: improve compute service registration failure handling If any nova compute service fails to register itself, Kolla Ansible will fail the host that queries the Nova API. This is the first compute host in the inventory, and fails in the task: Waiting for nova-compute services to register themselves Other hosts continue, often leading to further errors later on. Clearly this is not idea. This change modifies the behaviour to query the compute service list until all expected hosts are present, but does not fail the querying host if they are not. A new task is added that executes for all hosts, and fails only those hosts that have not registered successfully. Alternatively, to fail all hosts in a cell when any compute service fails to register, set nova_compute_registration_fatal to true. Change-Id: I12c1928cf1f1fb9e28f1741e7fe4968004ea1816 Closes-Bug: #1940119 --- ansible/roles/nova-cell/defaults/main.yml | 6 ++ ansible/roles/nova-cell/tasks/deploy.yml | 3 +- .../nova-cell/tasks/discover_computes.yml | 86 ++++-------------- .../tasks/wait_discover_computes.yml | 88 +++++++++++++++++++ doc/source/reference/compute/nova-guide.rst | 14 +++ .../nova-discover-hosts-0353e9274f22195c.yaml | 9 ++ 6 files changed, 133 insertions(+), 73 deletions(-) create mode 100644 ansible/roles/nova-cell/tasks/wait_discover_computes.yml create mode 100644 releasenotes/notes/nova-discover-hosts-0353e9274f22195c.yaml diff --git a/ansible/roles/nova-cell/defaults/main.yml b/ansible/roles/nova-cell/defaults/main.yml index b5f1d87057..1eedb8a2a9 100644 --- a/ansible/roles/nova-cell/defaults/main.yml +++ b/ansible/roles/nova-cell/defaults/main.yml @@ -488,6 +488,12 @@ nova_compute_startup_delay: 30 # nova_cell_conductor_has_api_database to no. nova_cell_conductor_has_api_database: "yes" +# Whether the failure of a nova-compute service to register itself is fatal to +# the Kolla Ansible run. This is evaluated on a per-cell basis. Default +# behaviour is to only fail the host on which the compute service failed to +# register itself. +nova_compute_registration_fatal: false + #################### # Notification #################### diff --git a/ansible/roles/nova-cell/tasks/deploy.yml b/ansible/roles/nova-cell/tasks/deploy.yml index 795739f29b..beecc7d448 100644 --- a/ansible/roles/nova-cell/tasks/deploy.yml +++ b/ansible/roles/nova-cell/tasks/deploy.yml @@ -16,7 +16,7 @@ - name: Flush handlers meta: flush_handlers -- include_tasks: discover_computes.yml +- import_tasks: wait_discover_computes.yml vars: # List of virtualised compute hypervisors in this Ansible play batch. virt_computes_in_batch: >- @@ -34,5 +34,4 @@ # Run discovery when one or more compute hosts are in the Ansible batch, # and there is a cell conductor in the inventory to delegate to. - all_computes_in_batch | length > 0 - - inventory_hostname == all_computes_in_batch[0] - groups[nova_cell_conductor_group] | length > 0 diff --git a/ansible/roles/nova-cell/tasks/discover_computes.yml b/ansible/roles/nova-cell/tasks/discover_computes.yml index d13589cca8..43b2f3c1cd 100644 --- a/ansible/roles/nova-cell/tasks/discover_computes.yml +++ b/ansible/roles/nova-cell/tasks/discover_computes.yml @@ -1,77 +1,21 @@ --- -# We need to wait for all expected compute services to register before running -# cells v2 host discovery. This includes virtualised compute services and -# ironic compute services. -# Work with --limit by including only hosts in ansible_play_batch. -- name: Build a list of expected compute service hosts - vars: - # For virt, use ansible_facts.nodename rather than inventory_hostname, since this - # is similar to what nova uses internally as its default for the - # [DEFAULT] host config option. - virt_compute_service_hosts: >- - {{ virt_computes_in_batch | - map('extract', hostvars, ['ansible_facts', 'nodename']) | - list }} - # For ironic, use {{ansible_facts.hostname}}-ironic since this is what we - # configure for [DEFAULT] host in nova.conf. - ironic_compute_service_hosts: >- - {{ ironic_computes_in_batch | - map('extract', hostvars, ['ansible_facts', 'hostname']) | - map('regex_replace', '^(.*)$', '\1-ironic') | - list }} - set_fact: - expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}" - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" +# Discover compute hosts for a cell. -- name: Waiting for nova-compute services to register themselves - become: true - command: > - docker exec kolla_toolbox openstack - --os-interface {{ openstack_interface }} - --os-auth-url {{ openstack_auth.auth_url }} - --os-username {{ openstack_auth.username }} - --os-password {{ openstack_auth.password }} - --os-identity-api-version 3 - --os-user-domain-name {{ openstack_auth.user_domain_name }} - --os-system-scope {{ openstack_auth.system_scope }} - --os-region-name {{ openstack_region_name }} - {% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %} - compute service list --format json --column Host --service nova-compute - register: nova_compute_services - changed_when: false - retries: 20 - delay: 10 - until: - - nova_compute_services is success - # A list containing the 'Host' field of compute services that have - # registered themselves. Don't exclude compute services that are disabled - # since these could have been explicitly disabled by the operator. While we - # could exclude services that are down, the nova-manage cell_v2 - # discover_hosts does not do this so let's not block on it here. - # NOTE(mgoddard): Cannot factor this out into an intermediary variable - # before ansible 2.8, due to - # https://bugs.launchpad.net/kolla-ansible/+bug/1835817. - - (nova_compute_services.stdout | - from_json | - map(attribute='Host') | - list) - is superset(expected_compute_service_hosts) - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" +- block: + - import_tasks: get_cell_settings.yml -- import_tasks: get_cell_settings.yml - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" + - name: Fail if cell settings not found + fail: + msg: >- + Unable to find settings for {{ nova_cell_name or 'the default cell' }}. + when: not nova_cell_settings -- name: Fail if cell settings not found - fail: - msg: >- - Unable to find settings for {{ nova_cell_name or 'the default cell' }}. - when: not nova_cell_settings - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" + # TODO(yoctozepto): no need to do --by-service if ironic not used + - name: Discover nova hosts + become: true + command: > + docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }} + changed_when: False -# TODO(yoctozepto): no need to do --by-service if ironic not used -- name: Discover nova hosts - become: true - command: > - docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }} - changed_when: False + # Delegate to a cell conductor. delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" diff --git a/ansible/roles/nova-cell/tasks/wait_discover_computes.yml b/ansible/roles/nova-cell/tasks/wait_discover_computes.yml new file mode 100644 index 0000000000..89587dab2c --- /dev/null +++ b/ansible/roles/nova-cell/tasks/wait_discover_computes.yml @@ -0,0 +1,88 @@ +--- +# We need to wait for all expected compute services to register before running +# cells v2 host discovery. This includes virtualised compute services and +# ironic compute services. +# Work with --limit by including only hosts in ansible_play_batch. + +- block: + - name: Waiting for nova-compute services to register themselves + become: true + command: > + docker exec kolla_toolbox openstack + --os-interface {{ openstack_interface }} + --os-auth-url {{ openstack_auth.auth_url }} + --os-username {{ openstack_auth.username }} + --os-password {{ openstack_auth.password }} + --os-identity-api-version 3 + --os-user-domain-name {{ openstack_auth.user_domain_name }} + --os-system-scope {{ openstack_auth.system_scope }} + --os-region-name {{ openstack_region_name }} + {% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %} + compute service list --format json --column Host --service nova-compute + register: nova_compute_services + changed_when: false + failed_when: false + retries: 20 + delay: 10 + until: + - nova_compute_services is success + # A list containing the 'Host' field of compute services that have + # registered themselves. Don't exclude compute services that are disabled + # since these could have been explicitly disabled by the operator. While we + # could exclude services that are down, the nova-manage cell_v2 + # discover_hosts does not do this so let's not block on it here. + - (nova_compute_services.stdout | + from_json | + map(attribute='Host') | + list) + is superset(expected_compute_service_hosts) + # Execute on one compute per cell, and delegate to a cell conductor. + when: inventory_hostname == all_computes_in_batch[0] + delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" + + # NOTE(mgoddard): Use a separate fail task to ensure we fail only those hosts + # that failed to register. + - name: Fail if nova-compute service failed to register + vars: + # 'Host' field of all registered compute services. + nova_compute_service_hosts: >- + {{ hostvars[all_computes_in_batch[0]].nova_compute_services.stdout | + from_json | + map(attribute='Host') | + list }} + # 'Host' field of failed compute services. + failed_compute_service_hosts: >- + {{ expected_compute_service_hosts | difference(nova_compute_service_hosts) | list }} + # Whether any compute services failed on this host. + any_failed_services: >- + {{ ansible_facts.nodename in failed_compute_service_hosts or + (ansible_facts.hostname ~ "-ironic") in failed_compute_service_hosts }} + fail: + msg: >- + The Nova compute service failed to register itself on the following + hosts: {{ failed_compute_service_hosts | join(',') }} + when: >- + any_failed_services or + (nova_compute_registration_fatal | bool and + failed_compute_service_hosts | length > 0) + vars: + # For virt, use ansible_facts.nodename rather than inventory_hostname, since this + # is similar to what nova uses internally as its default for the + # [DEFAULT] host config option. + virt_compute_service_hosts: >- + {{ virt_computes_in_batch | + map('extract', hostvars, ['ansible_facts', 'nodename']) | + list }} + # For ironic, use {{ansible_facts.hostname}}-ironic since this is what we + # configure for [DEFAULT] host in nova.conf. + ironic_compute_service_hosts: >- + {{ ironic_computes_in_batch | + map('extract', hostvars, ['ansible_facts', 'hostname']) | + map('regex_replace', '^(.*)$', '\1-ironic') | + list }} + expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}" + +- name: Include discover_computes.yml + include_tasks: discover_computes.yml + # Execute on one compute host per cell. + when: inventory_hostname == all_computes_in_batch[0] diff --git a/doc/source/reference/compute/nova-guide.rst b/doc/source/reference/compute/nova-guide.rst index 4905169099..3deccee3e9 100644 --- a/doc/source/reference/compute/nova-guide.rst +++ b/doc/source/reference/compute/nova-guide.rst @@ -65,3 +65,17 @@ concept known as Vendordata. If a Vendordata file is located in the following path within the Kolla configuration, Kolla will automatically use it when the Nova service is deployed or reconfigured: ``/etc/kolla/config/nova/vendordata.json``. + +Failure handling +================ + +Compute service registration +---------------------------- + +During deployment, Kolla Ansible waits for Nova compute services to register +themselves. By default, if a compute service does not register itself before +the timeout, that host will be marked as failed in the Ansible run. This +behaviour is useful at scale, where failures are more frequent. + +Alternatively, to fail all hosts in a cell when any compute service fails +to register, set ``nova_compute_registration_fatal`` to ``true``. diff --git a/releasenotes/notes/nova-discover-hosts-0353e9274f22195c.yaml b/releasenotes/notes/nova-discover-hosts-0353e9274f22195c.yaml new file mode 100644 index 0000000000..dc43e90122 --- /dev/null +++ b/releasenotes/notes/nova-discover-hosts-0353e9274f22195c.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes an issue where a failure of any Nova compute service to register + itself would cause only the host querying the nova API to fail. + Now, only hosts that fail to register will fail the Kolla Ansible run. + Alternatively, to fail all hosts in a cell when any compute service fails + to register, set ``nova_compute_registration_fatal`` to ``true``. + `LP#1940119 `__