diff --git a/ansible/roles/nova-cell/defaults/main.yml b/ansible/roles/nova-cell/defaults/main.yml index b5f1d87057..1eedb8a2a9 100644 --- a/ansible/roles/nova-cell/defaults/main.yml +++ b/ansible/roles/nova-cell/defaults/main.yml @@ -488,6 +488,12 @@ nova_compute_startup_delay: 30 # nova_cell_conductor_has_api_database to no. nova_cell_conductor_has_api_database: "yes" +# Whether the failure of a nova-compute service to register itself is fatal to +# the Kolla Ansible run. This is evaluated on a per-cell basis. Default +# behaviour is to only fail the host on which the compute service failed to +# register itself. +nova_compute_registration_fatal: false + #################### # Notification #################### diff --git a/ansible/roles/nova-cell/tasks/deploy.yml b/ansible/roles/nova-cell/tasks/deploy.yml index 795739f29b..beecc7d448 100644 --- a/ansible/roles/nova-cell/tasks/deploy.yml +++ b/ansible/roles/nova-cell/tasks/deploy.yml @@ -16,7 +16,7 @@ - name: Flush handlers meta: flush_handlers -- include_tasks: discover_computes.yml +- import_tasks: wait_discover_computes.yml vars: # List of virtualised compute hypervisors in this Ansible play batch. virt_computes_in_batch: >- @@ -34,5 +34,4 @@ # Run discovery when one or more compute hosts are in the Ansible batch, # and there is a cell conductor in the inventory to delegate to. - all_computes_in_batch | length > 0 - - inventory_hostname == all_computes_in_batch[0] - groups[nova_cell_conductor_group] | length > 0 diff --git a/ansible/roles/nova-cell/tasks/discover_computes.yml b/ansible/roles/nova-cell/tasks/discover_computes.yml index 43dfe08e9c..49d38b6c37 100644 --- a/ansible/roles/nova-cell/tasks/discover_computes.yml +++ b/ansible/roles/nova-cell/tasks/discover_computes.yml @@ -1,79 +1,23 @@ --- -# We need to wait for all expected compute services to register before running -# cells v2 host discovery. This includes virtualised compute services and -# ironic compute services. -# Work with --limit by including only hosts in ansible_play_batch. -- name: Build a list of expected compute service hosts - vars: - # For virt, use ansible_facts.nodename rather than inventory_hostname, since this - # is similar to what nova uses internally as its default for the - # [DEFAULT] host config option. - virt_compute_service_hosts: >- - {{ virt_computes_in_batch | - map('extract', hostvars, ['ansible_facts', 'nodename']) | - list }} - # For ironic, use {{ansible_facts.hostname}}-ironic since this is what we - # configure for [DEFAULT] host in nova.conf. - ironic_compute_service_hosts: >- - {{ ironic_computes_in_batch | - map('extract', hostvars, ['ansible_facts', 'hostname']) | - map('regex_replace', '^(.*)$', '\1-ironic') | - list }} - set_fact: - expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}" - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" +# Discover compute hosts for a cell. -- name: Waiting for nova-compute services to register themselves - become: true - command: > - docker exec kolla_toolbox openstack - --os-interface {{ openstack_interface }} - --os-auth-url {{ openstack_auth.auth_url }} - --os-username {{ openstack_auth.username }} - --os-password {{ openstack_auth.password }} - --os-identity-api-version 3 - --os-user-domain-name {{ openstack_auth.user_domain_name }} - --os-system-scope {{ openstack_auth.system_scope }} - --os-region-name {{ openstack_region_name }} - {% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %} - compute service list --format json --column Host --service nova-compute - register: nova_compute_services - changed_when: false - retries: 20 - delay: 10 - until: - - nova_compute_services is success - # A list containing the 'Host' field of compute services that have - # registered themselves. Don't exclude compute services that are disabled - # since these could have been explicitly disabled by the operator. While we - # could exclude services that are down, the nova-manage cell_v2 - # discover_hosts does not do this so let's not block on it here. - # NOTE(mgoddard): Cannot factor this out into an intermediary variable - # before ansible 2.8, due to - # https://bugs.launchpad.net/kolla-ansible/+bug/1835817. - - (nova_compute_services.stdout | - from_json | - map(attribute='Host') | - list) - is superset(expected_compute_service_hosts) - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" +- block: + - import_tasks: get_cell_settings.yml -- import_tasks: get_cell_settings.yml - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" + - name: Fail if cell settings not found + fail: + msg: >- + Unable to find settings for {{ nova_cell_name or 'the default cell' }}. + when: not nova_cell_settings -- name: Fail if cell settings not found - fail: - msg: >- - Unable to find settings for {{ nova_cell_name or 'the default cell' }}. - when: not nova_cell_settings - delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" + # TODO(yoctozepto): no need to do --by-service if ironic not used + - name: Discover nova hosts + become: true + command: > + docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }} + changed_when: False -# TODO(yoctozepto): no need to do --by-service if ironic not used -- name: Discover nova hosts - become: true - command: > - docker exec nova_conductor nova-manage cell_v2 discover_hosts --by-service --cell_uuid {{ nova_cell_settings.cell_uuid }} - changed_when: False + # Delegate to a cell conductor. delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" # Fail all hosts if any of these once-per-cell tasks fail. any_errors_fatal: true diff --git a/ansible/roles/nova-cell/tasks/wait_discover_computes.yml b/ansible/roles/nova-cell/tasks/wait_discover_computes.yml new file mode 100644 index 0000000000..89587dab2c --- /dev/null +++ b/ansible/roles/nova-cell/tasks/wait_discover_computes.yml @@ -0,0 +1,88 @@ +--- +# We need to wait for all expected compute services to register before running +# cells v2 host discovery. This includes virtualised compute services and +# ironic compute services. +# Work with --limit by including only hosts in ansible_play_batch. + +- block: + - name: Waiting for nova-compute services to register themselves + become: true + command: > + docker exec kolla_toolbox openstack + --os-interface {{ openstack_interface }} + --os-auth-url {{ openstack_auth.auth_url }} + --os-username {{ openstack_auth.username }} + --os-password {{ openstack_auth.password }} + --os-identity-api-version 3 + --os-user-domain-name {{ openstack_auth.user_domain_name }} + --os-system-scope {{ openstack_auth.system_scope }} + --os-region-name {{ openstack_region_name }} + {% if openstack_cacert != '' %}--os-cacert {{ openstack_cacert }}{% endif %} + compute service list --format json --column Host --service nova-compute + register: nova_compute_services + changed_when: false + failed_when: false + retries: 20 + delay: 10 + until: + - nova_compute_services is success + # A list containing the 'Host' field of compute services that have + # registered themselves. Don't exclude compute services that are disabled + # since these could have been explicitly disabled by the operator. While we + # could exclude services that are down, the nova-manage cell_v2 + # discover_hosts does not do this so let's not block on it here. + - (nova_compute_services.stdout | + from_json | + map(attribute='Host') | + list) + is superset(expected_compute_service_hosts) + # Execute on one compute per cell, and delegate to a cell conductor. + when: inventory_hostname == all_computes_in_batch[0] + delegate_to: "{{ groups[nova_cell_conductor_group][0] }}" + + # NOTE(mgoddard): Use a separate fail task to ensure we fail only those hosts + # that failed to register. + - name: Fail if nova-compute service failed to register + vars: + # 'Host' field of all registered compute services. + nova_compute_service_hosts: >- + {{ hostvars[all_computes_in_batch[0]].nova_compute_services.stdout | + from_json | + map(attribute='Host') | + list }} + # 'Host' field of failed compute services. + failed_compute_service_hosts: >- + {{ expected_compute_service_hosts | difference(nova_compute_service_hosts) | list }} + # Whether any compute services failed on this host. + any_failed_services: >- + {{ ansible_facts.nodename in failed_compute_service_hosts or + (ansible_facts.hostname ~ "-ironic") in failed_compute_service_hosts }} + fail: + msg: >- + The Nova compute service failed to register itself on the following + hosts: {{ failed_compute_service_hosts | join(',') }} + when: >- + any_failed_services or + (nova_compute_registration_fatal | bool and + failed_compute_service_hosts | length > 0) + vars: + # For virt, use ansible_facts.nodename rather than inventory_hostname, since this + # is similar to what nova uses internally as its default for the + # [DEFAULT] host config option. + virt_compute_service_hosts: >- + {{ virt_computes_in_batch | + map('extract', hostvars, ['ansible_facts', 'nodename']) | + list }} + # For ironic, use {{ansible_facts.hostname}}-ironic since this is what we + # configure for [DEFAULT] host in nova.conf. + ironic_compute_service_hosts: >- + {{ ironic_computes_in_batch | + map('extract', hostvars, ['ansible_facts', 'hostname']) | + map('regex_replace', '^(.*)$', '\1-ironic') | + list }} + expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}" + +- name: Include discover_computes.yml + include_tasks: discover_computes.yml + # Execute on one compute host per cell. + when: inventory_hostname == all_computes_in_batch[0] diff --git a/doc/source/reference/compute/nova-guide.rst b/doc/source/reference/compute/nova-guide.rst index 4905169099..3deccee3e9 100644 --- a/doc/source/reference/compute/nova-guide.rst +++ b/doc/source/reference/compute/nova-guide.rst @@ -65,3 +65,17 @@ concept known as Vendordata. If a Vendordata file is located in the following path within the Kolla configuration, Kolla will automatically use it when the Nova service is deployed or reconfigured: ``/etc/kolla/config/nova/vendordata.json``. + +Failure handling +================ + +Compute service registration +---------------------------- + +During deployment, Kolla Ansible waits for Nova compute services to register +themselves. By default, if a compute service does not register itself before +the timeout, that host will be marked as failed in the Ansible run. This +behaviour is useful at scale, where failures are more frequent. + +Alternatively, to fail all hosts in a cell when any compute service fails +to register, set ``nova_compute_registration_fatal`` to ``true``. diff --git a/releasenotes/notes/nova-discover-hosts-0353e9274f22195c.yaml b/releasenotes/notes/nova-discover-hosts-0353e9274f22195c.yaml new file mode 100644 index 0000000000..dc43e90122 --- /dev/null +++ b/releasenotes/notes/nova-discover-hosts-0353e9274f22195c.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes an issue where a failure of any Nova compute service to register + itself would cause only the host querying the nova API to fail. + Now, only hosts that fail to register will fail the Kolla Ansible run. + Alternatively, to fail all hosts in a cell when any compute service fails + to register, set ``nova_compute_registration_fatal`` to ``true``. + `LP#1940119 `__