From ce680bcfe2208f19a23630ee3feff4b9bd40c595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Piliszek?= Date: Sun, 16 Jun 2019 20:37:35 +0200 Subject: [PATCH] Avoid parallel discover_hosts (nova-related race condition) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a rare event both kolla-ansible and nova-scheduler try to do the mapping at the same time and one of them fails. Since kolla-ansible runs host discovery on each deployment, there is no need to change the default of no periodic host discovery. I added some notes for future. They are not critical. I made the decision explicit in the comments. I changed the task name to satisfy recommendations. I removed the variable because it is not used (to avoid future doubts). Closes-Bug: #1832987 Change-Id: I3128472f028a2dbd7ace02abc179a9629ad74ceb Signed-off-by: Radosław Piliszek --- ansible/roles/nova/tasks/discover_computes.yml | 5 +++-- ansible/roles/nova/templates/nova.conf.j2 | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ansible/roles/nova/tasks/discover_computes.yml b/ansible/roles/nova/tasks/discover_computes.yml index 1a0c904f26..647bd5d59c 100644 --- a/ansible/roles/nova/tasks/discover_computes.yml +++ b/ansible/roles/nova/tasks/discover_computes.yml @@ -23,15 +23,16 @@ - nova_compute_services is success - nova_compute_services.stdout | from_json | length != 0 -- name: Discovering nova hosts +# TODO(yoctozepto): no need to do --by-service if ironic not used +- name: Discover nova hosts become: true command: > docker exec nova_api nova-manage cell_v2 discover_hosts --by-service - register: discover_hosts changed_when: False run_once: True delegate_to: "{{ groups['nova-api'][0] }}" +# NOTE(yoctozepto): SIGHUP is probably unnecessary - name: Refresh cell cache in nova scheduler become: true command: docker kill --signal HUP nova_scheduler diff --git a/ansible/roles/nova/templates/nova.conf.j2 b/ansible/roles/nova/templates/nova.conf.j2 index 04b196b916..39bb43d9f9 100644 --- a/ansible/roles/nova/templates/nova.conf.j2 +++ b/ansible/roles/nova/templates/nova.conf.j2 @@ -254,7 +254,10 @@ secure_proxy_ssl_header = HTTP_X_FORWARDED_PROTO [scheduler] max_attempts = 10 -discover_hosts_in_cells_interval = 60 +# NOTE(yoctozepto): kolla-ansible handles cell mapping by itself on each deploy +# periodic run must be disabled to avoid random failures (where both try to map) +# -1 is default and means periodic discovery is disabled +discover_hosts_in_cells_interval = -1 {% if enable_nova_fake | bool %} default_filters = RetryFilter,AvailabilityZoneFilter,ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter