From ed3b27cc923144a9f437e931a5af3571e801ea33 Mon Sep 17 00:00:00 2001 From: Jan Gutter Date: Sun, 9 Jul 2023 11:49:04 +0100 Subject: [PATCH] etcd: Add support for more scenarios This commit addresses a few shortcomings in the etcd service: * Adding or removing etcd nodes required manual intervention. * The etcd service would have brief outages during upgrades or reconfigures because restarts weren't always serialised. This makes the etcd service follow a similar pattern to mariadb: * There is now a distiction between bootstrapping the cluster and adding / removing another member. * This more closely follows etcd's upstream bootstrapping guidelines. * The etcd role now serialises restarts internally so the kolla_serial pattern is no longer appropriate (or necessary). This does not remove the need for manual intervention in all failure modes: the documentation has been updated to address the most common issues. Note that there's repetition in the container specifications: this is somewhat deliberate. In a future cleanup, it's intended to reduce the duplication. Change-Id: I39829ba0c5894f8e549f9b83b416e6db4fafd96f --- ansible/roles/etcd/defaults/main.yml | 16 ++- ansible/roles/etcd/handlers/main.yml | 69 ++++++++++--- ansible/roles/etcd/tasks/bootstrap.yml | 25 +++++ .../roles/etcd/tasks/bootstrap_cluster.yml | 60 ++++++++++++ .../roles/etcd/tasks/bootstrap_services.yml | 55 +++++++++++ ansible/roles/etcd/tasks/deploy.yml | 2 + ansible/roles/etcd/tasks/lookup_cluster.yml | 26 +++++ ansible/roles/etcd/tasks/lookup_leader.yml | 41 ++++++++ .../etcd/tasks/remove_deleted_members.yml | 39 ++++++++ ansible/roles/etcd/tasks/restart_services.yml | 25 +++++ ansible/site.yml | 1 - doc/source/admin/etcd.rst | 97 +++++++++++++++++++ doc/source/admin/index.rst | 1 + doc/source/user/adding-and-removing-hosts.rst | 8 ++ etc/kolla/globals.yml | 7 ++ .../notes/managed-etcd-72fb2d3fbba516d9.yaml | 12 +++ tests/setup_gate.sh | 2 +- tests/templates/globals-default.j2 | 4 + 18 files changed, 471 insertions(+), 19 deletions(-) create mode 100644 ansible/roles/etcd/tasks/bootstrap.yml create mode 100644 ansible/roles/etcd/tasks/bootstrap_cluster.yml create mode 100644 ansible/roles/etcd/tasks/bootstrap_services.yml create mode 100644 ansible/roles/etcd/tasks/lookup_cluster.yml create mode 100644 ansible/roles/etcd/tasks/lookup_leader.yml create mode 100644 ansible/roles/etcd/tasks/remove_deleted_members.yml create mode 100644 ansible/roles/etcd/tasks/restart_services.yml create mode 100644 doc/source/admin/etcd.rst create mode 100644 releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml diff --git a/ansible/roles/etcd/defaults/main.yml b/ansible/roles/etcd/defaults/main.yml index c85af4be9a..2421f16faf 100644 --- a/ansible/roles/etcd/defaults/main.yml +++ b/ansible/roles/etcd/defaults/main.yml @@ -5,15 +5,18 @@ etcd_services: group: etcd enabled: true environment: + # KOLLA_BOOTSTRAP_STATUS is used to indicate whether the container should + # be recreated. Otherwise the kolla_container task doesn't detect that the + # environment has changed if variables are removed. + KOLLA_BOOTSTRAP_STATUS: "bootstrap completed" + ETCDCTL_API: "3" + ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}" + ETCDCTL_WRITE_OUT: "json" ETCD_DATA_DIR: "/var/lib/etcd" ETCD_NAME: "{{ ansible_facts.hostname }}" ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}" ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}" - ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}" ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}" - ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}" - ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}" - ETCD_INITIAL_CLUSTER_STATE: "new" ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log" KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}" ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}" @@ -52,3 +55,8 @@ etcd_extra_volumes: "{{ default_extra_volumes }}" ############ etcd_client_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_client_port }}" etcd_peer_internal_endpoint: "{{ etcd_protocol }}://{{ api_interface_address | put_address_in_context('url') }}:{{ etcd_peer_port }}" + +################### +# Managing members +################### +etcd_remove_deleted_members: "no" diff --git a/ansible/roles/etcd/handlers/main.yml b/ansible/roles/etcd/handlers/main.yml index 1c397aa35e..5813fcfaf5 100644 --- a/ansible/roles/etcd/handlers/main.yml +++ b/ansible/roles/etcd/handlers/main.yml @@ -1,16 +1,59 @@ --- -- name: Restart etcd container - vars: - service_name: "etcd" - service: "{{ etcd_services[service_name] }}" - become: true - kolla_container: - action: "recreate_or_restart_container" - common_options: "{{ docker_common_options }}" - name: "{{ service.container_name }}" - image: "{{ service.image }}" - environment: "{{ service.environment }}" - volumes: "{{ service.volumes }}" - dimensions: "{{ service.dimensions }}" +- name: Bootstrap etcd on new cluster + include_tasks: 'bootstrap_cluster.yml' when: - kolla_action != "config" + listen: + - Bootstrap etcd cluster + +- name: Look up the cluster leader + include_tasks: 'lookup_leader.yml' + when: + - kolla_action != "config" + listen: + - Restart etcd container + - Bootstrap etcd services + - Bootstrap etcd cluster + - Check for deleted members + +- name: Bootstrap etcd on new services + include_tasks: 'bootstrap_services.yml' + when: + - groups.etcd_had_volume_False is defined + - inventory_hostname in groups.etcd_had_volume_False + - kolla_action != "config" + listen: + - Bootstrap etcd services + +- name: Rolling restart of etcd non-leaders + include_tasks: 'restart_services.yml' + when: + - inventory_hostname not in (groups.etcd_is_leader_True | default([])) + - groups.etcd.index(inventory_hostname) % 4 == item + - kolla_action != "config" + listen: + - Restart etcd container + - Bootstrap etcd services + - Bootstrap etcd cluster + loop: + - 0 + - 1 + - 2 + - 3 + +- name: Restart etcd leader + include_tasks: 'restart_services.yml' + when: + - inventory_hostname in (groups.etcd_is_leader_True | default([])) + - kolla_action != "config" + listen: + - Restart etcd container + - Bootstrap etcd services + - Bootstrap etcd cluster + +- name: Remove deleted members + include_tasks: 'remove_deleted_members.yml' + when: + - kolla_action != "config" + listen: + - Check for deleted members diff --git a/ansible/roles/etcd/tasks/bootstrap.yml b/ansible/roles/etcd/tasks/bootstrap.yml new file mode 100644 index 0000000000..eb0d00a20d --- /dev/null +++ b/ansible/roles/etcd/tasks/bootstrap.yml @@ -0,0 +1,25 @@ +--- +- import_tasks: lookup_cluster.yml + +# NOTE(jan.gutter): The following two tasks set facts that aren't really used. +# They serve the purpose to trigger the handlers for bootstrapping: +# If no etcd data volumes exist, bootstrap a new initial cluster. +# If some volumes exist, add the new nodes to an existing cluster. + +- name: Determine whether a new cluster needs bootstrapping + set_fact: + etcd_bootstrap_cluster: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}" + when: not (etcd_cluster_exists | bool) + changed_when: not (etcd_cluster_exists | bool) + notify: Bootstrap etcd cluster + +- name: Determine when new services need bootstrapping + set_fact: + etcd_bootstrap_services: "{% for host in groups['etcd_had_volume_False'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}" + when: + - etcd_cluster_exists | bool + - groups.etcd_had_volume_False is defined + changed_when: + - etcd_cluster_exists | bool + - groups.etcd_had_volume_False is defined + notify: Bootstrap etcd services diff --git a/ansible/roles/etcd/tasks/bootstrap_cluster.yml b/ansible/roles/etcd/tasks/bootstrap_cluster.yml new file mode 100644 index 0000000000..5c627f9e5d --- /dev/null +++ b/ansible/roles/etcd/tasks/bootstrap_cluster.yml @@ -0,0 +1,60 @@ +--- +- name: Bootstrapping etcd cluster + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + become: true + kolla_container: + action: "start_container" + common_options: "{{ docker_common_options }}" + environment: + KOLLA_BOOTSTRAP_STATUS: "bootstrap cluster" + ETCD_INITIAL_CLUSTER_STATE: "new" + ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}" + ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}" + ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }}{% if not loop.last %},{% endif %}{% endfor %}" + ETCDCTL_API: "3" + ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}" + ETCDCTL_WRITE_OUT: "json" + ETCD_DATA_DIR: "/var/lib/etcd" + ETCD_NAME: "{{ ansible_facts.hostname }}" + ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}" + ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}" + ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}" + ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log" + KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}" + ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}" + ETCD_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}" + ETCD_PEER_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}" + ETCD_PEER_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}" + image: "{{ service.image }}" + name: "{{ service.container_name }}" + volumes: "{{ service.volumes }}" + dimensions: "{{ service.dimensions }}" + +- name: Wait for etcd service port liveness + wait_for: + host: "{{ api_interface_address }}" + port: "{{ etcd_client_port }}" + connect_timeout: 1 + timeout: 60 + register: check_etcd_port + until: check_etcd_port is success + retries: 10 + delay: 6 + +- name: Wait for etcd endpoints to be healthy + become: true + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + command: >- + {{ kolla_container_engine }} exec {{ service.container_name }} + etcdctl endpoint health + changed_when: false + register: result + until: + - result is success + - ((result.stdout | from_json | first)['health'] | default(False) | bool) + retries: 10 + delay: 6 diff --git a/ansible/roles/etcd/tasks/bootstrap_services.yml b/ansible/roles/etcd/tasks/bootstrap_services.yml new file mode 100644 index 0000000000..05e18c4971 --- /dev/null +++ b/ansible/roles/etcd/tasks/bootstrap_services.yml @@ -0,0 +1,55 @@ +--- +- name: Add new member to etcd cluster + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + become: true + command: >- + {{ kolla_container_engine }} exec {{ service.container_name }} + etcdctl member add {{ ansible_facts.hostname }} + --peer-urls={{ etcd_protocol }}://{{ 'api' | kolla_address(inventory_hostname) | put_address_in_context('url') }}:{{ etcd_peer_port }} + delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}" + +- name: Bootstrapping etcd containers + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + become: true + kolla_container: + action: "start_container" + common_options: "{{ docker_common_options }}" + environment: + KOLLA_BOOTSTRAP_STATUS: "bootstrap service" + ETCD_INITIAL_CLUSTER_STATE: "existing" + ETCD_INITIAL_ADVERTISE_PEER_URLS: "{{ etcd_peer_internal_endpoint }}" + ETCD_INITIAL_CLUSTER_TOKEN: "{{ etcd_cluster_token }}" + ETCD_INITIAL_CLUSTER: "{% for host in groups['etcd_had_volume_True'] %}{{ hostvars[host].ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(host) | put_address_in_context('url') }}:{{ etcd_peer_port }},{% endfor %}{{ ansible_facts.hostname }}={{ etcd_protocol }}://{{ 'api' | kolla_address(inventory_hostname) | put_address_in_context('url') }}:{{ etcd_peer_port }}" + ETCDCTL_API: "3" + ETCDCTL_ENDPOINTS: "{{ etcd_client_internal_endpoint }}" + ETCDCTL_WRITE_OUT: "json" + ETCD_DATA_DIR: "/var/lib/etcd" + ETCD_NAME: "{{ ansible_facts.hostname }}" + ETCD_ADVERTISE_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}" + ETCD_LISTEN_CLIENT_URLS: "{{ etcd_client_internal_endpoint }}" + ETCD_LISTEN_PEER_URLS: "{{ etcd_peer_internal_endpoint }}" + ETCD_OUT_FILE: "/var/log/kolla/etcd/etcd.log" + KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}" + ETCD_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}" + ETCD_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}" + ETCD_PEER_CERT_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-cert.pem{% endif %}" + ETCD_PEER_KEY_FILE: "{% if etcd_enable_tls | bool %}/etc/etcd/certs/etcd-key.pem{% endif %}" + image: "{{ service.image }}" + name: "{{ service.container_name }}" + volumes: "{{ service.volumes }}" + dimensions: "{{ service.dimensions }}" + +- name: Wait for etcd service port liveness + wait_for: + host: "{{ api_interface_address }}" + port: "{{ etcd_client_port }}" + connect_timeout: 1 + timeout: 60 + register: check_etcd_client_port + until: check_etcd_client_port is success + retries: 10 + delay: 6 diff --git a/ansible/roles/etcd/tasks/deploy.yml b/ansible/roles/etcd/tasks/deploy.yml index 49edff81e3..d0b36cb78b 100644 --- a/ansible/roles/etcd/tasks/deploy.yml +++ b/ansible/roles/etcd/tasks/deploy.yml @@ -3,5 +3,7 @@ - import_tasks: check-containers.yml +- import_tasks: bootstrap.yml + - name: Flush handlers meta: flush_handlers diff --git a/ansible/roles/etcd/tasks/lookup_cluster.yml b/ansible/roles/etcd/tasks/lookup_cluster.yml new file mode 100644 index 0000000000..bd95d573b0 --- /dev/null +++ b/ansible/roles/etcd/tasks/lookup_cluster.yml @@ -0,0 +1,26 @@ +--- +- name: Ensure etcd volume + become: true + kolla_container: + action: "create_volume" + common_options: "{{ docker_common_options }}" + name: "kolla_etcd" + register: etcd_volume + +# NOTE(jan.gutter): If the play is interrupted before properly bootstrapping, +# we will incorrectly assume that an etcd cluster exists. This likely requires +# manual intervention to unwedge. If a volume exists we must assume there's +# data on it. + +- name: Divide hosts by their etcd volume availability + group_by: + key: etcd_had_volume_{{ etcd_volume is not changed }} + changed_when: false + +- name: Establish whether the cluster has already existed + set_fact: + etcd_cluster_exists: "{{ groups.etcd_had_volume_True is defined }}" + changed_when: + - etcd_remove_deleted_members | bool + - groups.etcd_had_volume_True is defined + notify: Check for deleted members diff --git a/ansible/roles/etcd/tasks/lookup_leader.yml b/ansible/roles/etcd/tasks/lookup_leader.yml new file mode 100644 index 0000000000..aebd851a14 --- /dev/null +++ b/ansible/roles/etcd/tasks/lookup_leader.yml @@ -0,0 +1,41 @@ +--- +# NOTE(jan.gutter): These tasks assume a cluster is running +- name: Check for the etcd leader + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + become: true + # NOTE(jan.gutter): We need to set the ETCD environment vars here to + # handle an upgrade scenario from older etcd containers. These can be + # removed once the new workflow has been in place for a cycle or two. + command: >- + {{ kolla_container_engine }} exec + -e ETCDCTL_API=3 + -e ETCDCTL_ENDPOINTS="{{ etcd_client_internal_endpoint }}" + -e ETCDCTL_WRITE_OUT="json" + {{ service.container_name }} + etcdctl endpoint status + changed_when: false + when: + - inventory_hostname in (groups.etcd_had_volume_True | default([])) + register: etcd_endpoint_status_result + +- name: Divide hosts by their etcd leader status + vars: + etcd_endpoint_status: >- + {{ etcd_endpoint_status_result.stdout | default('[]') | from_json }} + etcd_member_id: >- + {{ etcd_endpoint_status[0]['Status']['header']['member_id'] + | default('') }} + etcd_leader_id: >- + {{ etcd_endpoint_status[0]['Status']['leader'] + | default('none') }} + group_by: + key: etcd_is_leader_{{ etcd_member_id == etcd_leader_id }} + changed_when: false + +- name: Set the etcd cluster leader + set_fact: + etcd_cluster_leader: "{{ groups.etcd_is_leader_True | sort | first }}" + when: groups.etcd_is_leader_True is defined + changed_when: false diff --git a/ansible/roles/etcd/tasks/remove_deleted_members.yml b/ansible/roles/etcd/tasks/remove_deleted_members.yml new file mode 100644 index 0000000000..188498e5cb --- /dev/null +++ b/ansible/roles/etcd/tasks/remove_deleted_members.yml @@ -0,0 +1,39 @@ +--- +- name: List the etcd members + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + become: true + command: >- + {{ kolla_container_engine }} exec {{ service.container_name }} + etcdctl member list + changed_when: false + run_once: true + delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}" + register: etcd_member_list_result + +- name: Remove deleted members from the etcd cluster + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + etcd_members_from_inventory: >- + {{ groups['etcd'] + | map('extract', hostvars, 'ansible_facts') + | map(attribute='hostname') + | list }} + etcd_deleted_members: >- + {{ etcd_member_list_result.stdout | from_json + | json_query('members[].name') + | difference(etcd_members_from_inventory) }} + etcd_member_id: >- + {{ etcd_member_list_result.stdout | from_json + | json_query('members[].{key: name, value: ID}') | items2dict }} + become: true + command: >- + {{ kolla_container_engine }} exec {{ service.container_name }} + etcdctl member remove {{ '%x' % etcd_member_id[etcd_deleted_member] }} + run_once: true + delegate_to: "{{ etcd_cluster_leader | default(groups[service.group][0]) }}" + loop: "{{ etcd_deleted_members }}" + loop_control: + loop_var: etcd_deleted_member diff --git a/ansible/roles/etcd/tasks/restart_services.yml b/ansible/roles/etcd/tasks/restart_services.yml new file mode 100644 index 0000000000..b3c4de5264 --- /dev/null +++ b/ansible/roles/etcd/tasks/restart_services.yml @@ -0,0 +1,25 @@ +--- +- name: Restart etcd container + vars: + service_name: "etcd" + service: "{{ etcd_services[service_name] }}" + become: true + kolla_container: + action: "recreate_or_restart_container" + common_options: "{{ docker_common_options }}" + name: "{{ service.container_name }}" + image: "{{ service.image }}" + volumes: "{{ service.volumes }}" + dimensions: "{{ service.dimensions }}" + environment: "{{ service.environment }}" + +- name: Wait for etcd service port liveness + wait_for: + host: "{{ api_interface_address }}" + port: "{{ etcd_client_port }}" + connect_timeout: 1 + timeout: 60 + register: check_etcd_client_port + until: check_etcd_client_port is success + retries: 10 + delay: 6 diff --git a/ansible/site.yml b/ansible/site.yml index 144b608cd5..04167fc9ef 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -458,7 +458,6 @@ hosts: - etcd - '&enable_etcd_True' - serial: '{{ kolla_serial|default("0") }}' roles: - { role: etcd, tags: etcd } diff --git a/doc/source/admin/etcd.rst b/doc/source/admin/etcd.rst new file mode 100644 index 0000000000..35c6f31624 --- /dev/null +++ b/doc/source/admin/etcd.rst @@ -0,0 +1,97 @@ +.. etcd: + +============= +Managing etcd +============= + +Kolla Ansible can manage the lifecycle of an etcd cluster and supports the +following operations: + +* Bootstrapping a clean multi-node etcd cluster +* Adding a new member to the etcd cluster +* Optionally, automatically removing a deleted node from the etcd cluster. + +It is highly recommended to read the operator documentation for the version +of etcd deployed in the cluster. + +.. note:: + + Once an etcd cluster is bootstrapped, the etcd service takes most of its + configuration from the etcd database itself. + + This pattern is very different from many other Kolla Ansible services, and + is a source of confusion for operators unfamiliar with etcd. + +Cluster vs Node Bootstrapping +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Kolla Ansible distinguishes between two forms of bootstrapping in an etcd +cluster: + +* Bootstrapping multiple nodes at the same time to bring up a new cluster +* Bootstrapping a single node to add it to an existing cluster + +These corresponds to the `new` and `existing` parameters for +`ETCD_INITIAL_CLUSTER_STATE` in the upstream documentation. Once an etcd node +has completed bootstrap, the bootstrap configuration is ignored, even if it is +changed. + +Kolla Ansible will decide to perform a new cluster bootstrap if it detects that +there is no existing data on the etcd nodes. Otherwise it assumes that there is +a healthy etcd cluster and it will add a new node to it. + +Forcing Bootstrapping +~~~~~~~~~~~~~~~~~~~~~ + +Kolla Ansible looks for the `kolla_etcd` volume on the node. If this volume +is available, it assumes that the bootstrap process has run on the node and +the volume contains the required config. + +However, if the process was interrupted (externally, or by an error), this +volume might be misconfigured. In order to prevent dataloss, manual +intervention is required. + +Before retriggering bootstrap make sure that there is no valuable data on the +volume. This could be because the node was not in service, or that the data +is persisted elsewhere. + +To retrigger a bootstrap (for either the cluster, or for a single node), +remove the volume, from all affected nodes: + +``docker volume rm kolla_etcd`` + +Rerunning Kolla Ansible will then trigger the appropriate workflow and either +a blank cluster will be bootstrapped, or an empty member will be added to +the existing cluster. + +Manual Commands +~~~~~~~~~~~~~~~ + +In order to manage etcd manually, the ``etcdctl`` command can be used inside +the `etcd` container. This command has been set up with the appropriate +environment variables for integrating with automation. + +``etcdctl`` is configured with json output by default: + +.. code-block:: console + + # list cluster members in a human-readable table + docker exec -it etcd etcdctl -w table member list + +Removing Dead Nodes +~~~~~~~~~~~~~~~~~~~ + +If ``globals.yml`` has the value ``etcd_remove_deleted_members: "yes"`` then +etcd nodes that are not in the inventory will be removed from the etcd cluster. + +Any errors in the inventory can therefore cause unintended removal. + +To manually remove a dead node from the etcd cluster, use the following +commands: + +.. code-block:: console + + # list cluster members and identify dead member + docker exec -it etcd etcdctl -w table member list + # remove dead member + docker exec -it etcd etcdctl member remove MEMBER_ID_IN_HEX diff --git a/doc/source/admin/index.rst b/doc/source/admin/index.rst index 720b663c3f..f668312514 100644 --- a/doc/source/admin/index.rst +++ b/doc/source/admin/index.rst @@ -9,5 +9,6 @@ Admin Guides tls acme mariadb-backup-and-restore + etcd production-architecture-guide deployment-philosophy diff --git a/doc/source/user/adding-and-removing-hosts.rst b/doc/source/user/adding-and-removing-hosts.rst index 60fd396d31..4e244e995b 100644 --- a/doc/source/user/adding-and-removing-hosts.rst +++ b/doc/source/user/adding-and-removing-hosts.rst @@ -173,6 +173,14 @@ For each host, clean up its services: .. _removing-existing-compute-nodes: +If the node is also running the `etcd` service, set +``etcd_remove_deleted_members: "yes"`` in `globals.yml` to automatically +remove nodes from the `etcd` cluster that have been removed from the inventory. + +Alternatively the `etcd` members can be removed manually with `etcdctl`. For +more details, please consult the `runtime reconfiguration` documentation +section for the version of etcd in operation. + Removing existing compute nodes ------------------------------- diff --git a/etc/kolla/globals.yml b/etc/kolla/globals.yml index 3eec3ba452..92d1d7147a 100644 --- a/etc/kolla/globals.yml +++ b/etc/kolla/globals.yml @@ -903,3 +903,10 @@ workaround_ansible_issue_8743: yes # this is UDP port #hacluster_corosync_port: 5405 + +############## +# etcd options +############## +# If `etcd_remove_deleted_members` is enabled, Kolla Ansible will automatically +# remove etcd members from the cluster that are no longer in the inventory. +#etcd_remove_deleted_members: "no" diff --git a/releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml b/releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml new file mode 100644 index 0000000000..6c8a91a54a --- /dev/null +++ b/releasenotes/notes/managed-etcd-72fb2d3fbba516d9.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + The `etcd` tooling has been updated to better serialize restarts when + applying configuration or updates. Previously minor outages might occur + since all services were restarted in the same task. + - | + The `etcd` tooling has been updated to handle adding and removing nodes. + Previously this was an undocumented manual process and required creating + service containers. Operators can refer to the + `etcd admin guide `__ + for more details. diff --git a/tests/setup_gate.sh b/tests/setup_gate.sh index 995edbe151..3a1a8f0ce2 100755 --- a/tests/setup_gate.sh +++ b/tests/setup_gate.sh @@ -52,7 +52,7 @@ function prepare_images { fi if [[ $SCENARIO == "cephadm" ]]; then - GATE_IMAGES+=",^cinder" + GATE_IMAGES+=",^cinder,^etcd" fi if [[ $SCENARIO == "cells" ]]; then diff --git a/tests/templates/globals-default.j2 b/tests/templates/globals-default.j2 index 73029ab62b..4ba6877a84 100644 --- a/tests/templates/globals-default.j2 +++ b/tests/templates/globals-default.j2 @@ -77,6 +77,7 @@ openstack_tag_suffix: "{{ docker_image_tag_suffix }}" enable_zun: "yes" enable_kuryr: "yes" enable_etcd: "yes" +etcd_remove_deleted_members: "yes" docker_configure_for_zun: "yes" containerd_configure_for_zun: "yes" enable_cinder: "yes" @@ -132,6 +133,9 @@ enable_cinder: "yes" glance_backend_ceph: "yes" cinder_backend_ceph: "yes" nova_backend_ceph: "yes" +# Internal etcd +enable_etcd: "yes" +etcd_remove_deleted_members: "yes" enable_ceph_rgw: "yes" ceph_rgw_hosts: