Fix multiple issues with MariaDB handling

These affected both deploy (and reconfigure) and upgrade
resulting in WSREP issues, failed deploys or need to
recover the cluster.

This patch makes sure k-a does not abruptly terminate
nodes to break cluster.
This is achieved by cleaner separation between stages
(bootstrap, restart current, deploy new) and 3 phases
for restarts (to keep the quorum).

Upgrade actions, which operate on a healthy cluster,
went to its section.

Service restart was refactored.

We no longer rely on the master/slave distinction as
all nodes are masters in Galera.

Closes-bug: #1857908
Closes-bug: #1859145
Change-Id: I83600c69141714fc412df0976f49019a857655f5
This commit is contained in:
Radosław Piliszek 2020-01-03 11:20:00 +01:00
parent ac62b560ff
commit 9f14ad651a
9 changed files with 182 additions and 210 deletions

View File

@ -17,16 +17,10 @@
restart_policy: no restart_policy: no
volumes: "{{ service.volumes }}" volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}" dimensions: "{{ service.dimensions }}"
when:
- bootstrap_host is defined
- bootstrap_host == inventory_hostname
listen: Bootstrap MariaDB cluster listen: Bootstrap MariaDB cluster
notify:
- restart mariadb
# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed # NOTE(yoctozepto): We have to loop this to avoid breaking on connection resets
# https://github.com/ansible/ansible-modules-core/issues/2788 - name: Wait for first MariaDB service port liveness
- name: wait first mariadb container
wait_for: wait_for:
host: "{{ api_interface_address }}" host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}" port: "{{ mariadb_port }}"
@ -37,157 +31,60 @@
until: check_mariadb_port is success until: check_mariadb_port is success
retries: 10 retries: 10
delay: 6 delay: 6
when:
- bootstrap_host is defined
- bootstrap_host == inventory_hostname
listen: Bootstrap MariaDB cluster listen: Bootstrap MariaDB cluster
- name: Wait for MariaDB to become operational - name: Wait for first MariaDB service to sync WSREP
become: true become: true
command: >- command: >-
docker exec {{ mariadb_service.container_name }} docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }} mysql -uroot -p{{ database_password }}
--silent --skip-column-names --silent --skip-column-names
-e 'SHOW STATUS LIKE "wsrep_evs_state"' -e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
changed_when: false changed_when: false
register: result register: result
until: '"OPERATIONAL" in result.stdout' until: result.stdout == "wsrep_local_state_comment\tSynced"
retries: 10 retries: 10
delay: 6 delay: 6
no_log: true no_log: true
when:
- bootstrap_host is defined
- bootstrap_host == inventory_hostname
listen: Bootstrap MariaDB cluster listen: Bootstrap MariaDB cluster
- name: restart slave mariadb - name: Creating haproxy mysql user
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true become: true
kolla_docker: kolla_toolbox:
action: "recreate_or_restart_container" module_name: mysql_user
common_options: "{{ docker_common_options }}" module_args:
name: "{{ service.container_name }}" login_host: "{{ api_interface_address }}"
image: "{{ service.image }}" login_port: "{{ mariadb_port }}"
volumes: "{{ service.volumes }}" login_user: "{{ database_user }}"
dimensions: "{{ service.dimensions }}" login_password: "{{ database_password }}"
name: "haproxy"
password: ""
host: "%"
priv: "*.*:USAGE"
listen: Bootstrap MariaDB cluster
- name: Restart MariaDB on existing cluster members
include_tasks: 'restart_services.yml'
when: when:
- groups.mariadb_port_alive_True is defined
- inventory_hostname in groups.mariadb_port_alive_True
- groups.mariadb_port_alive_True.index(inventory_hostname) % 3 == item
- kolla_action != "config" - kolla_action != "config"
- inventory_hostname != master_host
- not mariadb_recover | default(false)
listen: restart mariadb listen: restart mariadb
loop:
- 0
- 1
- 2
# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed - name: Start MariaDB on new nodes
# https://github.com/ansible/ansible-modules-core/issues/2788 include_tasks: 'restart_services.yml'
- name: wait for slave mariadb
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
connect_timeout: 1
timeout: 60
search_regex: "MariaDB"
register: check_mariadb_port
until: check_mariadb_port is success
retries: 10
delay: 6
when: when:
- bootstrap_host is not defined or bootstrap_host != inventory_hostname
- groups.mariadb_port_alive_False is defined
- inventory_hostname in groups.mariadb_port_alive_False
- kolla_action != "config" - kolla_action != "config"
- inventory_hostname != master_host
- not mariadb_recover | default(false)
listen: restart mariadb listen: restart mariadb
- name: run upgrade on slave - name: Ensure MariaDB is running normally on bootstrap host
vars: include_tasks: 'restart_services.yml'
service_name: "mariadb" listen: Bootstrap MariaDB cluster
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "start_container"
common_options: "{{ docker_common_options }}"
detach: False
dimensions: "{{ service.dimensions }}"
environment:
KOLLA_UPGRADE:
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
DB_HOST: "{{ api_interface_address }}"
DB_PORT: "{{ mariadb_port }}"
DB_ROOT_PASSWORD: "{{ database_password }}"
image: "{{ service.image }}"
labels:
UPGRADE:
name: "upgrade_mariadb"
restart_policy: no
volumes: "{{ service.volumes }}"
no_log: true
when:
- kolla_action == "upgrade"
- inventory_hostname != master_host
- not mariadb_recover | default(false)
listen: restart mariadb
- name: restart master mariadb
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "recreate_or_restart_container"
common_options: "{{ docker_common_options }}"
name: "{{ service.container_name }}"
image: "{{ service.image }}"
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
when:
- kolla_action != "config"
- inventory_hostname == master_host
- not mariadb_recover | default(false)
listen: restart mariadb
# TODO(jeffrey4l), remove the task check when the wait_for bug is fixed
# https://github.com/ansible/ansible-modules-core/issues/2788
- name: Waiting for master mariadb
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
connect_timeout: 1
timeout: 60
search_regex: "MariaDB"
register: check_mariadb_port
until: check_mariadb_port is success
retries: 10
delay: 6
when:
- kolla_action != "config"
- inventory_hostname == master_host
- not mariadb_recover | default(false)
listen: restart mariadb
- name: run upgrade on master
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "start_container"
common_options: "{{ docker_common_options }}"
detach: False
dimensions: "{{ service.dimensions }}"
environment:
KOLLA_UPGRADE:
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
DB_HOST: "{{ api_interface_address }}"
DB_PORT: "{{ mariadb_port }}"
DB_ROOT_PASSWORD: "{{ database_password }}"
image: "{{ service.image }}"
labels:
UPGRADE:
name: "upgrade_mariadb"
restart_policy: no
volumes: "{{ service.volumes }}"
no_log: true
when:
- kolla_action == "upgrade"
- inventory_hostname == master_host
- not mariadb_recover | default(false)
listen: restart mariadb

View File

@ -1,13 +1,9 @@
--- ---
- name: Set a fact about the master host
set_fact:
master_host: "{{ groups['mariadb'][0] }}"
- include_tasks: lookup_cluster.yml - include_tasks: lookup_cluster.yml
- include_tasks: bootstrap_cluster.yml - include_tasks: bootstrap_cluster.yml
when: when:
- not has_cluster | bool - not mariadb_cluster_exists
- inventory_hostname == groups['mariadb'][0] - inventory_hostname == groups['mariadb'][0]
- include_tasks: recover_cluster.yml - include_tasks: recover_cluster.yml

View File

@ -1,6 +1,5 @@
--- ---
- name: Set a fact about the master host
set_fact:
master_host: "{{ groups['mariadb'][0] }}"
- import_tasks: check-containers.yml - import_tasks: check-containers.yml
# NOTE(yoctozepto): handlers prerequisite
- import_tasks: lookup_cluster.yml

View File

@ -1,25 +1,5 @@
--- ---
- name: Cleaning up temp file on localhost - name: Create MariaDB volume
file:
path: /tmp/kolla_mariadb_cluster
state: absent
delegate_to: localhost
changed_when: False
check_mode: no
run_once: True
# NOTE(mnasiadka): Due to the way that we are setting fact has_cluster - content needs to be ''
- name: Creating temp file on localhost
copy:
content: ''
dest: /tmp/kolla_mariadb_cluster
mode: 0644
delegate_to: localhost
changed_when: False
check_mode: no
run_once: True
- name: Creating mariadb volume
become: true become: true
kolla_docker: kolla_docker:
action: "create_volume" action: "create_volume"
@ -27,25 +7,59 @@
name: "mariadb" name: "mariadb"
register: mariadb_volume register: mariadb_volume
- name: Writing hostname of host with existing cluster files to temp file - name: Divide hosts by their MariaDB volume availability
copy: group_by:
content: "{{ ansible_hostname }}" key: mariadb_had_volume_{{ mariadb_volume is not changed }}
dest: /tmp/kolla_mariadb_cluster
mode: 0644
delegate_to: localhost
changed_when: False
check_mode: no
when: mariadb_volume is not changed
- name: Registering host from temp file - name: Establish whether the cluster has already existed
set_fact: set_fact:
has_cluster: "{{ lookup('file', '/tmp/kolla_mariadb_cluster') | length > 0 }}" mariadb_cluster_exists: "{{ groups.mariadb_had_volume_True is defined }}"
- name: Cleaning up temp file on localhost - block:
file: - name: Check MariaDB service port liveness
path: /tmp/kolla_mariadb_cluster wait_for:
state: absent host: "{{ api_interface_address }}"
delegate_to: localhost port: "{{ mariadb_port }}"
changed_when: False connect_timeout: 1
check_mode: no timeout: 10
run_once: True search_regex: "MariaDB"
register: check_mariadb_port_liveness
ignore_errors: yes
- name: Divide hosts by their MariaDB service port liveness
group_by:
key: mariadb_port_alive_{{ check_mariadb_port_liveness is success }}
- block:
- name: Check MariaDB service WSREP sync status
become: true
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
changed_when: false
register: check_mariadb_sync_status
no_log: true
# NOTE(yoctozepto): this is extracted separately to properly escape
# the TAB character which likes to go wrong due to interaction between
# Python/Ansible/Jinja2/YAML, the way below works
- name: Extract MariaDB service WSREP sync status
set_fact:
mariadb_sync_status: "{{ check_mariadb_sync_status.stdout.split('\t')[1] }}"
- name: Divide hosts by their MariaDB service WSREP sync status
group_by:
key: mariadb_sync_status_{{ mariadb_sync_status }}
- name: Fail when MariaDB service is not synced
fail:
msg: MariaDB service is not synced. Please wait for WSREP sync before proceeding.
when:
- groups.mariadb_sync_status_Synced is not defined or
inventory_hostname not in groups.mariadb_sync_status_Synced
when:
- groups.mariadb_port_alive_True is defined
- inventory_hostname in groups.mariadb_port_alive_True
when: not mariadb_recover | default(False)

View File

@ -1,7 +1,7 @@
--- ---
- fail: - fail:
msg: "MariaDB cluster was not found. Is your inventory correct?" msg: "MariaDB cluster was not found. Is your inventory correct?"
when: not has_cluster | bool when: not mariadb_cluster_exists
- name: Cleaning up temp file on mariadb hosts - name: Cleaning up temp file on mariadb hosts
file: file:
@ -97,8 +97,6 @@
- set_fact: - set_fact:
bootstrap_host: "{{ mariadb_recover_inventory_name }}" bootstrap_host: "{{ mariadb_recover_inventory_name }}"
master_host: "{{ mariadb_recover_inventory_name }}"
changed_when: true
- name: Copying grastate.dat file from MariaDB container in bootstrap host - name: Copying grastate.dat file from MariaDB container in bootstrap host
become: true become: true

View File

@ -1,19 +1,4 @@
--- ---
- name: Creating haproxy mysql user
become: true
kolla_toolbox:
module_name: mysql_user
module_args:
login_host: "{{ api_interface_address }}"
login_port: "{{ mariadb_port }}"
login_user: "{{ database_user }}"
login_password: "{{ database_password }}"
name: "haproxy"
password: ""
host: "%"
priv: "*.*:USAGE"
run_once: True
- import_tasks: wait_for_loadbalancer.yml - import_tasks: wait_for_loadbalancer.yml
- name: Creating the Mariabackup database - name: Creating the Mariabackup database
@ -65,7 +50,3 @@
run_once: True run_once: True
when: when:
- enable_mariabackup | bool - enable_mariabackup | bool
- name: Cleaning up facts
set_fact:
delegate_host: "bootstraped"

View File

@ -0,0 +1,46 @@
---
- name: Restart MariaDB container
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "recreate_or_restart_container"
common_options: "{{ docker_common_options }}"
name: "{{ service.container_name }}"
image: "{{ service.image }}"
volumes: "{{ service.volumes }}"
dimensions: "{{ service.dimensions }}"
# NOTE(yoctozepto): We have to loop this to avoid breaking on connection resets
- name: Wait for MariaDB service port liveness
wait_for:
host: "{{ api_interface_address }}"
port: "{{ mariadb_port }}"
connect_timeout: 1
timeout: 60
search_regex: "MariaDB"
register: check_mariadb_port
until: check_mariadb_port is success
retries: 10
delay: 6
- name: Wait for MariaDB service to sync WSREP
become: true
command: >-
docker exec {{ mariadb_service.container_name }}
mysql -uroot -p{{ database_password }}
--silent --skip-column-names
-e 'SHOW STATUS LIKE "wsrep_local_state_comment"'
changed_when: false
register: result
until: result.stdout == "wsrep_local_state_comment\tSynced"
retries: 10
delay: 6
no_log: true
when:
# NOTE(yoctozepto): we don't want to wait for new nodes to fully sync
# with an existing cluster as this could take time
- not mariadb_cluster_exists or
(groups.mariadb_port_alive_True is defined and
inventory_hostname in groups.mariadb_port_alive_True)

View File

@ -1,2 +1,26 @@
--- ---
- include_tasks: deploy.yml - include_tasks: deploy.yml
- name: Run upgrade in MariaDB container
vars:
service_name: "mariadb"
service: "{{ mariadb_services[service_name] }}"
become: true
kolla_docker:
action: "start_container"
common_options: "{{ docker_common_options }}"
detach: False
dimensions: "{{ service.dimensions }}"
environment:
KOLLA_UPGRADE:
KOLLA_CONFIG_STRATEGY: "{{ config_strategy }}"
DB_HOST: "{{ api_interface_address }}"
DB_PORT: "{{ mariadb_port }}"
DB_ROOT_PASSWORD: "{{ database_password }}"
image: "{{ service.image }}"
labels:
UPGRADE:
name: "upgrade_mariadb"
restart_policy: no
volumes: "{{ service.volumes }}"
no_log: true

View File

@ -0,0 +1,17 @@
---
fixes:
- |
Fixes MariaDB issues in multinode scenarios which affected
deployment, reconfiguration, upgrade and Galera cluster resizing.
They were usually manifested by WSREP issues in various places
and could lead to need to recover the Galera cluster.
Note these issues were due to how MariaDB was handled during
Kolla Ansible runs and did not affect Galera cluster during normal
operations unless MariaDB was later touched by Kolla Ansible.
Users wishing to run actions on their Galera clusters using
Kolla Ansible are strongly advised to update.
For details please see the following Launchpad bug records:
`bug 1857908
<https://bugs.launchpad.net/kolla-ansible/+bug/1857908>`__ and
`bug 1859145
<https://bugs.launchpad.net/kolla-ansible/+bug/1859145>`__.