diff --git a/releasenotes/notes/mariadb-rolling-upgrades-323510425c3c7751.yaml b/releasenotes/notes/mariadb-rolling-upgrades-323510425c3c7751.yaml new file mode 100644 index 0000000000..81d8c596a2 --- /dev/null +++ b/releasenotes/notes/mariadb-rolling-upgrades-323510425c3c7751.yaml @@ -0,0 +1,8 @@ +--- +upgrade: + - During upgrades, container and service restarts for the mariadb/galera + cluster were being triggered multiple times and causing the cluster to + become unstable and often unrecoverable. This situation has been improved + immensely, and we now have tight control such that restarts of the galera + containers only need to happen once, and are done so in a controlled, + predictable and repeatable way. diff --git a/scripts/run-upgrade.sh b/scripts/run-upgrade.sh index 5b1569408f..524e0872a8 100755 --- a/scripts/run-upgrade.sh +++ b/scripts/run-upgrade.sh @@ -145,11 +145,24 @@ function main { RUN_TASKS+=("${UPGRADE_PLAYBOOKS}/deploy-config-changes.yml") RUN_TASKS+=("${UPGRADE_PLAYBOOKS}/user-secrets-adjustment.yml") RUN_TASKS+=("${UPGRADE_PLAYBOOKS}/db-collation-alter.yml") - RUN_TASKS+=("setup-hosts.yml --limit '!galera_all[0]'") - RUN_TASKS+=("lxc-containers-create.yml --limit galera_all[0]") + # we don't want to trigger galera container restarts yet + RUN_TASKS+=("setup-hosts.yml --limit '!galera_all'") + # add new container config to galera containers but don't restart + RUN_TASKS+=("lxc-containers-create.yml -e 'lxc_container_allow_restarts=false' --limit galera_all") + # rebuild the repo servers + RUN_TASKS+=("repo-install.yml") RUN_TASKS+=("${UPGRADE_PLAYBOOKS}/repo-server-pip-conf-removal.yml") RUN_TASKS+=("${UPGRADE_PLAYBOOKS}/old-hostname-compatibility.yml") - RUN_TASKS+=("setup-infrastructure.yml -e 'galera_upgrade=true' -e 'rabbitmq_upgrade=true'") + # explicitly perform mariadb upgrade + RUN_TASKS+=("galera-install.yml -e 'galera_upgrade=true'") + # explicitly perform controlled galera cluster restart + RUN_TASKS+=("${UPGRADE_PLAYBOOKS}/galera-cluster-rolling-restart.yml") + # individually run each of the remaining plays from setup-infrastructure + RUN_TASKS+=("haproxy-install.yml") + RUN_TASKS+=("memcached-install.yml") + RUN_TASKS+=("rabbitmq-install.yml -e 'rabbitmq_upgrade=true'") + RUN_TASKS+=("utility-install.yml") + RUN_TASKS+=("rsyslog-install.yml") RUN_TASKS+=("${UPGRADE_PLAYBOOKS}/memcached-flush.yml") RUN_TASKS+=("setup-openstack.yml") # Run the tasks in order diff --git a/scripts/upgrade-utilities/playbooks/galera-cluster-rolling-restart.yml b/scripts/upgrade-utilities/playbooks/galera-cluster-rolling-restart.yml new file mode 100644 index 0000000000..e125b99d13 --- /dev/null +++ b/scripts/upgrade-utilities/playbooks/galera-cluster-rolling-restart.yml @@ -0,0 +1,59 @@ +--- +# Copyright 2016, Rackspace US, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Gracefully restart mariadb/galera cluster + hosts: galera_all + serial: 1 + max_fail_percentage: 0 + gather_facts: false + user: root + tasks: + - name: Stop mariadb + service: + name: mysql + state: stopped + retries: 5 + delay: 10 + + - name: Stop container + lxc_container: + name: "{{ inventory_hostname }}" + state: "stopped" + delegate_to: "{{ physical_host }}" + + - name: Start container + lxc_container: + name: "{{ inventory_hostname }}" + state: "started" + delegate_to: "{{ physical_host }}" + + post_tasks: + - name: Wait for mariadb port 3306 to be available + local_action: + module: wait_for + port: "3306" + host: "{{ ansible_ssh_host | default(inventory_hostname) }}" + search_regex: MariaDB + retries: 10 + delay: 10 + + - name: Check that WSREP is ready and Synced + shell: "/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf extended-status | egrep '(wsrep_local_state_comment)'" + register: mysql_ready + until: + - mysql_ready.rc == 0 + - (mysql_ready.stdout).find("Synced") != -1 + retries: 60 + delay: 1