8827e4f7f1
Add better idempotency checks on editing the pacemaker resources and fetching and re-tagging new images, which prevents the upgrade from failing. The latest status after staged upgrade looks like this: Online: [ controller-0 controller-1 controller-2 ] GuestOnline: [ galera-bundle-0@controller-0 galera-bundle-1@controller-1 galera-bundle-2@controller-2 rabbitmq-bundle-0@controller-0 rabbitmq-bundle-1@controller-1 redis-bundle-0@controller-0 redis-bundle-1@controller-1 ] Full list of resources: podman container set: galera-bundle [brew-pulp-docker01.web.prod.ext.phx2.redhat.com:8888/rhosp15/openstack-mariadb:pcmklatest] galera-bundle-0 (ocf:💓galera): Master controller-0 galera-bundle-1 (ocf:💓galera): Master controller-1 galera-bundle-2 (ocf:💓galera): Master controller-2 podman container set: rabbitmq-bundle [brew-pulp-docker01.web.prod.ext.phx2.redhat.com:8888/rhosp15/openstack-rabbitmq:pcmklatest] rabbitmq-bundle-0 (ocf:💓rabbitmq-cluster): Started controller-0 rabbitmq-bundle-1 (ocf:💓rabbitmq-cluster): Started controller-1 podman container set: redis-bundle [brew-pulp-docker01.web.prod.ext.phx2.redhat.com:8888/rhosp15/openstack-redis:pcmklatest] redis-bundle-0 (ocf:💓redis): Master controller-0 redis-bundle-1 (ocf:💓redis): Slave controller-1 ip-192.168.24.8 (ocf:💓IPaddr2): Started controller-0 ip-10.0.0.106 (ocf:💓IPaddr2): Started controller-0 ip-172.17.1.16 (ocf:💓IPaddr2): Started controller-0 ip-172.17.1.23 (ocf:💓IPaddr2): Started controller-0 ip-172.17.3.11 (ocf:💓IPaddr2): Started controller-0 ip-172.17.4.25 (ocf:💓IPaddr2): Started controller-0 podman container set: haproxy-bundle [brew-pulp-docker01.web.prod.ext.phx2.redhat.com:8888/rhosp15/openstack-haproxy:pcmklatest] haproxy-bundle-podman-0 (ocf:💓podman): Started controller-0 haproxy-bundle-podman-1 (ocf:💓podman): Started controller-1 haproxy-bundle-podman-2 (ocf:💓podman): Stopped podman container: openstack-cinder-volume [brew-pulp-docker01.web.prod.ext.phx2.redhat.com:8888/rhosp15/openstack-cinder-volume:pcmklatest] openstack-cinder-volume-podman-0 (ocf:💓podman): Started controller-1 Failed Resource Actions: * rabbitmq_monitor_10000 on rabbitmq-bundle-0 'unknown error' (1): call=4861, status=Timed Out, exitreason='', last-rc-change='Mon Aug 5 10:37:51 2019', queued=0ms, exec=0ms * rabbitmq_monitor_10000 on rabbitmq-bundle-1 'unknown error' (1): call=42, status=Timed Out, exitreason='', last-rc-change='Mon Aug 5 10:15:55 2019', queued=0ms, exec=0ms This indicates that there are still issues we'll need to solve, but at least the upgrade passes now and we can keep solving the follow-up issues while the critical upgrade path is unblocked. Closes-Bug: #1838971 Change-Id: I2e88dc34fa59624523de4c52a1873438c78e972f
462 lines
19 KiB
YAML
462 lines
19 KiB
YAML
heat_template_version: rocky
|
|
|
|
description: >
|
|
OpenStack containerized Rabbitmq service
|
|
|
|
parameters:
|
|
ContainerRabbitmqImage:
|
|
description: image
|
|
type: string
|
|
ContainerRabbitmqConfigImage:
|
|
description: The container image to use for the rabbitmq config_volume
|
|
type: string
|
|
EndpointMap:
|
|
default: {}
|
|
description: Mapping of service endpoint -> protocol. Typically set
|
|
via parameter_defaults in the resource registry.
|
|
type: json
|
|
ServiceData:
|
|
default: {}
|
|
description: Dictionary packing service data
|
|
type: json
|
|
ServiceNetMap:
|
|
default: {}
|
|
description: Mapping of service_name -> network name. Typically set
|
|
via parameter_defaults in the resource registry. This
|
|
mapping overrides those in ServiceNetMapDefaults.
|
|
type: json
|
|
DefaultPasswords:
|
|
default: {}
|
|
type: json
|
|
RabbitCookie:
|
|
type: string
|
|
default: ''
|
|
hidden: true
|
|
RoleName:
|
|
default: ''
|
|
description: Role name on which the service is applied
|
|
type: string
|
|
RoleParameters:
|
|
default: {}
|
|
description: Parameters specific to the role
|
|
type: json
|
|
ConfigDebug:
|
|
default: false
|
|
description: Whether to run config management (e.g. Puppet) in debug mode.
|
|
type: boolean
|
|
PcmkConfigRestartTimeout:
|
|
default: 600
|
|
description: Time in seconds to wait for a pcmk resource to restart when
|
|
a config change is detected and the resource is being restarted
|
|
type: number
|
|
ContainerCli:
|
|
type: string
|
|
default: 'podman'
|
|
description: CLI tool used to manage containers.
|
|
constraints:
|
|
- allowed_values: ['docker', 'podman']
|
|
DeployIdentifier:
|
|
default: ''
|
|
type: string
|
|
description: >
|
|
Setting this to a unique value will re-run any deployment tasks which
|
|
perform configuration on a Heat stack-update.
|
|
|
|
conditions:
|
|
puppet_debug_enabled: {get_param: ConfigDebug}
|
|
|
|
resources:
|
|
|
|
ContainersCommon:
|
|
type: ../containers-common.yaml
|
|
|
|
RabbitMQServiceBase:
|
|
type: ./rabbitmq-container-puppet.yaml
|
|
properties:
|
|
ServiceData: {get_param: ServiceData}
|
|
ServiceNetMap: {get_param: ServiceNetMap}
|
|
DefaultPasswords: {get_param: DefaultPasswords}
|
|
EndpointMap: {get_param: EndpointMap}
|
|
RoleName: {get_param: RoleName}
|
|
RoleParameters: {get_param: RoleParameters}
|
|
|
|
outputs:
|
|
role_data:
|
|
description: Role data for the Rabbitmq API role.
|
|
value:
|
|
service_name: rabbitmq
|
|
monitoring_subscription: {get_attr: [RabbitMQServiceBase, role_data, monitoring_subscription]}
|
|
config_settings:
|
|
map_merge:
|
|
- get_attr: [RabbitMQServiceBase, role_data, config_settings]
|
|
- rabbitmq::service_manage: false
|
|
tripleo::profile::pacemaker::rabbitmq_bundle::rabbitmq_docker_image: &rabbitmq_image_pcmklatest
|
|
list_join:
|
|
- ':'
|
|
- - yaql:
|
|
data: {get_param: ContainerRabbitmqImage}
|
|
expression: $.data.rightSplit(separator => ":", maxSplits => 1)[0]
|
|
- 'pcmklatest'
|
|
tripleo::profile::pacemaker::rabbitmq_bundle::control_port: 3122
|
|
tripleo::profile::pacemaker::rabbitmq_bundle::container_backend: {get_param: ContainerCli}
|
|
tripleo::rabbitmq::firewall_rules:
|
|
'109 rabbitmq-bundle':
|
|
dport:
|
|
- 3122
|
|
- 4369
|
|
- 5672
|
|
- 25672
|
|
service_config_settings: {get_attr: [RabbitmqBase, role_data, service_config_settings]}
|
|
# BEGIN DOCKER SETTINGS
|
|
puppet_config:
|
|
config_volume: rabbitmq
|
|
puppet_tags: 'file,file_line'
|
|
step_config:
|
|
list_join:
|
|
- "\n"
|
|
- - "['Rabbitmq_policy', 'Rabbitmq_user'].each |String $val| { noop_resource($val) }"
|
|
- "include ::tripleo::profile::pacemaker::rabbitmq_bundle"
|
|
config_image: {get_param: ContainerRabbitmqConfigImage}
|
|
kolla_config:
|
|
/var/lib/kolla/config_files/rabbitmq.json:
|
|
command: /usr/sbin/pacemaker_remoted
|
|
config_files:
|
|
- dest: /etc/libqb/force-filesystem-sockets
|
|
source: /dev/null
|
|
owner: root
|
|
perm: '0644'
|
|
- dest: /var/log/btmp
|
|
source: /dev/null
|
|
owner: root:utmp
|
|
perm: '0600'
|
|
- source: "/var/lib/kolla/config_files/src/*"
|
|
dest: "/"
|
|
merge: true
|
|
preserve_properties: true
|
|
- source: "/var/lib/kolla/config_files/src-tls/*"
|
|
dest: "/"
|
|
merge: true
|
|
optional: true
|
|
preserve_properties: true
|
|
permissions:
|
|
- path: /var/lib/rabbitmq
|
|
owner: rabbitmq:rabbitmq
|
|
recurse: true
|
|
- path: /var/log/rabbitmq
|
|
owner: rabbitmq:rabbitmq
|
|
recurse: true
|
|
- path: /etc/pki/tls/certs/rabbitmq.crt
|
|
owner: rabbitmq:rabbitmq
|
|
perm: '0600'
|
|
optional: true
|
|
- path: /etc/pki/tls/private/rabbitmq.key
|
|
owner: rabbitmq:rabbitmq
|
|
perm: '0600'
|
|
optional: true
|
|
# When using pacemaker we don't launch the container, instead that is done by pacemaker
|
|
# itself.
|
|
container_config_scripts: {get_attr: [ContainersCommon, container_config_scripts]}
|
|
docker_config:
|
|
step_1:
|
|
rabbitmq_bootstrap:
|
|
start_order: 0
|
|
image: {get_param: ContainerRabbitmqImage}
|
|
net: host
|
|
privileged: false
|
|
volumes:
|
|
- /var/lib/kolla/config_files/rabbitmq.json:/var/lib/kolla/config_files/config.json:ro
|
|
- /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
|
|
- /etc/hosts:/etc/hosts:ro
|
|
- /etc/localtime:/etc/localtime:ro
|
|
- /var/lib/rabbitmq:/var/lib/rabbitmq:z
|
|
environment:
|
|
- KOLLA_CONFIG_STRATEGY=COPY_ALWAYS
|
|
- KOLLA_BOOTSTRAP=True
|
|
-
|
|
list_join:
|
|
- '='
|
|
- - 'RABBITMQ_CLUSTER_COOKIE'
|
|
-
|
|
yaql:
|
|
expression: $.data.passwords.where($ != '').first()
|
|
data:
|
|
passwords:
|
|
- {get_param: RabbitCookie}
|
|
- {get_param: [DefaultPasswords, rabbit_cookie]}
|
|
step_2:
|
|
rabbitmq_restart_bundle:
|
|
start_order: 0
|
|
config_volume: rabbitmq
|
|
detach: false
|
|
net: host
|
|
ipc: host
|
|
user: root
|
|
environment:
|
|
- TRIPLEO_MINOR_UPDATE
|
|
command:
|
|
- '/usr/bin/bootstrap_host_exec'
|
|
- 'rabbitmq'
|
|
- str_replace:
|
|
template:
|
|
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show rabbitmq-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT rabbitmq-bundle; echo "rabbitmq-bundle restart invoked"; fi'
|
|
params:
|
|
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
|
|
image: {get_param: ContainerRabbitmqImage}
|
|
volumes:
|
|
list_concat:
|
|
- {get_attr: [ContainersCommon, volumes]}
|
|
-
|
|
- /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
|
|
- /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
|
|
rabbitmq_init_bundle:
|
|
start_order: 1
|
|
detach: false
|
|
net: host
|
|
ipc: host
|
|
user: root
|
|
command: # '/container_puppet_apply.sh "STEP" "TAGS" "CONFIG" "DEBUG"'
|
|
list_concat:
|
|
- - '/container_puppet_apply.sh'
|
|
- '2'
|
|
- 'file,file_line,concat,augeas,pacemaker::resource::bundle,pacemaker::property,pacemaker::resource::ocf,pacemaker::constraint::order,pacemaker::constraint::colocation,rabbitmq_policy,rabbitmq_user,rabbitmq_ready'
|
|
- 'include ::tripleo::profile::base::pacemaker;include ::tripleo::profile::pacemaker::rabbitmq_bundle'
|
|
- if:
|
|
- puppet_debug_enabled
|
|
- - '--debug'
|
|
- - ''
|
|
image: {get_param: ContainerRabbitmqImage}
|
|
volumes:
|
|
list_concat:
|
|
- {get_attr: [ContainersCommon, container_puppet_apply_volumes]}
|
|
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
|
|
- /bin/true:/bin/epmd
|
|
environment:
|
|
# https://launchpad.net/bugs/1822673 (lang/lc_all to utf-8 are an elixir requirement)
|
|
- 'LANG=en_US.UTF-8'
|
|
- 'LC_ALL=en_US.UTF-8'
|
|
# NOTE: this should force this container to re-run on each
|
|
# update (scale-out, etc.)
|
|
- list_join:
|
|
- ''
|
|
- - 'TRIPLEO_DEPLOY_IDENTIFIER='
|
|
- {get_param: DeployIdentifier}
|
|
host_prep_tasks:
|
|
- name: create persistent directories
|
|
file:
|
|
path: "{{ item.path }}"
|
|
state: directory
|
|
setype: "{{ item.setype }}"
|
|
with_items:
|
|
- { 'path': /var/lib/rabbitmq, 'setype': svirt_sandbox_file_t }
|
|
- { 'path': /var/log/containers/rabbitmq, 'setype': svirt_sandbox_file_t }
|
|
- { 'path': /var/log/rabbitmq, 'setype': svirt_sandbox_file_t }
|
|
- name: rabbitmq logs readme
|
|
copy:
|
|
dest: /var/log/rabbitmq/readme.txt
|
|
content: |
|
|
Log files from rabbitmq containers can be found under
|
|
/var/log/containers/rabbitmq.
|
|
ignore_errors: true
|
|
- name: stop the Erlang port mapper on the host and make sure it cannot bind to the port used by container
|
|
shell: |
|
|
echo 'export ERL_EPMD_ADDRESS=127.0.0.1' > /etc/rabbitmq/rabbitmq-env.conf
|
|
echo 'export ERL_EPMD_PORT=4370' >> /etc/rabbitmq/rabbitmq-env.conf
|
|
for pid in $(pgrep epmd --ns 1 --nslist pid); do kill $pid; done
|
|
metadata_settings:
|
|
get_attr: [RabbitMQServiceBase, role_data, metadata_settings]
|
|
deploy_steps_tasks:
|
|
- name: RabbitMQ tag container image for pacemaker
|
|
when: step|int == 1
|
|
import_role:
|
|
name: tripleo-container-tag
|
|
vars:
|
|
container_image: {get_param: ContainerRabbitmqImage}
|
|
container_image_latest: *rabbitmq_image_pcmklatest
|
|
|
|
update_tasks:
|
|
- name: Rabbit fetch and retag container image for pacemaker
|
|
when: step|int == 2
|
|
block: &rabbitmq_fetch_retag_container_tasks
|
|
- name: Get container rabbitmq image
|
|
set_fact:
|
|
rabbitmq_image: {get_param: ContainerRabbitmqImage}
|
|
rabbitmq_image_latest: *rabbitmq_image_pcmklatest
|
|
- name: Pull latest rabbitmq images
|
|
command: "{{container_cli}} pull {{rabbitmq_image}}"
|
|
- name: Get previous rabbitmq image id
|
|
shell: "{{container_cli}} inspect --format '{{'{{'}}.Id{{'}}'}}' {{rabbitmq_image_latest}}"
|
|
register: old_rabbitmq_image_id
|
|
failed_when: false
|
|
- name: Get new rabbitmq image id
|
|
shell: "{{container_cli}} inspect --format '{{'{{'}}.Id{{'}}'}}' {{rabbitmq_image}}"
|
|
register: new_rabbitmq_image_id
|
|
- name: Retag pcmklatest to latest rabbitmq image
|
|
include_role:
|
|
name: tripleo-container-tag
|
|
vars:
|
|
container_image: "{{rabbitmq_image}}"
|
|
container_image_latest: "{{rabbitmq_image_latest}}"
|
|
when:
|
|
- old_rabbitmq_image_id.stdout != new_rabbitmq_image_id.stdout
|
|
- block:
|
|
- name: Get a list of container using rabbitmq image
|
|
shell: "{{container_cli}} ps -a -q -f 'ancestor={{old_rabbitmq_image_id.stdout}}'"
|
|
register: rabbitmq_containers_to_destroy
|
|
# It will be recreated with the delpoy step.
|
|
- name: Remove any container using the same rabbitmq image
|
|
shell: "{{container_cli}} rm -fv {{item}}"
|
|
with_items: "{{ rabbitmq_containers_to_destroy.stdout_lines }}"
|
|
- name: Remove previous rabbitmq images
|
|
shell: "{{container_cli}} rmi -f {{old_rabbitmq_image_id.stdout}}"
|
|
when:
|
|
- old_rabbitmq_image_id.stdout != ''
|
|
- old_rabbitmq_image_id.stdout != new_rabbitmq_image_id.stdout
|
|
|
|
upgrade_tasks:
|
|
- name: Prepare switch of rabbitmq image name
|
|
when:
|
|
- step|int == 0
|
|
block:
|
|
- name: Get rabbitmq image id currently used by pacemaker
|
|
shell: "pcs resource config rabbitmq-bundle | grep -Eo 'image=[^ ]+' | awk -F= '{print $2;}'"
|
|
register: rabbitmq_image_current_res
|
|
failed_when: false
|
|
- name: Image facts for rabbitmq
|
|
set_fact:
|
|
rabbitmq_image_latest: *rabbitmq_image_pcmklatest
|
|
rabbitmq_image_current: "{{rabbitmq_image_current_res.stdout}}"
|
|
- name: Prepare the switch to new rabbitmq container image name in pacemaker
|
|
block:
|
|
- name: Temporarily tag the current rabbitmq image id with the upgraded image name
|
|
import_role:
|
|
name: tripleo-container-tag
|
|
vars:
|
|
container_image: "{{rabbitmq_image_current}}"
|
|
container_image_latest: "{{rabbitmq_image_latest}}"
|
|
pull_image: false
|
|
when:
|
|
- rabbitmq_image_current != ''
|
|
- rabbitmq_image_current != rabbitmq_image_latest
|
|
- name: Check rabbitmq cluster resource status
|
|
shell: pcs resource config rabbitmq-bundle
|
|
failed_when: false
|
|
register: rabbitmq_pcs_res_result
|
|
- name: Set fact rabbitmq_pcs_res
|
|
set_fact:
|
|
rabbitmq_pcs_res: "{{rabbitmq_pcs_res_result|succeeded}}"
|
|
- name: set is_rabbitmq_bootstrap_node fact
|
|
tags: common
|
|
set_fact: is_rabbitmq_bootstrap_node={{rabbitmq_short_bootstrap_node_name|lower == ansible_hostname|lower}}
|
|
- name: Update rabbitmq-bundle pcs resource bundle for new container image
|
|
when:
|
|
- step|int == 1
|
|
- is_rabbitmq_bootstrap_node|bool
|
|
- rabbitmq_pcs_res|bool
|
|
- rabbitmq_image_current != rabbitmq_image_latest
|
|
block:
|
|
- name: Disable the rabbitmq cluster resource before container upgrade
|
|
pacemaker_resource:
|
|
resource: rabbitmq-bundle
|
|
state: disable
|
|
wait_for_resource: true
|
|
register: output
|
|
retries: 5
|
|
until: output.rc == 0
|
|
- name: Move rabbitmq logging to /var/log/containers
|
|
block:
|
|
- name: Check rabbitmq logging configuration in pacemaker
|
|
command: cibadmin --query --xpath "//storage-mapping[@id='rabbitmq-log']"
|
|
ignore_errors: true
|
|
register: rabbitmq_logs_moved
|
|
- name: Add a bind mount for logging in the rabbitmq bundle
|
|
# rc == 6 means the configuration doesn't exist in the CIB
|
|
when: rabbitmq_logs_moved.rc == 6
|
|
command: pcs resource bundle update rabbitmq-bundle storage-map add id=rabbitmq-log source-dir=/var/log/containers/rabbitmq target-dir=/var/log/rabbitmq options=rw
|
|
- name: Update the rabbitmq bundle to use the new container image name
|
|
command: "pcs resource bundle update rabbitmq-bundle container image={{rabbitmq_image_latest}}"
|
|
- name: Enable the rabbitmq cluster resource
|
|
pacemaker_resource:
|
|
resource: rabbitmq-bundle
|
|
state: enable
|
|
wait_for_resource: true
|
|
register: output
|
|
retries: 5
|
|
until: output.rc == 0
|
|
- name: Create hiera data to upgrade rabbitmq in a stepwise manner.
|
|
when:
|
|
- step|int == 1
|
|
block:
|
|
- name: set rabbitmq upgrade node facts in a single-node environment
|
|
set_fact:
|
|
rabbitmq_short_node_names_upgraded: "{{ rabbitmq_short_node_names }}"
|
|
cacheable: no
|
|
when: groups['rabbitmq'] | length <= 1
|
|
- name: set rabbitmq upgrade node facts from the limit option
|
|
set_fact:
|
|
rabbitmq_short_node_names_upgraded: "{{ rabbitmq_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
|
|
cacheable: no
|
|
when:
|
|
- groups['rabbitmq'] | length > 1
|
|
- item.split('.')[0] in ansible_limit.split(',')
|
|
loop: "{{ rabbitmq_short_node_names }}"
|
|
|
|
- debug:
|
|
msg: "Prepare rabbitmq upgrade for {{ rabbitmq_short_node_names_upgraded }}"
|
|
- fail:
|
|
msg: >
|
|
You can't upgrade rabbitmq without staged
|
|
upgrade. You need to use the limit option in order
|
|
to do so.
|
|
when: >-
|
|
rabbitmq_short_node_names_upgraded is not defined or
|
|
rabbitmq_short_node_names_upgraded | length == 0
|
|
|
|
- name: add the rabbitmq short name to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo-upgrade-hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: rabbitmq_short_node_names_override
|
|
tripleo_upgrade_value: "{{rabbitmq_short_node_names_upgraded}}"
|
|
- name: remove the extra hiera data needed for the upgrade.
|
|
include_role:
|
|
name: tripleo-upgrade-hiera
|
|
tasks_from: remove.yml
|
|
vars:
|
|
tripleo_upgrade_key: rabbitmq_short_node_names_override
|
|
when: rabbitmq_short_node_names_upgraded | length == rabbitmq_short_node_names | length
|
|
- name: Retag the pacemaker image if containerized
|
|
when:
|
|
- step|int == 3
|
|
block: *rabbitmq_fetch_retag_container_tasks
|
|
|
|
fast_forward_upgrade_tasks:
|
|
- when:
|
|
- step|int == 0
|
|
- release == 'ocata'
|
|
- is_bootstrap_node|bool
|
|
block:
|
|
- name: Check cluster resource status of rabbitmq
|
|
pacemaker_resource:
|
|
resource: {get_attr: [RabbitmqBase, role_data, service_name]}
|
|
state: show
|
|
check_mode: false
|
|
ignore_errors: true
|
|
register: rabbitmq_res_result
|
|
- name: Set fact rabbitmq_res
|
|
set_fact:
|
|
rabbitmq_res: "{{ rabbitmq_res_result.rc == 0 }}"
|
|
- name: Disable the rabbitmq cluster resource
|
|
pacemaker_resource:
|
|
resource: {get_attr: [RabbitmqBase, role_data, service_name]}
|
|
state: disable
|
|
wait_for_resource: true
|
|
register: rabbitmq_output
|
|
retries: 5
|
|
until: rabbitmq_output.rc == 0
|
|
when:
|
|
- step|int == 2
|
|
- release == 'ocata'
|
|
- is_bootstrap_node|bool
|
|
- rabbitmq_res|bool
|