tripleo-heat-templates/deployment/ovn/ovn-dbs-pacemaker-puppet.yaml
Kamil Sambor 780b057461 Add posibilities to set ovndbs monitor interval
Add posibilities to configure ovn dbs monitor interval
in tht by OVNDBSPacemakerMonitorInterval (default 30s).
Under load, this can create extra stress and since the
timeout has already been bumped, it makes sense to bump
this interval to a higher value as a trade off
between detecting a failure and stressing the service.

Depends-On: https://review.opendev.org/#/c/779918/
Change-Id: Id836676826f6e7c97ef8e3d665ab3e467ad055ba
(cherry picked from commit 9ece6f97af)
2021-03-11 08:20:39 +00:00

546 lines
23 KiB
YAML

heat_template_version: rocky
description: >
OpenStack containerized OVN DBs service managed by pacemaker
parameters:
ContainerOvnDbsImage:
description: image
type: string
ContainerOvnDbsConfigImage:
description: image
type: string
ClusterCommonTag:
default: false
description: When set to false, a pacemaker service is configured
to use a floating tag for its container image name,
e.g. 'REGISTRY/NAMESPACE/IMAGENAME:pcmklatest'. When
set to true, the service uses a floating prefix as
well, e.g. 'cluster.common.tag/IMAGENAME:pcmklatest'.
type: boolean
ClusterFullTag:
default: false
description: When set to true, the pacemaker service uses a fully
constant tag for its container image name, e.g.
'cluster.common.tag/SERVICENAME:pcmklatest'.
type: boolean
EndpointMap:
default: {}
description: Mapping of service endpoint -> protocol. Typically set
via parameter_defaults in the resource registry.
type: json
ServiceData:
default: {}
description: Dictionary packing service data
type: json
ServiceNetMap:
default: {}
description: Mapping of service_name -> network name. Typically set
via parameter_defaults in the resource registry. This
mapping overrides those in ServiceNetMapDefaults.
type: json
DefaultPasswords:
default: {}
type: json
RoleName:
default: ''
description: Role name on which the service is applied
type: string
RoleParameters:
default: {}
description: Parameters specific to the role
type: json
OVNNorthboundServerPort:
description: Port of the OVN Northbound DB server
type: number
default: 6641
OVNSouthboundServerPort:
description: Port of the OVN Southbound DB server
type: number
default: 6642
EnableLoadBalancer:
default: true
description: Whether to deploy a LoadBalancer, set to false when an external load balancer is used.
type: boolean
ConfigDebug:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
ContainerCli:
type: string
default: 'podman'
description: CLI tool used to manage containers.
constraints:
- allowed_values: ['docker', 'podman']
DeployIdentifier:
default: ''
type: string
description: >
Setting this to a unique value will re-run any deployment tasks which
perform configuration on a Heat stack-update.
EnableInternalTLS:
type: boolean
default: false
InternalTLSCAFile:
default: '/etc/ipa/ca.crt'
type: string
description: Specifies the default CA cert to use if TLS is used for
services in the internal network.
OVNDBSPacemakerTimeout:
description: timeout for monitor of ovn dbs resource in seconds
type: number
default: 60
OVNDBSReplicationInterval:
description: Probe interval for ovsdb-server. It configure probe
interval for connection for ovsdb-server when it is
in backup mode and connects to the active ovsdb-server for replication
type: number
default: 60000
CertificateKeySize:
type: string
default: '2048'
description: Specifies the private key size used when creating the
certificate.
OvnDBSCertificateKeySize:
type: string
default: ''
description: Override the private key size used when creating the
certificate for this service
OVNDBSPacemakerMonitorIntervalSlave:
description: monitor interval for ovn dbs resource in seconds
type: number
default: 30
OVNDBSPacemakerMonitorIntervalMaster:
description: monitor interval for ovn dbs resource in seconds
type: number
default: 10
conditions:
puppet_debug_enabled: {get_param: ConfigDebug}
docker_enabled: {equals: [{get_param: ContainerCli}, 'docker']}
internal_tls_enabled: {equals: [{get_param: EnableInternalTLS}, true]}
common_tag_enabled: {equals: [{get_param: ClusterCommonTag}, true]}
common_tag_full: {equals: [{get_param: ClusterFullTag}, true]}
use_external_load_balancer: {equals: [{get_param: EnableLoadBalancer}, false]}
key_size_override_unset: {equals: [{get_param: OvnDBSCertificateKeySize}, '']}
resources:
ContainersCommon:
type: ../containers-common.yaml
OVNDbsBase:
type: ./ovn-dbs-container-puppet.yaml
properties:
EndpointMap: {get_param: EndpointMap}
ServiceData: {get_param: ServiceData}
ServiceNetMap: {get_param: ServiceNetMap}
DefaultPasswords: {get_param: DefaultPasswords}
RoleName: {get_param: RoleName}
RoleParameters: {get_param: RoleParameters}
OVNNorthboundServerPort: {get_param: OVNNorthboundServerPort}
OVNSouthboundServerPort: {get_param: OVNSouthboundServerPort}
outputs:
role_data:
description: Role data for the OVN Dbs HA role.
value:
service_name: ovn_dbs
firewall_rules:
'121 OVN DB server ports':
proto: 'tcp'
dport:
# Control port for pcmk remote bundle
- 3125
- {get_param: OVNNorthboundServerPort}
- {get_param: OVNSouthboundServerPort}
config_settings:
map_merge:
- get_attr: [OVNDbsBase, role_data, config_settings]
- tripleo::profile::pacemaker::ovn_dbs_bundle::ovn_dbs_docker_image: &ovn_dbs_image_pcmklatest
if:
- common_tag_full
- "cluster.common.tag/ovn-northd:pcmklatest"
- yaql:
data:
if:
- common_tag_enabled
- yaql:
data: {get_param: ContainerOvnDbsImage}
expression: concat("cluster.common.tag/", $.data.rightSplit(separator => "/", maxSplits => 1)[1])
- {get_param: ContainerOvnDbsImage}
expression: concat($.data.rightSplit(separator => ":", maxSplits => 1)[0], ":pcmklatest")
- tripleo::profile::pacemaker::ovn_dbs_bundle::nb_db_port: {get_param: OVNNorthboundServerPort}
- tripleo::profile::pacemaker::ovn_dbs_bundle::sb_db_port: {get_param: OVNSouthboundServerPort}
- tripleo::profile::pacemaker::ovn_dbs_bundle::container_backend: {get_param: ContainerCli}
- tripleo::profile::pacemaker::ovn_dbs_bundle::dbs_timeout: {get_param: OVNDBSPacemakerTimeout}
- tripleo::profile::pacemaker::ovn_dbs_bundle::replication_probe_interval: {get_param: OVNDBSReplicationInterval}
- tripleo::profile::pacemaker::ovn_dbs_bundle::force_ocf: true
- tripleo::profile::pacemaker::ovn_dbs_bundle::monitor_interval_slave: {get_param: OVNDBSPacemakerMonitorIntervalSlave}
- tripleo::profile::pacemaker::ovn_dbs_bundle::monitor_interval_master: {get_param: OVNDBSPacemakerMonitorIntervalMaster}
- tripleo::haproxy::ovn_dbs_manage_lb:
if:
- use_external_load_balancer
- true
- false
- tripleo::profile::pacemaker::ovn_dbs_bundle::listen_on_master_ip_only:
if:
- use_external_load_balancer
- 'no'
- 'yes'
- if:
- internal_tls_enabled
- generate_service_certificates: true
tripleo::profile::pacemaker::ovn_dbs_bundle::ca_file:
get_param: InternalTLSCAFile
tripleo::profile::base::neutron::agents::ovn::protocol: 'ssl'
tripleo::profile::pacemaker::ovn_dbs_bundle::enable_internal_tls: true
ovn_dbs_certificate_specs:
service_certificate: '/etc/pki/tls/certs/ovn_dbs.crt'
service_key: '/etc/pki/tls/private/ovn_dbs.key'
hostname:
str_replace:
template: "%{hiera('fqdn_NETWORK')}"
params:
NETWORK: {get_param: [ServiceNetMap, OvnDbsNetwork]}
principal:
str_replace:
template: "ovn_dbs/%{hiera('fqdn_NETWORK')}"
params:
NETWORK: {get_param: [ServiceNetMap, OvnDbsNetwork]}
key_size:
if:
- key_size_override_unset
- {get_param: CertificateKeySize}
- {get_param: OvnDBSCertificateKeySize}
- {}
service_config_settings: {}
# BEGIN DOCKER SETTINGS
puppet_config:
config_volume: 'ovn_dbs'
puppet_tags: 'exec'
step_config: ''
config_image: &ovn_dbs_config_image {get_param: ContainerOvnDbsConfigImage}
kolla_config:
/var/lib/kolla/config_files/ovn_dbs.json:
command: /usr/sbin/pacemaker_remoted
config_files:
- dest: /etc/libqb/force-filesystem-sockets
source: /dev/null
owner: root
perm: '0644'
- source: "/var/lib/kolla/config_files/src/*"
dest: "/"
merge: true
preserve_properties: true
optional: true
container_config_scripts: {get_attr: [ContainersCommon, container_config_scripts]}
metadata_settings:
if:
- internal_tls_enabled
- - service: ovn_dbs
network: {get_param: [ServiceNetMap, OvnDbsNetwork]}
type: vip
- service: ovn_dbs
network: {get_param: [ServiceNetMap, OvnDbsNetwork]}
type: node
- null
docker_config:
step_3:
ovn_dbs_init_bundle:
start_order: 0
detach: false
net: host
ipc: host
user: root
config_volume: 'ovn_dbs_init_bundle'
command: # '/container_puppet_apply.sh "STEP" "TAGS" "CONFIG" "DEBUG"'
list_concat:
- - '/container_puppet_apply.sh'
- '3'
- 'ovn_dbs_remove_old_cruft'
- 'include tripleo::profile::pacemaker::ovn_dbs_bundle'
- if:
- puppet_debug_enabled
- - '--debug'
- - ''
image: *ovn_dbs_config_image
volumes:
list_concat:
- {get_attr: [ContainersCommon, container_puppet_apply_volumes]}
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- if:
- internal_tls_enabled
-
- /etc/pki/tls/certs/ovn_dbs.crt:/etc/pki/tls/certs/ovn_dbs.crt:ro
- /etc/pki/tls/private/ovn_dbs.key:/etc/pki/tls/private/ovn_dbs.key:ro
- null
environment:
# NOTE: this should force this container to re-run on each
# update (scale-out, etc.)
TRIPLEO_DEPLOY_IDENTIFIER: {get_param: DeployIdentifier}
host_prep_tasks:
- name: create persistent directories
file:
path: "{{ item.path }}"
state: directory
setype: "{{ item.setype }}"
mode: "{{ item.mode|default(omit) }}"
with_items:
- { 'path': /var/log/containers/openvswitch, 'setype': container_file_t, 'mode': '0750' }
- { 'path': /var/lib/openvswitch/ovn, 'setype': container_file_t }
deploy_steps_tasks:
- name: OVN DBS tag container image for pacemaker
when: step|int == 1
import_role:
name: tripleo_container_tag
vars:
container_image: {get_param: ContainerOvnDbsImage}
container_image_latest: *ovn_dbs_image_pcmklatest
- name: OVNDbs HA Wrappers Step
when: step|int == 3
block: &ovn_dbs_puppet_bundle
- name: Ovn dbs puppet bundle
import_role:
name: tripleo_ha_wrapper
vars:
tripleo_ha_wrapper_service_name: ovn_dbs
tripleo_ha_wrapper_resource_name: ovndbs_servers
tripleo_ha_wrapper_bundle_name: ovn-dbs-bundle
tripleo_ha_wrapper_resource_state: Slave Master
tripleo_ha_wrapper_puppet_config_volume: ovn_dbs
tripleo_ha_wrapper_puppet_execute: 'include ::tripleo::profile::base::pacemaker; include ::tripleo::profile::pacemaker::ovn_dbs_bundle'
tripleo_ha_wrapper_puppet_tags: 'pacemaker::resource::bundle,pacemaker::property,pacemaker::resource::ip,pacemaker::resource::ocf,pacemaker::constraint::order,pacemaker::constraint::colocation'
tripleo_ha_wrapper_puppet_debug: {get_param: ConfigDebug}
update_tasks:
- name: Tear-down non-HA ovn-dbs containers
when:
- step|int == 1
block: &ovn_dbs_teardown_nonha
- name: Remove non-HA ovn-dbs containers
include_role:
name: tripleo_container_rm
vars:
tripleo_container_cli: "{{ container_cli }}"
tripleo_containers_to_rm:
- ovn_north_db_server
- ovn_south_db_server
- ovn_northd
# When a schema change happens, the newer slaves don't connect
# back to the older master and end up timing out. So we clean
# up the error here until we get a fix for
# https://bugzilla.redhat.com/show_bug.cgi?id=1759974
- name: Clear ovndb cluster pacemaker error
shell: "pcs resource cleanup ovn-dbs-bundle"
when:
- step|int == 1
# Then we ban the resource for this node. It has no effect on
# the first two controllers, but when we reach the last one,
# it avoids a cut in the control plane as master get chosen in
# one of the updated Stopped ovn. They are in error, that why
# we need the cleanup just before.
- name: Ban ovndb resource on the current node.
shell: "pcs resource ban ovn-dbs-bundle $(hostname | cut -d. -f1)"
when:
- step|int == 1
- name: ovn-dbs fetch and retag container image for pacemaker
when:
- step|int == 3
block: &ovn_dbs_fetch_retag_container_tasks
- name: Pull latest ovn-dbs images
command: "{{container_cli}} pull {{ovn_dbs_image}}"
register: result
retries: 3
delay: 3
until: result.rc == 0
- name: Get previous ovn_dbs image id
shell: "{{container_cli}} inspect --format '{{'{{'}}.Id{{'}}'}}' {{ovn_dbs_image_latest}}"
register: old_ovn_dbs_image_id
failed_when: false
- name: Get new ovn_dbs image id
shell: "{{container_cli}} inspect --format '{{'{{'}}.Id{{'}}'}}' {{ovn_dbs_image}}"
register: new_ovn_dbs_image_id
- name: Retag pcmklatest to latest ovn_dbs image
include_role:
name: tripleo_container_tag
vars:
container_image: "{{ovn_dbs_image}}"
container_image_latest: "{{ovn_dbs_image_latest}}"
when:
- old_ovn_dbs_image_id.stdout != new_ovn_dbs_image_id.stdout
vars:
ovn_dbs_image: {get_param: ContainerOvnDbsImage}
ovn_dbs_image_latest: *ovn_dbs_image_pcmklatest
# We remove any leftover error and remove the ban.
- name: Ensure the cluster converge back even in case of schema change
shell: "pcs resource cleanup ovn-dbs-bundle"
when:
- step|int == 5
- name: Remove the ban
shell: "pcs resource clear ovn-dbs-bundle"
when:
- step|int == 5
# When ovn-dbs-bundle support was added, we didn't tag the ovn-dbs image
# with pcmklatest. So, when update is run for the first time we need to
# update the ovn-dbs-bundle resource to use the 'pcmklatest' tagged image.
# See https://bugzilla.redhat.com/show_bug.cgi?id=1586132.
# Step 3 (see above) takes care of tagging the image.
- name: Update ovn-dbs-bundle resource to use pcmklatest tag image if not used
when:
- step|int == 5
- ovn_dbs_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower
block:
- name: Get the present image used by ovn-dbs-bundle
shell: "pcs resource config ovn-dbs-bundle | grep -Eo 'image=[^ ]+' | awk -F= '{print $2;}'"
register: ovn_dbs_current_image
- block:
- name: Update the ovn-dbs-bundle to use the new container image name
command: "pcs resource bundle update ovn-dbs-bundle container image={{ovn_dbs_image_latest}}"
when:
- ovn_dbs_current_image.stdout != ovn_dbs_image_latest
vars:
ovn_dbs_image_latest: *ovn_dbs_image_pcmklatest
post_update_tasks:
- name: Ovn dbs bundle post update
when: step|int == 1
block: *ovn_dbs_puppet_bundle
vars:
tripleo_ha_wrapper_minor_update: true
upgrade_tasks:
- name: Tear-down non-HA ovn-dbs container
when:
- step|int == 0
block: *ovn_dbs_teardown_nonha
- name: Prepare switch of ovn-dbs image name
when:
- step|int == 0
block:
- name: Get ovn-dbs image id currently used by pacemaker
shell: "pcs resource config ovn-dbs-bundle | grep -Eo 'image=[^ ]+' | awk -F= '{print $2;}'"
register: ovn_dbs_image_current_res
failed_when: false
- name: ovn-dbs image facts
set_fact:
ovn_dbs_image: {get_param: ContainerOvnDbsImage}
ovn_dbs_image_latest: *ovn_dbs_image_pcmklatest
ovn_dbs_image_current: "{{ovn_dbs_image_current_res.stdout}}"
- name: Temporarily tag the current ovn_dbs image id with the upgraded image name
import_role:
name: tripleo_container_tag
vars:
container_image: "{{ovn_dbs_image_current}}"
container_image_latest: "{{ovn_dbs_image_latest}}"
pull_image: false
when:
- ovn_dbs_image_current != ''
- ovn_dbs_image_current != ovn_dbs_image_latest
# During an OS Upgrade, the cluster may not exist so we use
# the shell module instead.
# TODO(odyssey4me):
# Fix the pacemaker_resource module to handle the exception
# for a non-existant cluster more gracefully.
- name: Check ovn-dbs-bundle cluster resource status
shell: pcs resource config ovn-dbs-bundle
failed_when: false
changed_when: false
register: ovn_dbs_pcs_result
- name: Set fact ovn_dbs_pcs_res
set_fact:
ovn_dbs_pcs_res: "{{ ovn_dbs_pcs_result.rc == 0 }}"
- name: Update ovn_dbs pcs resource bundle for new container image
when:
- step|int == 1
- ovn_dbs_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower
- ovn_dbs_pcs_res|bool
- ovn_dbs_image_current != ovn_dbs_image_latest
block:
- name: Disable the ovn-dbs-bundle cluster resource before container upgrade
pacemaker_resource:
resource: ovn-dbs-bundle
state: disable
wait_for_resource: true
register: output
retries: 5
until: output.rc == 0
- name: pcs resource bundle update ovn-dbs for new container image name
command: "pcs resource bundle update ovn-dbs-bundle container image={{ovn_dbs_image_latest}}"
- name: Enable the ovn-dbs-bundle cluster resource
when:
pacemaker_resource:
resource: ovn-dbs-bundle
state: enable
wait_for_resource: true
register: output
retries: 5
until: output.rc == 0
- name: Create hiera data to upgrade ovn_dbs in a stepwise manner.
when:
- step|int == 1
- cluster_recreate|bool
block:
- name: set ovn_dbs upgrade node facts in a single-node environment
set_fact:
ovn_dbs_short_node_names_upgraded: "{{ ovn_dbs_short_node_names }}"
cacheable: no
when: groups['ovn_dbs'] | length <= 1
- name: set ovn_dbs upgrade node facts from the limit option
set_fact:
ovn_dbs_short_node_names_upgraded: "{{ ovn_dbs_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
cacheable: no
when:
- groups['ovn_dbs'] | length > 1
- item.split('.')[0] in ansible_limit.split(':')
loop: "{{ ovn_dbs_short_node_names | default([]) }}"
- fail:
msg: >
You can't upgrade ovn_dbs without staged
upgrade. You need to use the limit option in order
to do so.
when: >-
ovn_dbs_short_node_names_upgraded is not defined or
ovn_dbs_short_node_names_upgraded | length == 0
- debug:
msg: "Prepare ovn_dbs upgrade for {{ ovn_dbs_short_node_names_upgraded }}"
- name: remove ovn_dbs init container on upgrade-scaleup to force re-init
include_role:
name: tripleo_container_rm
vars:
tripleo_containers_to_rm:
- ovn_dbs_init_bundle
when:
- ovn_dbs_short_node_names_upgraded | length > 1
- name: add the ovn_dbs short name to hiera data for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: ovn_dbs_short_node_names_override
tripleo_upgrade_value: "{{ovn_dbs_short_node_names_upgraded}}"
- name: remove the extra hiera data needed for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: remove.yml
vars:
tripleo_upgrade_key: ovn_dbs_short_node_names_override
when: ovn_dbs_short_node_names_upgraded | length == ovn_dbs_short_node_names | length
- name: Retag the pacemaker image if containerized
when:
- step|int == 3
block: *ovn_dbs_fetch_retag_container_tasks
post_upgrade_tasks:
- name: Restart ovn-dbs service (pacemaker)
when:
- step|int == 1
- ovn_dbs_short_bootstrap_node_name|lower == ansible_facts['hostname']|lower
pacemaker_resource:
resource: ovn-dbs-bundle
state: restart
wait_for_resource: true