Merge "tripleo_container_manage: remove systemd healthchecks"

This commit is contained in:
Zuul 2020-04-23 16:56:16 +00:00 committed by Gerrit Code Review
commit b5d0de37fa
11 changed files with 234 additions and 135 deletions

View File

@ -117,6 +117,43 @@ Roles variables
| | | Must be a list. e.g. [0,3] |
+------------------------------------------------+-----------------------------+----------------------------+
Healthchecks
~~~~~~~~~~~~
Previously, the container healthcheck was implemented by a systemd timer which
would run ``podman exec`` to determine if a given container was healthy..
Now, we are using the native healthcheck interface in Podman; which is easier
to integrate and consume.
We are now using the native healthcheck interface in Podman; which is easier to
integrate with and consume.
To check if a container (e.g. keystone) is healthy, run the following command:
.. code-block:: bash
$ sudo podman healthcheck run keystone
The return code should be 0 and "healthy" should be printed as the output.
One can also use the ``podman inspect keystone`` output to figure out that
the healthcheck is periodically running and healthy:
.. code-block:: bash
"Healthcheck": {
"Status": "healthy",
"FailingStreak": 0,
"Log": [
{
"Start": "2020-04-14T18:48:57.272180578Z",
"End": "2020-04-14T18:48:57.806659104Z",
"ExitCode": 0,
"Output": ""
},
(...)
]
}
Debug
~~~~~

View File

@ -0,0 +1,7 @@
---
features:
- |
Stop using systemd to manage container healthchecks and use native podman
healthchecks which are easier to consume than systemd.
The tripleo_container_manage role will take care of cleaning up the old
systemd healthchecks if they exist.

View File

@ -1055,7 +1055,7 @@ class PodmanModuleParams:
return c
def addparam_healthcheck(self, c):
return c + ['--healthcheck', self.params['healthcheck']]
return c + ['--healthcheck-command', self.params['healthcheck']]
def addparam_healthcheck_interval(self, c):
return c + ['--healthcheck-interval',
@ -1280,6 +1280,7 @@ class PodmanDefaults:
"env_host": False,
"etc_hosts": {},
"group_add": [],
"healthcheck": "",
"ipc": "",
"kernelmemory": "0",
"log_driver": "k8s-file",
@ -1519,6 +1520,18 @@ class PodmanContainerDiff:
after = self.params['group_add']
return self._diff_update_and_compare('group_add', before, after)
# Healthcheck is only defined in container config if a healthcheck
# was configured; otherwise the config key isn't part of the config.
def diffparam_healthcheck(self):
if 'healthcheck' in self.info['config']:
# the "test" key is a list of 2 items where the first one is
# "CMD-SHELL" and the second one is the actual healthcheck command.
before = self.info['config']['healthcheck']['test'][1]
else:
before = ''
after = self.params['healthcheck'] or before
return self._diff_update_and_compare('healthcheck', before, after)
# Because of hostname is random generated, this parameter has partial idempotency only.
def diffparam_hostname(self):
before = self.info['config']['hostname']

View File

@ -15,6 +15,46 @@
# under the License.
- name: Create fedora container from /tmp/container-configs with old healthcheck
become: true
hosts: all
gather_facts: false
vars:
tripleo_container_manage_config: '/tmp/container-configs'
tripleo_container_manage_healthcheck_disabled: true
tripleo_container_manage_debug: true
tripleo_container_manage_config_patterns: 'fedora.json'
tripleo_container_manage_systemd_order: true
tasks:
- include_role:
name: tripleo_container_manage
post_tasks:
- name: Verify that Fedora container was created correctly and manually create old healthcheck for migration testing
when:
- not ansible_check_mode|bool
block:
# Reproduce what was done before to create and enable healthchecks
- name: "Enable and start systemd timers"
systemd:
state: started
name: "tripleo_fedora_healthcheck.timer"
enabled: true
daemon_reload: false
- name: "Add systemd requires for healthchecks"
command: "systemctl add-requires tripleo_fedora.service tripleo_fedora_healthcheck.timer"
# Check that migration is ready to be tested
- name: Check for fedora container
command: podman container exists fedora
- name: Check if tripleo_fedora systemd healthcheck service is active
command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer
register: tripleo_fedora_healthcheck_active_result
- name: Assert that tripleo_fedora systemd healthcheck service is active
assert:
that:
- tripleo_fedora_healthcheck_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd healthcheck service is not active'
success_msg: 'tripleo_fedora systemd healthcheck service is active'
- name: Create all containers from /tmp/container-configs
become: true
hosts: all
@ -53,30 +93,12 @@
- tripleo_fedora_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd service is not active'
success_msg: 'tripleo_fedora systemd service is active'
- name: Check if tripleo_fedora systemd healthcheck service is active
command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer
register: tripleo_fedora_healthcheck_active_result
- name: Assert that tripleo_fedora systemd healthcheck service is active
- name: Check if tripleo_fedora healthcheck is active and healthy
assert:
that:
- tripleo_fedora_healthcheck_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd healthcheck service is not active'
success_msg: 'tripleo_fedora systemd healthcheck service is active'
- name: Check if tripleo_fedora_bis has systemd service
stat:
path: /etc/systemd/system/tripleo_fedora_bis.service
register: stat_tripleo_fedora_bis_systemd
- name: Check if tripleo_fedora_bis has systemd healthcheck timer
stat:
path: /etc/systemd/system/tripleo_fedora_bis_healthcheck.timer
register: stat_tripleo_fedora_bis_systemd_timer
- name: Assert that tripleo_fedora_bis has no systemd integration
assert:
that:
- not stat_tripleo_fedora_bis_systemd.stat.exists
- not stat_tripleo_fedora_bis_systemd_timer.stat.exists
fail_msg: 'tripleo_fedora_bis has systemd service'
success_msg: 'tripleo_fedora_bis has no systemd service'
- "'healthy' in fedora_infos.containers.0.State.Healthcheck.Status"
fail_msg: 'fedora container healthcheck is not healthy'
success_msg: 'fedora container healthcheck is healthy'
- name: Verify that Fedora bis container was created correctly
block:
- name: Check for fedora_bis container
@ -180,15 +202,12 @@
- tripleo_fedora_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd service is not active'
success_msg: 'tripleo_fedora systemd service is active'
- name: Check if tripleo_fedora systemd healthcheck service is active
command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer
register: tripleo_fedora_healthcheck_active_result
- name: Assert that tripleo_fedora systemd healthcheck service is active
- name: Check if tripleo_fedora healthcheck is active and healthy
assert:
that:
- tripleo_fedora_healthcheck_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd healthcheck service is not active'
success_msg: 'tripleo_fedora systemd healthcheck service is active'
- "'healthy' in fedora_infos.containers.0.State.Healthcheck.Status"
fail_msg: 'fedora container healthcheck is not healthy'
success_msg: 'fedora container healthcheck is healthy'
- name: Check for fedora_bis container
command: podman container exists fedora_bis
- name: Check for fedora_three container
@ -212,6 +231,10 @@
block:
- name: Check for fedora container
command: podman container exists fedora
- name: Gather facts about fedora container
podman_container_info:
name: fedora
register: fedora_infos
- name: Check if tripleo_fedora systemd service is active
command: systemctl is-active --quiet tripleo_fedora
register: tripleo_fedora_active_result
@ -221,15 +244,12 @@
- tripleo_fedora_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd service is not active'
success_msg: 'tripleo_fedora systemd service is active'
- name: Check if tripleo_fedora systemd healthcheck service is active
command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer
register: tripleo_fedora_healthcheck_active_result
- name: Assert that tripleo_fedora systemd healthcheck service is active
- name: Check if tripleo_fedora healthcheck is active and healthy
assert:
that:
- tripleo_fedora_healthcheck_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd healthcheck service is not active'
success_msg: 'tripleo_fedora systemd healthcheck service is active'
- "'healthy' in fedora_infos.containers.0.State.Healthcheck.Status"
fail_msg: 'fedora container healthcheck is not healthy'
success_msg: 'fedora container healthcheck is healthy'
- name: Check for fedora_bis container
command: podman container exists fedora_bis
- name: Check for fedora_three container
@ -264,10 +284,6 @@
command: systemctl is-active --quiet tripleo_fedora
register: tripleo_fedora_active_result
failed_when: tripleo_fedora_active_result.rc == 0
- name: Check if tripleo_fedora systemd healthcheck service is still active
command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer
register: tripleo_fedora_healthcheck_active_result
failed_when: tripleo_fedora_healthcheck_active_result.rc == 0
- name: Check for fedora_bis container
command: podman container exists fedora_bis
- name: Check for fedora_three container

View File

@ -17,6 +17,7 @@
- name: Prepare
hosts: all
become: true
roles:
- role: test_deps
test_deps_extra_packages:
@ -34,6 +35,7 @@
"net": "host",
"command": "sleep 3600",
"restart": "always",
"check_interval": "500s",
"healthcheck": { "test": "echo test" }
}
dest: '/tmp/container-configs/fedora.json'
@ -43,8 +45,7 @@
{
"image": "fedora:latest",
"net": "host",
"command": "sleep 3600",
"healthcheck": { "test": "echo test" }
"command": "sleep 3600"
}
dest: '/tmp/container-configs/fedora_bis.json'
- name: Create a third configuration file for a fedora container
@ -56,3 +57,34 @@
"command": "sleep 3600"
}
dest: '/tmp/container-configs/fedora_three.json'
- name: Create old healthcheck service for fedora container
copy:
content: |
[Unit]
Description=tripleo_fedora healthcheck
Requisite=tripleo_fedora.service
[Service]
Type=oneshot
ExecStart=/usr/bin/podman exec --user root fedora sleep 3600 }}
[Install]
WantedBy=multi-user.target
dest: '/etc/systemd/system/tripleo_fedora_healthcheck.service'
mode: '0644'
owner: root
group: root
- name: Create old healthcheck timer for fedora container
copy:
content: |
[Unit]
Description=tripleo_fedora container healthcheck
PartOf=tripleo_fedora.service
[Timer]
OnActiveSec=120
OnUnitActiveSec=60
RandomizedDelaySec=5
[Install]
WantedBy=timers.target
dest: '/etc/systemd/system/tripleo_fedora_healthcheck.timer'
mode: '0644'
owner: root
group: root

View File

@ -0,0 +1,48 @@
---
# Copyright 2020 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
- name: "Stop and disable systemd timer for {{ container_systemd_healthcheck_name }}"
systemd:
state: stopped
name: "tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.timer"
enabled: false
daemon_reload: false
- name: "Remove systemd healthcheck files for {{ container_systemd_healthcheck_name }}"
file:
path: "{{ container_systemd_healthcheck_file }}"
state: absent
loop:
- "/etc/systemd/system/tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.service"
- "/etc/systemd/system/tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.timer"
loop_control:
loop_var: container_systemd_healthcheck_file
- name: Force systemd to re-read config after healthcheck removals
systemd:
daemon_reload: true
- name: "Check if {{ container_systemd_healthcheck_name }} healthcheck is not running"
command: "systemctl is-active --quiet tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.timer"
register: tripleo_healthcheck_result
failed_when:
- tripleo_healthcheck_result.rc == 0
- name: "Check if {{ container_systemd_healthcheck_name }} service is running and healthy"
command: "systemctl is-active --quiet tripleo_{{ container_systemd_healthcheck_name }}.service"
register: tripleo_service_result
failed_when:
- tripleo_service_result.rc != 0

View File

@ -47,6 +47,12 @@
env_file: "{{ lookup('dict', container_data).value.env_file | default(omit) }}"
etc_hosts: "{{ lookup('dict', container_data).value.extra_hosts | default({}) }}"
group_add: "{{ lookup('dict', container_data).value.group_add | default(omit) }}"
healthcheck: >-
{{ (not tripleo_container_manage_healthcheck_disabled | bool) |
ternary((lookup('dict', container_data).value.healthcheck.test|default(omit)), omit) }}
healthcheck_interval: >-
{{ lookup('dict', container_data).value.check_interval|default('60s')
if lookup('dict', container_data).value.healthcheck is defined else omit }}
hostname: "{{ lookup('dict', container_data).value.hostname | default(omit) }}"
image: "{{ lookup('dict', container_data).value.image }}"
interactive: "{{ lookup('dict', container_data).value.interactive | default(false) }}"

View File

@ -0,0 +1,25 @@
---
# Copyright 2020 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
- name: "Check if systemd healthcheck exists for {{ container_systemd_healthcheck_name }}"
stat:
path: "/etc/systemd/system/tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.service"
register: container_systemd_healthcheck_stat
- name: "Cleanup systemd healthcheck for {{ container_systemd_healthcheck_name }}"
when:
- (container_systemd_healthcheck_stat.stat.exists|bool)
include_tasks: podman/cleanup_healthcheck.yml

View File

@ -18,13 +18,6 @@
set_fact:
container_config: "{{ all_containers_hash | dict_to_list | haskey(attribute='restart', value=['always','unless-stopped'], any=True) | default([]) }}"
- name: Set container_config_healthcheck fact
set_fact:
# Using intersect to prevent a service which isn't controlled by systemd
# but has healthcheck in its configuration (by mistake)
# See https://bugs.launchpad.net/tripleo/+bug/1873249
container_config_healthcheck: "{{ all_containers_hash | dict_to_list | haskey(attribute='healthcheck') | intersect(container_config) | default([]) }}"
- name: "Manage systemd files"
no_log: "{{ not tripleo_container_manage_debug }}"
block:
@ -35,6 +28,11 @@
loop: "{{ container_config }}"
loop_control:
loop_var: container_data_requires
- name: "Cleanup systemd healthchecks"
when:
- not tripleo_container_manage_healthcheck_disabled
include: podman/stat_healthcheck.yml container_systemd_healthcheck_name="{{ lookup('dict', item).key }}"
loop: "{{ container_config }}"
- name: "Create systemd services files"
template:
src: systemd-service.j2
@ -46,50 +44,12 @@
loop: "{{ container_config }}"
loop_control:
loop_var: container_data_unit
- name: "Create systemd healthcheck files"
when:
- not tripleo_container_manage_healthcheck_disabled
- (container_config_healthcheck | length) > 0
block:
- name: "Create systemd unit files healthchecks"
template:
src: systemd-healthcheck.j2
dest: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_healthcheck).key }}_healthcheck.service"
mode: '0644'
owner: root
group: root
register: systemd_healthcheck
loop: "{{ container_config_healthcheck }}"
loop_control:
loop_var: container_data_healthcheck
- name: "Create systemd timers for healthchecks"
template:
src: systemd-timer.j2
dest: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_timer).key }}_healthcheck.timer"
mode: '0644'
owner: root
group: root
register: systemd_timer
loop: "{{ container_config_healthcheck }}"
loop_control:
loop_var: container_data_timer
- name: Create fact for container_systemd_changes
set_fact:
container_systemd_changes: >-
{{ ((systemd_file|default([])|get_changed_async_task_names) + (systemd_healthcheck|default([])|get_changed_async_task_names) +
(systemd_timer|default([])|get_changed_async_task_names)|unique) }}
- name: "Force systemd daemon reload if a systemd file changed"
systemd:
daemon_reload: true
when:
- (container_systemd_changes | length) > 0
- name: Create fact for container_systemd_restart_list
set_fact:
container_systemd_services_restart_list: "{{ (systemd_file|get_changed_async_task_names(extra=containers_changed|default([]))) }}"
container_systemd_timers_restart_list: "{{ (systemd_timer|get_changed_async_task_names(extra=containers_changed|default([]))) }}"
- (systemd_file|get_changed_async_task_names|length) > 0
- name: "Start or restart systemd services"
systemd:
@ -98,36 +58,10 @@
name: "tripleo_{{ container_sysd_name }}.service"
enabled: true
daemon_reload: false
loop: "{{ container_systemd_services_restart_list }}"
loop: "{{ (systemd_file|get_changed_async_task_names(extra=containers_changed|default([]))) }}"
loop_control:
loop_var: container_sysd_name
register: systemd_service_enable
until: systemd_service_enable.status.Result == "success"
retries: 5
delay: 5
- name: "Enable and start systemd timers"
systemd:
# Restart the timer if it was already running
state: restarted
name: "tripleo_{{ container_timer_name }}_healthcheck.timer"
enabled: true
daemon_reload: false
loop: "{{ container_systemd_timers_restart_list }}"
loop_control:
loop_var: container_timer_name
when:
- not tripleo_container_manage_healthcheck_disabled
register: systemd_healthcheck_enable
until: systemd_healthcheck_enable.status.Result == "success"
retries: 5
delay: 5
- name: "Add systemd requires for healthchecks"
command: "systemctl add-requires tripleo_{{ container_requires_timer_name }}.service tripleo_{{ container_requires_timer_name }}_healthcheck.timer"
loop: "{{ container_systemd_timers_restart_list }}"
loop_control:
loop_var: container_requires_timer_name
when:
- not tripleo_container_manage_healthcheck_disabled

View File

@ -1,10 +0,0 @@
[Unit]
Description=tripleo_{{ lookup('dict', container_data_healthcheck).key }} healthcheck
After=tripleo-container-shutdown.service tripleo_{{ lookup('dict', container_data_healthcheck).key }}.service
Requisite=tripleo_{{ lookup('dict', container_data_healthcheck).key }}.service
[Service]
Type=oneshot
ExecStart=/usr/bin/podman exec --user root {{ lookup('dict', container_data_healthcheck).key }} {{ lookup('dict', container_data_healthcheck).value.healthcheck.test }}
SyslogIdentifier=healthcheck_{{ lookup('dict', container_data_healthcheck).key }}
[Install]
WantedBy=multi-user.target

View File

@ -1,9 +0,0 @@
[Unit]
Description=tripleo_{{ lookup('dict', container_data_timer).key }} container healthcheck
PartOf=tripleo_{{ lookup('dict', container_data_timer).key }}.service
[Timer]
OnActiveSec=120
OnUnitActiveSec={{ lookup('dict', container_data_timer).value.check_interval | default(60) }}
RandomizedDelaySec={{ 45 if lookup('dict', container_data_timer).value.check_interval is not defined else (lookup('dict', container_data_timer).value.check_interval * 3 / 4) | int | abs }}
[Install]
WantedBy=timers.target