diff --git a/doc/source/roles/role-tripleo_container_manage.rst b/doc/source/roles/role-tripleo_container_manage.rst index 0e7825534..62c23609d 100644 --- a/doc/source/roles/role-tripleo_container_manage.rst +++ b/doc/source/roles/role-tripleo_container_manage.rst @@ -117,6 +117,43 @@ Roles variables | | | Must be a list. e.g. [0,3] | +------------------------------------------------+-----------------------------+----------------------------+ +Healthchecks +~~~~~~~~~~~~ + +Previously, the container healthcheck was implemented by a systemd timer which +would run ``podman exec`` to determine if a given container was healthy.. +Now, we are using the native healthcheck interface in Podman; which is easier +to integrate and consume. + +We are now using the native healthcheck interface in Podman; which is easier to +integrate with and consume. + +To check if a container (e.g. keystone) is healthy, run the following command: + +.. code-block:: bash + + $ sudo podman healthcheck run keystone + +The return code should be 0 and "healthy" should be printed as the output. +One can also use the ``podman inspect keystone`` output to figure out that +the healthcheck is periodically running and healthy: + +.. code-block:: bash + + "Healthcheck": { + "Status": "healthy", + "FailingStreak": 0, + "Log": [ + { + "Start": "2020-04-14T18:48:57.272180578Z", + "End": "2020-04-14T18:48:57.806659104Z", + "ExitCode": 0, + "Output": "" + }, + (...) + ] + } + Debug ~~~~~ diff --git a/releasenotes/notes/podman_heathcheck_v2-b3d4db0aeaf03519.yaml b/releasenotes/notes/podman_heathcheck_v2-b3d4db0aeaf03519.yaml new file mode 100644 index 000000000..0112314e5 --- /dev/null +++ b/releasenotes/notes/podman_heathcheck_v2-b3d4db0aeaf03519.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Stop using systemd to manage container healthchecks and use native podman + healthchecks which are easier to consume than systemd. + The tripleo_container_manage role will take care of cleaning up the old + systemd healthchecks if they exist. diff --git a/tripleo_ansible/ansible_plugins/modules/podman_container.py b/tripleo_ansible/ansible_plugins/modules/podman_container.py index a42d3c8c8..b9083abf7 100644 --- a/tripleo_ansible/ansible_plugins/modules/podman_container.py +++ b/tripleo_ansible/ansible_plugins/modules/podman_container.py @@ -1055,7 +1055,7 @@ class PodmanModuleParams: return c def addparam_healthcheck(self, c): - return c + ['--healthcheck', self.params['healthcheck']] + return c + ['--healthcheck-command', self.params['healthcheck']] def addparam_healthcheck_interval(self, c): return c + ['--healthcheck-interval', @@ -1280,6 +1280,7 @@ class PodmanDefaults: "env_host": False, "etc_hosts": {}, "group_add": [], + "healthcheck": "", "ipc": "", "kernelmemory": "0", "log_driver": "k8s-file", @@ -1519,6 +1520,18 @@ class PodmanContainerDiff: after = self.params['group_add'] return self._diff_update_and_compare('group_add', before, after) + # Healthcheck is only defined in container config if a healthcheck + # was configured; otherwise the config key isn't part of the config. + def diffparam_healthcheck(self): + if 'healthcheck' in self.info['config']: + # the "test" key is a list of 2 items where the first one is + # "CMD-SHELL" and the second one is the actual healthcheck command. + before = self.info['config']['healthcheck']['test'][1] + else: + before = '' + after = self.params['healthcheck'] or before + return self._diff_update_and_compare('healthcheck', before, after) + # Because of hostname is random generated, this parameter has partial idempotency only. def diffparam_hostname(self): before = self.info['config']['hostname'] diff --git a/tripleo_ansible/roles/tripleo_container_manage/molecule/default/converge.yml b/tripleo_ansible/roles/tripleo_container_manage/molecule/default/converge.yml index 8052eb174..8b2e7e4a0 100644 --- a/tripleo_ansible/roles/tripleo_container_manage/molecule/default/converge.yml +++ b/tripleo_ansible/roles/tripleo_container_manage/molecule/default/converge.yml @@ -15,6 +15,46 @@ # under the License. +- name: Create fedora container from /tmp/container-configs with old healthcheck + become: true + hosts: all + gather_facts: false + vars: + tripleo_container_manage_config: '/tmp/container-configs' + tripleo_container_manage_healthcheck_disabled: true + tripleo_container_manage_debug: true + tripleo_container_manage_config_patterns: 'fedora.json' + tripleo_container_manage_systemd_order: true + tasks: + - include_role: + name: tripleo_container_manage + post_tasks: + - name: Verify that Fedora container was created correctly and manually create old healthcheck for migration testing + when: + - not ansible_check_mode|bool + block: + # Reproduce what was done before to create and enable healthchecks + - name: "Enable and start systemd timers" + systemd: + state: started + name: "tripleo_fedora_healthcheck.timer" + enabled: true + daemon_reload: false + - name: "Add systemd requires for healthchecks" + command: "systemctl add-requires tripleo_fedora.service tripleo_fedora_healthcheck.timer" + # Check that migration is ready to be tested + - name: Check for fedora container + command: podman container exists fedora + - name: Check if tripleo_fedora systemd healthcheck service is active + command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer + register: tripleo_fedora_healthcheck_active_result + - name: Assert that tripleo_fedora systemd healthcheck service is active + assert: + that: + - tripleo_fedora_healthcheck_active_result.rc == 0 + fail_msg: 'tripleo_fedora systemd healthcheck service is not active' + success_msg: 'tripleo_fedora systemd healthcheck service is active' + - name: Create all containers from /tmp/container-configs become: true hosts: all @@ -53,30 +93,12 @@ - tripleo_fedora_active_result.rc == 0 fail_msg: 'tripleo_fedora systemd service is not active' success_msg: 'tripleo_fedora systemd service is active' - - name: Check if tripleo_fedora systemd healthcheck service is active - command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer - register: tripleo_fedora_healthcheck_active_result - - name: Assert that tripleo_fedora systemd healthcheck service is active + - name: Check if tripleo_fedora healthcheck is active and healthy assert: that: - - tripleo_fedora_healthcheck_active_result.rc == 0 - fail_msg: 'tripleo_fedora systemd healthcheck service is not active' - success_msg: 'tripleo_fedora systemd healthcheck service is active' - - name: Check if tripleo_fedora_bis has systemd service - stat: - path: /etc/systemd/system/tripleo_fedora_bis.service - register: stat_tripleo_fedora_bis_systemd - - name: Check if tripleo_fedora_bis has systemd healthcheck timer - stat: - path: /etc/systemd/system/tripleo_fedora_bis_healthcheck.timer - register: stat_tripleo_fedora_bis_systemd_timer - - name: Assert that tripleo_fedora_bis has no systemd integration - assert: - that: - - not stat_tripleo_fedora_bis_systemd.stat.exists - - not stat_tripleo_fedora_bis_systemd_timer.stat.exists - fail_msg: 'tripleo_fedora_bis has systemd service' - success_msg: 'tripleo_fedora_bis has no systemd service' + - "'healthy' in fedora_infos.containers.0.State.Healthcheck.Status" + fail_msg: 'fedora container healthcheck is not healthy' + success_msg: 'fedora container healthcheck is healthy' - name: Verify that Fedora bis container was created correctly block: - name: Check for fedora_bis container @@ -180,15 +202,12 @@ - tripleo_fedora_active_result.rc == 0 fail_msg: 'tripleo_fedora systemd service is not active' success_msg: 'tripleo_fedora systemd service is active' - - name: Check if tripleo_fedora systemd healthcheck service is active - command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer - register: tripleo_fedora_healthcheck_active_result - - name: Assert that tripleo_fedora systemd healthcheck service is active + - name: Check if tripleo_fedora healthcheck is active and healthy assert: that: - - tripleo_fedora_healthcheck_active_result.rc == 0 - fail_msg: 'tripleo_fedora systemd healthcheck service is not active' - success_msg: 'tripleo_fedora systemd healthcheck service is active' + - "'healthy' in fedora_infos.containers.0.State.Healthcheck.Status" + fail_msg: 'fedora container healthcheck is not healthy' + success_msg: 'fedora container healthcheck is healthy' - name: Check for fedora_bis container command: podman container exists fedora_bis - name: Check for fedora_three container @@ -212,6 +231,10 @@ block: - name: Check for fedora container command: podman container exists fedora + - name: Gather facts about fedora container + podman_container_info: + name: fedora + register: fedora_infos - name: Check if tripleo_fedora systemd service is active command: systemctl is-active --quiet tripleo_fedora register: tripleo_fedora_active_result @@ -221,15 +244,12 @@ - tripleo_fedora_active_result.rc == 0 fail_msg: 'tripleo_fedora systemd service is not active' success_msg: 'tripleo_fedora systemd service is active' - - name: Check if tripleo_fedora systemd healthcheck service is active - command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer - register: tripleo_fedora_healthcheck_active_result - - name: Assert that tripleo_fedora systemd healthcheck service is active + - name: Check if tripleo_fedora healthcheck is active and healthy assert: that: - - tripleo_fedora_healthcheck_active_result.rc == 0 - fail_msg: 'tripleo_fedora systemd healthcheck service is not active' - success_msg: 'tripleo_fedora systemd healthcheck service is active' + - "'healthy' in fedora_infos.containers.0.State.Healthcheck.Status" + fail_msg: 'fedora container healthcheck is not healthy' + success_msg: 'fedora container healthcheck is healthy' - name: Check for fedora_bis container command: podman container exists fedora_bis - name: Check for fedora_three container @@ -264,10 +284,6 @@ command: systemctl is-active --quiet tripleo_fedora register: tripleo_fedora_active_result failed_when: tripleo_fedora_active_result.rc == 0 - - name: Check if tripleo_fedora systemd healthcheck service is still active - command: systemctl is-active --quiet tripleo_fedora_healthcheck.timer - register: tripleo_fedora_healthcheck_active_result - failed_when: tripleo_fedora_healthcheck_active_result.rc == 0 - name: Check for fedora_bis container command: podman container exists fedora_bis - name: Check for fedora_three container diff --git a/tripleo_ansible/roles/tripleo_container_manage/molecule/default/prepare.yml b/tripleo_ansible/roles/tripleo_container_manage/molecule/default/prepare.yml index a7c4d4528..531eed45d 100644 --- a/tripleo_ansible/roles/tripleo_container_manage/molecule/default/prepare.yml +++ b/tripleo_ansible/roles/tripleo_container_manage/molecule/default/prepare.yml @@ -17,6 +17,7 @@ - name: Prepare hosts: all + become: true roles: - role: test_deps test_deps_extra_packages: @@ -34,6 +35,7 @@ "net": "host", "command": "sleep 3600", "restart": "always", + "check_interval": "500s", "healthcheck": { "test": "echo test" } } dest: '/tmp/container-configs/fedora.json' @@ -43,8 +45,7 @@ { "image": "fedora:latest", "net": "host", - "command": "sleep 3600", - "healthcheck": { "test": "echo test" } + "command": "sleep 3600" } dest: '/tmp/container-configs/fedora_bis.json' - name: Create a third configuration file for a fedora container @@ -56,3 +57,34 @@ "command": "sleep 3600" } dest: '/tmp/container-configs/fedora_three.json' + - name: Create old healthcheck service for fedora container + copy: + content: | + [Unit] + Description=tripleo_fedora healthcheck + Requisite=tripleo_fedora.service + [Service] + Type=oneshot + ExecStart=/usr/bin/podman exec --user root fedora sleep 3600 }} + [Install] + WantedBy=multi-user.target + dest: '/etc/systemd/system/tripleo_fedora_healthcheck.service' + mode: '0644' + owner: root + group: root + - name: Create old healthcheck timer for fedora container + copy: + content: | + [Unit] + Description=tripleo_fedora container healthcheck + PartOf=tripleo_fedora.service + [Timer] + OnActiveSec=120 + OnUnitActiveSec=60 + RandomizedDelaySec=5 + [Install] + WantedBy=timers.target + dest: '/etc/systemd/system/tripleo_fedora_healthcheck.timer' + mode: '0644' + owner: root + group: root diff --git a/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/cleanup_healthcheck.yml b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/cleanup_healthcheck.yml new file mode 100644 index 000000000..ad32d5533 --- /dev/null +++ b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/cleanup_healthcheck.yml @@ -0,0 +1,48 @@ +--- +# Copyright 2020 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: "Stop and disable systemd timer for {{ container_systemd_healthcheck_name }}" + systemd: + state: stopped + name: "tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.timer" + enabled: false + daemon_reload: false + +- name: "Remove systemd healthcheck files for {{ container_systemd_healthcheck_name }}" + file: + path: "{{ container_systemd_healthcheck_file }}" + state: absent + loop: + - "/etc/systemd/system/tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.service" + - "/etc/systemd/system/tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.timer" + loop_control: + loop_var: container_systemd_healthcheck_file + +- name: Force systemd to re-read config after healthcheck removals + systemd: + daemon_reload: true + +- name: "Check if {{ container_systemd_healthcheck_name }} healthcheck is not running" + command: "systemctl is-active --quiet tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.timer" + register: tripleo_healthcheck_result + failed_when: + - tripleo_healthcheck_result.rc == 0 + +- name: "Check if {{ container_systemd_healthcheck_name }} service is running and healthy" + command: "systemctl is-active --quiet tripleo_{{ container_systemd_healthcheck_name }}.service" + register: tripleo_service_result + failed_when: + - tripleo_service_result.rc != 0 diff --git a/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/create.yml b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/create.yml index 92ef2e041..1c5b343d1 100644 --- a/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/create.yml +++ b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/create.yml @@ -47,6 +47,12 @@ env_file: "{{ lookup('dict', container_data).value.env_file | default(omit) }}" etc_hosts: "{{ lookup('dict', container_data).value.extra_hosts | default({}) }}" group_add: "{{ lookup('dict', container_data).value.group_add | default(omit) }}" + healthcheck: >- + {{ (not tripleo_container_manage_healthcheck_disabled | bool) | + ternary((lookup('dict', container_data).value.healthcheck.test|default(omit)), omit) }} + healthcheck_interval: >- + {{ lookup('dict', container_data).value.check_interval|default('60s') + if lookup('dict', container_data).value.healthcheck is defined else omit }} hostname: "{{ lookup('dict', container_data).value.hostname | default(omit) }}" image: "{{ lookup('dict', container_data).value.image }}" interactive: "{{ lookup('dict', container_data).value.interactive | default(false) }}" diff --git a/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/stat_healthcheck.yml b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/stat_healthcheck.yml new file mode 100644 index 000000000..85e9d2e84 --- /dev/null +++ b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/stat_healthcheck.yml @@ -0,0 +1,25 @@ +--- +# Copyright 2020 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: "Check if systemd healthcheck exists for {{ container_systemd_healthcheck_name }}" + stat: + path: "/etc/systemd/system/tripleo_{{ container_systemd_healthcheck_name }}_healthcheck.service" + register: container_systemd_healthcheck_stat + +- name: "Cleanup systemd healthcheck for {{ container_systemd_healthcheck_name }}" + when: + - (container_systemd_healthcheck_stat.stat.exists|bool) + include_tasks: podman/cleanup_healthcheck.yml diff --git a/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/systemd.yml b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/systemd.yml index 09d12f342..bd7109f39 100644 --- a/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/systemd.yml +++ b/tripleo_ansible/roles/tripleo_container_manage/tasks/podman/systemd.yml @@ -18,13 +18,6 @@ set_fact: container_config: "{{ all_containers_hash | dict_to_list | haskey(attribute='restart', value=['always','unless-stopped'], any=True) | default([]) }}" -- name: Set container_config_healthcheck fact - set_fact: - # Using intersect to prevent a service which isn't controlled by systemd - # but has healthcheck in its configuration (by mistake) - # See https://bugs.launchpad.net/tripleo/+bug/1873249 - container_config_healthcheck: "{{ all_containers_hash | dict_to_list | haskey(attribute='healthcheck') | intersect(container_config) | default([]) }}" - - name: "Manage systemd files" no_log: "{{ not tripleo_container_manage_debug }}" block: @@ -35,6 +28,11 @@ loop: "{{ container_config }}" loop_control: loop_var: container_data_requires + - name: "Cleanup systemd healthchecks" + when: + - not tripleo_container_manage_healthcheck_disabled + include: podman/stat_healthcheck.yml container_systemd_healthcheck_name="{{ lookup('dict', item).key }}" + loop: "{{ container_config }}" - name: "Create systemd services files" template: src: systemd-service.j2 @@ -46,50 +44,12 @@ loop: "{{ container_config }}" loop_control: loop_var: container_data_unit - - name: "Create systemd healthcheck files" - when: - - not tripleo_container_manage_healthcheck_disabled - - (container_config_healthcheck | length) > 0 - block: - - name: "Create systemd unit files healthchecks" - template: - src: systemd-healthcheck.j2 - dest: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_healthcheck).key }}_healthcheck.service" - mode: '0644' - owner: root - group: root - register: systemd_healthcheck - loop: "{{ container_config_healthcheck }}" - loop_control: - loop_var: container_data_healthcheck - - name: "Create systemd timers for healthchecks" - template: - src: systemd-timer.j2 - dest: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_timer).key }}_healthcheck.timer" - mode: '0644' - owner: root - group: root - register: systemd_timer - loop: "{{ container_config_healthcheck }}" - loop_control: - loop_var: container_data_timer - -- name: Create fact for container_systemd_changes - set_fact: - container_systemd_changes: >- - {{ ((systemd_file|default([])|get_changed_async_task_names) + (systemd_healthcheck|default([])|get_changed_async_task_names) + - (systemd_timer|default([])|get_changed_async_task_names)|unique) }} - name: "Force systemd daemon reload if a systemd file changed" systemd: daemon_reload: true when: - - (container_systemd_changes | length) > 0 - -- name: Create fact for container_systemd_restart_list - set_fact: - container_systemd_services_restart_list: "{{ (systemd_file|get_changed_async_task_names(extra=containers_changed|default([]))) }}" - container_systemd_timers_restart_list: "{{ (systemd_timer|get_changed_async_task_names(extra=containers_changed|default([]))) }}" + - (systemd_file|get_changed_async_task_names|length) > 0 - name: "Start or restart systemd services" systemd: @@ -98,36 +58,10 @@ name: "tripleo_{{ container_sysd_name }}.service" enabled: true daemon_reload: false - loop: "{{ container_systemd_services_restart_list }}" + loop: "{{ (systemd_file|get_changed_async_task_names(extra=containers_changed|default([]))) }}" loop_control: loop_var: container_sysd_name register: systemd_service_enable until: systemd_service_enable.status.Result == "success" retries: 5 delay: 5 - -- name: "Enable and start systemd timers" - systemd: - # Restart the timer if it was already running - state: restarted - name: "tripleo_{{ container_timer_name }}_healthcheck.timer" - enabled: true - daemon_reload: false - loop: "{{ container_systemd_timers_restart_list }}" - loop_control: - loop_var: container_timer_name - when: - - not tripleo_container_manage_healthcheck_disabled - register: systemd_healthcheck_enable - until: systemd_healthcheck_enable.status.Result == "success" - retries: 5 - delay: 5 - - -- name: "Add systemd requires for healthchecks" - command: "systemctl add-requires tripleo_{{ container_requires_timer_name }}.service tripleo_{{ container_requires_timer_name }}_healthcheck.timer" - loop: "{{ container_systemd_timers_restart_list }}" - loop_control: - loop_var: container_requires_timer_name - when: - - not tripleo_container_manage_healthcheck_disabled diff --git a/tripleo_ansible/roles/tripleo_container_manage/templates/systemd-healthcheck.j2 b/tripleo_ansible/roles/tripleo_container_manage/templates/systemd-healthcheck.j2 deleted file mode 100644 index f530e63dc..000000000 --- a/tripleo_ansible/roles/tripleo_container_manage/templates/systemd-healthcheck.j2 +++ /dev/null @@ -1,10 +0,0 @@ -[Unit] -Description=tripleo_{{ lookup('dict', container_data_healthcheck).key }} healthcheck -After=tripleo-container-shutdown.service tripleo_{{ lookup('dict', container_data_healthcheck).key }}.service -Requisite=tripleo_{{ lookup('dict', container_data_healthcheck).key }}.service -[Service] -Type=oneshot -ExecStart=/usr/bin/podman exec --user root {{ lookup('dict', container_data_healthcheck).key }} {{ lookup('dict', container_data_healthcheck).value.healthcheck.test }} -SyslogIdentifier=healthcheck_{{ lookup('dict', container_data_healthcheck).key }} -[Install] -WantedBy=multi-user.target diff --git a/tripleo_ansible/roles/tripleo_container_manage/templates/systemd-timer.j2 b/tripleo_ansible/roles/tripleo_container_manage/templates/systemd-timer.j2 deleted file mode 100644 index 399569fef..000000000 --- a/tripleo_ansible/roles/tripleo_container_manage/templates/systemd-timer.j2 +++ /dev/null @@ -1,9 +0,0 @@ -[Unit] -Description=tripleo_{{ lookup('dict', container_data_timer).key }} container healthcheck -PartOf=tripleo_{{ lookup('dict', container_data_timer).key }}.service -[Timer] -OnActiveSec=120 -OnUnitActiveSec={{ lookup('dict', container_data_timer).value.check_interval | default(60) }} -RandomizedDelaySec={{ 45 if lookup('dict', container_data_timer).value.check_interval is not defined else (lookup('dict', container_data_timer).value.check_interval * 3 / 4) | int | abs }} -[Install] -WantedBy=timers.target