[train-squash] tripleo_container_manage: backport systemd optimizations

This is a combination of 2 commits to avoid regressions.
(see https://bugs.launchpad.net/tripleo/+bug/1873249)

This is the 1st commit message:

tripleo_container_manage: optimize systemd services/healthcheck bootstrap

Separate the creation of systemd files & service restarts so we don't
call systemd too many times and makes the deployment faster.

It also uses a new filter that will read register data to figure out
what systemd files changed so what containers need a restart.

Change-Id: I16596a5b262642a678a8b8b123384fc387f69c70
(cherry picked from commit 761e5cbdd5)

This is the commit message #2:

tripleo_container_manage: add safeguard against wrong healthcheck config

If a container config has by mistake a healthcheck but no systemd
restart policy, we don't want to manage the healthcheck because it
requires its service to be created.

To prevent that situation, we'll create the healthchecks only if they
are already part of the systemd services list that was created earlier.
For that, we're using the intersect() filter which allows to
get the intersection of 2 lists (systemd services and healthchecks to
create).

Adding molecule coverage to test this scenario.

Closes-Bug: #1873249
Change-Id: Id5cc784bae597def0648f07d28b6463b387d2212
(cherry picked from commit 04f16051cc)
This commit is contained in:
Emilien Macchi 2020-04-08 16:10:23 -04:00
parent 0019bb66a3
commit 40669aeee3
10 changed files with 187 additions and 74 deletions

View File

@ -43,7 +43,8 @@ class FilterModule(object):
'get_domain_id': self.get_domain_id,
'get_changed_containers': self.get_changed_containers,
'get_failed_containers': self.get_failed_containers,
'recursive_get_key_from_dict': self.recursive_get_key_from_dict
'recursive_get_key_from_dict': self.recursive_get_key_from_dict,
'get_changed_async_task_names': self.get_changed_async_task_names
}
def subsort(self, dict_to_sort, attribute, null_value=0):
@ -408,3 +409,19 @@ class FilterModule(object):
# not contain failed or finished keys.
continue
return failed
def get_changed_async_task_names(self, data, extra=[]):
"""Return a list of ansible resources that changed."
This filter will take a list of dictionaries (data)
and will return a list of resources that changed.
An extra list can be given to automatically include the item if
part of the list already.
"""
return_list = []
for i in data['results']:
loop_var = i.get('ansible_loop_var', 'item')
for k, v in i[loop_var].items():
if ('changed' in i and i['changed']) or k in extra:
return_list.append(k)
return return_list

View File

@ -62,6 +62,21 @@
- tripleo_fedora_healthcheck_active_result.rc == 0
fail_msg: 'tripleo_fedora systemd healthcheck service is not active'
success_msg: 'tripleo_fedora systemd healthcheck service is active'
- name: Check if tripleo_fedora_bis has systemd service
stat:
path: /etc/systemd/system/tripleo_fedora_bis.service
register: stat_tripleo_fedora_bis_systemd
- name: Check if tripleo_fedora_bis has systemd healthcheck timer
stat:
path: /etc/systemd/system/tripleo_fedora_bis_healthcheck.timer
register: stat_tripleo_fedora_bis_systemd_timer
- name: Assert that tripleo_fedora_bis has no systemd integration
assert:
that:
- not stat_tripleo_fedora_bis_systemd.stat.exists
- not stat_tripleo_fedora_bis_systemd_timer.stat.exists
fail_msg: 'tripleo_fedora_bis has systemd service'
success_msg: 'tripleo_fedora_bis has no systemd service'
- name: Verify that Fedora bis container was created correctly
block:
- name: Check for fedora_bis container

View File

@ -43,7 +43,8 @@
{
"image": "fedora:latest",
"net": "host",
"command": "sleep 3600"
"command": "sleep 3600",
"healthcheck": { "test": "echo test" }
}
dest: '/tmp/container-configs/fedora_bis.json'
- name: Create a third configuration file for a fedora container

View File

@ -19,15 +19,3 @@
- name: Manage containers asynchronously
include_tasks: podman/create.yml
# We don't want to use async for the systemd tasks or we can have startup
# errors when systemd has to deal with multiple services trying to start
# at the same time. It is more reliable to start them in serial.
- name: Manage container systemd services and healthchecks in serial
include_tasks: podman/systemd.yml
# systemd doesn't have the equivalent of docker unless-stopped.
# Let's force 'always' so containers aren't restarted when stopped by
# systemd, but restarted when in failure.
loop: "{{ batched_container_data | haskey(attribute='restart', value=['always','unless-stopped'], any=True) }}"
loop_control:
loop_var: container_config

View File

@ -26,3 +26,6 @@
loop: "{{ data | batch(tripleo_container_manage_concurrency) | list }}"
loop_control:
loop_var: batched_container_data
- name: Manage container systemd services and healthchecks for start_order {{ order }}"
include_tasks: podman/systemd.yml

View File

@ -14,70 +14,111 @@
# License for the specific language governing permissions and limitations
# under the License.
- name: Set container_name and container_sysd facts
- name: Set container_config fact
set_fact:
container_sysd_name: "{{ lookup('dict', container_config).key }}"
container_sysd_data: "{{ lookup('dict', container_config).value }}"
container_config: "{{ data | list | haskey(attribute='restart', value=['always','unless-stopped'], any=True) | default([]) }}"
- name: "Start systemd service for {{ container_sysd_name }}"
- name: Set container_config_healthcheck fact
set_fact:
# Using intersect to prevent a service which isn't controlled by systemd
# but has healthcheck in its configuration (by mistake)
# See https://bugs.launchpad.net/tripleo/+bug/1873249
container_config_healthcheck: "{{ data | list | haskey(attribute='healthcheck') | intersect(container_config) | default([]) }}"
- name: "Manage systemd files"
no_log: "{{ not tripleo_container_manage_debug }}"
block:
- name: "Remove trailing .requires for {{ container_sysd_name }}"
- name: "Remove trailing .requires"
file:
path: "/etc/systemd/system/tripleo_{{ container_sysd_name }}.requires"
path: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_requires).key }}.requires"
state: absent
- name: "Create systemd unit file for {{ container_sysd_name }} service"
loop: "{{ container_config }}"
loop_control:
loop_var: container_data_requires
- name: "Create systemd services files"
template:
src: systemd-service.j2
dest: "/etc/systemd/system/tripleo_{{ container_sysd_name }}.service"
dest: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_unit).key }}.service"
mode: '0644'
owner: root
group: root
register: systemd_file
- name: "Enable and start systemd service for {{ container_sysd_name }}"
systemd:
# Restart the service if it was already running
state: restarted
name: "tripleo_{{ container_sysd_name }}.service"
enabled: true
daemon_reload: true
when:
- systemd_file is changed or container_sysd_name in containers_changed
- name: "Manage systemd healthcheck for {{ container_sysd_name }}"
loop: "{{ container_config }}"
loop_control:
loop_var: container_data_unit
- name: "Create systemd healthcheck files"
when:
- not tripleo_container_manage_healthcheck_disabled
- container_sysd_data.healthcheck is defined
- (container_config_healthcheck | length) > 0
block:
- name: "Create systemd unit file for {{ container_sysd_name }} healthcheck"
- name: "Create systemd unit files healthchecks"
template:
src: systemd-healthcheck.j2
dest: "/etc/systemd/system/tripleo_{{ container_sysd_name }}_healthcheck.service"
dest: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_healthcheck).key }}_healthcheck.service"
mode: '0644'
owner: root
group: root
register: systemd_healthcheck
- name: "Create systemd timer for {{ container_sysd_name }} healthcheck"
loop: "{{ container_config_healthcheck }}"
loop_control:
loop_var: container_data_healthcheck
- name: "Create systemd timers for healthchecks"
template:
src: systemd-timer.j2
dest: "/etc/systemd/system/tripleo_{{ container_sysd_name }}_healthcheck.timer"
dest: "/etc/systemd/system/tripleo_{{ lookup('dict', container_data_timer).key }}_healthcheck.timer"
mode: '0644'
owner: root
group: root
register: systemd_timer
- name: "Enable and start systemd timer for {{ container_sysd_name }}"
systemd:
# Restart the timer if it was already running
state: restarted
name: "tripleo_{{ container_sysd_name }}_healthcheck.timer"
enabled: true
daemon_reload: true
when:
- systemd_healthcheck.changed or systemd_timer.changed
- name: "Add systemd require for {{ container_sysd_name }} healthcheck"
command: "systemctl add-requires tripleo_{{ container_sysd_name }}.service tripleo_{{ container_sysd_name }}_healthcheck.timer"
when:
- systemd_healthcheck.changed or systemd_timer.changed
- name: "Force systemd daemon reload"
systemd:
daemon_reload: true
when:
- systemd_healthcheck.changed or systemd_timer.changed
loop: "{{ container_config_healthcheck }}"
loop_control:
loop_var: container_data_timer
- name: Create fact for container_systemd_changes
set_fact:
container_systemd_changes: >-
{{ ((systemd_file|get_changed_async_task_names) + (systemd_healthcheck|get_changed_async_task_names) +
(systemd_timer|get_changed_async_task_names)|default([])|unique) }}
- name: "Force systemd daemon reload if a systemd file changed"
systemd:
daemon_reload: true
when:
- (container_systemd_changes | length) > 0
- name: Create fact for container_systemd_restart_list
set_fact:
container_systemd_services_restart_list: "{{ (systemd_file|get_changed_async_task_names(extra=containers_changed|default([]))) }}"
container_systemd_timers_restart_list: "{{ (systemd_timer|get_changed_async_task_names(extra=containers_changed|default([]))) }}"
- name: "Start or restart systemd services"
systemd:
# Restart the service if it was already running
state: restarted
name: "tripleo_{{ container_sysd_name }}.service"
enabled: true
daemon_reload: false
loop: "{{ container_systemd_services_restart_list }}"
loop_control:
loop_var: container_sysd_name
- name: "Enable and start systemd timers"
systemd:
# Restart the timer if it was already running
state: restarted
name: "tripleo_{{ container_timer_name }}_healthcheck.timer"
enabled: true
daemon_reload: false
loop: "{{ container_systemd_timers_restart_list }}"
loop_control:
loop_var: container_timer_name
when:
- not tripleo_container_manage_healthcheck_disabled
- name: "Add systemd requires for healthchecks"
command: "systemctl add-requires tripleo_{{ container_requires_timer_name }}.service tripleo_{{ container_requires_timer_name }}_healthcheck.timer"
loop: "{{ container_systemd_timers_restart_list }}"
loop_control:
loop_var: container_requires_timer_name
when:
- not tripleo_container_manage_healthcheck_disabled

View File

@ -1,10 +1,10 @@
[Unit]
Description=tripleo_{{ container_sysd_name }} healthcheck
After=tripleo-container-shutdown.service tripleo_{{ container_sysd_name }}.service
Requisite=tripleo_{{ container_sysd_name }}.service
Description=tripleo_{{ lookup('dict', container_data_healthcheck).key }} healthcheck
After=tripleo-container-shutdown.service tripleo_{{ lookup('dict', container_data_healthcheck).key }}.service
Requisite=tripleo_{{ lookup('dict', container_data_healthcheck).key }}.service
[Service]
Type=oneshot
ExecStart=/usr/bin/podman exec --user root {{ container_sysd_name }} {{ container_sysd_data.healthcheck.test }}
SyslogIdentifier=healthcheck_{{ container_sysd_name }}
ExecStart=/usr/bin/podman exec --user root {{ lookup('dict', container_data_healthcheck).key }} {{ lookup('dict', container_data_healthcheck).value.healthcheck.test }}
SyslogIdentifier=healthcheck_{{ lookup('dict', container_data_healthcheck).key }}
[Install]
WantedBy=multi-user.target

View File

@ -1,21 +1,21 @@
[Unit]
Description={{ container_sysd_name }} container
Description={{ lookup('dict', container_data_unit).key }} container
After=tripleo-container-shutdown.service
Wants={{ container_sysd_data.depends_on | default([]) | join(',') }}
Wants={{ lookup('dict', container_data_unit).value.depends_on | default([]) | join(',') }}
[Service]
Restart=always
{% if container_sysd_data.depends_on is defined and (container_sysd_data.depends_on | length > 0) and podman_drop_in | default('false') %}
ExecStart=/usr/libexec/tripleo-start-podman-container {{ container_sysd_name }}
{% if lookup('dict', container_data_unit).value.depends_on is defined and (lookup('dict', container_data_unit).value.depends_on | length > 0) and podman_drop_in | default('false') %}
ExecStart=/usr/libexec/tripleo-start-podman-container {{ lookup('dict', container_data_unit).key }}
{% else %}
ExecStart=/usr/bin/podman start {{ container_sysd_name }}
ExecStart=/usr/bin/podman start {{ lookup('dict', container_data_unit).key }}
{% endif %}
ExecReload=/usr/bin/podman kill --signal HUP {{ container_sysd_name }}
ExecStop=/usr/bin/podman stop -t {{ container_sysd_data.stop_grace_period | default(10) | int }} {{ container_sysd_name }}
ExecReload=/usr/bin/podman kill --signal HUP {{ lookup('dict', container_data_unit).key }}
ExecStop=/usr/bin/podman stop -t {{ lookup('dict', container_data_unit).value.stop_grace_period | default(10) | int }} {{ lookup('dict', container_data_unit).key }}
KillMode=none
Type=forking
PIDFile=/var/run/{{ container_sysd_name }}.pid
{% if container_sysd_data.systemd_exec_flags is defined %}
{% for s_flag, s_value in container_sysd_data.systemd_exec_flags.items() %}
PIDFile=/var/run/{{ lookup('dict', container_data_unit).key }}.pid
{% if lookup('dict', container_data_unit).value.systemd_exec_flags is defined %}
{% for s_flag, s_value in lookup('dict', container_data_unit).value.systemd_exec_flags.items() %}
{{ s_flag }}={{ s_value }}
{% endfor %}
{% endif %}

View File

@ -1,9 +1,9 @@
[Unit]
Description=tripleo_{{ container_sysd_name }} container healthcheck
PartOf=tripleo_{{ container_sysd_name }}.service
Description=tripleo_{{ lookup('dict', container_data_timer).key }} container healthcheck
PartOf=tripleo_{{ lookup('dict', container_data_timer).key }}.service
[Timer]
OnActiveSec=120
OnUnitActiveSec={{ container_sysd_data.check_interval | default(60) }}
RandomizedDelaySec={{ 45 if container_sysd_data.check_interval is not defined else (container_sysd_data.check_interval * 3 / 4) | int | abs }}
OnUnitActiveSec={{ lookup('dict', container_data_timer).value.check_interval | default(60) }}
RandomizedDelaySec={{ 45 if lookup('dict', container_data_timer).value.check_interval is not defined else (lookup('dict', container_data_timer).value.check_interval * 3 / 4) | int | abs }}
[Install]
WantedBy=timers.target

View File

@ -884,3 +884,51 @@ class TestHelperFilters(tests_base.TestCase):
expected_list = ['memcached', 'mysql']
result = self.filters.get_failed_containers(data)
self.assertEqual(result, expected_list)
def test_get_changed_async_task_names(self):
results = [
{
"ansible_loop_var": "systemd_loop",
"changed": False,
"failed": False,
"systemd_loop": {
'keystone': {
"config": "foo"
}
},
},
{
"ansible_loop_var": "systemd_loop",
"changed": False,
"failed": False,
"systemd_loop": {
'mysql': {
"config": "foo"
}
},
},
{
"ansible_loop_var": "systemd_loop",
"changed": True,
"failed": False,
"systemd_loop": {
'haproxy': {
"config": "foo"
}
},
},
{
"changed": True,
"failed": False,
"item": {
'memcached': {
"config": "foo"
}
},
},
]
data = {}
data['results'] = results
expected_list = ['mysql', 'haproxy', 'memcached']
result = self.filters.get_changed_async_task_names(data=data, extra=['mysql'])
self.assertEqual(result, expected_list)