From c2edd31a3108ddcfe640f1acd7cbde09e93df661 Mon Sep 17 00:00:00 2001 From: Emilien Macchi Date: Mon, 29 Jun 2020 08:52:54 -0400 Subject: [PATCH] Introduce an Action Plugin to fetch container infos Instead of running a bunch of tasks to figure out what container commands have been run, which ones did not terminate after 5 minutes, which ones failed or finished with a wrong exit code. We now have an action plugin that will do it faster and with better logging. Faster before it reduces the number of tasks. Better logging is provided, now displaying all errors during a run and fail at the end. Supporting check-mode. Re-using tripleo_container_manage role for molecule testing. Co-Authored-By: Alex Schultz Co-Authored-By: Kevin Carter Change-Id: Ie7f8c9cceaf9540d7d33a9bb5f33258c46185e77 (cherry picked from commit 19774d0be436034ff2d5510d9cea2abbee296af1) --- .../action/container_status.py | 334 ++++++++++++++++++ .../molecule/default/playbook.yml | 2 +- .../tasks/podman/check_exit_code.yml | 51 --- .../tasks/podman/create.yml | 47 +-- .../tasks/podman/get_commands_create.yml | 29 -- zuul.d/molecule.yaml | 1 + 6 files changed, 347 insertions(+), 117 deletions(-) create mode 100644 tripleo_ansible/ansible_plugins/action/container_status.py delete mode 100644 tripleo_ansible/roles/tripleo-container-manage/tasks/podman/check_exit_code.yml delete mode 100644 tripleo_ansible/roles/tripleo-container-manage/tasks/podman/get_commands_create.yml diff --git a/tripleo_ansible/ansible_plugins/action/container_status.py b/tripleo_ansible/ansible_plugins/action/container_status.py new file mode 100644 index 000000000..43e6d9ff6 --- /dev/null +++ b/tripleo_ansible/ansible_plugins/action/container_status.py @@ -0,0 +1,334 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright 2020 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import, division, print_function +__metaclass__ = type + +import copy +import tenacity +import yaml + +from ansible.errors import AnsibleActionFail +from ansible.plugins.action import ActionBase +from ansible.utils.display import Display + +DISPLAY = Display() + +# Default delay/retries used to fetch containers status and wait for them to be +# finished. +DELAY = 10 +RETRIES = 30 +TIMEOUT = DELAY * RETRIES + +ANSIBLE_METADATA = { + 'metadata_version': '1.1', + 'status': ['preview'], + 'supported_by': 'community' +} + +DOCUMENTATION = """ +module: container_status +author: + - "TripleO team" +version_added: '2.9' +short_description: Check and report containers status +notes: [] +description: + - For each container that isn't an exec or a container supposed to be + controlled by systemd, we expect it to terminate with a return code. + This module will check that code and make sure it's correct. If not, it + will report the failure for easier debug. +requirements: + - None +options: + container_async_results: + description: + - Async results of a podman_container invocation. + type: list + container_data: + description: + - List of dictionaries which have the container configurations. + type: list + valid_exit_codes: + description: + - List of valid container exit codes. + default: [] + type: list + debug: + description: + - Whether or not debug is enabled. + default: False + type: boolean +""" +EXAMPLES = """ +- name: Check containers status + containers_status: + container_async_results: "{{ create_async_poll_results.results }}" + container_data: + - keystone: + image: docker.io/keystone + - mysql_bootstrap: + image: docker.io/mysql + valid_exit_codes: + - 0 + - 2 +""" +RETURN = """ +changed_containers: + description: List of containers which changed. + returned: always + type: list + sample: + - keystone + - mysql +commands: + description: List of container cli commands that would be run. + returned: always + type: list + sample: + - podman rm -f keystone + - podman run keystone +""" + + +class ActionModule(ActionBase): + """Action plugin for container status""" + + _VALID_ARGS = yaml.safe_load(DOCUMENTATION)['options'] + + def _get_args(self): + missing = [] + args = {} + + for option, vals in self._VALID_ARGS.items(): + if 'default' not in vals: + if self._task.args.get(option, None) is None: + missing.append(option) + continue + args[option] = self._task.args.get(option) + else: + args[option] = self._task.args.get(option, vals['default']) + + if missing: + raise AnsibleActionFail('Missing required parameters: {}'.format( + ', '.join(missing))) + return args + + def _get_containers_to_check(self, data): + """Return a list of containers that we need to check. + + Given some container_data, figure out what containers terminate with + a return code so later we can check that code. + + :param data: Dictionary of container data. + :returns: List of containers that need to be checked. + """ + containers = [] + # loop through container data to get specific container + for container in data: + # get container name and data + for name, values in container.items(): + if 'action' in values or 'restart' in values: + continue + if 'image' in values: + # We assume that container configs that don't have a + # restart policy nor action (used for podman exec) but have + # an image set, will run something and then exit with a + # return code. + containers.append(name) + if self.debug and len(containers) > 0: + DISPLAY.display('These containers are supposed to terminate with ' + 'a valid exit code and will be checked: ' + '{}'.format(containers)) + return containers + + def _get_create_commands(self, results): + """Return a list of commands that were executed by container tool. + + :param results: Ansible task results. + :returns commands: List of commands. + """ + commands = [] + for item in results: + if item['changed']: + commands.extend(item['podman_actions']) + return commands + + def _is_container_running(self, container): + """Return True if a container has Running State. + + :params container: Dictionary for container infos. + :returns running: Boolean of container running status. + """ + state = container.get('State', {}) + running = state.get('Running', False) + return running + + def _get_container_infos(self, containers, task_vars): + """Return container infos. + + :params containers: List of containers. + :params task_vars: Dictionary of Ansible tasks variables. + :returns container_results: Dictionary of container infos. + """ + tvars = copy.deepcopy(task_vars) + result = self._execute_module( + module_name='podman_container_info', + module_args=dict(name=containers), + task_vars=tvars + ) + return [c for c in result["containers"]] + + @tenacity.retry( + reraise=True, + stop=tenacity.stop_after_attempt(RETRIES), + wait=tenacity.wait_fixed(DELAY) + ) + def _fetch_container_state(self, containers, task_vars): + """Return container states of finished containers with retries. + + :params containers: List of containers. + :params task_vars: Dictionary of Ansible tasks variables. + :returns container_results: Dictionary of container infos. + """ + containers_results = self._get_container_infos(containers, task_vars) + for container in containers_results: + name = container.get('Name') + if self._is_container_running(container): + raise AnsibleActionFail('Container {} has not finished yet, ' + 'retrying...'.format(name)) + return containers_results + + def _check_container_state(self, containers, exit_codes, task_vars): + """Return a tuple of running and failed containers. + + :params containers: List of containers to check. + :params exit_codes: List of valid exit codes. + :params task_vars: Dictionary of Ansible tasks variables. + :returns running, failed: Tuple of lists. + """ + running = [] + failed = [] + try: + self._fetch_container_state(containers, task_vars) + except AnsibleActionFail: + # We fail at the end with all the other infos + if self.debug: + DISPLAY.display('One or more containers did not finish on ' + 'time, the failure will be reported later.') + pass + containers_results = self._get_container_infos(containers, task_vars) + for container in containers_results: + container_name = container.get('Name') + container_state = container.get('State') + if self._is_container_running(container): + running.append(container_name) + elif container_state.get('ExitCode') not in exit_codes: + failed.append(container_name) + return (running, failed) + + def _check_errors_in_ansible_async_results(self, results): + """Get a tuple with changed and failed containers. + + :param results: Ansible results from "Check podman create status" + :returns: Tuple of containers that changed or failed + """ + changed = [] + failed = [] + for item in results: + # if Ansible is run in check mode, the async_results items will + # not contain failed or finished keys. + if self._play_context.check_mode: + break + async_result_item = item['create_async_result_item'] + if item['changed']: + for name, c in async_result_item['container_data'].items(): + changed.append(name) + if (item['failed'] or not item['finished'] + or ('stderr' in async_result_item + and async_result_item['stderr'] != '')): + for name, c in async_result_item['container_data'].items(): + failed.append(name) + return (changed, failed) + + def run(self, tmp=None, task_vars=None): + self._supports_check_mode = True + self.changed = False + self.changed_containers = [] + container_commands = [] + running = [] + failed = [] + + if task_vars is None: + task_vars = dict() + result = super(ActionModule, self).run(tmp, task_vars) + del tmp + # parse args + args = self._get_args() + + async_results = args['container_async_results'] + container_data = args['container_data'] + valid_exit_codes = args['valid_exit_codes'] + self.debug = args['debug'] + + containers_to_check = self._get_containers_to_check(container_data) + + # Check that the containers which are supposed to finish have + # actually finished and also terminated with the right exit code. + if len(valid_exit_codes) > 0 and len(containers_to_check) > 0: + (running, failed) = self._check_container_state( + containers_to_check, + valid_exit_codes, + task_vars) + + # Check the Ansible async results for containers which: + # - reported a changed resources (podman_container created or updated + # a container) and return it as self.changed_containers. + # - reported a failed resource (podman_container failed to create + # the container and return it as self.failed_containers. + # - didn't finish on time and return it as self.failed_containers. + (self.changed_containers, async_failed) = ( + self._check_errors_in_ansible_async_results(async_results)) + + if len(failed) > 0: + DISPLAY.error('Container(s) which finished with wrong return code' + ': {}'.format(failed)) + if len(async_failed) > 0: + DISPLAY.error('Container(s) which failed to be created by ' + 'podman_container module: {}'.format(async_failed)) + if len(running) > 0: + DISPLAY.error('Container(s) which did not finish after {} ' + 'minutes: {}'.format(TIMEOUT, running)) + total_errors = list(set(failed + async_failed + running)) + if len(total_errors) > 0: + raise AnsibleActionFail('Failed container(s): {}, check logs in ' + '/var/log/containers/' + 'stdouts/'.format(total_errors)) + + container_commands = self._get_create_commands(async_results) + if len(container_commands) > 0 and \ + (self._play_context.check_mode or self.debug): + for cmd in container_commands: + DISPLAY.display(cmd) + + if len(container_commands) > 0: + self.changed = True + + result['changed_containers'] = self.changed_containers + result['commands'] = container_commands + result['changed'] = self.changed + return result diff --git a/tripleo_ansible/roles/tripleo-container-manage/molecule/default/playbook.yml b/tripleo_ansible/roles/tripleo-container-manage/molecule/default/playbook.yml index bb9bbb408..e6642f6f3 100644 --- a/tripleo_ansible/roles/tripleo-container-manage/molecule/default/playbook.yml +++ b/tripleo_ansible/roles/tripleo-container-manage/molecule/default/playbook.yml @@ -290,7 +290,7 @@ { "image": "fedora:rawhide", "net": "host", - "command": "sleep 3600" + "command": "sleep 10" } dest: '/tmp/container-configs/fedora_bis.json' - include_role: diff --git a/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/check_exit_code.yml b/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/check_exit_code.yml deleted file mode 100644 index 83a30f889..000000000 --- a/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/check_exit_code.yml +++ /dev/null @@ -1,51 +0,0 @@ ---- -# Copyright 2020 Red Hat, Inc. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -- name: "Wait for containers to be exit" - podman_container_info: - name: "{{ containers_with_exit_code }}" - register: podman_containers_infos - until: ( podman_containers_infos.containers | selectattr('State.Running', 'equalto', True) |list|length ) == 0 - # Retry 30 times every 10 seconds so we wait 5 min in total - retries: 30 - delay: 10 - # We need to ignore the failures since later we print some debug. - # We can't use "rescue" here because the debug tasks use - # "podman_containers_infos". - failed_when: false - no_log: "{{ not tripleo_container_manage_debug }}" - -- name: Create a list of containers which didn't exit - set_fact: - running_containers: >- - {{ podman_containers_infos.containers | - selectattr('State.Running', 'equalto', True) | map(attribute='Name') | list }} - -- name: Create a list of containers with bad Exit Codes - set_fact: - broken_containers: >- - {{ podman_containers_infos.containers | - rejectattr('State.ExitCode', 'in', tripleo_container_manage_valid_exit_code) | map(attribute='Name') | list }} - -- name: "Print running containers" - fail: - msg: "Container(s) which are still running after 5 min: {{ running_containers }}, check logs in /var/log/containers/stdouts/" - when: running_containers|length != 0 - -- name: "Print failing containers" - fail: - msg: "Container(s) with bad ExitCode: {{ broken_containers }}, check logs in /var/log/containers/stdouts/" - when: broken_containers|length != 0 diff --git a/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/create.yml b/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/create.yml index 1cdc34b04..dcc1b85b7 100644 --- a/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/create.yml +++ b/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/create.yml @@ -96,46 +96,21 @@ when: - not ansible_check_mode|bool +- name: Check containers status + container_status: + container_async_results: "{{ create_async_poll_results.results }}" + container_data: "{{ batched_container_data }}" + valid_exit_codes: "{{ tripleo_container_manage_valid_exit_code }}" + debug: "{{ tripleo_container_manage_debug | bool }}" + register: container_status_results + - name: "Create fact for containers which changed" set_fact: # List of containers which have changed (created or updated) - containers_changed: "{{ create_async_poll_results.results | get_changed_containers | default([]) }}" + containers_changed: "{{ container_status_results.changed_containers | default([]) }}" -- name: "Create fact for containers which failed" +- name: "Append the list of all podman commands that are run for containers with changes" set_fact: - # List of containers which returned an error when creating or updating them - containers_failed: "{{ create_async_poll_results.results | get_failed_containers | default([]) }}" - -- name: "Create fact for containers which require rc check" - set_fact: - # List of containers which would terminate with a return code that needs to be valid. - # We assume that container configs that don't have a restart policy nor action - # (used for podman exec) will run something and then exit with a return code. - containers_to_check: >- - {{ batched_container_data | haskey(attribute='image', excluded_keys=['action', 'restart']) | - list_of_keys | default([]) | difference(containers_failed) }} - -- name: Print the containers that failed to start - fail: - msg: "{{ containers_failed }} failed to start, check logs in /var/log/containers/stdouts/" - when: - - containers_failed|length != 0 - -- name: Block for container commands - include_tasks: podman/get_commands_create.yml + all_containers_commands: "{{ container_status_results.commands | default([]) + (all_containers_commands | default([]) | list) }}" when: - ansible_check_mode|bool - -- name: "Print the list of containers which changed" - debug: - var: containers_changed - when: tripleo_container_manage_debug | bool - -- name: "Block for container exit codes" - when: - - not ansible_check_mode|bool - - tripleo_container_manage_valid_exit_code|length != 0 - - containers_to_check|length != 0 - include_tasks: podman/check_exit_code.yml - vars: - containers_with_exit_code: "{{ containers_to_check }}" diff --git a/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/get_commands_create.yml b/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/get_commands_create.yml deleted file mode 100644 index 8673f6fe6..000000000 --- a/tripleo_ansible/roles/tripleo-container-manage/tasks/podman/get_commands_create.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -# Copyright 2020 Red Hat, Inc. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -- name: "Create a list of podman commands that are run for containers with changes" - set_fact: - containers_commands: >- - {{ create_async_results.results | selectattr('changed', 'equalto', true) | - map(attribute='podman_actions') | default([]) | list }} - -- name: "Print the list of commands that are run for containers with changes" - debug: - var: containers_commands - -- name: "Append the list of all podman commands that are run for containers with changes" - set_fact: - all_containers_commands: "{{ containers_commands|default([], true) + (all_containers_commands | default([]) | list) }}" diff --git a/zuul.d/molecule.yaml b/zuul.d/molecule.yaml index 614dae9ec..44fed781d 100644 --- a/zuul.d/molecule.yaml +++ b/zuul.d/molecule.yaml @@ -353,6 +353,7 @@ files: - ^tripleo_ansible/roles/tripleo-container-manage/.* - ^tripleo_ansible/roles/tripleo-container-rm/.* + - ^tripleo_ansible/ansible_plugins/action/container_status.py$ - ^tripleo_ansible/ansible_plugins/filter/helpers.py$ - ^tripleo_ansible/ansible_plugins/modules/container_config_data.py$ - ^tripleo_ansible/ansible_plugins/modules/container_puppet_config.py$