Introduce an Action Plugin to fetch container infos
Instead of running a bunch of tasks to figure out what container
commands have been run, which ones did not terminate after 5 minutes,
which ones failed or finished with a wrong exit code. We now have an
action plugin that will do it faster and with better logging.
Faster before it reduces the number of tasks.
Better logging is provided, now displaying all errors during a run and
fail at the end.
Supporting check-mode.
Re-using tripleo_container_manage role for molecule testing.
Co-Authored-By: Alex Schultz <aschultz@redhat.com>
Co-Authored-By: Kevin Carter <kecarter@redhat.com>
Change-Id: Ie7f8c9cceaf9540d7d33a9bb5f33258c46185e77
(cherry picked from commit 19774d0be4
)
This commit is contained in:
parent
b2a34778b8
commit
c2edd31a31
|
@ -0,0 +1,334 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright 2020 Red Hat, Inc.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
__metaclass__ = type
|
||||
|
||||
import copy
|
||||
import tenacity
|
||||
import yaml
|
||||
|
||||
from ansible.errors import AnsibleActionFail
|
||||
from ansible.plugins.action import ActionBase
|
||||
from ansible.utils.display import Display
|
||||
|
||||
DISPLAY = Display()
|
||||
|
||||
# Default delay/retries used to fetch containers status and wait for them to be
|
||||
# finished.
|
||||
DELAY = 10
|
||||
RETRIES = 30
|
||||
TIMEOUT = DELAY * RETRIES
|
||||
|
||||
ANSIBLE_METADATA = {
|
||||
'metadata_version': '1.1',
|
||||
'status': ['preview'],
|
||||
'supported_by': 'community'
|
||||
}
|
||||
|
||||
DOCUMENTATION = """
|
||||
module: container_status
|
||||
author:
|
||||
- "TripleO team"
|
||||
version_added: '2.9'
|
||||
short_description: Check and report containers status
|
||||
notes: []
|
||||
description:
|
||||
- For each container that isn't an exec or a container supposed to be
|
||||
controlled by systemd, we expect it to terminate with a return code.
|
||||
This module will check that code and make sure it's correct. If not, it
|
||||
will report the failure for easier debug.
|
||||
requirements:
|
||||
- None
|
||||
options:
|
||||
container_async_results:
|
||||
description:
|
||||
- Async results of a podman_container invocation.
|
||||
type: list
|
||||
container_data:
|
||||
description:
|
||||
- List of dictionaries which have the container configurations.
|
||||
type: list
|
||||
valid_exit_codes:
|
||||
description:
|
||||
- List of valid container exit codes.
|
||||
default: []
|
||||
type: list
|
||||
debug:
|
||||
description:
|
||||
- Whether or not debug is enabled.
|
||||
default: False
|
||||
type: boolean
|
||||
"""
|
||||
EXAMPLES = """
|
||||
- name: Check containers status
|
||||
containers_status:
|
||||
container_async_results: "{{ create_async_poll_results.results }}"
|
||||
container_data:
|
||||
- keystone:
|
||||
image: docker.io/keystone
|
||||
- mysql_bootstrap:
|
||||
image: docker.io/mysql
|
||||
valid_exit_codes:
|
||||
- 0
|
||||
- 2
|
||||
"""
|
||||
RETURN = """
|
||||
changed_containers:
|
||||
description: List of containers which changed.
|
||||
returned: always
|
||||
type: list
|
||||
sample:
|
||||
- keystone
|
||||
- mysql
|
||||
commands:
|
||||
description: List of container cli commands that would be run.
|
||||
returned: always
|
||||
type: list
|
||||
sample:
|
||||
- podman rm -f keystone
|
||||
- podman run keystone
|
||||
"""
|
||||
|
||||
|
||||
class ActionModule(ActionBase):
|
||||
"""Action plugin for container status"""
|
||||
|
||||
_VALID_ARGS = yaml.safe_load(DOCUMENTATION)['options']
|
||||
|
||||
def _get_args(self):
|
||||
missing = []
|
||||
args = {}
|
||||
|
||||
for option, vals in self._VALID_ARGS.items():
|
||||
if 'default' not in vals:
|
||||
if self._task.args.get(option, None) is None:
|
||||
missing.append(option)
|
||||
continue
|
||||
args[option] = self._task.args.get(option)
|
||||
else:
|
||||
args[option] = self._task.args.get(option, vals['default'])
|
||||
|
||||
if missing:
|
||||
raise AnsibleActionFail('Missing required parameters: {}'.format(
|
||||
', '.join(missing)))
|
||||
return args
|
||||
|
||||
def _get_containers_to_check(self, data):
|
||||
"""Return a list of containers that we need to check.
|
||||
|
||||
Given some container_data, figure out what containers terminate with
|
||||
a return code so later we can check that code.
|
||||
|
||||
:param data: Dictionary of container data.
|
||||
:returns: List of containers that need to be checked.
|
||||
"""
|
||||
containers = []
|
||||
# loop through container data to get specific container
|
||||
for container in data:
|
||||
# get container name and data
|
||||
for name, values in container.items():
|
||||
if 'action' in values or 'restart' in values:
|
||||
continue
|
||||
if 'image' in values:
|
||||
# We assume that container configs that don't have a
|
||||
# restart policy nor action (used for podman exec) but have
|
||||
# an image set, will run something and then exit with a
|
||||
# return code.
|
||||
containers.append(name)
|
||||
if self.debug and len(containers) > 0:
|
||||
DISPLAY.display('These containers are supposed to terminate with '
|
||||
'a valid exit code and will be checked: '
|
||||
'{}'.format(containers))
|
||||
return containers
|
||||
|
||||
def _get_create_commands(self, results):
|
||||
"""Return a list of commands that were executed by container tool.
|
||||
|
||||
:param results: Ansible task results.
|
||||
:returns commands: List of commands.
|
||||
"""
|
||||
commands = []
|
||||
for item in results:
|
||||
if item['changed']:
|
||||
commands.extend(item['podman_actions'])
|
||||
return commands
|
||||
|
||||
def _is_container_running(self, container):
|
||||
"""Return True if a container has Running State.
|
||||
|
||||
:params container: Dictionary for container infos.
|
||||
:returns running: Boolean of container running status.
|
||||
"""
|
||||
state = container.get('State', {})
|
||||
running = state.get('Running', False)
|
||||
return running
|
||||
|
||||
def _get_container_infos(self, containers, task_vars):
|
||||
"""Return container infos.
|
||||
|
||||
:params containers: List of containers.
|
||||
:params task_vars: Dictionary of Ansible tasks variables.
|
||||
:returns container_results: Dictionary of container infos.
|
||||
"""
|
||||
tvars = copy.deepcopy(task_vars)
|
||||
result = self._execute_module(
|
||||
module_name='podman_container_info',
|
||||
module_args=dict(name=containers),
|
||||
task_vars=tvars
|
||||
)
|
||||
return [c for c in result["containers"]]
|
||||
|
||||
@tenacity.retry(
|
||||
reraise=True,
|
||||
stop=tenacity.stop_after_attempt(RETRIES),
|
||||
wait=tenacity.wait_fixed(DELAY)
|
||||
)
|
||||
def _fetch_container_state(self, containers, task_vars):
|
||||
"""Return container states of finished containers with retries.
|
||||
|
||||
:params containers: List of containers.
|
||||
:params task_vars: Dictionary of Ansible tasks variables.
|
||||
:returns container_results: Dictionary of container infos.
|
||||
"""
|
||||
containers_results = self._get_container_infos(containers, task_vars)
|
||||
for container in containers_results:
|
||||
name = container.get('Name')
|
||||
if self._is_container_running(container):
|
||||
raise AnsibleActionFail('Container {} has not finished yet, '
|
||||
'retrying...'.format(name))
|
||||
return containers_results
|
||||
|
||||
def _check_container_state(self, containers, exit_codes, task_vars):
|
||||
"""Return a tuple of running and failed containers.
|
||||
|
||||
:params containers: List of containers to check.
|
||||
:params exit_codes: List of valid exit codes.
|
||||
:params task_vars: Dictionary of Ansible tasks variables.
|
||||
:returns running, failed: Tuple of lists.
|
||||
"""
|
||||
running = []
|
||||
failed = []
|
||||
try:
|
||||
self._fetch_container_state(containers, task_vars)
|
||||
except AnsibleActionFail:
|
||||
# We fail at the end with all the other infos
|
||||
if self.debug:
|
||||
DISPLAY.display('One or more containers did not finish on '
|
||||
'time, the failure will be reported later.')
|
||||
pass
|
||||
containers_results = self._get_container_infos(containers, task_vars)
|
||||
for container in containers_results:
|
||||
container_name = container.get('Name')
|
||||
container_state = container.get('State')
|
||||
if self._is_container_running(container):
|
||||
running.append(container_name)
|
||||
elif container_state.get('ExitCode') not in exit_codes:
|
||||
failed.append(container_name)
|
||||
return (running, failed)
|
||||
|
||||
def _check_errors_in_ansible_async_results(self, results):
|
||||
"""Get a tuple with changed and failed containers.
|
||||
|
||||
:param results: Ansible results from "Check podman create status"
|
||||
:returns: Tuple of containers that changed or failed
|
||||
"""
|
||||
changed = []
|
||||
failed = []
|
||||
for item in results:
|
||||
# if Ansible is run in check mode, the async_results items will
|
||||
# not contain failed or finished keys.
|
||||
if self._play_context.check_mode:
|
||||
break
|
||||
async_result_item = item['create_async_result_item']
|
||||
if item['changed']:
|
||||
for name, c in async_result_item['container_data'].items():
|
||||
changed.append(name)
|
||||
if (item['failed'] or not item['finished']
|
||||
or ('stderr' in async_result_item
|
||||
and async_result_item['stderr'] != '')):
|
||||
for name, c in async_result_item['container_data'].items():
|
||||
failed.append(name)
|
||||
return (changed, failed)
|
||||
|
||||
def run(self, tmp=None, task_vars=None):
|
||||
self._supports_check_mode = True
|
||||
self.changed = False
|
||||
self.changed_containers = []
|
||||
container_commands = []
|
||||
running = []
|
||||
failed = []
|
||||
|
||||
if task_vars is None:
|
||||
task_vars = dict()
|
||||
result = super(ActionModule, self).run(tmp, task_vars)
|
||||
del tmp
|
||||
# parse args
|
||||
args = self._get_args()
|
||||
|
||||
async_results = args['container_async_results']
|
||||
container_data = args['container_data']
|
||||
valid_exit_codes = args['valid_exit_codes']
|
||||
self.debug = args['debug']
|
||||
|
||||
containers_to_check = self._get_containers_to_check(container_data)
|
||||
|
||||
# Check that the containers which are supposed to finish have
|
||||
# actually finished and also terminated with the right exit code.
|
||||
if len(valid_exit_codes) > 0 and len(containers_to_check) > 0:
|
||||
(running, failed) = self._check_container_state(
|
||||
containers_to_check,
|
||||
valid_exit_codes,
|
||||
task_vars)
|
||||
|
||||
# Check the Ansible async results for containers which:
|
||||
# - reported a changed resources (podman_container created or updated
|
||||
# a container) and return it as self.changed_containers.
|
||||
# - reported a failed resource (podman_container failed to create
|
||||
# the container and return it as self.failed_containers.
|
||||
# - didn't finish on time and return it as self.failed_containers.
|
||||
(self.changed_containers, async_failed) = (
|
||||
self._check_errors_in_ansible_async_results(async_results))
|
||||
|
||||
if len(failed) > 0:
|
||||
DISPLAY.error('Container(s) which finished with wrong return code'
|
||||
': {}'.format(failed))
|
||||
if len(async_failed) > 0:
|
||||
DISPLAY.error('Container(s) which failed to be created by '
|
||||
'podman_container module: {}'.format(async_failed))
|
||||
if len(running) > 0:
|
||||
DISPLAY.error('Container(s) which did not finish after {} '
|
||||
'minutes: {}'.format(TIMEOUT, running))
|
||||
total_errors = list(set(failed + async_failed + running))
|
||||
if len(total_errors) > 0:
|
||||
raise AnsibleActionFail('Failed container(s): {}, check logs in '
|
||||
'/var/log/containers/'
|
||||
'stdouts/'.format(total_errors))
|
||||
|
||||
container_commands = self._get_create_commands(async_results)
|
||||
if len(container_commands) > 0 and \
|
||||
(self._play_context.check_mode or self.debug):
|
||||
for cmd in container_commands:
|
||||
DISPLAY.display(cmd)
|
||||
|
||||
if len(container_commands) > 0:
|
||||
self.changed = True
|
||||
|
||||
result['changed_containers'] = self.changed_containers
|
||||
result['commands'] = container_commands
|
||||
result['changed'] = self.changed
|
||||
return result
|
|
@ -290,7 +290,7 @@
|
|||
{
|
||||
"image": "fedora:rawhide",
|
||||
"net": "host",
|
||||
"command": "sleep 3600"
|
||||
"command": "sleep 10"
|
||||
}
|
||||
dest: '/tmp/container-configs/fedora_bis.json'
|
||||
- include_role:
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
---
|
||||
# Copyright 2020 Red Hat, Inc.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
- name: "Wait for containers to be exit"
|
||||
podman_container_info:
|
||||
name: "{{ containers_with_exit_code }}"
|
||||
register: podman_containers_infos
|
||||
until: ( podman_containers_infos.containers | selectattr('State.Running', 'equalto', True) |list|length ) == 0
|
||||
# Retry 30 times every 10 seconds so we wait 5 min in total
|
||||
retries: 30
|
||||
delay: 10
|
||||
# We need to ignore the failures since later we print some debug.
|
||||
# We can't use "rescue" here because the debug tasks use
|
||||
# "podman_containers_infos".
|
||||
failed_when: false
|
||||
no_log: "{{ not tripleo_container_manage_debug }}"
|
||||
|
||||
- name: Create a list of containers which didn't exit
|
||||
set_fact:
|
||||
running_containers: >-
|
||||
{{ podman_containers_infos.containers |
|
||||
selectattr('State.Running', 'equalto', True) | map(attribute='Name') | list }}
|
||||
|
||||
- name: Create a list of containers with bad Exit Codes
|
||||
set_fact:
|
||||
broken_containers: >-
|
||||
{{ podman_containers_infos.containers |
|
||||
rejectattr('State.ExitCode', 'in', tripleo_container_manage_valid_exit_code) | map(attribute='Name') | list }}
|
||||
|
||||
- name: "Print running containers"
|
||||
fail:
|
||||
msg: "Container(s) which are still running after 5 min: {{ running_containers }}, check logs in /var/log/containers/stdouts/"
|
||||
when: running_containers|length != 0
|
||||
|
||||
- name: "Print failing containers"
|
||||
fail:
|
||||
msg: "Container(s) with bad ExitCode: {{ broken_containers }}, check logs in /var/log/containers/stdouts/"
|
||||
when: broken_containers|length != 0
|
|
@ -96,46 +96,21 @@
|
|||
when:
|
||||
- not ansible_check_mode|bool
|
||||
|
||||
- name: Check containers status
|
||||
container_status:
|
||||
container_async_results: "{{ create_async_poll_results.results }}"
|
||||
container_data: "{{ batched_container_data }}"
|
||||
valid_exit_codes: "{{ tripleo_container_manage_valid_exit_code }}"
|
||||
debug: "{{ tripleo_container_manage_debug | bool }}"
|
||||
register: container_status_results
|
||||
|
||||
- name: "Create fact for containers which changed"
|
||||
set_fact:
|
||||
# List of containers which have changed (created or updated)
|
||||
containers_changed: "{{ create_async_poll_results.results | get_changed_containers | default([]) }}"
|
||||
containers_changed: "{{ container_status_results.changed_containers | default([]) }}"
|
||||
|
||||
- name: "Create fact for containers which failed"
|
||||
- name: "Append the list of all podman commands that are run for containers with changes"
|
||||
set_fact:
|
||||
# List of containers which returned an error when creating or updating them
|
||||
containers_failed: "{{ create_async_poll_results.results | get_failed_containers | default([]) }}"
|
||||
|
||||
- name: "Create fact for containers which require rc check"
|
||||
set_fact:
|
||||
# List of containers which would terminate with a return code that needs to be valid.
|
||||
# We assume that container configs that don't have a restart policy nor action
|
||||
# (used for podman exec) will run something and then exit with a return code.
|
||||
containers_to_check: >-
|
||||
{{ batched_container_data | haskey(attribute='image', excluded_keys=['action', 'restart']) |
|
||||
list_of_keys | default([]) | difference(containers_failed) }}
|
||||
|
||||
- name: Print the containers that failed to start
|
||||
fail:
|
||||
msg: "{{ containers_failed }} failed to start, check logs in /var/log/containers/stdouts/"
|
||||
when:
|
||||
- containers_failed|length != 0
|
||||
|
||||
- name: Block for container commands
|
||||
include_tasks: podman/get_commands_create.yml
|
||||
all_containers_commands: "{{ container_status_results.commands | default([]) + (all_containers_commands | default([]) | list) }}"
|
||||
when:
|
||||
- ansible_check_mode|bool
|
||||
|
||||
- name: "Print the list of containers which changed"
|
||||
debug:
|
||||
var: containers_changed
|
||||
when: tripleo_container_manage_debug | bool
|
||||
|
||||
- name: "Block for container exit codes"
|
||||
when:
|
||||
- not ansible_check_mode|bool
|
||||
- tripleo_container_manage_valid_exit_code|length != 0
|
||||
- containers_to_check|length != 0
|
||||
include_tasks: podman/check_exit_code.yml
|
||||
vars:
|
||||
containers_with_exit_code: "{{ containers_to_check }}"
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
---
|
||||
# Copyright 2020 Red Hat, Inc.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
- name: "Create a list of podman commands that are run for containers with changes"
|
||||
set_fact:
|
||||
containers_commands: >-
|
||||
{{ create_async_results.results | selectattr('changed', 'equalto', true) |
|
||||
map(attribute='podman_actions') | default([]) | list }}
|
||||
|
||||
- name: "Print the list of commands that are run for containers with changes"
|
||||
debug:
|
||||
var: containers_commands
|
||||
|
||||
- name: "Append the list of all podman commands that are run for containers with changes"
|
||||
set_fact:
|
||||
all_containers_commands: "{{ containers_commands|default([], true) + (all_containers_commands | default([]) | list) }}"
|
|
@ -353,6 +353,7 @@
|
|||
files:
|
||||
- ^tripleo_ansible/roles/tripleo-container-manage/.*
|
||||
- ^tripleo_ansible/roles/tripleo-container-rm/.*
|
||||
- ^tripleo_ansible/ansible_plugins/action/container_status.py$
|
||||
- ^tripleo_ansible/ansible_plugins/filter/helpers.py$
|
||||
- ^tripleo_ansible/ansible_plugins/modules/container_config_data.py$
|
||||
- ^tripleo_ansible/ansible_plugins/modules/container_puppet_config.py$
|
||||
|
|
Loading…
Reference in New Issue