Fix unreachable nodes detection
According to the docs ansible is expected to return 3 if there were unreachable nodes. Zuul reacts on this and retries the job. However ansible really returns 4 in this case [1] which actually is the code for a parse error (which doesn't make sense to retry). To work around that add a simple callback plugin that gets notified if there are unreachable nodes and write the information about which nodes failed to a special file beneath the job-output.txt. The executor can detect this and react properly then. Further we have the information about which nodes failed in this file which gets uploaded to the log server too if it exists. [1] https://github.com/ansible/ansible/issues/19720 Change-Id: I6d609835dba18b50dcbe883d01c6229ce4e38d91
This commit is contained in:
parent
f7f8ea617a
commit
30cb2b1484
|
@ -5,6 +5,7 @@
|
|||
command: ansible-playbook src/git.openstack.org/openstack-infra/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
|
||||
environment:
|
||||
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
|
||||
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
|
||||
- name: Run ansible playbook that should fail
|
||||
|
@ -13,6 +14,7 @@
|
|||
failed_when: "failed_results.rc != 2"
|
||||
environment:
|
||||
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
|
||||
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
|
||||
|
||||
- name: Validate output - setupvar
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Jobs that encountered unreachable nodes are now correctly detected and
|
||||
retried.
|
|
@ -18,6 +18,7 @@
|
|||
|
||||
- job:
|
||||
name: no-log-unreachable
|
||||
attempts: 1
|
||||
run: playbooks/no-log-unreachable.yaml
|
||||
|
||||
- project:
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
- hosts: localhost
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Test
|
||||
debug:
|
||||
msg: Test
|
28
tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml
vendored
Normal file
28
tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml
vendored
Normal file
|
@ -0,0 +1,28 @@
|
|||
- hosts: localhost
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Add a fake host
|
||||
add_host:
|
||||
hostname: fake
|
||||
ansible_host: notexisting.example.notexisting
|
||||
|
||||
- hosts: fake
|
||||
gather_facts: no
|
||||
tasks:
|
||||
- name: Run a lineinfile task
|
||||
vars:
|
||||
logins:
|
||||
- machine: foo
|
||||
login: bar
|
||||
password: my-very-secret-password-1
|
||||
- machine: two
|
||||
login: three
|
||||
password: my-very-secret-password-2
|
||||
lineinfile:
|
||||
path: /tmp/.netrc
|
||||
mode: 0600
|
||||
create: true
|
||||
insertafter: EOF
|
||||
line: "machine {{ item.machine }} login {{ item.login }} password {{ item.password }}"
|
||||
with_items: "{{ logins }}"
|
||||
no_log: true
|
|
@ -0,0 +1,35 @@
|
|||
- pipeline:
|
||||
name: check
|
||||
manager: independent
|
||||
post-review: true
|
||||
trigger:
|
||||
gerrit:
|
||||
- event: patchset-created
|
||||
success:
|
||||
gerrit:
|
||||
Verified: 1
|
||||
failure:
|
||||
gerrit:
|
||||
Verified: -1
|
||||
|
||||
- job:
|
||||
name: base
|
||||
parent: null
|
||||
|
||||
- job:
|
||||
name: pre-unreachable
|
||||
attempts: 2
|
||||
pre-run:
|
||||
- playbooks/unreachable.yaml
|
||||
run: playbooks/run.yaml
|
||||
|
||||
- job:
|
||||
name: run-unreachable
|
||||
attempts: 2
|
||||
run: playbooks/unreachable.yaml
|
||||
|
||||
- project:
|
||||
check:
|
||||
jobs:
|
||||
- pre-unreachable
|
||||
- run-unreachable
|
|
@ -0,0 +1,6 @@
|
|||
- tenant:
|
||||
name: tenant-one
|
||||
source:
|
||||
gerrit:
|
||||
config-projects:
|
||||
- org/project
|
|
@ -4165,6 +4165,40 @@ class TestNoLog(AnsibleZuulTestCase):
|
|||
self.assertNotIn('my-very-secret-password-2', text_log)
|
||||
|
||||
|
||||
class TestUnreachable(AnsibleZuulTestCase):
|
||||
tenant_config_file = 'config/ansible-unreachable/main.yaml'
|
||||
|
||||
def _get_file(self, build, path):
|
||||
p = os.path.join(build.jobdir.root, path)
|
||||
with open(p) as f:
|
||||
return f.read()
|
||||
|
||||
def test_unreachable(self):
|
||||
self.wait_timeout = 120
|
||||
|
||||
# Output extra ansible info so we might see errors.
|
||||
self.executor_server.verbose = True
|
||||
self.executor_server.keep_jobdir = True
|
||||
|
||||
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
|
||||
|
||||
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
|
||||
self.waitUntilSettled()
|
||||
|
||||
# The result must be retry limit because jobs with unreachable nodes
|
||||
# will be retried.
|
||||
self.assertIn('RETRY_LIMIT', A.messages[0])
|
||||
self.assertHistory([
|
||||
dict(name='pre-unreachable', result=None, changes='1,1'),
|
||||
dict(name='pre-unreachable', result=None, changes='1,1'),
|
||||
dict(name='run-unreachable', result=None, changes='1,1'),
|
||||
dict(name='run-unreachable', result=None, changes='1,1'),
|
||||
], ordered=False)
|
||||
unreachable_log = self._get_file(self.history[0],
|
||||
'.ansible/nodes.unreachable')
|
||||
self.assertEqual('fake\n', unreachable_log)
|
||||
|
||||
|
||||
class TestJobPause(AnsibleZuulTestCase):
|
||||
tenant_config_file = 'config/job-pause/main.yaml'
|
||||
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
# Copyright 2018 BMW Carit GmbH
|
||||
#
|
||||
# Zuul is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Zuul is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# This is not needed in python3 - but it is needed in python2 because there
|
||||
# is a json module in ansible.plugins.callback and python2 gets confused.
|
||||
# Easy local testing with ansible-playbook is handy when hacking on zuul_stream
|
||||
# so just put in the __future__ statement.
|
||||
from __future__ import absolute_import
|
||||
|
||||
import os
|
||||
|
||||
from ansible.plugins.callback import default
|
||||
|
||||
|
||||
class CallbackModule(default.CallbackModule):
|
||||
|
||||
CALLBACK_VERSION = 2.0
|
||||
# aggregate means we can be loaded and not be the stdout plugin
|
||||
CALLBACK_TYPE = 'aggregate'
|
||||
CALLBACK_NAME = 'zuul_unreachable'
|
||||
|
||||
def __init__(self):
|
||||
super(CallbackModule, self).__init__()
|
||||
self.output_path = os.path.join(
|
||||
os.environ['ZUUL_JOBDIR'], '.ansible', 'nodes.unreachable')
|
||||
self.unreachable_hosts = set()
|
||||
|
||||
def v2_runner_on_unreachable(self, result):
|
||||
host = result._host.get_name()
|
||||
if host not in self.unreachable_hosts:
|
||||
self.unreachable_hosts.add(host)
|
||||
with open(self.output_path, 'a') as f:
|
||||
f.write('%s\n' % host)
|
|
@ -370,6 +370,8 @@ class JobDir(object):
|
|||
self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache')
|
||||
os.makedirs(self.fact_cache)
|
||||
self.control_path = os.path.join(self.ansible_cache_root, 'cp')
|
||||
self.job_unreachable_file = os.path.join(self.ansible_cache_root,
|
||||
'nodes.unreachable')
|
||||
os.makedirs(self.control_path)
|
||||
localhost_facts = os.path.join(self.fact_cache, 'localhost')
|
||||
# NOTE(pabelanger): We do not want to leak zuul-executor facts to other
|
||||
|
@ -1775,7 +1777,12 @@ class AnsibleJob(object):
|
|||
|
||||
if timeout and watchdog.timed_out:
|
||||
return (self.RESULT_TIMED_OUT, None)
|
||||
if ret == 3:
|
||||
# Note: Unlike documented ansible currently wrongly returns 4 on
|
||||
# unreachable so we have the zuul_unreachable callback module that
|
||||
# creates the file job-output.unreachable in case there were
|
||||
# unreachable nodes. This can be removed once ansible returns a
|
||||
# distinct value for unreachable.
|
||||
if ret == 3 or os.path.exists(self.jobdir.job_unreachable_file):
|
||||
# AnsibleHostUnreachable: We had a network issue connecting to
|
||||
# our zuul-worker.
|
||||
return (self.RESULT_UNREACHABLE, None)
|
||||
|
|
Loading…
Reference in New Issue