Fix unreachable nodes detection

According to the docs ansible is expected to return 3 if there were
unreachable nodes. Zuul reacts on this and retries the job. However
ansible really returns 4 in this case [1] which actually is the code
for a parse error (which doesn't make sense to retry).

To work around that add a simple callback plugin that gets notified if
there are unreachable nodes and write the information about which
nodes failed to a special file beneath the job-output.txt. The
executor can detect this and react properly then. Further we have the
information about which nodes failed in this file which gets uploaded
to the log server too if it exists.

[1] https://github.com/ansible/ansible/issues/19720

Change-Id: I6d609835dba18b50dcbe883d01c6229ce4e38d91
This commit is contained in:
Tobias Henkel 2018-09-15 15:26:01 +02:00
parent f7f8ea617a
commit 30cb2b1484
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
10 changed files with 170 additions and 1 deletions

View File

@ -5,6 +5,7 @@
command: ansible-playbook src/git.openstack.org/openstack-infra/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml
environment:
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
- name: Run ansible playbook that should fail
@ -13,6 +14,7 @@
failed_when: "failed_results.rc != 2"
environment:
ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
ZUUL_JOBDIR: "{{ ansible_user_dir}}"
ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json"
- name: Validate output - setupvar

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Jobs that encountered unreachable nodes are now correctly detected and
retried.

View File

@ -18,6 +18,7 @@
- job:
name: no-log-unreachable
attempts: 1
run: playbooks/no-log-unreachable.yaml
- project:

View File

@ -0,0 +1,6 @@
- hosts: localhost
gather_facts: no
tasks:
- name: Test
debug:
msg: Test

View File

@ -0,0 +1,28 @@
- hosts: localhost
gather_facts: no
tasks:
- name: Add a fake host
add_host:
hostname: fake
ansible_host: notexisting.example.notexisting
- hosts: fake
gather_facts: no
tasks:
- name: Run a lineinfile task
vars:
logins:
- machine: foo
login: bar
password: my-very-secret-password-1
- machine: two
login: three
password: my-very-secret-password-2
lineinfile:
path: /tmp/.netrc
mode: 0600
create: true
insertafter: EOF
line: "machine {{ item.machine }} login {{ item.login }} password {{ item.password }}"
with_items: "{{ logins }}"
no_log: true

View File

@ -0,0 +1,35 @@
- pipeline:
name: check
manager: independent
post-review: true
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- job:
name: base
parent: null
- job:
name: pre-unreachable
attempts: 2
pre-run:
- playbooks/unreachable.yaml
run: playbooks/run.yaml
- job:
name: run-unreachable
attempts: 2
run: playbooks/unreachable.yaml
- project:
check:
jobs:
- pre-unreachable
- run-unreachable

View File

@ -0,0 +1,6 @@
- tenant:
name: tenant-one
source:
gerrit:
config-projects:
- org/project

View File

@ -4165,6 +4165,40 @@ class TestNoLog(AnsibleZuulTestCase):
self.assertNotIn('my-very-secret-password-2', text_log)
class TestUnreachable(AnsibleZuulTestCase):
tenant_config_file = 'config/ansible-unreachable/main.yaml'
def _get_file(self, build, path):
p = os.path.join(build.jobdir.root, path)
with open(p) as f:
return f.read()
def test_unreachable(self):
self.wait_timeout = 120
# Output extra ansible info so we might see errors.
self.executor_server.verbose = True
self.executor_server.keep_jobdir = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# The result must be retry limit because jobs with unreachable nodes
# will be retried.
self.assertIn('RETRY_LIMIT', A.messages[0])
self.assertHistory([
dict(name='pre-unreachable', result=None, changes='1,1'),
dict(name='pre-unreachable', result=None, changes='1,1'),
dict(name='run-unreachable', result=None, changes='1,1'),
dict(name='run-unreachable', result=None, changes='1,1'),
], ordered=False)
unreachable_log = self._get_file(self.history[0],
'.ansible/nodes.unreachable')
self.assertEqual('fake\n', unreachable_log)
class TestJobPause(AnsibleZuulTestCase):
tenant_config_file = 'config/job-pause/main.yaml'

View File

@ -0,0 +1,45 @@
# Copyright 2018 BMW Carit GmbH
#
# Zuul is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Zuul is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
# This is not needed in python3 - but it is needed in python2 because there
# is a json module in ansible.plugins.callback and python2 gets confused.
# Easy local testing with ansible-playbook is handy when hacking on zuul_stream
# so just put in the __future__ statement.
from __future__ import absolute_import
import os
from ansible.plugins.callback import default
class CallbackModule(default.CallbackModule):
CALLBACK_VERSION = 2.0
# aggregate means we can be loaded and not be the stdout plugin
CALLBACK_TYPE = 'aggregate'
CALLBACK_NAME = 'zuul_unreachable'
def __init__(self):
super(CallbackModule, self).__init__()
self.output_path = os.path.join(
os.environ['ZUUL_JOBDIR'], '.ansible', 'nodes.unreachable')
self.unreachable_hosts = set()
def v2_runner_on_unreachable(self, result):
host = result._host.get_name()
if host not in self.unreachable_hosts:
self.unreachable_hosts.add(host)
with open(self.output_path, 'a') as f:
f.write('%s\n' % host)

View File

@ -370,6 +370,8 @@ class JobDir(object):
self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache')
os.makedirs(self.fact_cache)
self.control_path = os.path.join(self.ansible_cache_root, 'cp')
self.job_unreachable_file = os.path.join(self.ansible_cache_root,
'nodes.unreachable')
os.makedirs(self.control_path)
localhost_facts = os.path.join(self.fact_cache, 'localhost')
# NOTE(pabelanger): We do not want to leak zuul-executor facts to other
@ -1775,7 +1777,12 @@ class AnsibleJob(object):
if timeout and watchdog.timed_out:
return (self.RESULT_TIMED_OUT, None)
if ret == 3:
# Note: Unlike documented ansible currently wrongly returns 4 on
# unreachable so we have the zuul_unreachable callback module that
# creates the file job-output.unreachable in case there were
# unreachable nodes. This can be removed once ansible returns a
# distinct value for unreachable.
if ret == 3 or os.path.exists(self.jobdir.job_unreachable_file):
# AnsibleHostUnreachable: We had a network issue connecting to
# our zuul-worker.
return (self.RESULT_UNREACHABLE, None)