From 30cb2b1484ef2d11947ced38a8bde58a5cbf7f0a Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Sat, 15 Sep 2018 15:26:01 +0200 Subject: [PATCH] Fix unreachable nodes detection According to the docs ansible is expected to return 3 if there were unreachable nodes. Zuul reacts on this and retries the job. However ansible really returns 4 in this case [1] which actually is the code for a parse error (which doesn't make sense to retry). To work around that add a simple callback plugin that gets notified if there are unreachable nodes and write the information about which nodes failed to a special file beneath the job-output.txt. The executor can detect this and react properly then. Further we have the information about which nodes failed in this file which gets uploaded to the log server too if it exists. [1] https://github.com/ansible/ansible/issues/19720 Change-Id: I6d609835dba18b50dcbe883d01c6229ce4e38d91 --- playbooks/zuul-stream/functional.yaml | 2 + .../notes/unreachable-60ce0305416a5e8c.yaml | 5 +++ .../ansible-no-log/git/org_project/zuul.yaml | 1 + .../git/org_project/playbooks/run.yaml | 6 +++ .../org_project/playbooks/unreachable.yaml | 28 ++++++++++++ .../git/org_project/zuul.yaml | 35 +++++++++++++++ .../config/ansible-unreachable/main.yaml | 6 +++ tests/unit/test_v3.py | 34 ++++++++++++++ zuul/ansible/callback/zuul_unreachable.py | 45 +++++++++++++++++++ zuul/executor/server.py | 9 +++- 10 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/unreachable-60ce0305416a5e8c.yaml create mode 100644 tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/run.yaml create mode 100644 tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml create mode 100644 tests/fixtures/config/ansible-unreachable/git/org_project/zuul.yaml create mode 100644 tests/fixtures/config/ansible-unreachable/main.yaml create mode 100644 zuul/ansible/callback/zuul_unreachable.py diff --git a/playbooks/zuul-stream/functional.yaml b/playbooks/zuul-stream/functional.yaml index 779a10278f..a9535cd9fb 100644 --- a/playbooks/zuul-stream/functional.yaml +++ b/playbooks/zuul-stream/functional.yaml @@ -5,6 +5,7 @@ command: ansible-playbook src/git.openstack.org/openstack-infra/zuul/playbooks/zuul-stream/fixtures/test-stream.yaml environment: ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json" + ZUUL_JOBDIR: "{{ ansible_user_dir}}" ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json" - name: Run ansible playbook that should fail @@ -13,6 +14,7 @@ failed_when: "failed_results.rc != 2" environment: ZUUL_JOB_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json" + ZUUL_JOBDIR: "{{ ansible_user_dir}}" ARA_LOG_CONFIG: "{{ ansible_user_dir}}/logging.json" - name: Validate output - setupvar diff --git a/releasenotes/notes/unreachable-60ce0305416a5e8c.yaml b/releasenotes/notes/unreachable-60ce0305416a5e8c.yaml new file mode 100644 index 0000000000..8ae96af034 --- /dev/null +++ b/releasenotes/notes/unreachable-60ce0305416a5e8c.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Jobs that encountered unreachable nodes are now correctly detected and + retried. diff --git a/tests/fixtures/config/ansible-no-log/git/org_project/zuul.yaml b/tests/fixtures/config/ansible-no-log/git/org_project/zuul.yaml index 38501b21fa..99da5ed064 100644 --- a/tests/fixtures/config/ansible-no-log/git/org_project/zuul.yaml +++ b/tests/fixtures/config/ansible-no-log/git/org_project/zuul.yaml @@ -18,6 +18,7 @@ - job: name: no-log-unreachable + attempts: 1 run: playbooks/no-log-unreachable.yaml - project: diff --git a/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/run.yaml b/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/run.yaml new file mode 100644 index 0000000000..e768adf6a3 --- /dev/null +++ b/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/run.yaml @@ -0,0 +1,6 @@ +- hosts: localhost + gather_facts: no + tasks: + - name: Test + debug: + msg: Test diff --git a/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml b/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml new file mode 100644 index 0000000000..3d3ddf5830 --- /dev/null +++ b/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/unreachable.yaml @@ -0,0 +1,28 @@ +- hosts: localhost + gather_facts: no + tasks: + - name: Add a fake host + add_host: + hostname: fake + ansible_host: notexisting.example.notexisting + +- hosts: fake + gather_facts: no + tasks: + - name: Run a lineinfile task + vars: + logins: + - machine: foo + login: bar + password: my-very-secret-password-1 + - machine: two + login: three + password: my-very-secret-password-2 + lineinfile: + path: /tmp/.netrc + mode: 0600 + create: true + insertafter: EOF + line: "machine {{ item.machine }} login {{ item.login }} password {{ item.password }}" + with_items: "{{ logins }}" + no_log: true diff --git a/tests/fixtures/config/ansible-unreachable/git/org_project/zuul.yaml b/tests/fixtures/config/ansible-unreachable/git/org_project/zuul.yaml new file mode 100644 index 0000000000..f567e77834 --- /dev/null +++ b/tests/fixtures/config/ansible-unreachable/git/org_project/zuul.yaml @@ -0,0 +1,35 @@ +- pipeline: + name: check + manager: independent + post-review: true + trigger: + gerrit: + - event: patchset-created + success: + gerrit: + Verified: 1 + failure: + gerrit: + Verified: -1 + +- job: + name: base + parent: null + +- job: + name: pre-unreachable + attempts: 2 + pre-run: + - playbooks/unreachable.yaml + run: playbooks/run.yaml + +- job: + name: run-unreachable + attempts: 2 + run: playbooks/unreachable.yaml + +- project: + check: + jobs: + - pre-unreachable + - run-unreachable diff --git a/tests/fixtures/config/ansible-unreachable/main.yaml b/tests/fixtures/config/ansible-unreachable/main.yaml new file mode 100644 index 0000000000..73787886f0 --- /dev/null +++ b/tests/fixtures/config/ansible-unreachable/main.yaml @@ -0,0 +1,6 @@ +- tenant: + name: tenant-one + source: + gerrit: + config-projects: + - org/project diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py index d871a2d80b..d92ef77280 100644 --- a/tests/unit/test_v3.py +++ b/tests/unit/test_v3.py @@ -4165,6 +4165,40 @@ class TestNoLog(AnsibleZuulTestCase): self.assertNotIn('my-very-secret-password-2', text_log) +class TestUnreachable(AnsibleZuulTestCase): + tenant_config_file = 'config/ansible-unreachable/main.yaml' + + def _get_file(self, build, path): + p = os.path.join(build.jobdir.root, path) + with open(p) as f: + return f.read() + + def test_unreachable(self): + self.wait_timeout = 120 + + # Output extra ansible info so we might see errors. + self.executor_server.verbose = True + self.executor_server.keep_jobdir = True + + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + # The result must be retry limit because jobs with unreachable nodes + # will be retried. + self.assertIn('RETRY_LIMIT', A.messages[0]) + self.assertHistory([ + dict(name='pre-unreachable', result=None, changes='1,1'), + dict(name='pre-unreachable', result=None, changes='1,1'), + dict(name='run-unreachable', result=None, changes='1,1'), + dict(name='run-unreachable', result=None, changes='1,1'), + ], ordered=False) + unreachable_log = self._get_file(self.history[0], + '.ansible/nodes.unreachable') + self.assertEqual('fake\n', unreachable_log) + + class TestJobPause(AnsibleZuulTestCase): tenant_config_file = 'config/job-pause/main.yaml' diff --git a/zuul/ansible/callback/zuul_unreachable.py b/zuul/ansible/callback/zuul_unreachable.py new file mode 100644 index 0000000000..8c7b85f430 --- /dev/null +++ b/zuul/ansible/callback/zuul_unreachable.py @@ -0,0 +1,45 @@ +# Copyright 2018 BMW Carit GmbH +# +# Zuul is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Zuul is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ansible. If not, see . + +# This is not needed in python3 - but it is needed in python2 because there +# is a json module in ansible.plugins.callback and python2 gets confused. +# Easy local testing with ansible-playbook is handy when hacking on zuul_stream +# so just put in the __future__ statement. +from __future__ import absolute_import + +import os + +from ansible.plugins.callback import default + + +class CallbackModule(default.CallbackModule): + + CALLBACK_VERSION = 2.0 + # aggregate means we can be loaded and not be the stdout plugin + CALLBACK_TYPE = 'aggregate' + CALLBACK_NAME = 'zuul_unreachable' + + def __init__(self): + super(CallbackModule, self).__init__() + self.output_path = os.path.join( + os.environ['ZUUL_JOBDIR'], '.ansible', 'nodes.unreachable') + self.unreachable_hosts = set() + + def v2_runner_on_unreachable(self, result): + host = result._host.get_name() + if host not in self.unreachable_hosts: + self.unreachable_hosts.add(host) + with open(self.output_path, 'a') as f: + f.write('%s\n' % host) diff --git a/zuul/executor/server.py b/zuul/executor/server.py index 75b6e474d0..1955fde5d1 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -370,6 +370,8 @@ class JobDir(object): self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache') os.makedirs(self.fact_cache) self.control_path = os.path.join(self.ansible_cache_root, 'cp') + self.job_unreachable_file = os.path.join(self.ansible_cache_root, + 'nodes.unreachable') os.makedirs(self.control_path) localhost_facts = os.path.join(self.fact_cache, 'localhost') # NOTE(pabelanger): We do not want to leak zuul-executor facts to other @@ -1775,7 +1777,12 @@ class AnsibleJob(object): if timeout and watchdog.timed_out: return (self.RESULT_TIMED_OUT, None) - if ret == 3: + # Note: Unlike documented ansible currently wrongly returns 4 on + # unreachable so we have the zuul_unreachable callback module that + # creates the file job-output.unreachable in case there were + # unreachable nodes. This can be removed once ansible returns a + # distinct value for unreachable. + if ret == 3 or os.path.exists(self.jobdir.job_unreachable_file): # AnsibleHostUnreachable: We had a network issue connecting to # our zuul-worker. return (self.RESULT_UNREACHABLE, None)