diff --git a/doc/source/job-content.rst b/doc/source/job-content.rst index 414db621cd..281421421c 100644 --- a/doc/source/job-content.rst +++ b/doc/source/job-content.rst @@ -1222,6 +1222,30 @@ For example the following would skip retrying the build: .. _build_status: +Ansible Groups +-------------- + +Ansible host groups may be configured via the job's :attr:`nodeset`. +In addition to these, Zuul automatically creates a group named +`zuul_unreachable`. It is always present, and is empty when the job +starts. If any playbook encounters an unreachable host, that host is +added to the group for all subsequent playbooks. This can be used to +avoid executing certain post-run playbook steps on hosts that are +already known to be unreachable. For example, to avoid copying logs +from a remote host, a play might look something like: + +.. code-block:: yaml + + - hosts: all:!zuul_unreachable + gather_facts: no + tasks: + - name: Copy logs + ... + +The group name `zuul_unreachable` is reserved by zuul and will +automatically override any similarly named group defined by the +nodeset. + Build Status ------------ diff --git a/releasenotes/notes/zuul_unreachable-6eff96f19b12c113.yaml b/releasenotes/notes/zuul_unreachable-6eff96f19b12c113.yaml new file mode 100644 index 0000000000..68364a479d --- /dev/null +++ b/releasenotes/notes/zuul_unreachable-6eff96f19b12c113.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + When a remote host in a job is found to be unreachable, Zuul will + automatically add it to a new group named `zuul_unreachable` for + all subsequent playbooks. This can be used to avoid running + certain post-run steps on hosts already known to be unreachable. diff --git a/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/post.yaml b/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/post.yaml index 3d5c8e7bce..7d3abea77f 100644 --- a/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/post.yaml +++ b/tests/fixtures/config/ansible-unreachable/git/org_project/playbooks/post.yaml @@ -1,3 +1,17 @@ +- hosts: all:!zuul_unreachable + gather_facts: no + tasks: + - name: Output debug statement + debug: + msg: "This host is not unreachable: {{ inventory_hostname }}" + +- hosts: zuul_unreachable + gather_facts: no + tasks: + - name: Output debug statement + debug: + msg: "This host is unreachable: {{ inventory_hostname }}" + - hosts: localhost gather_facts: no tasks: diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py index 1b7895b220..6565650053 100644 --- a/tests/unit/test_v3.py +++ b/tests/unit/test_v3.py @@ -8141,6 +8141,14 @@ class TestUnreachable(AnsibleZuulTestCase): will_retry = f.readline() expect_retry = build.name not in retried_builds self.assertEqual(str(expect_retry), will_retry) + output_path = os.path.join(build.jobdir.root, + 'work/logs/job-output.txt') + with open(output_path) as f: + job_output = f.read() + self.log.debug(job_output) + self.assertNotIn("This host is not unreachable", job_output) + self.assertIn("This host is unreachable: fake", job_output) + retried_builds.add(build.name) conn = self.scheds.first.sched.sql.connection diff --git a/zuul/executor/server.py b/zuul/executor/server.py index b5e78ba292..6e39ecf945 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -919,7 +919,8 @@ def is_group_var_set(name, host, nodeset, job): return False -def make_inventory_dict(nodes, nodeset, hostvars, remove_keys=None): +def make_inventory_dict(nodes, nodeset, hostvars, unreachable_nodes, + remove_keys=None): hosts = {} for node in nodes: node_hostvars = hostvars[node['name']].copy() @@ -940,13 +941,11 @@ def make_inventory_dict(nodes, nodeset, hostvars, remove_keys=None): 'all': { 'hosts': hosts, 'vars': all_hostvars, + 'children': {}, } } for group in nodeset.getGroups(): - if 'children' not in inventory['all']: - inventory['all']['children'] = dict() - group_hosts = {} for node_name in group.nodes: group_hosts[node_name] = None @@ -956,6 +955,11 @@ def make_inventory_dict(nodes, nodeset, hostvars, remove_keys=None): 'hosts': group_hosts, }}) + inventory['all']['children'].update({ + 'zuul_unreachable': { + 'hosts': {n: None for n in unreachable_nodes} + }}) + return inventory @@ -1068,6 +1072,7 @@ class AnsibleJob(object): self.frozen_hostvars = {} # The zuul.* vars self.debug_zuul_vars = {} + self.unreachable_nodes = set() self.waiting_for_semaphores = False try: max_attempts = self.arguments["zuul"]["max_attempts"] @@ -2671,11 +2676,23 @@ class AnsibleJob(object): self.original_hostvars[host['name']]) self.original_hostvars[host['name']]['unsafe_vars'] = unsafe + def updateUnreachableHosts(self): + # Load the unreachable file and update our running scoreboard + # of unreachable hosts. + try: + for line in open(self.jobdir.job_unreachable_file): + node = line.strip() + self.log.debug("Noting %s as unreachable", node) + self.unreachable_nodes.add(node) + except Exception: + self.log.error("Error updating unreachable hosts:") + def writeDebugInventory(self): # This file is unused by Zuul, but the base jobs copy it to logs # for debugging, so let's continue to put something there. inventory = make_inventory_dict( - self.host_list, self.nodeset, self.original_hostvars) + self.host_list, self.nodeset, self.original_hostvars, + self.unreachable_nodes) inventory['all']['vars']['zuul'] = self.debug_zuul_vars with open(self.jobdir.inventory, 'w') as inventory_yaml: @@ -2701,6 +2718,7 @@ class AnsibleJob(object): def writeInventory(self, jobdir_playbook, hostvars): inventory = make_inventory_dict( self.host_list, self.nodeset, hostvars, + self.unreachable_nodes, remove_keys=jobdir_playbook.secrets_keys) with open(jobdir_playbook.inventory, 'w') as inventory_yaml: @@ -3003,7 +3021,7 @@ class AnsibleJob(object): return (self.RESULT_TIMED_OUT, None) # Note: Unlike documented ansible currently wrongly returns 4 on # unreachable so we have the zuul_unreachable callback module that - # creates the file job-output.unreachable in case there were + # creates the file nodes.unreachable in case there were # unreachable nodes. This can be removed once ansible returns a # distinct value for unreachable. # TODO: Investigate whether the unreachable callback can be @@ -3012,6 +3030,7 @@ class AnsibleJob(object): if ret == 3 or os.path.exists(self.jobdir.job_unreachable_file): # AnsibleHostUnreachable: We had a network issue connecting to # our zuul-worker. + self.updateUnreachableHosts() return (self.RESULT_UNREACHABLE, None) elif ret == -9: # Received abort request.