From 85c70008d6a059c82e1bcd28053b392fb39aa6f8 Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Sun, 20 Jan 2019 10:50:39 +0100 Subject: [PATCH] Retry jobs on winrm send_input failed In some cases winrm tasks fail with the message 'winrm send_input failed'. This error is unrelated to the jobs and is caused by some underlying problems with the winrm connection or powershell on the node. Unfortunately ansible treats this failure as a task failure rather than like an unreachable host and thus returns with its standard error code 2 for failed tasks so zuul doesn't catch this and retry the job. However this error can easily be spotted by checking the ansible output for the string 'FATAL ERROR DURING FILE TRANSFER' which is unique to this failure in ansible. By catching this and returning RESULT_UNREACHABLE we can automatically retry these jobs. Change-Id: Iceba265e9783a96b8f50dfcf269468ddee1df810 --- zuul/executor/server.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/zuul/executor/server.py b/zuul/executor/server.py index 5eca20cebe..05c8729b4a 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -1947,6 +1947,15 @@ class AnsibleJob(object): job_output.write("{now} | {line}\n".format( now=datetime.datetime.now(), line=line.decode('utf-8').rstrip())) + elif ret == 2: + # This is a workaround to detect winrm connection failures that are + # not detected by ansible. These can be detected if the string + # 'FATAL ERROR DURING FILE TRANSFER' is in the ansible output. + # In this case we should treat the host as unreachable and retry + # the job. + for line in syntax_buffer: + if b'FATAL ERROR DURING FILE TRANSFER' in line: + return self.RESULT_UNREACHABLE, None return (self.RESULT_NORMAL, ret)