Increase merge retries and delays in between

GitHub occasionally denies merges because some internal background
process didn't finish necessary work for it. This happens regularly in
our deployment. Currently zuul retries the merge after two seconds and
then fails and triggers a gate reset. However we see frequent
occasions where another retry or longer wait times would have made it
to succeed. Further we also observed that GitHub sometimes lied about
the result of a merge operation by doing the merge successfully but
responding with an error.

In order to handle those issues more robust retry more often and sleep
slightly longer between the retries. Further after getting an error we
still need to check if the PR is merged in case it succeeded 'in the
background'.

Change-Id: I45a08f32a243d0a0bd1492fa0d244f81e351772a
This commit is contained in:
Tobias Henkel 2020-10-05 15:47:40 +02:00 committed by Simon Westphahl
parent 493c13cb77
commit 7f360b4f69
No known key found for this signature in database
6 changed files with 82 additions and 8 deletions

View File

@ -309,6 +309,7 @@ class PullRequest(ObjectType):
isDraft = Boolean()
reviewDecision = String()
mergeable = String()
merged = Boolean()
reviewThreads = Field(PullRequestReviewThreads,
first=Int(), after=String())
@ -321,6 +322,9 @@ class PullRequest(ObjectType):
def resolve_mergeable(parent, info):
return "MERGEABLE" if parent.mergeable else "CONFLICTING"
def resolve_merged(parent, info):
return parent.is_merged
def resolve_reviewThreads(parent, info, first, after=None):
if after is None:
after = '0'

View File

@ -759,7 +759,7 @@ class TestGithubDriver(ZuulTestCase):
# pipeline does not merge the pull request
# merge failed on 405 Method Not Allowed error - twice
self.fake_github.merge_not_allowed_count = 2
self.fake_github.merge_not_allowed_count = 5
D = self.fake_github.openFakePullRequest('org/project', 'master', 'D')
self.fake_github.emitEvent(D.getCommentAddedEvent('merge me'))
self.waitUntilSettled()
@ -771,6 +771,26 @@ class TestGithubDriver(ZuulTestCase):
'Pull request merge failed: Merge not allowed '
'because of fake reason')
@simple_layout('layouts/merging-github.yaml', driver='github')
@okay_tracebacks('Unknown merge failure',
'Method not allowed')
def test_report_pull_merge_error_success(self):
A = self.fake_github.openFakePullRequest('org/project', 'master', 'A')
# Fail all merge attempts
self.fake_github.merge_failure = True
self.fake_github.merge_not_allowed_count = 5
with mock.patch(
"tests.fake_graphql.PullRequest.resolve_merged"
) as merged_mock:
# Report change as merged despite the merge failure.
merged_mock.return_value = True
self.fake_github.emitEvent(A.getCommentAddedEvent('merge me'))
self.waitUntilSettled()
# Checking A.is_merged doesn't work here as we did not call
# A.setMerged(). Instead make sure that no error was reported.
self.assertEqual(len(A.comments), 0)
@simple_layout('layouts/merging-github.yaml', driver='github')
def test_report_pull_merge_message_reviewed_by(self):
# pipeline merges the pull request on success

View File

@ -2096,6 +2096,10 @@ class GithubConnection(ZKChangeCacheMixin, ZKBranchCacheMixin, BaseConnection):
return True
def isMerged(self, change, event):
github = self.getGithubClient(change.project.name, zuul_event_id=event)
return self.graphql_client.merged(github, change)
def getPullBySha(self, sha, project_name, event):
log = get_annotated_logger(self.log, event)

View File

@ -226,7 +226,8 @@ class GithubReporter(BaseReporter):
message = self._formatMergeMessage(change, merge_mode)
merge_mode = self.merge_modes[merge_mode]
for i in [1, 2]:
max_retries = 5
for i in range(1, max_retries + 1):
try:
self.connection.mergePull(project, pr_number, message, sha=sha,
method=merge_mode,
@ -235,13 +236,30 @@ class GithubReporter(BaseReporter):
is_merged=True)
return
except MergeFailure as e:
log.exception('Merge attempt of change %s %s/2 failed.',
change, i, exc_info=True)
log.exception('Merge attempt of change %s %s/%s failed.',
change, i, max_retries, exc_info=True)
error_message = str(e)
if i == 1:
time.sleep(2)
log.warning('Merge of change %s failed after 2 attempts, giving up',
change)
if i < max_retries:
time.sleep(5)
# Sometimes GitHub lies about the merge status and reports
# merge failed but in fact did the merge successfully. To
# account for that re-fetch the PR and check if it's still
# unmerged.
try:
if self.connection.isMerged(change, item.event):
# The change has been merged despite we got an
# error back from GitHub
log.warning(
'Merge attempt of change %s failed but change '
'is merged. Treating as success.', change)
self.connection.updateChangeAttributes(
change, is_merged=True)
return
except Exception:
log.exception('Error while checking for merged '
'change %s', change)
log.warning('Merge of change %s failed after %s attempts, giving up',
change, max_retries)
raise MergeFailure(error_message)
def addReview(self, item, change):

View File

@ -44,6 +44,7 @@ class GraphQLClient:
'canmerge-page-threads',
'branch-protection',
'branch-protection-inner',
'merged',
]
for query_name in query_names:
f = importlib.resources.files('zuul').joinpath(
@ -277,3 +278,19 @@ class GraphQLClient:
log = get_annotated_logger(self.log, zuul_event_id)
return self._fetch_branch_protection(log, github, project,
zuul_event_id)
def merged(self, github, change):
owner, repo = change.project.name.split('/')
variables = {
'zuul_query': 'merged', # used for logging
'owner': owner,
'repo': repo,
'pull': change.number,
}
query = self.queries['merged']
query = self._prepare_query(query, variables)
response = github.session.post(self.url, json=query)
response.raise_for_status()
return nested_get(response.json(), 'data', 'repository', 'pullRequest',
'merged')

View File

@ -0,0 +1,11 @@
query canMergeData(
$owner: String!
$repo: String!
$pull: Int!
) {
repository(owner: $owner, name: $repo) {
pullRequest(number: $pull) {
merged
}
}
}