Fix race involving job request locks
It's possible for the following sequence to occur (prefixed by thread ids): 2> process job request cache update 1> finish job 1> set job request state to complete 1> unlock job request 1> delete job request 1> delete job request lock 2> get cached list of running jobs for lostRequests, start examining job 2> check if the job is unlocked (this will re-create the lock dir and return true) 2> attempt to set job request state to complete (this will raise JobRequestNotFound) 2> bail This leaves a lock node laying around. We have a cleanup process that will eventually remove it in production, but it's existence can cause the clean-state checks at the end of unit tests to fail. To correct this: a) Try to avoid re-creating the lock (though this is not possible in all cases) b) If we encounter JobRequestNotFound error in the cleanup, attempt to delete the job nonetheless (so that we re-delete the lock dir) The remove method is also made entirely idemptotent to support this. Change-Id: I49ad5c38a3c6cbaf0962e805b6c228e36b97a3d2
This commit is contained in:
parent
9a27c447c1
commit
97a76de403
|
@ -238,6 +238,9 @@ class ExecutorClient(object):
|
|||
self.executor_api.update(build_request)
|
||||
except JobRequestNotFound as e:
|
||||
self.log.warning("Could not complete build: %s", str(e))
|
||||
# In case we re-created the lock directory, still remove
|
||||
# the request for the side effect of removing the lock.
|
||||
self.executor_api.remove(build_request)
|
||||
return
|
||||
except BadVersionError:
|
||||
# There could be a race condition:
|
||||
|
|
|
@ -144,6 +144,9 @@ class MergeClient(object):
|
|||
self.merger_api.remove(merge_request)
|
||||
except JobRequestNotFound as e:
|
||||
self.log.warning("Could not complete merge: %s", str(e))
|
||||
# In case we re-created the lock directory, still remove
|
||||
# the request for the side effect of removing the lock.
|
||||
self.merger_api.remove(merge_request)
|
||||
return
|
||||
except BadVersionError:
|
||||
# There could be a race condition:
|
||||
|
|
|
@ -317,7 +317,10 @@ class JobRequestQueue(ZooKeeperSimpleBase):
|
|||
except NoNodeError:
|
||||
# Nothing to do if the node is already deleted
|
||||
pass
|
||||
self.clearParams(request)
|
||||
try:
|
||||
self.clearParams(request)
|
||||
except NoNodeError:
|
||||
pass
|
||||
try:
|
||||
# Delete the lock parent node as well
|
||||
path = "/".join([self.LOCK_ROOT, request.uuid])
|
||||
|
@ -421,6 +424,8 @@ class JobRequestQueue(ZooKeeperSimpleBase):
|
|||
|
||||
def isLocked(self, request):
|
||||
path = "/".join([self.LOCK_ROOT, request.uuid])
|
||||
if not self.kazoo_client.exists(path):
|
||||
return False
|
||||
lock = SessionAwareLock(self.kazoo_client, path)
|
||||
is_locked = len(lock.contenders()) > 0
|
||||
return is_locked
|
||||
|
|
Loading…
Reference in New Issue