Improve exception handling around lost requests

An unexpected exception during resets of lost requests was leaving
the request in a locked state, and thus it would never get processed
again. Improve handling around that.

Also, raise a ZKLockException if the thing we've been asked to lock
has disappeared for some reason (this was the unhandled exception that
caused the lost request handling to fail).

Change-Id: Ie3e91714edc482b7b4fb99d7992cae999b1b7026
This commit is contained in:
David Shrewsbury 2017-10-09 10:44:31 -04:00
parent c739eec853
commit 8dc91bb752
2 changed files with 23 additions and 1 deletions

View File

@ -395,7 +395,12 @@ class CleanupWorker(BaseCleanupWorker):
except exceptions.ZKLockException:
continue
self._resetLostRequest(zk_conn, req)
try:
self._resetLostRequest(zk_conn, req)
except Exception:
self.log.exception("Error resetting lost request %s:",
req.id)
zk_conn.unlockNodeRequest(req)
def _cleanupNodeRequestLocks(self):

View File

@ -625,6 +625,9 @@ class ZooKeeper(object):
except kze.LockTimeout:
raise npe.TimeoutException(
"Timeout trying to acquire lock %s" % lock_path)
except kze.NoNodeError:
have_lock = False
self.log.error("Image build not found for locking: %s", image)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
@ -642,6 +645,10 @@ class ZooKeeper(object):
except kze.LockTimeout:
raise npe.TimeoutException(
"Timeout trying to acquire lock %s" % lock_path)
except kze.NoNodeError:
have_lock = False
self.log.error("Image build number not found for locking: %s, %s",
build_number, image)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
@ -659,6 +666,10 @@ class ZooKeeper(object):
except kze.LockTimeout:
raise npe.TimeoutException(
"Timeout trying to acquire lock %s" % lock_path)
except kze.NoNodeError:
have_lock = False
self.log.error("Image upload not found for locking: %s, %s, %s",
build_number, provider, image)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
@ -1436,6 +1447,9 @@ class ZooKeeper(object):
except kze.LockTimeout:
raise npe.TimeoutException(
"Timeout trying to acquire lock %s" % path)
except kze.NoNodeError:
have_lock = False
self.log.error("Request not found for locking: %s", request)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
@ -1483,6 +1497,9 @@ class ZooKeeper(object):
except kze.LockTimeout:
raise npe.TimeoutException(
"Timeout trying to acquire lock %s" % path)
except kze.NoNodeError:
have_lock = False
self.log.error("Node not found for locking: %s", node)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.