Handle concurrent modification during change cache delete

The following race was observed:

1) Several hours before the error, an event caused a change
   to be queried and added to the cache.
2) The change was enqueued in a pipeline for a while and
   therefore stayed in the relevant set.
3) The change was removed from the pipelines.
4) A cache prune process started shortly before the error and
   calculated the relevant set (the change was not in this set)
   and also the changes that were last modified > 1 hour ago
   (the change was in this set).  This combination means the
   entry is subject to pruning.
5) The cache cleanup starts slowly deleting changes (this
   takes about 3 minutes).
6) An event arrives for the change.  Gerrit is queried and the
   updated change is inserted into the cache.
7) The cache cleanup method gets around to deleting the change
   from the cache.
8) Subsequent queue processes can't find the change in the cache
   and raise an exception.

Or, in fewer words, the change was updated between the decision
time for the deletion and the deletion itself.

The kazoo delete method takes a version argument which will alert
us if the znode it would delete is of a different version than
specified.  If we remember the version of the cache entry from
when we decide to delete it, we can avoid the race by ensuring that
the deleted znode hasn't been updated since our decision.  This
change implements that.

The 'recursive' parameter is removed since it causes the version
check to always pass.  There are no children under the cache entry,
so it's not necessary.  It was likely only added to simplify the
case where we delete a node which is already deleted (NoNodeError).
To account for that, we handle that exception explicitly.

Change-Id: Ica840225fd52585a29452c80d90a4aa5e7763c8a
This commit is contained in:
James E. Blair 2021-10-18 13:58:30 -07:00
parent 0d9e083952
commit bfe5a4a935
3 changed files with 54 additions and 6 deletions

View File

@ -0,0 +1,6 @@
---
fixes:
- |
A bug with the change cache cleanup routine which could have
caused items to be stuck in pipelines without running jobs has
been corrected.

View File

@ -1315,6 +1315,30 @@ class TestChangeCache(ZooKeeperBaseTestCase):
invalid_key = ChangeKey('conn', 'project', 'change', 'invalid', '1')
self.cache.delete(invalid_key)
def test_concurrent_delete(self):
change = DummyChange("project", {"foo": "bar"})
key = ChangeKey('conn', 'project', 'change', 'foo', '1')
self.cache.set(key, change)
old_version = change.cache_version
# Simulate someone updating the cache after we decided to
# delete the entry
self.cache.set(key, change, old_version)
self.assertNotEqual(old_version, change.cache_version)
self.cache.delete(key, old_version)
# The change should still be in the cache
self.assertIsNotNone(self.cache.get(key))
def test_prune(self):
change1 = DummyChange("project", {"foo": "bar"})
change2 = DummyChange("project", {"foo": "baz"})
key1 = ChangeKey('conn', 'project', 'change', 'foo', '1')
key2 = ChangeKey('conn', 'project', 'change', 'foo', '2')
self.cache.set(key1, change1)
self.cache.set(key2, change2)
self.cache.prune([key1], max_age=0)
self.assertIsNotNone(self.cache.get(key1))
self.assertIsNone(self.cache.get(key2))
def test_concurrent_update(self):
change = DummyChange("project", {"foo": "bar"})
key = ChangeKey('conn', 'project', 'change', 'foo', '1')

View File

@ -201,11 +201,21 @@ class AbstractChangeCache(ZooKeeperSimpleBase, Iterable, abc.ABC):
def prune(self, relevant, max_age=3600): # 1h
cutoff_time = time.time() - max_age
outdated = {c.cache_stat.key for c in list(self._change_cache.values())
if c.cache_stat.last_modified < cutoff_time}
to_prune = outdated - set(relevant)
outdated_versions = dict()
for c in list(self._change_cache.values()):
# Assign to a local variable so all 3 values we use are
# consistent in case the cache_stat is updated during this
# loop.
cache_stat = c.cache_stat
if cache_stat.last_modified >= cutoff_time:
# This entry isn't old enough to delete yet
continue
# Save the version we examined so we can make sure to only
# delete that version.
outdated_versions[cache_stat.key] = cache_stat.version
to_prune = set(outdated_versions.keys()) - set(relevant)
for key in to_prune:
self.delete(key)
self.delete(key, outdated_versions[key])
def cleanup(self):
valid_uuids = {c.cache_stat.uuid
@ -347,12 +357,20 @@ class AbstractChangeCache(ZooKeeperSimpleBase, Iterable, abc.ABC):
change = self.get(key)
return change
def delete(self, key):
def delete(self, key, version=-1):
cache_path = self._cachePath(key._hash)
# Only delete the cache entry and NOT the data node in order to
# prevent race conditions with other consumers. The stale data
# nodes will be removed by the periodic cleanup.
self.kazoo_client.delete(cache_path, recursive=True)
try:
self.kazoo_client.delete(cache_path, version)
except BadVersionError:
# Someone else may have written a new entry since we
# decided to delete this, so we should no longer delete
# the entry.
return
except NoNodeError:
pass
with contextlib.suppress(KeyError):
del self._change_cache[key._hash]