Handle concurrent modification during change cache delete

The following race was observed: 1) Several hours before the error, an event caused a change to be queried and added to the cache. 2) The change was enqueued in a pipeline for a while and therefore stayed in the relevant set. 3) The change was removed from the pipelines. 4) A cache prune process started shortly before the error and calculated the relevant set (the change was not in this set) and also the changes that were last modified > 1 hour ago (the change was in this set). This combination means the entry is subject to pruning. 5) The cache cleanup starts slowly deleting changes (this takes about 3 minutes). 6) An event arrives for the change. Gerrit is queried and the updated change is inserted into the cache. 7) The cache cleanup method gets around to deleting the change from the cache. 8) Subsequent queue processes can't find the change in the cache and raise an exception. Or, in fewer words, the change was updated between the decision time for the deletion and the deletion itself. The kazoo delete method takes a version argument which will alert us if the znode it would delete is of a different version than specified. If we remember the version of the cache entry from when we decide to delete it, we can avoid the race by ensuring that the deleted znode hasn't been updated since our decision. This change implements that. The 'recursive' parameter is removed since it causes the version check to always pass. There are no children under the cache entry, so it's not necessary. It was likely only added to simplify the case where we delete a node which is already deleted (NoNodeError). To account for that, we handle that exception explicitly. Change-Id: Ica840225fd52585a29452c80d90a4aa5e7763c8a
2021-10-18 13:58:30 -07:00 · 2021-10-18 13:58:30 -07:00 · bfe5a4a935
parent 0d9e083952
commit bfe5a4a935
3 changed files with 54 additions and 6 deletions
--- a/releasenotes/notes/change-cache-prune-fix-eac72e164927c028.yaml
+++ b/releasenotes/notes/change-cache-prune-fix-eac72e164927c028.yaml
@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    A bug with the change cache cleanup routine which could have
+    caused items to be stuck in pipelines without running jobs has
+    been corrected.
--- a/tests/unit/test_zk.py
+++ b/tests/unit/test_zk.py
@ -1315,6 +1315,30 @@ class TestChangeCache(ZooKeeperBaseTestCase):
        invalid_key = ChangeKey('conn', 'project', 'change', 'invalid', '1')
        self.cache.delete(invalid_key)

+    def test_concurrent_delete(self):
+        change = DummyChange("project", {"foo": "bar"})
+        key = ChangeKey('conn', 'project', 'change', 'foo', '1')
+        self.cache.set(key, change)
+        old_version = change.cache_version
+        # Simulate someone updating the cache after we decided to
+        # delete the entry
+        self.cache.set(key, change, old_version)
+        self.assertNotEqual(old_version, change.cache_version)
+        self.cache.delete(key, old_version)
+        # The change should still be in the cache
+        self.assertIsNotNone(self.cache.get(key))
+
+    def test_prune(self):
+        change1 = DummyChange("project", {"foo": "bar"})
+        change2 = DummyChange("project", {"foo": "baz"})
+        key1 = ChangeKey('conn', 'project', 'change', 'foo', '1')
+        key2 = ChangeKey('conn', 'project', 'change', 'foo', '2')
+        self.cache.set(key1, change1)
+        self.cache.set(key2, change2)
+        self.cache.prune([key1], max_age=0)
+        self.assertIsNotNone(self.cache.get(key1))
+        self.assertIsNone(self.cache.get(key2))
+
    def test_concurrent_update(self):
        change = DummyChange("project", {"foo": "bar"})
        key = ChangeKey('conn', 'project', 'change', 'foo', '1')
--- a/zuul/zk/change_cache.py
+++ b/zuul/zk/change_cache.py
@ -201,11 +201,21 @@ class AbstractChangeCache(ZooKeeperSimpleBase, Iterable, abc.ABC):

    def prune(self, relevant, max_age=3600):  # 1h
        cutoff_time = time.time() - max_age
-        outdated = {c.cache_stat.key for c in list(self._change_cache.values())
-                    if c.cache_stat.last_modified < cutoff_time}
-        to_prune = outdated - set(relevant)
+        outdated_versions = dict()
+        for c in list(self._change_cache.values()):
+            # Assign to a local variable so all 3 values we use are
+            # consistent in case the cache_stat is updated during this
+            # loop.
+            cache_stat = c.cache_stat
+            if cache_stat.last_modified >= cutoff_time:
+                # This entry isn't old enough to delete yet
+                continue
+            # Save the version we examined so we can make sure to only
+            # delete that version.
+            outdated_versions[cache_stat.key] = cache_stat.version
+        to_prune = set(outdated_versions.keys()) - set(relevant)
        for key in to_prune:
-            self.delete(key)
+            self.delete(key, outdated_versions[key])

    def cleanup(self):
        valid_uuids = {c.cache_stat.uuid
@ -347,12 +357,20 @@ class AbstractChangeCache(ZooKeeperSimpleBase, Iterable, abc.ABC):
            change = self.get(key)
        return change

-    def delete(self, key):
+    def delete(self, key, version=-1):
        cache_path = self._cachePath(key._hash)
        # Only delete the cache entry and NOT the data node in order to
        # prevent race conditions with other consumers. The stale data
        # nodes will be removed by the periodic cleanup.
-        self.kazoo_client.delete(cache_path, recursive=True)
+        try:
+            self.kazoo_client.delete(cache_path, version)
+        except BadVersionError:
+            # Someone else may have written a new entry since we
+            # decided to delete this, so we should no longer delete
+            # the entry.
+            return
+        except NoNodeError:
+            pass

        with contextlib.suppress(KeyError):
            del self._change_cache[key._hash]