From bfe5a4a93524e1b534851f3d04c4f1ad7d44eec7 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 18 Oct 2021 13:58:30 -0700 Subject: [PATCH] Handle concurrent modification during change cache delete The following race was observed: 1) Several hours before the error, an event caused a change to be queried and added to the cache. 2) The change was enqueued in a pipeline for a while and therefore stayed in the relevant set. 3) The change was removed from the pipelines. 4) A cache prune process started shortly before the error and calculated the relevant set (the change was not in this set) and also the changes that were last modified > 1 hour ago (the change was in this set). This combination means the entry is subject to pruning. 5) The cache cleanup starts slowly deleting changes (this takes about 3 minutes). 6) An event arrives for the change. Gerrit is queried and the updated change is inserted into the cache. 7) The cache cleanup method gets around to deleting the change from the cache. 8) Subsequent queue processes can't find the change in the cache and raise an exception. Or, in fewer words, the change was updated between the decision time for the deletion and the deletion itself. The kazoo delete method takes a version argument which will alert us if the znode it would delete is of a different version than specified. If we remember the version of the cache entry from when we decide to delete it, we can avoid the race by ensuring that the deleted znode hasn't been updated since our decision. This change implements that. The 'recursive' parameter is removed since it causes the version check to always pass. There are no children under the cache entry, so it's not necessary. It was likely only added to simplify the case where we delete a node which is already deleted (NoNodeError). To account for that, we handle that exception explicitly. Change-Id: Ica840225fd52585a29452c80d90a4aa5e7763c8a --- ...ange-cache-prune-fix-eac72e164927c028.yaml | 6 ++++ tests/unit/test_zk.py | 24 +++++++++++++++ zuul/zk/change_cache.py | 30 +++++++++++++++---- 3 files changed, 54 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/change-cache-prune-fix-eac72e164927c028.yaml diff --git a/releasenotes/notes/change-cache-prune-fix-eac72e164927c028.yaml b/releasenotes/notes/change-cache-prune-fix-eac72e164927c028.yaml new file mode 100644 index 0000000000..bb683071ad --- /dev/null +++ b/releasenotes/notes/change-cache-prune-fix-eac72e164927c028.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + A bug with the change cache cleanup routine which could have + caused items to be stuck in pipelines without running jobs has + been corrected. diff --git a/tests/unit/test_zk.py b/tests/unit/test_zk.py index c3c23eb422..97e8b130c2 100644 --- a/tests/unit/test_zk.py +++ b/tests/unit/test_zk.py @@ -1315,6 +1315,30 @@ class TestChangeCache(ZooKeeperBaseTestCase): invalid_key = ChangeKey('conn', 'project', 'change', 'invalid', '1') self.cache.delete(invalid_key) + def test_concurrent_delete(self): + change = DummyChange("project", {"foo": "bar"}) + key = ChangeKey('conn', 'project', 'change', 'foo', '1') + self.cache.set(key, change) + old_version = change.cache_version + # Simulate someone updating the cache after we decided to + # delete the entry + self.cache.set(key, change, old_version) + self.assertNotEqual(old_version, change.cache_version) + self.cache.delete(key, old_version) + # The change should still be in the cache + self.assertIsNotNone(self.cache.get(key)) + + def test_prune(self): + change1 = DummyChange("project", {"foo": "bar"}) + change2 = DummyChange("project", {"foo": "baz"}) + key1 = ChangeKey('conn', 'project', 'change', 'foo', '1') + key2 = ChangeKey('conn', 'project', 'change', 'foo', '2') + self.cache.set(key1, change1) + self.cache.set(key2, change2) + self.cache.prune([key1], max_age=0) + self.assertIsNotNone(self.cache.get(key1)) + self.assertIsNone(self.cache.get(key2)) + def test_concurrent_update(self): change = DummyChange("project", {"foo": "bar"}) key = ChangeKey('conn', 'project', 'change', 'foo', '1') diff --git a/zuul/zk/change_cache.py b/zuul/zk/change_cache.py index 69f806ebce..3d01b7bf06 100644 --- a/zuul/zk/change_cache.py +++ b/zuul/zk/change_cache.py @@ -201,11 +201,21 @@ class AbstractChangeCache(ZooKeeperSimpleBase, Iterable, abc.ABC): def prune(self, relevant, max_age=3600): # 1h cutoff_time = time.time() - max_age - outdated = {c.cache_stat.key for c in list(self._change_cache.values()) - if c.cache_stat.last_modified < cutoff_time} - to_prune = outdated - set(relevant) + outdated_versions = dict() + for c in list(self._change_cache.values()): + # Assign to a local variable so all 3 values we use are + # consistent in case the cache_stat is updated during this + # loop. + cache_stat = c.cache_stat + if cache_stat.last_modified >= cutoff_time: + # This entry isn't old enough to delete yet + continue + # Save the version we examined so we can make sure to only + # delete that version. + outdated_versions[cache_stat.key] = cache_stat.version + to_prune = set(outdated_versions.keys()) - set(relevant) for key in to_prune: - self.delete(key) + self.delete(key, outdated_versions[key]) def cleanup(self): valid_uuids = {c.cache_stat.uuid @@ -347,12 +357,20 @@ class AbstractChangeCache(ZooKeeperSimpleBase, Iterable, abc.ABC): change = self.get(key) return change - def delete(self, key): + def delete(self, key, version=-1): cache_path = self._cachePath(key._hash) # Only delete the cache entry and NOT the data node in order to # prevent race conditions with other consumers. The stale data # nodes will be removed by the periodic cleanup. - self.kazoo_client.delete(cache_path, recursive=True) + try: + self.kazoo_client.delete(cache_path, version) + except BadVersionError: + # Someone else may have written a new entry since we + # decided to delete this, so we should no longer delete + # the entry. + return + except NoNodeError: + pass with contextlib.suppress(KeyError): del self._change_cache[key._hash]