From c531adacaeb5f853bcc43be24572140b12325d78 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 30 Jan 2024 14:18:56 -0800 Subject: [PATCH] Add --keep-config-cache option to delete-state command The circular dependency refactor will require deleting all of the pipeline states as well as the event queues from ZK while zuul is offline during the upgrade. This is fairly close to the existing "delete-state" command, except that we can keep the config cache. Doing so will allow for a faster recovery since we won't need to issue all of the cat jobs again in order to fetch file contents. To facilitate this, we add a "--keep-config-cache" argument to the "delete-state" command which will then remove everything under /zuul except /zuul/config. Also, speed up both operations by implementing a fast recursive delete method which sends async delete ops depth first and only checks their results at the end (as opposed to the standard kazoo delete which checks each operation at once). This is added without a release note since it's not widely useful and the upcoming change which requires its use will have a release note with usage instructions. Change-Id: I4db43e00a73f5e5b796261ffe7236ed906e6b421 --- tests/unit/test_client.py | 40 +++++++++++++++++++++++++++++++++++++++ zuul/cmd/client.py | 28 +++++++++++++++++++++------ zuul/zk/__init__.py | 19 +++++++++++++++++++ 3 files changed, 81 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 9ccfda2d1d..5aaf186913 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -358,6 +358,46 @@ class TestOfflineZKOperations(ZuulTestCase): self.zk_client.disconnect() + def test_delete_state_keep_config_cache(self): + # Shut everything down (as much as possible) to reduce + # logspam and errors. + ZuulTestCase.shutdown(self) + + # Re-start the client connection because we need one for the + # test. + self.zk_client = ZooKeeperClient.fromConfig(self.config) + self.zk_client.connect() + + config_file = os.path.join(self.test_root, 'zuul.conf') + with open(config_file, 'w') as f: + self.config.write(f) + + # Save a copy of the things we keep in ZK + old_keys = self.getZKTree('/keystorage') + old_config = self.getZKTree('/zuul/config') + + p = subprocess.Popen( + [os.path.join(sys.prefix, 'bin/zuul-admin'), + '-c', config_file, + 'delete-state', '--keep-config-cache', + ], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + out, _ = p.communicate(b'yes\n') + self.log.debug(out.decode('utf8')) + + # Make sure the keys are still around + new_keys = self.getZKTree('/keystorage') + self.assertEqual(new_keys, old_keys) + new_config = self.getZKTree('/zuul/config') + self.assertEqual(new_config, old_config) + + # Make sure we really deleted everything + children = self.zk_client.client.get_children('/zuul') + self.assertEqual(['config'], children) + + self.zk_client.disconnect() + class TestOnlineZKOperations(ZuulTestCase): tenant_config_file = 'config/single-tenant/main.yaml' diff --git a/zuul/cmd/client.py b/zuul/cmd/client.py index 63742d2965..448b56b46b 100755 --- a/zuul/cmd/client.py +++ b/zuul/cmd/client.py @@ -42,6 +42,8 @@ from zuul.zk.locks import tenant_read_lock, pipeline_lock from zuul.zk.zkobject import ZKContext from zuul.zk.components import COMPONENT_REGISTRY +from kazoo.exceptions import NoNodeError + def parse_cutoff(now, before, older_than): if before and not older_than: @@ -496,6 +498,9 @@ class Client(zuul.cmd.ZuulApp): ZooKeeper; it will not remove private keys or Nodepool data.''')) cmd_delete_state.set_defaults(command='delete-state') + cmd_delete_state.add_argument( + '--keep-config-cache', action='store_true', + help='keep config cache') cmd_delete_state.set_defaults(func=self.delete_state) cmd_delete_pipeline_state = subparsers.add_parser( @@ -1014,8 +1019,22 @@ class Client(zuul.cmd.ZuulApp): zk_client.connect() confirm = input("Are you sure you want to delete " "all ephemeral data from ZooKeeper? (yes/no) ") - if confirm.strip().lower() == 'yes': - zk_client.client.delete('/zuul', recursive=True) + if confirm.strip().lower() != 'yes': + print("Aborting") + sys.exit(1) + if self.args.keep_config_cache: + try: + children = zk_client.client.get_children('/zuul') + except NoNodeError: + children = [] + for child in children: + if child == 'config': + continue + path = f'/zuul/{child}' + self.log.debug("Deleting %s", path) + zk_client.fastRecursiveDelete(path) + else: + zk_client.fastRecursiveDelete('/zuul') sys.exit(0) def delete_pipeline_state(self): @@ -1028,16 +1047,13 @@ class Client(zuul.cmd.ZuulApp): safe_tenant = urllib.parse.quote_plus(args.tenant) safe_pipeline = urllib.parse.quote_plus(args.pipeline) COMPONENT_REGISTRY.create(zk_client) - self.log.info('get tenant') with tenant_read_lock(zk_client, args.tenant): path = f'/zuul/tenant/{safe_tenant}/pipeline/{safe_pipeline}' - self.log.info('get pipe') pipeline = Pipeline(args.tenant, args.pipeline) with pipeline_lock( zk_client, args.tenant, args.pipeline ) as plock: - self.log.info('got locks') - zk_client.client.delete(path, recursive=True) + zk_client.fastRecursiveDelete(path) with ZKContext(zk_client, plock, None, self.log) as context: pipeline.state = PipelineState.new( context, _path=path, layout_uuid=None) diff --git a/zuul/zk/__init__.py b/zuul/zk/__init__.py index d4d5a05e4b..3a7c917b5e 100644 --- a/zuul/zk/__init__.py +++ b/zuul/zk/__init__.py @@ -224,6 +224,25 @@ class ZooKeeperClient(object): zstat = self.client.set("/zuul/ltime", b"") return zstat.last_modified_transaction_id + def _fastRecursiveDelete(self, path, results): + try: + children = self.client.get_children(path) + except NoNodeError: + return + if children: + for child in children: + self._fastRecursiveDelete(f'{path}/{child}', results) + results.append(self.client.delete_async(path, -1)) + + def fastRecursiveDelete(self, path): + results = [] + self._fastRecursiveDelete(path, results) + for res in results: + try: + res.get() + except NoNodeError: + pass + class ZooKeeperSimpleBase(metaclass=ABCMeta): """Base class for stateless Zookeeper interaction."""