From 3815cce7aadc6dee64824a1407d8c33171732809 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 1 Aug 2023 13:48:59 -0700 Subject: [PATCH] Change image ID from int sequence to UUID When we export and import image data (for backup/restore purposes), we need to reset the ZK sequence counter for image builds in order to avoid collisions. The only way we can do that is to create and then delete a large number of znodes. Some sites (including OpenDev) have sequence numbers that are in the hundreds of thousands. To avoid this time-consuming operation (which is only intended to be run to restore from backup -- when operators are already under additional stress!), this change switches the build IDs from integer sequences to UUIDs. This avoids the problem with collisions after import (at least, to the degree that UUIDs avoid collisions). The actual change is fairly simple, but many unit tests need to be updated. Since the change is user-visible in the command output (image lists, etc), a release note is added. A related change which updates all of the textual references of build "number" to build "id" follows this one for clarity and ease of review. Change-Id: Ie7c68b094bc9733914a808756eeee8b62f696713 --- nodepool/builder.py | 11 +- nodepool/tests/__init__.py | 21 +++- nodepool/tests/unit/test_builder.py | 107 ++++++++---------- nodepool/tests/unit/test_commands.py | 52 ++++----- nodepool/tests/unit/test_webapp.py | 12 +- nodepool/tests/unit/test_zk.py | 2 +- nodepool/zk/zookeeper.py | 35 +++--- .../notes/buildid-4131ffe6f4cea09f.yaml | 8 ++ 8 files changed, 128 insertions(+), 120 deletions(-) create mode 100644 releasenotes/notes/buildid-4131ffe6f4cea09f.yaml diff --git a/nodepool/builder.py b/nodepool/builder.py index 2fdd15eec..10227275e 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -390,7 +390,7 @@ class CleanupWorker(BaseWorker): self.log.exception("Exception cleaning up local build %s:", local_build) - LOCAL_BUILDS_RE = re.compile(r'^(.*)-(\d+)\.(?:\w+)$') + LOCAL_BUILDS_RE = re.compile(r'^(.*)-([abcdef\d]+)\.(?:\w+)$') def _getLocalBuilds(self): seen = set() @@ -628,13 +628,14 @@ class BuildWorker(BaseWorker): return log_dir = self._getBuildLogRoot(name) keep = max(self._config.build_log_retention, 1) - existing = sorted(os.listdir(log_dir)) + existing = os.listdir(log_dir) existing = [f for f in existing if f.startswith(name)] + existing = [os.path.join(log_dir, f) for f in existing] + existing = sorted(existing, key=os.path.getmtime) delete = existing[:0 - keep] for f in delete: - fp = os.path.join(log_dir, f) - self.log.info("Deleting old build log %s" % (fp,)) - os.unlink(fp) + self.log.info("Deleting old build log %s", f) + os.unlink(f) def _getBuildLog(self, name, build_id): log_dir = self._getBuildLogRoot(name) diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 97540ff24..2bc7a72dc 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -516,21 +516,32 @@ class DBTestCase(BaseTestCase): break self.wait_for_threads() - def waitForBuild(self, image_name, build_id, states=None, - check_files=True): + def waitForBuild(self, image_name, states=None, check_files=True, + ignore_list=None): if states is None: states = (zk.READY,) - base = "-".join([image_name, build_id]) + if ignore_list: + ignore_list = [b.id for b in ignore_list] + else: + ignore_list = [] for _ in iterate_timeout(ONE_MINUTE, Exception, "Image build record to reach state", interval=1): self.wait_for_threads() - build = self.zk.getBuild(image_name, build_id) - if build and build.state in states: + build = None + for b in self.zk.getBuilds(image_name): + if b.state not in states: + continue + if b.id in ignore_list: + continue + build = b + if build: break + base = "-".join([image_name, build.id]) + # We should only expect a dib manifest with a successful build. while check_files and build.state == zk.READY: self.wait_for_threads() diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py index 66deae3a7..25f88e3e2 100644 --- a/nodepool/tests/unit/test_builder.py +++ b/nodepool/tests/unit/test_builder.py @@ -230,10 +230,10 @@ class TestNodePoolBuilder(tests.DBTestCase): configfile = self.setup_config('node_two_image.yaml') self.useBuilder(configfile) self.waitForImage('fake-provider', 'fake-image') - self.waitForImage('fake-provider', 'fake-image2') + image2 = self.waitForImage('fake-provider', 'fake-image2') self.replace_config(configfile, 'node_two_image_remove.yaml') self.waitForImageDeletion('fake-provider', 'fake-image2') - self.waitForBuildDeletion('fake-image2', '0000000001') + self.waitForBuildDeletion('fake-image2', image2.build_id) def test_image_removal_two_builders(self): # This tests the gap between building and uploading an image. @@ -246,7 +246,7 @@ class TestNodePoolBuilder(tests.DBTestCase): configfile1 = self.setup_config('builder_image1.yaml') builder1 = self.useBuilder(configfile1) self.waitForImage('fake-provider1', 'fake-image1') - self.waitForBuild('fake-image1', '0000000001') + self.waitForBuild('fake-image1') for worker in builder1._upload_workers: worker.shutdown() worker.join() @@ -260,7 +260,7 @@ class TestNodePoolBuilder(tests.DBTestCase): self.waitForImage('fake-provider2', 'fake-image2') # Don't check files because the image path switched to the # second builder; we're really only interested in ZK. - self.waitForBuild('fake-image1', '0000000001', check_files=False) + self.waitForBuild('fake-image1', check_files=False) def test_image_removal_dib_deletes_first(self): # Break cloud image deleting @@ -285,13 +285,13 @@ class TestNodePoolBuilder(tests.DBTestCase): 'DIB disk files did not delete first'): self.wait_for_threads() files = builder.DibImageFile.from_image_id( - self._config_images_dir.path, 'fake-image2-0000000001') + self._config_images_dir.path, f'fake-image2-{img.build_id}') if not files: break # Check image is still in fake-provider cloud img.state = zk.DELETING self.assertEqual( - self.zk.getImageUpload('fake-image2', '0000000001', + self.zk.getImageUpload('fake-image2', img.build_id, 'fake-provider', '0000000001'), img) @@ -301,7 +301,7 @@ class TestNodePoolBuilder(tests.DBTestCase): # Check image is removed from cloud and zk self.waitForImageDeletion('fake-provider', 'fake-image2', match=img) # Check build is removed from zk - self.waitForBuildDeletion('fake-image2', '0000000001') + self.waitForBuildDeletion('fake-image2', img.build_id) def test_image_rebuild_age(self): self._test_image_rebuild_age() @@ -309,40 +309,40 @@ class TestNodePoolBuilder(tests.DBTestCase): def _test_image_rebuild_age(self, expire=86400): configfile = self.setup_config('node.yaml') self.useBuilder(configfile) - build = self.waitForBuild('fake-image', '0000000001') + build1 = self.waitForBuild('fake-image') log_path1 = os.path.join(self._config_build_log_dir.path, - 'fake-image-0000000001.log') + f'fake-image-{build1.id}.log') self.assertTrue(os.path.exists(log_path1)) image = self.waitForImage('fake-provider', 'fake-image') # Expire rebuild-age (default: 1day) to force a new build. - build.state_time -= expire + build1.state_time -= expire with self.zk.imageBuildLock('fake-image', blocking=True, timeout=1): - self.zk.storeBuild('fake-image', build, '0000000001') - self.waitForBuild('fake-image', '0000000002') + self.zk.storeBuild('fake-image', build1, build1.id) + build2 = self.waitForBuild('fake-image', ignore_list=[build1]) log_path2 = os.path.join(self._config_build_log_dir.path, - 'fake-image-0000000002.log') + f'fake-image-{build2.id}.log') self.assertTrue(os.path.exists(log_path2)) self.waitForImage('fake-provider', 'fake-image', [image]) builds = self.zk.getBuilds('fake-image', zk.READY) self.assertEqual(len(builds), 2) - return (build, image) + return (build1, build2, image) def test_image_rotation(self): # Expire rebuild-age (2days), to avoid problems when expiring 2 images. - self._test_image_rebuild_age(expire=172800) - build = self.waitForBuild('fake-image', '0000000002') + build1, build2, image = self._test_image_rebuild_age(expire=172800) # Expire rebuild-age (default: 1day) to force a new build. - build.state_time -= 86400 + build2.state_time -= 86400 with self.zk.imageBuildLock('fake-image', blocking=True, timeout=1): - self.zk.storeBuild('fake-image', build, '0000000002') - self.waitForBuildDeletion('fake-image', '0000000001') - self.waitForBuild('fake-image', '0000000003') + self.zk.storeBuild('fake-image', build2, build2.id) + self.waitForBuildDeletion('fake-image', build1.id) + build3 = self.waitForBuild('fake-image', + ignore_list=[build1, build2]) log_path1 = os.path.join(self._config_build_log_dir.path, - 'fake-image-0000000001.log') + f'fake-image-{build1.id}.log') log_path2 = os.path.join(self._config_build_log_dir.path, - 'fake-image-0000000002.log') + f'fake-image-{build2.id}.log') log_path3 = os.path.join(self._config_build_log_dir.path, - 'fake-image-0000000003.log') + f'fake-image-{build3.id}.log') # Our log retention is set to 1, so the first log should be deleted. self.assertFalse(os.path.exists(log_path1)) self.assertTrue(os.path.exists(log_path2)) @@ -361,16 +361,15 @@ class TestNodePoolBuilder(tests.DBTestCase): # total of 2 diskimages on disk at all times. # Expire rebuild-age (2days), to avoid problems when expiring 2 images. - build001, image001 = self._test_image_rebuild_age(expire=172800) - build002 = self.waitForBuild('fake-image', '0000000002') + build1, build2, image1 = self._test_image_rebuild_age(expire=172800) # Make sure 2rd diskimage build was uploaded. - image002 = self.waitForImage('fake-provider', 'fake-image', [image001]) - self.assertEqual(image002.build_id, '0000000002') + image2 = self.waitForImage('fake-provider', 'fake-image', + ignore_list=[image1]) # Delete external name / id so we can test exception handlers. upload = self.zk.getUploads( - 'fake-image', '0000000001', 'fake-provider', zk.READY)[0] + 'fake-image', build1.id, 'fake-provider', zk.READY)[0] upload.external_name = None upload.external_id = None with self.zk.imageUploadLock(upload.image_name, upload.build_id, @@ -380,32 +379,33 @@ class TestNodePoolBuilder(tests.DBTestCase): upload.provider_name, upload, upload.id) # Expire rebuild-age (default: 1day) to force a new build. - build002.state_time -= 86400 + build2.state_time -= 86400 with self.zk.imageBuildLock('fake-image', blocking=True, timeout=1): - self.zk.storeBuild('fake-image', build002, '0000000002') - self.waitForBuildDeletion('fake-image', '0000000001') + self.zk.storeBuild('fake-image', build2, build2.id) + self.waitForBuildDeletion('fake-image', build1.id) # Make sure fake-image for fake-provider is removed from zookeeper. upload = self.zk.getUploads( - 'fake-image', '0000000001', 'fake-provider') + 'fake-image', build1.id, 'fake-provider') self.assertEqual(len(upload), 0) - self.waitForBuild('fake-image', '0000000003') + build3 = self.waitForBuild('fake-image', + ignore_list=[build1, build2]) # Ensure we only have 2 builds on disk. builds = self.zk.getBuilds('fake-image', zk.READY) self.assertEqual(len(builds), 2) # Make sure 3rd diskimage build was uploaded. - image003 = self.waitForImage( - 'fake-provider', 'fake-image', [image001, image002]) - self.assertEqual(image003.build_id, '0000000003') + image3 = self.waitForImage( + 'fake-provider', 'fake-image', ignore_list=[image1, image2]) + self.assertEqual(image3.build_id, build3.id) def test_cleanup_hard_upload_fails(self): configfile = self.setup_config('node.yaml') self.useBuilder(configfile) - self.waitForImage('fake-provider', 'fake-image') + image = self.waitForImage('fake-provider', 'fake-image') - upload = self.zk.getUploads('fake-image', '0000000001', + upload = self.zk.getUploads('fake-image', image.build_id, 'fake-provider', zk.READY)[0] # Store a new ZK node as UPLOADING to represent a hard fail @@ -433,16 +433,8 @@ class TestNodePoolBuilder(tests.DBTestCase): found = False for _ in iterate_timeout(10, Exception, 'image builds to fail', 0.1): builds = self.zk.getBuilds('fake-image') - for build in builds: - # Lexicographical order - if build and build.id > '0000000001': - # We know we've built more than one image and we know - # they have all failed. We can't check if they have - # failed directly because they may be cleaned up. - found = build.id - break - time.sleep(0.1) - if found: + if builds: + found = builds[0].id break # Now replace the config with a valid config and check that the image @@ -460,7 +452,7 @@ class TestNodePoolBuilder(tests.DBTestCase): def test_diskimage_build_only(self): configfile = self.setup_config('node_diskimage_only.yaml') self.useBuilder(configfile) - build_tar = self.waitForBuild('fake-image', '0000000001') + build_tar = self.waitForBuild('fake-image') self.assertEqual(build_tar._formats, ['tar']) self.assertReportedStat('nodepool.dib_image_build.' @@ -476,9 +468,8 @@ class TestNodePoolBuilder(tests.DBTestCase): def test_diskimage_build_formats(self): configfile = self.setup_config('node_diskimage_formats.yaml') self.useBuilder(configfile) - build_default = self.waitForBuild('fake-image-default-format', - '0000000001') - build_vhd = self.waitForBuild('fake-image-vhd', '0000000001') + build_default = self.waitForBuild('fake-image-default-format') + build_vhd = self.waitForBuild('fake-image-vhd') self.assertEqual(build_default._formats, ['qcow2']) self.assertEqual(build_vhd._formats, ['vhd']) @@ -491,15 +482,15 @@ class TestNodePoolBuilder(tests.DBTestCase): def test_diskimage_build_parents(self): configfile = self.setup_config('node_diskimage_parents.yaml') self.useBuilder(configfile) - self.waitForBuild('parent-image-1', '0000000001') - self.waitForBuild('parent-image-2', '0000000001') + self.waitForBuild('parent-image-1') + self.waitForBuild('parent-image-2') @mock.patch('select.poll') def test_diskimage_build_timeout(self, mock_poll): configfile = self.setup_config('diskimage_build_timeout.yaml') builder.BUILD_PROCESS_POLL_TIMEOUT = 500 self.useBuilder(configfile, cleanup_interval=0) - self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,)) + self.waitForBuild('fake-image', states=(zk.FAILED,)) def test_session_loss_during_build(self): configfile = self.setup_config('node.yaml') @@ -513,7 +504,7 @@ class TestNodePoolBuilder(tests.DBTestCase): # Disable cleanup thread to verify builder cleans up after itself bldr = self.useBuilder(configfile, cleanup_interval=0) - self.waitForBuild('fake-image', '0000000001', states=(zk.BUILDING,)) + self.waitForBuild('fake-image', states=(zk.BUILDING,)) # The build should now be paused just before writing out any DIB files. # Mock the next call to storeBuild() which is supposed to be the update @@ -587,9 +578,9 @@ class TestNodePoolBuilder(tests.DBTestCase): def test_post_upload_hook(self): configfile = self.setup_config('node_upload_hook.yaml') bldr = self.useBuilder(configfile) - self.waitForImage('fake-provider', 'fake-image') + image = self.waitForImage('fake-provider', 'fake-image') images_dir = bldr._config.images_dir post_file = os.path.join( - images_dir, 'fake-image-0000000001.qcow2.post') + images_dir, f'fake-image-{image.build_id}.qcow2.post') self.assertTrue(os.path.exists(post_file), 'Post hook file exists') diff --git a/nodepool/tests/unit/test_commands.py b/nodepool/tests/unit/test_commands.py index f38e9ca1b..16d994a6a 100644 --- a/nodepool/tests/unit/test_commands.py +++ b/nodepool/tests/unit/test_commands.py @@ -254,10 +254,10 @@ class TestNodepoolCMD(tests.DBTestCase): self.patch_argv('-c', configfile, 'dib-image-delete', 'fake-image-%s' % (builds[0].id)) nodepoolcmd.main() - self.waitForBuildDeletion('fake-image', '0000000001') + self.waitForBuildDeletion('fake-image', builds[0].id) # Check that fake-image-0000000001 doesn't exist self.assert_listed( - configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0) + configfile, ['dib-image-list'], 0, f'fake-image-{builds[0].id}', 0) def test_dib_image_delete_custom_image_creation(self): # Test deletion of images without a .d manifest folder @@ -274,22 +274,22 @@ class TestNodepoolCMD(tests.DBTestCase): builds = self.zk.getMostRecentBuilds(1, 'fake-image', zk.READY) # Delete manifest folder to simulate custom image creation shutil.rmtree(os.path.join(builder._config.images_dir, - 'fake-image-0000000001.d')) + f'fake-image-{builds[0].id}.d')) # But ensure image is still present self.assertTrue( os.path.exists(os.path.join(builder._config.images_dir, - 'fake-image-0000000001.qcow2'))) + f'fake-image-{builds[0].id}.qcow2'))) # Delete the image self.patch_argv('-c', configfile, 'dib-image-delete', 'fake-image-%s' % (builds[0].id)) nodepoolcmd.main() - self.waitForBuildDeletion('fake-image', '0000000001') + self.waitForBuildDeletion('fake-image', builds[0].id) # Check that fake-image-0000000001 doesn't exist self.assert_listed( - configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0) + configfile, ['dib-image-list'], 0, f'fake-image-{builds[0].id}', 0) self.assertFalse( os.path.exists(os.path.join(builder._config.images_dir, - 'fake-image-0000000001.qcow2'))) + f'fake-image-{builds[0].id}.qcow2'))) def test_dib_image_delete_two_builders(self): # This tests deleting an image when its builder is offline @@ -316,19 +316,20 @@ class TestNodepoolCMD(tests.DBTestCase): nodepoolcmd.main() # 4. Verify builder2 deleted the ZK records, but image is still on disk - self.waitForBuildDeletion('fake-image', '0000000001') + self.waitForBuildDeletion('fake-image', builds[0].id) self.assertTrue( os.path.exists(os.path.join(builder1._config.images_dir, - 'fake-image-0000000001.d'))) + f'fake-image-{builds[0].id}.d'))) self.assertFalse( os.path.exists(os.path.join(builder2._config.images_dir, - 'fake-image-0000000001.d'))) + f'fake-image-{builds[0].id}.d'))) # 5. Start builder1 and verify it deletes image on disk builder1 = self.useBuilder(configfile1) for _ in iterate_timeout(30, AssertionError, 'image file delete'): - if not os.path.exists(os.path.join(builder1._config.images_dir, - 'fake-image-0000000001.d')): + if not os.path.exists( + os.path.join(builder1._config.images_dir, + f'fake-image-{builds[0].id}.d')): break def test_delete(self): @@ -650,15 +651,15 @@ class TestNodepoolCMD(tests.DBTestCase): self.startPool(pool) self.waitForNodes('fake-label') - build = self.waitForBuild('fake-image', '0000000001') + build = self.waitForBuild('fake-image') # Delete the first build so that we have a hole in our # numbering. This lets us validate that we reconstruct the # sequence state correctly. build.state = zk.DELETING with self.zk.imageBuildLock('fake-image', blocking=True, timeout=1): - self.zk.storeBuild('fake-image', build, '0000000001') - self.waitForBuildDeletion('fake-image', '0000000001') - self.waitForBuild('fake-image', '0000000002') + self.zk.storeBuild('fake-image', build, build.id) + self.waitForBuildDeletion('fake-image', build.id) + build2 = self.waitForBuild('fake-image', ignore_list=[build]) pool.stop() for worker in builder._upload_workers: @@ -668,7 +669,7 @@ class TestNodepoolCMD(tests.DBTestCase): # Save a copy of the data in ZK old_data = self.getZKTree('/nodepool/images') # We aren't backing up the lock data - old_data.pop('/nodepool/images/fake-image/builds/0000000002' + old_data.pop(f'/nodepool/images/fake-image/builds/{build2.id}' '/providers/fake-provider/images/lock') old_data.pop('/nodepool/images/fake-image/builds/lock') @@ -692,7 +693,7 @@ class TestNodepoolCMD(tests.DBTestCase): # First test a new upload of the existing image and make sure # it uses the correct sequence number. upload = self.waitForUpload('fake-provider', 'fake-image', - '0000000002', '0000000001') + build2.id, '0000000001') upload.state = zk.DELETING with self.zk.imageUploadLock(upload.image_name, upload.build_id, upload.provider_name, blocking=True, @@ -702,15 +703,12 @@ class TestNodepoolCMD(tests.DBTestCase): # We skip at least one number because upload lock is a sequence # node too (this is why builds and uploads start at 1 instead of 0). upload = self.waitForUpload('fake-provider', 'fake-image', - '0000000002', '0000000003') + build2.id, '0000000003') - # Now build a new image and make sure it uses the correct - # sequence number. - build = self.waitForBuild('fake-image', '0000000002') + # Now build a new image and make sure it uses a different id. + build2 = self.waitForBuild('fake-image', ignore_list=[build]) # Expire rebuild-age (default: 1day) to force a new build. - build.state_time -= 86400 + build2.state_time -= 86400 with self.zk.imageBuildLock('fake-image', blocking=True, timeout=1): - self.zk.storeBuild('fake-image', build, '0000000002') - # We skip at least one number because build lock is a sequence - # node too (this is why builds and uploads start at 1 instead of 0). - self.waitForBuild('fake-image', '0000000004') + self.zk.storeBuild('fake-image', build2, build2.id) + self.waitForBuild('fake-image', ignore_list=[build, build2]) diff --git a/nodepool/tests/unit/test_webapp.py b/nodepool/tests/unit/test_webapp.py index bcb01d6e4..b7aa47f30 100644 --- a/nodepool/tests/unit/test_webapp.py +++ b/nodepool/tests/unit/test_webapp.py @@ -80,7 +80,7 @@ class TestWebApp(tests.DBTestCase): # cache update self.useBuilder(configfile) - self.waitForImage('fake-provider', 'fake-image') + image = self.waitForImage('fake-provider', 'fake-image') self.waitForNodes('fake-label') req = request.Request( @@ -90,7 +90,7 @@ class TestWebApp(tests.DBTestCase): self.assertEqual(f.info().get('Content-Type'), 'text/plain; charset=UTF-8') data = f.read() - self.assertIn("| 0000000001 | fake-image | ready |", + self.assertIn(f"| {image.build_id} | fake-image | ready |", data.decode('utf8')) def test_image_list_json(self): @@ -105,7 +105,7 @@ class TestWebApp(tests.DBTestCase): # cache update self.useBuilder(configfile) - self.waitForImage('fake-provider', 'fake-image') + image = self.waitForImage('fake-provider', 'fake-image') self.waitForNodes('fake-label') req = request.Request( @@ -116,7 +116,7 @@ class TestWebApp(tests.DBTestCase): 'application/json') data = f.read() objs = json.loads(data.decode('utf8')) - self.assertDictContainsSubset({'id': '0000000001', + self.assertDictContainsSubset({'id': image.build_id, 'image': 'fake-image', 'provider': 'fake-provider', 'state': 'ready'}, objs[0]) @@ -133,7 +133,7 @@ class TestWebApp(tests.DBTestCase): # cache update self.useBuilder(configfile) - self.waitForImage('fake-provider', 'fake-image') + image = self.waitForImage('fake-provider', 'fake-image') self.waitForNodes('fake-label') req = request.Request( @@ -146,7 +146,7 @@ class TestWebApp(tests.DBTestCase): objs = json.loads(data.decode('utf8')) # make sure this is valid json and has some of the # non-changing keys - self.assertDictContainsSubset({'id': 'fake-image-0000000001', + self.assertDictContainsSubset({'id': f'fake-image-{image.build_id}', 'formats': ['qcow2'], 'state': 'ready'}, objs[0]) diff --git a/nodepool/tests/unit/test_zk.py b/nodepool/tests/unit/test_zk.py index fbbc8409d..988bd081e 100644 --- a/nodepool/tests/unit/test_zk.py +++ b/nodepool/tests/unit/test_zk.py @@ -262,7 +262,7 @@ class TestZooKeeper(tests.DBTestCase): image = "ubuntu-trusty" b1 = self.zk.storeBuild(image, zk.ImageBuild()) b2 = self.zk.storeBuild(image, zk.ImageBuild()) - self.assertLess(int(b1), int(b2)) + self.assertNotEqual(b1, b2) def test_store_and_get_build(self): image = "ubuntu-trusty" diff --git a/nodepool/zk/zookeeper.py b/nodepool/zk/zookeeper.py index 49372c5e3..4a6b8a76b 100644 --- a/nodepool/zk/zookeeper.py +++ b/nodepool/zk/zookeeper.py @@ -1857,12 +1857,12 @@ class ZooKeeper(ZooKeeperBase): build_path = self._imageBuildsPath(image) + "/" if build_number is None: - path = self.kazoo_client.create( - build_path, + build_number = uuid.uuid4().hex + path = build_path + build_number + self.kazoo_client.create( + path, value=build_data.serialize(), - sequence=True, makepath=True) - build_number = path.split("/")[-1] else: path = build_path + build_number self.kazoo_client.set(path, build_data.serialize()) @@ -2960,34 +2960,33 @@ class ZooKeeper(ZooKeeperBase): # We do some extra work to ensure that the sequence numbers # don't collide. ZK sequence numbers are stored in the parent # node and ZK isn't smart enough to avoid collisions if there - # are missing entries. So if we restore build 1, and then the - # builder later wants to create a new build, it will attempt - # to create build 1, and fail since the node already exists. + # are missing entries. So if we restore upload 1, and then the + # builder later wants to create a new upload, it will attempt + # to create upload 1, and fail since the node already exists. # # NB: The behavior is slightly different for sequence number 1 # vs others; if 2 is the lowest number, then ZK will create # node 0 and 1 before colliding with 2. This is further # complicated in the nodepool context since we create lock - # entries under the build/upload znodes which also seem to + # entries under the upload znodes which also seem to # have an effect on the counter. # # Regardless, if we pre-create sequence nodes up to our - # highest node numbers for builds and uploads, we are - # guaranteed that the next sequence node created will be - # greater. So we look at all the sequence nodes in our import - # data set and pre-create sequence nodes up to that number. + # highest node numbers for uploads, we are guaranteed that the + # next sequence node created will be greater. So we look at + # all the sequence nodes in our import data set and pre-create + # sequence nodes up to that number. + # + # Build ids are not affected since they are not sequence nodes + # (though they used to be). highest_num = {} - # 0 1 2 3 4 5 - # /nodepool/images/fake-image/builds/0000000002/ + # 0 1 2 3 4 5 + # /nodepool/images/fake-image/builds/UUID/ # 6 7 8 9 # providers/fake-provider/images/0000000001 for path, data in import_data.items(): parts = path.split('/') - if len(parts) == 6: - key = '/'.join(parts[:5]) - num = int(parts[5]) - highest_num[key] = max(highest_num.get(key, num), num) if len(parts) == 10: key = '/'.join(parts[:9]) num = int(parts[9]) diff --git a/releasenotes/notes/buildid-4131ffe6f4cea09f.yaml b/releasenotes/notes/buildid-4131ffe6f4cea09f.yaml new file mode 100644 index 000000000..26d4488cd --- /dev/null +++ b/releasenotes/notes/buildid-4131ffe6f4cea09f.yaml @@ -0,0 +1,8 @@ +--- +upgrade: + - | + New diskimage build ids are now in UUID format instead of integer + sequences with leading zeroes. This facilitates faster + restoration of images from backup data on systems with large + numbers of image builds. Existing builds will continue to use the + integer format and can coexist with the new format.