Use image cache when launching nodes

We consult ZooKeeper to determine the most recent image upload
when we decide whether we should accept or decline a request.  If
we accept the request, we also consult it again for the same
information when we start building the node.  In both cases, we
can use the cache to avoid what may potentially be (especially in
the case of a large number of images or uploads) quite a lot of
ZK requests.  Our cache should be almost up to date (typically
milliseconds, or at the worst, seconds behind), and the worst
case is equivalent to what would happen if an image build took
just a few seconds longer.  The tradeoff is worth it.

Similarly, when we create min-ready requests, we can also consult
the cache.

With those 3 changes, all references to getMostRecentImageUpload
in Nodepool use the cache.

The original un-cached method is kept as well, because there are
an enormous number of references to it in the unit tests and they
don't have caching enabled.

Change-Id: Iac1ff8adfbdb8eb9a286929a59cf07cd0b4ac7ad
This commit is contained in:
James E. Blair 2023-03-14 15:28:39 -07:00
parent 70f143690d
commit 5033d399c4
4 changed files with 37 additions and 23 deletions

View File

@ -99,7 +99,7 @@ class StateMachineNodeLauncher(stats.StatsReporter):
diskimage = self.provider_config.diskimages[ diskimage = self.provider_config.diskimages[
label.diskimage.name] label.diskimage.name]
cloud_image = self.zk.getMostRecentImageUpload( cloud_image = self.zk.getMostRecentImageUpload(
diskimage.name, self.provider_config.name) diskimage.name, self.provider_config.name, cached=True)
if not cloud_image: if not cloud_image:
raise exceptions.LaunchNodepoolException( raise exceptions.LaunchNodepoolException(
@ -374,7 +374,7 @@ class StateMachineHandler(NodeRequestHandler):
else: else:
if not self.zk.getMostRecentImageUpload( if not self.zk.getMostRecentImageUpload(
self.pool.labels[label].diskimage.name, self.pool.labels[label].diskimage.name,
self.provider.name): self.provider.name, cached=True):
return False return False
return True return True

View File

@ -1176,7 +1176,8 @@ class NodePool(threading.Thread):
for pool_label in pool.labels.values(): for pool_label in pool.labels.values():
if pool_label.diskimage: if pool_label.diskimage:
if self.zk.getMostRecentImageUpload( if self.zk.getMostRecentImageUpload(
pool_label.diskimage.name, pool.provider.name): pool_label.diskimage.name, pool.provider.name,
cached=True):
return True return True
else: else:
manager = self.getProviderManager(pool.provider.name) manager = self.getProviderManager(pool.provider.name)

View File

@ -599,7 +599,7 @@ class TestDriverAws(tests.DBTestCase):
def test_aws_diskimage_snapshot(self): def test_aws_diskimage_snapshot(self):
configfile = self.setup_config('aws/diskimage.yaml') configfile = self.setup_config('aws/diskimage.yaml')
self.useBuilder(configfile) builder = self.useBuilder(configfile)
image = self.waitForImage('ec2-us-west-2', 'fake-image') image = self.waitForImage('ec2-us-west-2', 'fake-image')
self.assertEqual(image.username, 'zuul') self.assertEqual(image.username, 'zuul')

View File

@ -1388,39 +1388,52 @@ class ZooKeeper(ZooKeeperBase):
return uploads[:count] return uploads[:count]
def getMostRecentImageUpload(self, image, provider, def getMostRecentImageUpload(self, image, provider,
state=READY): state=READY, cached=False):
''' '''
Retrieve the most recent image upload data with the given state. Retrieve the most recent image upload data with the given state.
:param str image: The image name. :param str image: The image name.
:param str provider: The provider name owning the image. :param str provider: The provider name owning the image.
:param str state: The image upload state to match on. :param str state: The image upload state to match on.
:param bool cached: Whether to use cached data.
:returns: An ImageUpload object matching the given state, or :returns: An ImageUpload object matching the given state, or
None if there is no recent upload. None if there is no recent upload.
''' '''
recent_data = None
for build_number in self.getBuildNumbers(image):
path = self._imageUploadPath(image, build_number, provider)
try:
uploads = self.kazoo_client.get_children(path)
except kze.NoNodeError:
uploads = [] uploads = []
for upload in uploads: if cached:
if upload == 'lock': # skip the upload lock node uploads = self.getCachedImageUploads()
else:
for build_number in self.getBuildNumbers(image):
path = self._imageUploadPath(image, build_number, provider)
try:
upload_numbers = self.kazoo_client.get_children(path)
except kze.NoNodeError:
upload_numbers = []
for upload_number in upload_numbers:
if upload_number == 'lock': # skip the upload lock node
continue continue
data = self.getImageUpload( data = self.getImageUpload(
image, build_number, provider, upload) image, build_number, provider, upload_number)
if not data or data.state != state: if not data or data.state != state:
continue continue
elif (recent_data is None or uploads.append(data)
recent_data.state_time < data.state_time):
recent_data = data
return recent_data recent_upload = None
for upload in uploads:
if upload.image_name != image:
continue
if upload.provider_name != provider:
continue
if upload.state != state:
continue
if (recent_upload is None or
recent_upload.state_time < upload.state_time):
recent_upload = upload
return recent_upload
def storeImageUpload(self, image, build_number, provider, image_data, def storeImageUpload(self, image, build_number, provider, image_data,
upload_number=None): upload_number=None):