Improve image build checking with many diskimages

We've seen in production a huge delay between image builds that have
been requested until the build has actually been started. This was
despite several free builders.

Profiling showed that all of our builders are extremely busy with
reloading and comparing their config [1]. This is with a ~2MB sized
nodepool.yaml and accordingly many diskimages.

The reason for this bottleneck is that after every image we check
we're reloading the config and re-comparing it with the currently
active config. This config checking currently effectively grows
quadratically with the number of diskimages (more iterations and
slower config check).

To solve this problem we should perform this check less often.

[1] Yappi data:
name                                  ncall  tsub      ttot      tavg
..er/__init__.py:852 AwsLabel.__eq__  468..  144.6263  252.2575  0.000054
..odepool/config.py:276 Label.__eq__  1962   0.015242  251.5035  0.128187
..py:866 AwsProviderDiskImage.__ne__  234..  3.761890  86.93310  0.000037
../nodepool/config.py:425 loadConfig  4      0.000625  22.47684  5.619211
..-packages/yaml/__init__.py:74 load  20     0.128956  21.65417  1.082708

Change-Id: I786daa20ca428039a44d14b1e389d4d3fd62a735
This commit is contained in:
Tobias Henkel 2023-03-13 16:08:28 +01:00
parent 0e7de19664
commit 155f337fbf
No known key found for this signature in database
GPG Key ID: 03750DEC158E5FA2
1 changed files with 19 additions and 4 deletions

View File

@ -610,6 +610,7 @@ class BuildWorker(BaseWorker):
self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name)
self.name = 'BuildWorker.%s' % name
self._lost_zk_connection = False
self._last_config_check = None
zk.client.on_connection_lost_listeners.append(self._onConnectionLost)
def _onConnectionLost(self):
@ -640,20 +641,33 @@ class BuildWorker(BaseWorker):
log_dir = self._getBuildLogRoot(name)
return os.path.join(log_dir, '%s-%s.log' % (name, build_id))
def _checkConfigRecent(self):
if self._last_config_check:
# Assume config is recent enough if we've checked it in the last 60 seconds
if time.monotonic() - self._last_config_check < 60:
return True
self._last_config_check = time.monotonic()
start = time.perf_counter()
new_config = self._readConfig()
end = time.perf_counter()
self.log.debug('Read config in %s seconds', end - start)
return new_config == self._config
def _checkForScheduledImageUpdates(self):
'''
Check every DIB image to see if it has aged out and needs rebuilt.
'''
self.log.debug('Checking for scheduled image updates')
for diskimage in self._config.diskimages.values():
# Check if we've been told to shutdown
# or if ZK connection is suspended
if not self._running or self._zk.suspended or self._zk.lost:
return
try:
new_config = self._readConfig()
if new_config != self._config:
if not self._checkConfigRecent():
# If our config isn't up to date then return and start
# over with a new config load.
self.log.debug('Config changed, skipping _checkForScheduledImageUpdates')
return
self._checkImageForScheduledImageUpdates(diskimage)
except Exception:
@ -757,16 +771,17 @@ class BuildWorker(BaseWorker):
'''
Query ZooKeeper for any manual image build requests.
'''
self.log.debug('Checking for manual image build requests')
for diskimage in self._config.diskimages.values():
# Check if we've been told to shutdown
# or if ZK connection is suspended
if not self._running or self._zk.suspended or self._zk.lost:
return
try:
new_config = self._readConfig()
if new_config != self._config:
if not self._checkConfigRecent():
# If our config isn't up to date then return and start
# over with a new config load.
self.log.debug('Config changed, skipping _checkForManualBuildRequest')
return
self._checkImageForManualBuildRequest(diskimage)
except Exception: