Improve image build checking with many diskimages
We've seen in production a huge delay between image builds that have been requested until the build has actually been started. This was despite several free builders. Profiling showed that all of our builders are extremely busy with reloading and comparing their config [1]. This is with a ~2MB sized nodepool.yaml and accordingly many diskimages. The reason for this bottleneck is that after every image we check we're reloading the config and re-comparing it with the currently active config. This config checking currently effectively grows quadratically with the number of diskimages (more iterations and slower config check). To solve this problem we should perform this check less often. [1] Yappi data: name ncall tsub ttot tavg ..er/__init__.py:852 AwsLabel.__eq__ 468.. 144.6263 252.2575 0.000054 ..odepool/config.py:276 Label.__eq__ 1962 0.015242 251.5035 0.128187 ..py:866 AwsProviderDiskImage.__ne__ 234.. 3.761890 86.93310 0.000037 ../nodepool/config.py:425 loadConfig 4 0.000625 22.47684 5.619211 ..-packages/yaml/__init__.py:74 load 20 0.128956 21.65417 1.082708 Change-Id: I786daa20ca428039a44d14b1e389d4d3fd62a735
This commit is contained in:
parent
0e7de19664
commit
155f337fbf
|
@ -610,6 +610,7 @@ class BuildWorker(BaseWorker):
|
|||
self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name)
|
||||
self.name = 'BuildWorker.%s' % name
|
||||
self._lost_zk_connection = False
|
||||
self._last_config_check = None
|
||||
zk.client.on_connection_lost_listeners.append(self._onConnectionLost)
|
||||
|
||||
def _onConnectionLost(self):
|
||||
|
@ -640,20 +641,33 @@ class BuildWorker(BaseWorker):
|
|||
log_dir = self._getBuildLogRoot(name)
|
||||
return os.path.join(log_dir, '%s-%s.log' % (name, build_id))
|
||||
|
||||
def _checkConfigRecent(self):
|
||||
if self._last_config_check:
|
||||
# Assume config is recent enough if we've checked it in the last 60 seconds
|
||||
if time.monotonic() - self._last_config_check < 60:
|
||||
return True
|
||||
self._last_config_check = time.monotonic()
|
||||
start = time.perf_counter()
|
||||
new_config = self._readConfig()
|
||||
end = time.perf_counter()
|
||||
self.log.debug('Read config in %s seconds', end - start)
|
||||
return new_config == self._config
|
||||
|
||||
def _checkForScheduledImageUpdates(self):
|
||||
'''
|
||||
Check every DIB image to see if it has aged out and needs rebuilt.
|
||||
'''
|
||||
self.log.debug('Checking for scheduled image updates')
|
||||
for diskimage in self._config.diskimages.values():
|
||||
# Check if we've been told to shutdown
|
||||
# or if ZK connection is suspended
|
||||
if not self._running or self._zk.suspended or self._zk.lost:
|
||||
return
|
||||
try:
|
||||
new_config = self._readConfig()
|
||||
if new_config != self._config:
|
||||
if not self._checkConfigRecent():
|
||||
# If our config isn't up to date then return and start
|
||||
# over with a new config load.
|
||||
self.log.debug('Config changed, skipping _checkForScheduledImageUpdates')
|
||||
return
|
||||
self._checkImageForScheduledImageUpdates(diskimage)
|
||||
except Exception:
|
||||
|
@ -757,16 +771,17 @@ class BuildWorker(BaseWorker):
|
|||
'''
|
||||
Query ZooKeeper for any manual image build requests.
|
||||
'''
|
||||
self.log.debug('Checking for manual image build requests')
|
||||
for diskimage in self._config.diskimages.values():
|
||||
# Check if we've been told to shutdown
|
||||
# or if ZK connection is suspended
|
||||
if not self._running or self._zk.suspended or self._zk.lost:
|
||||
return
|
||||
try:
|
||||
new_config = self._readConfig()
|
||||
if new_config != self._config:
|
||||
if not self._checkConfigRecent():
|
||||
# If our config isn't up to date then return and start
|
||||
# over with a new config load.
|
||||
self.log.debug('Config changed, skipping _checkForManualBuildRequest')
|
||||
return
|
||||
self._checkImageForManualBuildRequest(diskimage)
|
||||
except Exception:
|
||||
|
|
Loading…
Reference in New Issue