From 890ea4975eaafa147fd7c87dbfcdd179d3e0d7b8 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 29 Jan 2019 11:50:08 -0500 Subject: [PATCH] Revert "Revert "Add a timeout for the image build"" This reverts commit ccf40a462a4805c7162bc34434cfe718cd96c448. The previous version would not work properly when daemonized because there was no stdout. This version maintains stdout and uses select/poll with non-blocking stdout to capture the output to a log file. Depends-On: https://review.openstack.org/634266 Change-Id: I7f0617b91e071294fe6051d14475ead1d7df56b7 --- doc/source/configuration.rst | 6 ++ nodepool/builder.py | 87 ++++++++++++++++--- nodepool/cmd/config_validator.py | 1 + nodepool/config.py | 5 +- nodepool/tests/__init__.py | 23 +++-- .../tests/fixtures/config_validate/good.yaml | 1 + .../fixtures/diskimage_build_timeout.yaml | 27 ++++++ nodepool/tests/unit/test_builder.py | 8 ++ .../notes/build-timeout-bb68a1fd24f97a10.yaml | 6 ++ 9 files changed, 142 insertions(+), 22 deletions(-) create mode 100644 nodepool/tests/fixtures/diskimage_build_timeout.yaml create mode 100644 releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index 34dcdd890..ba442c0a4 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -277,6 +277,12 @@ Options Specifies the distro to be used as a base image to build the image using diskimage-builder. + .. attr:: build-timeout + :type: int + + How long (in seconds) to wait for the diskimage build before giving up. + The default is 8 hours. + .. attr:: elements :type: list diff --git a/nodepool/builder.py b/nodepool/builder.py index dce709e9e..196e6e052 100755 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -13,8 +13,10 @@ # License for the specific language governing permissions and limitations # under the License. +import fcntl import logging import os +import select import shutil import socket import subprocess @@ -42,6 +44,9 @@ SUSPEND_WAIT_TIME = 30 # so just hardcode it for all qcow2 building DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'" +# DIB process polling timeout, in milliseconds +BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000 + class DibImageFile(object): ''' @@ -750,6 +755,7 @@ class BuildWorker(BaseWorker): self.log.info('Logging to %s' % (log_fn,)) start_time = time.monotonic() + try: p = subprocess.Popen( shlex.split(cmd), @@ -761,17 +767,73 @@ class BuildWorker(BaseWorker): "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) ) - with open(log_fn, 'wb') as log: - while True: - ln = p.stdout.readline() - log.write(ln) - log.flush() - if not ln: - break + # Make subprocess stdout non-blocking + fd = p.stdout.fileno() + fl = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) - rc = p.wait() - m = "Exit code: %s\n" % rc - log.write(m.encode('utf8')) + # Poll subprocess stdout for readability + r, w, e = select.select([fd], [], [], 0) + bitmask = (select.POLLIN | select.POLLHUP) + poll = select.poll() + poll.register(fd, bitmask) + + rc = None + did_timeout = False + subprocess_done = False + + def buildDidTimeout(): + build_time = time.monotonic() - start_time + if build_time > diskimage.build_timeout: + return True + return False + + with open(log_fn, 'wb') as log: + + # While the subprocess is running, we will loop through stdout + # events. If we can read data, write that out to the log file. + # If the subprocess completes, set the flag so we can exit and + # write the return code. + # + # We check for build timeouts in two places: after we have read + # all available data in the buffer (which will cause an early exit + # of the poll loop), and after a poll timeout. If we did not have + # a check after the reads, we would have to have a poll timeout + # to occur to check for a build timeout, which may not happen if + # there is constantly data available for reading. + + while not subprocess_done: + for fd, event in poll.poll(BUILD_PROCESS_POLL_TIMEOUT): + + # Data available for reading + if event & select.POLLIN: + data = p.stdout.read(1024) + while data: + log.write(data) + log.flush() + data = p.stdout.read(1024) + if buildDidTimeout(): + break + + # Subprocess complete + elif event & select.POLLHUP: + subprocess_done = True + rc = p.wait() + + if not subprocess_done: + if buildDidTimeout(): + did_timeout = True + rc = 1 + self.log.error( + "Build timeout for image %s, build %s (log: %s)", + diskimage.name, build_id, log_fn) + p.kill() + break + + # Subprocess finished, write return code + if not did_timeout: + m = "Exit code: %s\n" % rc + log.write(m.encode('utf8')) # It's possible the connection to the ZK cluster could have been # interrupted during the build. If so, wait for it to return. @@ -796,9 +858,10 @@ class BuildWorker(BaseWorker): self.log.info("ZooKeeper lost while building %s" % diskimage.name) self._zk.resetLostFlag() build_data.state = zk.FAILED - elif p.returncode: + elif p.returncode or did_timeout: self.log.info( - "DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) + "DIB failed creating %s (%s) (timeout=%s)" % ( + diskimage.name, p.returncode, did_timeout)) build_data.state = zk.FAILED else: self.log.info("DIB image %s is built" % diskimage.name) diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py index c3dde57d7..f2646af34 100644 --- a/nodepool/cmd/config_validator.py +++ b/nodepool/cmd/config_validator.py @@ -44,6 +44,7 @@ class ConfigValidator: 'rebuild-age': int, 'env-vars': {str: str}, 'username': str, + 'build-timeout': int, } webapp = { diff --git a/nodepool/config.py b/nodepool/config.py index 2b5e4ebd4..7f09961c7 100755 --- a/nodepool/config.py +++ b/nodepool/config.py @@ -118,6 +118,7 @@ class Config(ConfigValue): d.image_types = set(diskimage.get('formats', [])) d.pause = bool(diskimage.get('pause', False)) d.username = diskimage.get('username', 'zuul') + d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60)) self.diskimages[d.name] = d def setSecureDiskimageEnv(self, diskimages, secure_config_path): @@ -179,6 +180,7 @@ class DiskImage(ConfigValue): self.image_types = None self.pause = False self.username = None + self.build_timeout = None def __eq__(self, other): if isinstance(other, DiskImage): @@ -189,7 +191,8 @@ class DiskImage(ConfigValue): other.env_vars == self.env_vars and other.image_types == self.image_types and other.pause == self.pause and - other.username == self.username) + other.username == self.username and + other.build_timeout == self.build_timeout) return False def __repr__(self): diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 6cf67ff3d..e4c84a7d1 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -411,20 +411,25 @@ class DBTestCase(BaseTestCase): time.sleep(1) self.wait_for_threads() - def waitForBuild(self, image_name, build_id): + def waitForBuild(self, image_name, build_id, states=None): + if states is None: + states = (zk.READY,) + base = "-".join([image_name, build_id]) - while True: - self.wait_for_threads() - files = builder.DibImageFile.from_image_id( - self._config_images_dir.path, base) - if files: - break - time.sleep(1) while True: self.wait_for_threads() build = self.zk.getBuild(image_name, build_id) - if build and build.state == zk.READY: + if build and build.state in states: + break + time.sleep(1) + + # We should only expect a dib manifest with a successful build. + while build.state == zk.READY: + self.wait_for_threads() + files = builder.DibImageFile.from_image_id( + self._config_images_dir.path, base) + if files: break time.sleep(1) diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml index 9aba1340b..7e81657b6 100644 --- a/nodepool/tests/fixtures/config_validate/good.yaml +++ b/nodepool/tests/fixtures/config_validate/good.yaml @@ -152,6 +152,7 @@ diskimages: - cache-devstack release: trusty rebuild-age: 3600 + build-timeout: 3600 env-vars: TMPDIR: /opt/dib_tmp DIB_IMAGE_CACHE: /opt/dib_cache diff --git a/nodepool/tests/fixtures/diskimage_build_timeout.yaml b/nodepool/tests/fixtures/diskimage_build_timeout.yaml new file mode 100644 index 000000000..b133181f3 --- /dev/null +++ b/nodepool/tests/fixtures/diskimage_build_timeout.yaml @@ -0,0 +1,27 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +labels: [] + +providers: [] + +diskimages: + - name: fake-image + build-timeout: 2 + formats: + - tar + elements: + - fedora + - vm + release: 21 + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py index c992bf7aa..1379ec0f8 100644 --- a/nodepool/tests/unit/test_builder.py +++ b/nodepool/tests/unit/test_builder.py @@ -16,6 +16,7 @@ import os import uuid import fixtures +import mock from nodepool import builder, exceptions, tests from nodepool.driver.fake import provider as fakeprovider @@ -335,3 +336,10 @@ class TestNodePoolBuilder(tests.DBTestCase): self.assertEqual(build_default._formats, ['qcow2']) self.assertEqual(build_vhd._formats, ['vhd']) + + @mock.patch('select.poll') + def test_diskimage_build_timeout(self, mock_poll): + configfile = self.setup_config('diskimage_build_timeout.yaml') + builder.BUILD_PROCESS_POLL_TIMEOUT = 500 + self.useBuilder(configfile, cleanup_interval=0) + self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,)) diff --git a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml new file mode 100644 index 000000000..6901903db --- /dev/null +++ b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + A new option (build-timeout) has been added to the builder diskimage + configuration to control how long the builder should wait for image + builds before giving up. The default is 8 hours.