Revert "Revert "Add a timeout for the image build""

This reverts commit ccf40a462a.

The previous version would not work properly when daemonized
because there was no stdout. This version maintains stdout and
uses select/poll with non-blocking stdout to capture the output
to a log file.

Depends-On: https://review.openstack.org/634266

Change-Id: I7f0617b91e071294fe6051d14475ead1d7df56b7
This commit is contained in:
David Shrewsbury 2019-01-29 11:50:08 -05:00
parent acf038aef4
commit 890ea4975e
9 changed files with 142 additions and 22 deletions

View File

@ -277,6 +277,12 @@ Options
Specifies the distro to be used as a base image to build the image using Specifies the distro to be used as a base image to build the image using
diskimage-builder. diskimage-builder.
.. attr:: build-timeout
:type: int
How long (in seconds) to wait for the diskimage build before giving up.
The default is 8 hours.
.. attr:: elements .. attr:: elements
:type: list :type: list

View File

@ -13,8 +13,10 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
import fcntl
import logging import logging
import os import os
import select
import shutil import shutil
import socket import socket
import subprocess import subprocess
@ -42,6 +44,9 @@ SUSPEND_WAIT_TIME = 30
# so just hardcode it for all qcow2 building # so just hardcode it for all qcow2 building
DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'" DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'"
# DIB process polling timeout, in milliseconds
BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000
class DibImageFile(object): class DibImageFile(object):
''' '''
@ -750,6 +755,7 @@ class BuildWorker(BaseWorker):
self.log.info('Logging to %s' % (log_fn,)) self.log.info('Logging to %s' % (log_fn,))
start_time = time.monotonic() start_time = time.monotonic()
try: try:
p = subprocess.Popen( p = subprocess.Popen(
shlex.split(cmd), shlex.split(cmd),
@ -761,17 +767,73 @@ class BuildWorker(BaseWorker):
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
) )
with open(log_fn, 'wb') as log: # Make subprocess stdout non-blocking
while True: fd = p.stdout.fileno()
ln = p.stdout.readline() fl = fcntl.fcntl(fd, fcntl.F_GETFL)
log.write(ln) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
log.flush()
if not ln:
break
rc = p.wait() # Poll subprocess stdout for readability
m = "Exit code: %s\n" % rc r, w, e = select.select([fd], [], [], 0)
log.write(m.encode('utf8')) bitmask = (select.POLLIN | select.POLLHUP)
poll = select.poll()
poll.register(fd, bitmask)
rc = None
did_timeout = False
subprocess_done = False
def buildDidTimeout():
build_time = time.monotonic() - start_time
if build_time > diskimage.build_timeout:
return True
return False
with open(log_fn, 'wb') as log:
# While the subprocess is running, we will loop through stdout
# events. If we can read data, write that out to the log file.
# If the subprocess completes, set the flag so we can exit and
# write the return code.
#
# We check for build timeouts in two places: after we have read
# all available data in the buffer (which will cause an early exit
# of the poll loop), and after a poll timeout. If we did not have
# a check after the reads, we would have to have a poll timeout
# to occur to check for a build timeout, which may not happen if
# there is constantly data available for reading.
while not subprocess_done:
for fd, event in poll.poll(BUILD_PROCESS_POLL_TIMEOUT):
# Data available for reading
if event & select.POLLIN:
data = p.stdout.read(1024)
while data:
log.write(data)
log.flush()
data = p.stdout.read(1024)
if buildDidTimeout():
break
# Subprocess complete
elif event & select.POLLHUP:
subprocess_done = True
rc = p.wait()
if not subprocess_done:
if buildDidTimeout():
did_timeout = True
rc = 1
self.log.error(
"Build timeout for image %s, build %s (log: %s)",
diskimage.name, build_id, log_fn)
p.kill()
break
# Subprocess finished, write return code
if not did_timeout:
m = "Exit code: %s\n" % rc
log.write(m.encode('utf8'))
# It's possible the connection to the ZK cluster could have been # It's possible the connection to the ZK cluster could have been
# interrupted during the build. If so, wait for it to return. # interrupted during the build. If so, wait for it to return.
@ -796,9 +858,10 @@ class BuildWorker(BaseWorker):
self.log.info("ZooKeeper lost while building %s" % diskimage.name) self.log.info("ZooKeeper lost while building %s" % diskimage.name)
self._zk.resetLostFlag() self._zk.resetLostFlag()
build_data.state = zk.FAILED build_data.state = zk.FAILED
elif p.returncode: elif p.returncode or did_timeout:
self.log.info( self.log.info(
"DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) "DIB failed creating %s (%s) (timeout=%s)" % (
diskimage.name, p.returncode, did_timeout))
build_data.state = zk.FAILED build_data.state = zk.FAILED
else: else:
self.log.info("DIB image %s is built" % diskimage.name) self.log.info("DIB image %s is built" % diskimage.name)

View File

@ -44,6 +44,7 @@ class ConfigValidator:
'rebuild-age': int, 'rebuild-age': int,
'env-vars': {str: str}, 'env-vars': {str: str},
'username': str, 'username': str,
'build-timeout': int,
} }
webapp = { webapp = {

View File

@ -118,6 +118,7 @@ class Config(ConfigValue):
d.image_types = set(diskimage.get('formats', [])) d.image_types = set(diskimage.get('formats', []))
d.pause = bool(diskimage.get('pause', False)) d.pause = bool(diskimage.get('pause', False))
d.username = diskimage.get('username', 'zuul') d.username = diskimage.get('username', 'zuul')
d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
self.diskimages[d.name] = d self.diskimages[d.name] = d
def setSecureDiskimageEnv(self, diskimages, secure_config_path): def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
self.image_types = None self.image_types = None
self.pause = False self.pause = False
self.username = None self.username = None
self.build_timeout = None
def __eq__(self, other): def __eq__(self, other):
if isinstance(other, DiskImage): if isinstance(other, DiskImage):
@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
other.env_vars == self.env_vars and other.env_vars == self.env_vars and
other.image_types == self.image_types and other.image_types == self.image_types and
other.pause == self.pause and other.pause == self.pause and
other.username == self.username) other.username == self.username and
other.build_timeout == self.build_timeout)
return False return False
def __repr__(self): def __repr__(self):

View File

@ -411,20 +411,25 @@ class DBTestCase(BaseTestCase):
time.sleep(1) time.sleep(1)
self.wait_for_threads() self.wait_for_threads()
def waitForBuild(self, image_name, build_id): def waitForBuild(self, image_name, build_id, states=None):
if states is None:
states = (zk.READY,)
base = "-".join([image_name, build_id]) base = "-".join([image_name, build_id])
while True:
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if files:
break
time.sleep(1)
while True: while True:
self.wait_for_threads() self.wait_for_threads()
build = self.zk.getBuild(image_name, build_id) build = self.zk.getBuild(image_name, build_id)
if build and build.state == zk.READY: if build and build.state in states:
break
time.sleep(1)
# We should only expect a dib manifest with a successful build.
while build.state == zk.READY:
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if files:
break break
time.sleep(1) time.sleep(1)

View File

@ -152,6 +152,7 @@ diskimages:
- cache-devstack - cache-devstack
release: trusty release: trusty
rebuild-age: 3600 rebuild-age: 3600
build-timeout: 3600
env-vars: env-vars:
TMPDIR: /opt/dib_tmp TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache DIB_IMAGE_CACHE: /opt/dib_cache

View File

@ -0,0 +1,27 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels: []
providers: []
diskimages:
- name: fake-image
build-timeout: 2
formats:
- tar
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -16,6 +16,7 @@
import os import os
import uuid import uuid
import fixtures import fixtures
import mock
from nodepool import builder, exceptions, tests from nodepool import builder, exceptions, tests
from nodepool.driver.fake import provider as fakeprovider from nodepool.driver.fake import provider as fakeprovider
@ -335,3 +336,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
self.assertEqual(build_default._formats, ['qcow2']) self.assertEqual(build_default._formats, ['qcow2'])
self.assertEqual(build_vhd._formats, ['vhd']) self.assertEqual(build_vhd._formats, ['vhd'])
@mock.patch('select.poll')
def test_diskimage_build_timeout(self, mock_poll):
configfile = self.setup_config('diskimage_build_timeout.yaml')
builder.BUILD_PROCESS_POLL_TIMEOUT = 500
self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))

View File

@ -0,0 +1,6 @@
---
features:
- |
A new option (build-timeout) has been added to the builder diskimage
configuration to control how long the builder should wait for image
builds before giving up. The default is 8 hours.