Browse Source

Add a timeout for the image build

A builder thread can wedge if the build process wedges. Add a timeout
to the subprocess. Since it was the call to readline() that would block,
we change the process to have DIB write directly to the log. This allows
us to set a timeout in the Popen.wait() call. And we kill the dib
subprocess, as well.

The timeout value can be controlled in the diskimage configuration and
defaults to 8 hours.

Change-Id: I188e8a74dc39b55a4b50ade5c1a96832fea76a7d
tags/3.5.0
David Shrewsbury 10 months ago
parent
commit
7225354ec0
10 changed files with 147 additions and 49 deletions
  1. +6
    -0
      doc/source/configuration.rst
  2. +28
    -17
      nodepool/builder.py
  3. +1
    -0
      nodepool/cmd/config_validator.py
  4. +4
    -1
      nodepool/config.py
  5. +12
    -7
      nodepool/tests/__init__.py
  6. +54
    -24
      nodepool/tests/fake-image-create
  7. +1
    -0
      nodepool/tests/fixtures/config_validate/good.yaml
  8. +26
    -0
      nodepool/tests/fixtures/diskimage_build_timeout.yaml
  9. +9
    -0
      nodepool/tests/unit/test_builder.py
  10. +6
    -0
      releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml

+ 6
- 0
doc/source/configuration.rst View File

@@ -277,6 +277,12 @@ Options
Specifies the distro to be used as a base image to build the image using
diskimage-builder.

.. attr:: build-timeout
:type: int

How long (in seconds) to wait for the diskimage build before giving up.
The default is 8 hours.

.. attr:: elements
:type: list


+ 28
- 17
nodepool/builder.py View File

@@ -739,21 +739,28 @@ class BuildWorker(BaseWorker):
if 'qcow2' in img_types:
qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS

cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' %
log_fn = self._getBuildLog(diskimage.name, build_id)

cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
(self.dib_cmd, img_types, qemu_img_options, filename,
img_elements))
log_fn, img_elements))

self._pruneBuildLogs(diskimage.name)
log_fn = self._getBuildLog(diskimage.name, build_id)

self.log.info('Running %s' % (cmd,))
self.log.info('Logging to %s' % (log_fn,))

start_time = time.monotonic()

# We used to use readline() on stdout to output the lines to the
# build log. Unfortunately, this would block as long as the process
# ran (with no easy way to timeout the read) and wedge the builder.
# Now we use --logfile option to the dib command and set a timeout
# on the wait() call to prevent the wedge.
did_timeout = False
try:
p = subprocess.Popen(
shlex.split(cmd),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
env=env)
except OSError as e:
@@ -761,17 +768,20 @@ class BuildWorker(BaseWorker):
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
)

with open(log_fn, 'wb') as log:
while True:
ln = p.stdout.readline()
log.write(ln)
log.flush()
if not ln:
break

rc = p.wait()
m = "Exit code: %s\n" % rc
log.write(m.encode('utf8'))
try:
rc = p.wait(timeout=diskimage.build_timeout)
except subprocess.TimeoutExpired:
p.kill()
did_timeout = True
rc = 1
self.log.error(
"Build timeout for image %s, build %s (log: %s)",
diskimage.name, build_id, log_fn)
else:
# Append return code to dib's log file
with open(log_fn, 'ab') as log:
m = "Exit code: %s\n" % rc
log.write(m.encode('utf8'))

# It's possible the connection to the ZK cluster could have been
# interrupted during the build. If so, wait for it to return.
@@ -796,9 +806,10 @@ class BuildWorker(BaseWorker):
self.log.info("ZooKeeper lost while building %s" % diskimage.name)
self._zk.resetLostFlag()
build_data.state = zk.FAILED
elif p.returncode:
elif p.returncode or did_timeout:
self.log.info(
"DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
"DIB failed creating %s (%s) (timeout=%s)" % (
diskimage.name, p.returncode, did_timeout))
build_data.state = zk.FAILED
else:
self.log.info("DIB image %s is built" % diskimage.name)

+ 1
- 0
nodepool/cmd/config_validator.py View File

@@ -44,6 +44,7 @@ class ConfigValidator:
'rebuild-age': int,
'env-vars': {str: str},
'username': str,
'build-timeout': int,
}

webapp = {

+ 4
- 1
nodepool/config.py View File

@@ -118,6 +118,7 @@ class Config(ConfigValue):
d.image_types = set(diskimage.get('formats', []))
d.pause = bool(diskimage.get('pause', False))
d.username = diskimage.get('username', 'zuul')
d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
self.diskimages[d.name] = d

def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
self.image_types = None
self.pause = False
self.username = None
self.build_timeout = None

def __eq__(self, other):
if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
other.env_vars == self.env_vars and
other.image_types == self.image_types and
other.pause == self.pause and
other.username == self.username)
other.username == self.username and
other.build_timeout == self.build_timeout)
return False

def __repr__(self):

+ 12
- 7
nodepool/tests/__init__.py View File

@@ -410,20 +410,25 @@ class DBTestCase(BaseTestCase):
time.sleep(1)
self.wait_for_threads()

def waitForBuild(self, image_name, build_id):
def waitForBuild(self, image_name, build_id, states=None):
if states is None:
states = (zk.READY,)

base = "-".join([image_name, build_id])

while True:
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if files:
build = self.zk.getBuild(image_name, build_id)
if build and build.state in states:
break
time.sleep(1)

while True:
# We should only expect a dib manifest with a successful build.
while build.state == zk.READY:
self.wait_for_threads()
build = self.zk.getBuild(image_name, build_id)
if build and build.state == zk.READY:
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if files:
break
time.sleep(1)


+ 54
- 24
nodepool/tests/fake-image-create View File

@@ -1,10 +1,49 @@
#!/bin/bash

outfile=
outtypes=("qcow2")

all_args=$*
logfile=
checksum=
no_tmpfs=
qemu_img_options=
x=

TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
if [ $? -ne 0 ]; then
echo "Invalid option"
exit 1
fi
eval set -- "$TEMP"
while true ; do
case "$1" in
--checksum)
checksum=1; shift 1;;
--no-tmpfs)
no_tmpfs=1; shift 1;;
--qemu-img-options)
qemu_img_options=$2; shift 2;;
--logfile)
logfile=$2; shift 2;;
-o) outfile=$2; shift 2;;
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
-x) x=1; shift;;
--) shift ; break ;;
*) echo "Unknown option : $1"; exit 1;;
esac
done

# If --logfile was given, direct stdout to it, as well
if [ ! -z "$logfile" ]; then
exec > >(tee -a ${logfile})
fi

echo "*** fake-image-create: start"

echo "arguments:"
echo "----"
echo $*
echo "$all_args"
echo "----"

if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
exit 1
fi

outfile=
outtypes=("qcow2")

TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@")
if [ $? -ne 0 ]; then
echo "Invalid option"
exit 1
if [ ! -z "$logfile" ]; then
echo " -> logfile: $logfile"
fi
if [ ! -z "$checksum" ]; then
echo " -> set --checksum"
fi
if [ ! -z "$no_tmpfs" ]; then
echo " -> set --no-tmpfs"
fi
if [ ! -z "$qemu_img_options" ]; then
echo " -> qemu-img-options: $qemu_img_options"
fi
if [ ! -z "$x" ]; then
echo " -> debugging enabled"
fi
eval set -- "$TEMP"
while true ; do
case "$1" in
--checksum)
echo " -> set --checksum"; shift 1;;
--no-tmpfs)
echo " -> set --no-tmpfs"; shift 1;;
--qemu-img-options)
echo " -> qemu-img-options: $2"; shift 2;;
-o) outfile=$2; shift 2;;
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
-x) echo " -> debugging enabled"; shift;;
--) shift ; break ;;
*) echo "Unknown option : $1"; exit 1;;
esac
done

if [ -z "$outfile" ]; then
echo "No output file specified."

+ 1
- 0
nodepool/tests/fixtures/config_validate/good.yaml View File

@@ -130,6 +130,7 @@ diskimages:
- cache-devstack
release: trusty
rebuild-age: 3600
build-timeout: 3600
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache

+ 26
- 0
nodepool/tests/fixtures/diskimage_build_timeout.yaml View File

@@ -0,0 +1,26 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'

zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}

labels: []

providers: []

diskimages:
- name: fake-image
formats:
- tar
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

+ 9
- 0
nodepool/tests/unit/test_builder.py View File

@@ -16,6 +16,8 @@
import os
import uuid
import fixtures
import mock
import subprocess

from nodepool import builder, exceptions, tests
from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase):

self.assertEqual(build_default._formats, ['qcow2'])
self.assertEqual(build_vhd._formats, ['vhd'])

@mock.patch.object(subprocess.Popen, 'wait')
def test_diskimage_build_timeout(self, mock_wait):
mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
configfile = self.setup_config('diskimage_build_timeout.yaml')
self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))

+ 6
- 0
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml View File

@@ -0,0 +1,6 @@
---
features:
- |
A new option (build-timeout) has been added to the builder diskimage
configuration to control how long the builder should wait for image
builds before giving up. The default is 8 hours.

Loading…
Cancel
Save