Merge "Revert "Add a timeout for the image build""

This commit is contained in:
Zuul 2019-01-25 22:37:34 +00:00 committed by Gerrit Code Review
commit f2c155821c
10 changed files with 50 additions and 148 deletions

View File

@ -277,12 +277,6 @@ Options
Specifies the distro to be used as a base image to build the image using Specifies the distro to be used as a base image to build the image using
diskimage-builder. diskimage-builder.
.. attr:: build-timeout
:type: int
How long (in seconds) to wait for the diskimage build before giving up.
The default is 8 hours.
.. attr:: elements .. attr:: elements
:type: list :type: list

View File

@ -739,28 +739,21 @@ class BuildWorker(BaseWorker):
if 'qcow2' in img_types: if 'qcow2' in img_types:
qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
log_fn = self._getBuildLog(diskimage.name, build_id) cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' %
cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
(self.dib_cmd, img_types, qemu_img_options, filename, (self.dib_cmd, img_types, qemu_img_options, filename,
log_fn, img_elements)) img_elements))
self._pruneBuildLogs(diskimage.name) self._pruneBuildLogs(diskimage.name)
log_fn = self._getBuildLog(diskimage.name, build_id)
self.log.info('Running %s' % (cmd,)) self.log.info('Running %s' % (cmd,))
self.log.info('Logging to %s' % (log_fn,)) self.log.info('Logging to %s' % (log_fn,))
start_time = time.monotonic() start_time = time.monotonic()
# We used to use readline() on stdout to output the lines to the
# build log. Unfortunately, this would block as long as the process
# ran (with no easy way to timeout the read) and wedge the builder.
# Now we use --logfile option to the dib command and set a timeout
# on the wait() call to prevent the wedge.
did_timeout = False
try: try:
p = subprocess.Popen( p = subprocess.Popen(
shlex.split(cmd), shlex.split(cmd),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
env=env) env=env)
except OSError as e: except OSError as e:
@ -768,20 +761,17 @@ class BuildWorker(BaseWorker):
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
) )
try: with open(log_fn, 'wb') as log:
rc = p.wait(timeout=diskimage.build_timeout) while True:
except subprocess.TimeoutExpired: ln = p.stdout.readline()
p.kill() log.write(ln)
did_timeout = True log.flush()
rc = 1 if not ln:
self.log.error( break
"Build timeout for image %s, build %s (log: %s)",
diskimage.name, build_id, log_fn) rc = p.wait()
else: m = "Exit code: %s\n" % rc
# Append return code to dib's log file log.write(m.encode('utf8'))
with open(log_fn, 'ab') as log:
m = "Exit code: %s\n" % rc
log.write(m.encode('utf8'))
# It's possible the connection to the ZK cluster could have been # It's possible the connection to the ZK cluster could have been
# interrupted during the build. If so, wait for it to return. # interrupted during the build. If so, wait for it to return.
@ -806,10 +796,9 @@ class BuildWorker(BaseWorker):
self.log.info("ZooKeeper lost while building %s" % diskimage.name) self.log.info("ZooKeeper lost while building %s" % diskimage.name)
self._zk.resetLostFlag() self._zk.resetLostFlag()
build_data.state = zk.FAILED build_data.state = zk.FAILED
elif p.returncode or did_timeout: elif p.returncode:
self.log.info( self.log.info(
"DIB failed creating %s (%s) (timeout=%s)" % ( "DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
diskimage.name, p.returncode, did_timeout))
build_data.state = zk.FAILED build_data.state = zk.FAILED
else: else:
self.log.info("DIB image %s is built" % diskimage.name) self.log.info("DIB image %s is built" % diskimage.name)

View File

@ -44,7 +44,6 @@ class ConfigValidator:
'rebuild-age': int, 'rebuild-age': int,
'env-vars': {str: str}, 'env-vars': {str: str},
'username': str, 'username': str,
'build-timeout': int,
} }
webapp = { webapp = {

View File

@ -118,7 +118,6 @@ class Config(ConfigValue):
d.image_types = set(diskimage.get('formats', [])) d.image_types = set(diskimage.get('formats', []))
d.pause = bool(diskimage.get('pause', False)) d.pause = bool(diskimage.get('pause', False))
d.username = diskimage.get('username', 'zuul') d.username = diskimage.get('username', 'zuul')
d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
self.diskimages[d.name] = d self.diskimages[d.name] = d
def setSecureDiskimageEnv(self, diskimages, secure_config_path): def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@ -180,7 +179,6 @@ class DiskImage(ConfigValue):
self.image_types = None self.image_types = None
self.pause = False self.pause = False
self.username = None self.username = None
self.build_timeout = None
def __eq__(self, other): def __eq__(self, other):
if isinstance(other, DiskImage): if isinstance(other, DiskImage):
@ -191,8 +189,7 @@ class DiskImage(ConfigValue):
other.env_vars == self.env_vars and other.env_vars == self.env_vars and
other.image_types == self.image_types and other.image_types == self.image_types and
other.pause == self.pause and other.pause == self.pause and
other.username == self.username and other.username == self.username)
other.build_timeout == self.build_timeout)
return False return False
def __repr__(self): def __repr__(self):

View File

@ -411,21 +411,9 @@ class DBTestCase(BaseTestCase):
time.sleep(1) time.sleep(1)
self.wait_for_threads() self.wait_for_threads()
def waitForBuild(self, image_name, build_id, states=None): def waitForBuild(self, image_name, build_id):
if states is None:
states = (zk.READY,)
base = "-".join([image_name, build_id]) base = "-".join([image_name, build_id])
while True: while True:
self.wait_for_threads()
build = self.zk.getBuild(image_name, build_id)
if build and build.state in states:
break
time.sleep(1)
# We should only expect a dib manifest with a successful build.
while build.state == zk.READY:
self.wait_for_threads() self.wait_for_threads()
files = builder.DibImageFile.from_image_id( files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base) self._config_images_dir.path, base)
@ -433,6 +421,13 @@ class DBTestCase(BaseTestCase):
break break
time.sleep(1) time.sleep(1)
while True:
self.wait_for_threads()
build = self.zk.getBuild(image_name, build_id)
if build and build.state == zk.READY:
break
time.sleep(1)
self.wait_for_threads() self.wait_for_threads()
return build return build

View File

@ -1,49 +1,10 @@
#!/bin/bash #!/bin/bash
outfile=
outtypes=("qcow2")
all_args=$*
logfile=
checksum=
no_tmpfs=
qemu_img_options=
x=
TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
if [ $? -ne 0 ]; then
echo "Invalid option"
exit 1
fi
eval set -- "$TEMP"
while true ; do
case "$1" in
--checksum)
checksum=1; shift 1;;
--no-tmpfs)
no_tmpfs=1; shift 1;;
--qemu-img-options)
qemu_img_options=$2; shift 2;;
--logfile)
logfile=$2; shift 2;;
-o) outfile=$2; shift 2;;
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
-x) x=1; shift;;
--) shift ; break ;;
*) echo "Unknown option : $1"; exit 1;;
esac
done
# If --logfile was given, direct stdout to it, as well
if [ ! -z "$logfile" ]; then
exec > >(tee -a ${logfile})
fi
echo "*** fake-image-create: start" echo "*** fake-image-create: start"
echo "arguments:" echo "arguments:"
echo "----" echo "----"
echo "$all_args" echo $*
echo "----" echo "----"
if [[ "${SHOULD_FAIL}" == 'true' ]]; then if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@ -77,21 +38,30 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
exit 1 exit 1
fi fi
if [ ! -z "$logfile" ]; then outfile=
echo " -> logfile: $logfile" outtypes=("qcow2")
fi
if [ ! -z "$checksum" ]; then TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@")
echo " -> set --checksum" if [ $? -ne 0 ]; then
fi echo "Invalid option"
if [ ! -z "$no_tmpfs" ]; then exit 1
echo " -> set --no-tmpfs"
fi
if [ ! -z "$qemu_img_options" ]; then
echo " -> qemu-img-options: $qemu_img_options"
fi
if [ ! -z "$x" ]; then
echo " -> debugging enabled"
fi fi
eval set -- "$TEMP"
while true ; do
case "$1" in
--checksum)
echo " -> set --checksum"; shift 1;;
--no-tmpfs)
echo " -> set --no-tmpfs"; shift 1;;
--qemu-img-options)
echo " -> qemu-img-options: $2"; shift 2;;
-o) outfile=$2; shift 2;;
-t) IFS="," read -a outtypes <<< "$2"; shift 2;;
-x) echo " -> debugging enabled"; shift;;
--) shift ; break ;;
*) echo "Unknown option : $1"; exit 1;;
esac
done
if [ -z "$outfile" ]; then if [ -z "$outfile" ]; then
echo "No output file specified." echo "No output file specified."

View File

@ -152,7 +152,6 @@ diskimages:
- cache-devstack - cache-devstack
release: trusty release: trusty
rebuild-age: 3600 rebuild-age: 3600
build-timeout: 3600
env-vars: env-vars:
TMPDIR: /opt/dib_tmp TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache DIB_IMAGE_CACHE: /opt/dib_cache

View File

@ -1,26 +0,0 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels: []
providers: []
diskimages:
- name: fake-image
formats:
- tar
elements:
- fedora
- vm
release: 21
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -16,8 +16,6 @@
import os import os
import uuid import uuid
import fixtures import fixtures
import mock
import subprocess
from nodepool import builder, exceptions, tests from nodepool import builder, exceptions, tests
from nodepool.driver.fake import provider as fakeprovider from nodepool.driver.fake import provider as fakeprovider
@ -337,10 +335,3 @@ class TestNodePoolBuilder(tests.DBTestCase):
self.assertEqual(build_default._formats, ['qcow2']) self.assertEqual(build_default._formats, ['qcow2'])
self.assertEqual(build_vhd._formats, ['vhd']) self.assertEqual(build_vhd._formats, ['vhd'])
@mock.patch.object(subprocess.Popen, 'wait')
def test_diskimage_build_timeout(self, mock_wait):
mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
configfile = self.setup_config('diskimage_build_timeout.yaml')
self.useBuilder(configfile, cleanup_interval=0)
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))

View File

@ -1,6 +0,0 @@
---
features:
- |
A new option (build-timeout) has been added to the builder diskimage
configuration to control how long the builder should wait for image
builds before giving up. The default is 8 hours.