@@ -277,6 +277,12 @@ Options | |||
Specifies the distro to be used as a base image to build the image using | |||
diskimage-builder. | |||
.. attr:: build-timeout | |||
:type: int | |||
How long (in seconds) to wait for the diskimage build before giving up. | |||
The default is 8 hours. | |||
.. attr:: elements | |||
:type: list | |||
@@ -13,8 +13,10 @@ | |||
# License for the specific language governing permissions and limitations | |||
# under the License. | |||
import fcntl | |||
import logging | |||
import os | |||
import select | |||
import shutil | |||
import socket | |||
import subprocess | |||
@@ -42,6 +44,9 @@ SUSPEND_WAIT_TIME = 30 | |||
# so just hardcode it for all qcow2 building | |||
DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'" | |||
# DIB process polling timeout, in milliseconds | |||
BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000 | |||
class DibImageFile(object): | |||
''' | |||
@@ -750,6 +755,7 @@ class BuildWorker(BaseWorker): | |||
self.log.info('Logging to %s' % (log_fn,)) | |||
start_time = time.monotonic() | |||
try: | |||
p = subprocess.Popen( | |||
shlex.split(cmd), | |||
@@ -761,17 +767,73 @@ class BuildWorker(BaseWorker): | |||
"Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror) | |||
) | |||
# Make subprocess stdout non-blocking | |||
fd = p.stdout.fileno() | |||
fl = fcntl.fcntl(fd, fcntl.F_GETFL) | |||
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) | |||
# Poll subprocess stdout for readability | |||
r, w, e = select.select([fd], [], [], 0) | |||
bitmask = (select.POLLIN | select.POLLHUP) | |||
poll = select.poll() | |||
poll.register(fd, bitmask) | |||
rc = None | |||
did_timeout = False | |||
subprocess_done = False | |||
def buildDidTimeout(): | |||
build_time = time.monotonic() - start_time | |||
if build_time > diskimage.build_timeout: | |||
return True | |||
return False | |||
with open(log_fn, 'wb') as log: | |||
while True: | |||
ln = p.stdout.readline() | |||
log.write(ln) | |||
log.flush() | |||
if not ln: | |||
break | |||
rc = p.wait() | |||
m = "Exit code: %s\n" % rc | |||
log.write(m.encode('utf8')) | |||
# While the subprocess is running, we will loop through stdout | |||
# events. If we can read data, write that out to the log file. | |||
# If the subprocess completes, set the flag so we can exit and | |||
# write the return code. | |||
# | |||
# We check for build timeouts in two places: after we have read | |||
# all available data in the buffer (which will cause an early exit | |||
# of the poll loop), and after a poll timeout. If we did not have | |||
# a check after the reads, we would have to have a poll timeout | |||
# to occur to check for a build timeout, which may not happen if | |||
# there is constantly data available for reading. | |||
while not subprocess_done: | |||
for fd, event in poll.poll(BUILD_PROCESS_POLL_TIMEOUT): | |||
# Data available for reading | |||
if event & select.POLLIN: | |||
data = p.stdout.read(1024) | |||
while data: | |||
log.write(data) | |||
log.flush() | |||
data = p.stdout.read(1024) | |||
if buildDidTimeout(): | |||
break | |||
# Subprocess complete | |||
elif event & select.POLLHUP: | |||
subprocess_done = True | |||
rc = p.wait() | |||
if not subprocess_done: | |||
if buildDidTimeout(): | |||
did_timeout = True | |||
rc = 1 | |||
self.log.error( | |||
"Build timeout for image %s, build %s (log: %s)", | |||
diskimage.name, build_id, log_fn) | |||
p.kill() | |||
break | |||
# Subprocess finished, write return code | |||
if not did_timeout: | |||
m = "Exit code: %s\n" % rc | |||
log.write(m.encode('utf8')) | |||
# It's possible the connection to the ZK cluster could have been | |||
# interrupted during the build. If so, wait for it to return. | |||
@@ -796,9 +858,10 @@ class BuildWorker(BaseWorker): | |||
self.log.info("ZooKeeper lost while building %s" % diskimage.name) | |||
self._zk.resetLostFlag() | |||
build_data.state = zk.FAILED | |||
elif p.returncode: | |||
elif p.returncode or did_timeout: | |||
self.log.info( | |||
"DIB failed creating %s (%s)" % (diskimage.name, p.returncode)) | |||
"DIB failed creating %s (%s) (timeout=%s)" % ( | |||
diskimage.name, p.returncode, did_timeout)) | |||
build_data.state = zk.FAILED | |||
else: | |||
self.log.info("DIB image %s is built" % diskimage.name) |
@@ -44,6 +44,7 @@ class ConfigValidator: | |||
'rebuild-age': int, | |||
'env-vars': {str: str}, | |||
'username': str, | |||
'build-timeout': int, | |||
} | |||
webapp = { |
@@ -118,6 +118,7 @@ class Config(ConfigValue): | |||
d.image_types = set(diskimage.get('formats', [])) | |||
d.pause = bool(diskimage.get('pause', False)) | |||
d.username = diskimage.get('username', 'zuul') | |||
d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60)) | |||
self.diskimages[d.name] = d | |||
def setSecureDiskimageEnv(self, diskimages, secure_config_path): | |||
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue): | |||
self.image_types = None | |||
self.pause = False | |||
self.username = None | |||
self.build_timeout = None | |||
def __eq__(self, other): | |||
if isinstance(other, DiskImage): | |||
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue): | |||
other.env_vars == self.env_vars and | |||
other.image_types == self.image_types and | |||
other.pause == self.pause and | |||
other.username == self.username) | |||
other.username == self.username and | |||
other.build_timeout == self.build_timeout) | |||
return False | |||
def __repr__(self): |
@@ -414,20 +414,25 @@ class DBTestCase(BaseTestCase): | |||
time.sleep(1) | |||
self.wait_for_threads() | |||
def waitForBuild(self, image_name, build_id): | |||
def waitForBuild(self, image_name, build_id, states=None): | |||
if states is None: | |||
states = (zk.READY,) | |||
base = "-".join([image_name, build_id]) | |||
while True: | |||
self.wait_for_threads() | |||
files = builder.DibImageFile.from_image_id( | |||
self._config_images_dir.path, base) | |||
if files: | |||
build = self.zk.getBuild(image_name, build_id) | |||
if build and build.state in states: | |||
break | |||
time.sleep(1) | |||
while True: | |||
# We should only expect a dib manifest with a successful build. | |||
while build.state == zk.READY: | |||
self.wait_for_threads() | |||
build = self.zk.getBuild(image_name, build_id) | |||
if build and build.state == zk.READY: | |||
files = builder.DibImageFile.from_image_id( | |||
self._config_images_dir.path, base) | |||
if files: | |||
break | |||
time.sleep(1) | |||
@@ -174,6 +174,7 @@ diskimages: | |||
- cache-devstack | |||
release: trusty | |||
rebuild-age: 3600 | |||
build-timeout: 3600 | |||
env-vars: | |||
TMPDIR: /opt/dib_tmp | |||
DIB_IMAGE_CACHE: /opt/dib_cache |
@@ -0,0 +1,27 @@ | |||
elements-dir: . | |||
images-dir: '{images_dir}' | |||
build-log-dir: '{build_log_dir}' | |||
zookeeper-servers: | |||
- host: {zookeeper_host} | |||
port: {zookeeper_port} | |||
chroot: {zookeeper_chroot} | |||
labels: [] | |||
providers: [] | |||
diskimages: | |||
- name: fake-image | |||
build-timeout: 2 | |||
formats: | |||
- tar | |||
elements: | |||
- fedora | |||
- vm | |||
release: 21 | |||
env-vars: | |||
TMPDIR: /opt/dib_tmp | |||
DIB_IMAGE_CACHE: /opt/dib_cache | |||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ | |||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 |
@@ -16,6 +16,7 @@ | |||
import os | |||
import uuid | |||
import fixtures | |||
import mock | |||
from nodepool import builder, exceptions, tests | |||
from nodepool.driver.fake import provider as fakeprovider | |||
@@ -335,3 +336,10 @@ class TestNodePoolBuilder(tests.DBTestCase): | |||
self.assertEqual(build_default._formats, ['qcow2']) | |||
self.assertEqual(build_vhd._formats, ['vhd']) | |||
@mock.patch('select.poll') | |||
def test_diskimage_build_timeout(self, mock_poll): | |||
configfile = self.setup_config('diskimage_build_timeout.yaml') | |||
builder.BUILD_PROCESS_POLL_TIMEOUT = 500 | |||
self.useBuilder(configfile, cleanup_interval=0) | |||
self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,)) |
@@ -0,0 +1,6 @@ | |||
--- | |||
features: | |||
- | | |||
A new option (build-timeout) has been added to the builder diskimage | |||
configuration to control how long the builder should wait for image | |||
builds before giving up. The default is 8 hours. |