Browse Source

Add a timeout for the image build

A builder thread can wedge if the build process wedges. Add a timeout
to the subprocess. Since it was the call to readline() that would block,
we change the process to have DIB write directly to the log. This allows
us to set a timeout in the Popen.wait() call. And we kill the dib
subprocess, as well.

The timeout value can be controlled in the diskimage configuration and
defaults to 8 hours.

Change-Id: I188e8a74dc39b55a4b50ade5c1a96832fea76a7d
tags/3.5.0
David Shrewsbury 7 months ago
parent
commit
7225354ec0

+ 6
- 0
doc/source/configuration.rst View File

@@ -277,6 +277,12 @@ Options
277 277
       Specifies the distro to be used as a base image to build the image using
278 278
       diskimage-builder.
279 279
 
280
+   .. attr:: build-timeout
281
+      :type: int
282
+
283
+      How long (in seconds) to wait for the diskimage build before giving up.
284
+      The default is 8 hours.
285
+
280 286
    .. attr:: elements
281 287
       :type: list
282 288
 

+ 28
- 17
nodepool/builder.py View File

@@ -739,21 +739,28 @@ class BuildWorker(BaseWorker):
739 739
         if 'qcow2' in img_types:
740 740
             qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
741 741
 
742
-        cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' %
742
+        log_fn = self._getBuildLog(diskimage.name, build_id)
743
+
744
+        cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
743 745
                (self.dib_cmd, img_types, qemu_img_options, filename,
744
-                img_elements))
746
+                log_fn, img_elements))
745 747
 
746 748
         self._pruneBuildLogs(diskimage.name)
747
-        log_fn = self._getBuildLog(diskimage.name, build_id)
748 749
 
749 750
         self.log.info('Running %s' % (cmd,))
750 751
         self.log.info('Logging to %s' % (log_fn,))
751 752
 
752 753
         start_time = time.monotonic()
754
+
755
+        # We used to use readline() on stdout to output the lines to the
756
+        # build log. Unfortunately, this would block as long as the process
757
+        # ran (with no easy way to timeout the read) and wedge the builder.
758
+        # Now we use --logfile option to the dib command and set a timeout
759
+        # on the wait() call to prevent the wedge.
760
+        did_timeout = False
753 761
         try:
754 762
             p = subprocess.Popen(
755 763
                 shlex.split(cmd),
756
-                stdout=subprocess.PIPE,
757 764
                 stderr=subprocess.STDOUT,
758 765
                 env=env)
759 766
         except OSError as e:
@@ -761,17 +768,20 @@ class BuildWorker(BaseWorker):
761 768
                 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
762 769
             )
763 770
 
764
-        with open(log_fn, 'wb') as log:
765
-            while True:
766
-                ln = p.stdout.readline()
767
-                log.write(ln)
768
-                log.flush()
769
-                if not ln:
770
-                    break
771
-
772
-            rc = p.wait()
773
-            m = "Exit code: %s\n" % rc
774
-            log.write(m.encode('utf8'))
771
+        try:
772
+            rc = p.wait(timeout=diskimage.build_timeout)
773
+        except subprocess.TimeoutExpired:
774
+            p.kill()
775
+            did_timeout = True
776
+            rc = 1
777
+            self.log.error(
778
+                "Build timeout for image %s, build %s (log: %s)",
779
+                diskimage.name, build_id, log_fn)
780
+        else:
781
+            # Append return code to dib's log file
782
+            with open(log_fn, 'ab') as log:
783
+                m = "Exit code: %s\n" % rc
784
+                log.write(m.encode('utf8'))
775 785
 
776 786
         # It's possible the connection to the ZK cluster could have been
777 787
         # interrupted during the build. If so, wait for it to return.
@@ -796,9 +806,10 @@ class BuildWorker(BaseWorker):
796 806
             self.log.info("ZooKeeper lost while building %s" % diskimage.name)
797 807
             self._zk.resetLostFlag()
798 808
             build_data.state = zk.FAILED
799
-        elif p.returncode:
809
+        elif p.returncode or did_timeout:
800 810
             self.log.info(
801
-                "DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
811
+                "DIB failed creating %s (%s) (timeout=%s)" % (
812
+                    diskimage.name, p.returncode, did_timeout))
802 813
             build_data.state = zk.FAILED
803 814
         else:
804 815
             self.log.info("DIB image %s is built" % diskimage.name)

+ 1
- 0
nodepool/cmd/config_validator.py View File

@@ -44,6 +44,7 @@ class ConfigValidator:
44 44
             'rebuild-age': int,
45 45
             'env-vars': {str: str},
46 46
             'username': str,
47
+            'build-timeout': int,
47 48
         }
48 49
 
49 50
         webapp = {

+ 4
- 1
nodepool/config.py View File

@@ -118,6 +118,7 @@ class Config(ConfigValue):
118 118
             d.image_types = set(diskimage.get('formats', []))
119 119
             d.pause = bool(diskimage.get('pause', False))
120 120
             d.username = diskimage.get('username', 'zuul')
121
+            d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
121 122
             self.diskimages[d.name] = d
122 123
 
123 124
     def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
179 180
         self.image_types = None
180 181
         self.pause = False
181 182
         self.username = None
183
+        self.build_timeout = None
182 184
 
183 185
     def __eq__(self, other):
184 186
         if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
189 191
                     other.env_vars == self.env_vars and
190 192
                     other.image_types == self.image_types and
191 193
                     other.pause == self.pause and
192
-                    other.username == self.username)
194
+                    other.username == self.username and
195
+                    other.build_timeout == self.build_timeout)
193 196
         return False
194 197
 
195 198
     def __repr__(self):

+ 12
- 7
nodepool/tests/__init__.py View File

@@ -410,20 +410,25 @@ class DBTestCase(BaseTestCase):
410 410
             time.sleep(1)
411 411
         self.wait_for_threads()
412 412
 
413
-    def waitForBuild(self, image_name, build_id):
413
+    def waitForBuild(self, image_name, build_id, states=None):
414
+        if states is None:
415
+            states = (zk.READY,)
416
+
414 417
         base = "-".join([image_name, build_id])
418
+
415 419
         while True:
416 420
             self.wait_for_threads()
417
-            files = builder.DibImageFile.from_image_id(
418
-                self._config_images_dir.path, base)
419
-            if files:
421
+            build = self.zk.getBuild(image_name, build_id)
422
+            if build and build.state in states:
420 423
                 break
421 424
             time.sleep(1)
422 425
 
423
-        while True:
426
+        # We should only expect a dib manifest with a successful build.
427
+        while build.state == zk.READY:
424 428
             self.wait_for_threads()
425
-            build = self.zk.getBuild(image_name, build_id)
426
-            if build and build.state == zk.READY:
429
+            files = builder.DibImageFile.from_image_id(
430
+                self._config_images_dir.path, base)
431
+            if files:
427 432
                 break
428 433
             time.sleep(1)
429 434
 

+ 54
- 24
nodepool/tests/fake-image-create View File

@@ -1,10 +1,49 @@
1 1
 #!/bin/bash
2 2
 
3
+outfile=
4
+outtypes=("qcow2")
5
+
6
+all_args=$*
7
+logfile=
8
+checksum=
9
+no_tmpfs=
10
+qemu_img_options=
11
+x=
12
+
13
+TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
14
+if [ $? -ne 0 ]; then
15
+    echo "Invalid option"
16
+    exit 1
17
+fi
18
+eval set -- "$TEMP"
19
+while true ; do
20
+    case "$1" in
21
+        --checksum)
22
+            checksum=1; shift 1;;
23
+        --no-tmpfs)
24
+            no_tmpfs=1; shift 1;;
25
+        --qemu-img-options)
26
+            qemu_img_options=$2; shift 2;;
27
+        --logfile)
28
+            logfile=$2; shift 2;;
29
+        -o) outfile=$2; shift 2;;
30
+        -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
31
+        -x) x=1; shift;;
32
+        --) shift ; break ;;
33
+        *) echo "Unknown option : $1"; exit 1;;
34
+    esac
35
+done
36
+
37
+# If --logfile was given, direct stdout to it, as well
38
+if [ ! -z "$logfile" ]; then
39
+    exec > >(tee -a ${logfile})
40
+fi
41
+
3 42
 echo "*** fake-image-create: start"
4 43
 
5 44
 echo "arguments:"
6 45
 echo "----"
7
-echo $*
46
+echo "$all_args"
8 47
 echo "----"
9 48
 
10 49
 if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
38 77
     exit 1
39 78
 fi
40 79
 
41
-outfile=
42
-outtypes=("qcow2")
43
-
44
-TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@")
45
-if [ $? -ne 0 ]; then
46
-    echo "Invalid option"
47
-    exit 1
80
+if [ ! -z "$logfile" ]; then
81
+    echo " -> logfile: $logfile"
82
+fi
83
+if [ ! -z "$checksum" ]; then
84
+    echo " -> set --checksum"
85
+fi
86
+if [ ! -z "$no_tmpfs" ]; then
87
+    echo " -> set --no-tmpfs"
88
+fi
89
+if [ ! -z "$qemu_img_options" ]; then
90
+    echo " -> qemu-img-options: $qemu_img_options"
91
+fi
92
+if [ ! -z "$x" ]; then
93
+    echo " -> debugging enabled"
48 94
 fi
49
-eval set -- "$TEMP"
50
-while true ; do
51
-    case "$1" in
52
-        --checksum)
53
-            echo " -> set --checksum"; shift 1;;
54
-        --no-tmpfs)
55
-            echo " -> set --no-tmpfs"; shift 1;;
56
-        --qemu-img-options)
57
-            echo " -> qemu-img-options: $2"; shift 2;;
58
-        -o) outfile=$2; shift 2;;
59
-        -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
60
-        -x) echo " -> debugging enabled"; shift;;
61
-        --) shift ; break ;;
62
-        *) echo "Unknown option : $1"; exit 1;;
63
-    esac
64
-done
65 95
 
66 96
 if [ -z "$outfile" ]; then
67 97
     echo "No output file specified."

+ 1
- 0
nodepool/tests/fixtures/config_validate/good.yaml View File

@@ -130,6 +130,7 @@ diskimages:
130 130
       - cache-devstack
131 131
     release: trusty
132 132
     rebuild-age: 3600
133
+    build-timeout: 3600
133 134
     env-vars:
134 135
       TMPDIR: /opt/dib_tmp
135 136
       DIB_IMAGE_CACHE: /opt/dib_cache

+ 26
- 0
nodepool/tests/fixtures/diskimage_build_timeout.yaml View File

@@ -0,0 +1,26 @@
1
+elements-dir: .
2
+images-dir: '{images_dir}'
3
+build-log-dir: '{build_log_dir}'
4
+
5
+zookeeper-servers:
6
+  - host: {zookeeper_host}
7
+    port: {zookeeper_port}
8
+    chroot: {zookeeper_chroot}
9
+
10
+labels: []
11
+
12
+providers: []
13
+
14
+diskimages:
15
+  - name: fake-image
16
+    formats:
17
+      - tar
18
+    elements:
19
+      - fedora
20
+      - vm
21
+    release: 21
22
+    env-vars:
23
+      TMPDIR: /opt/dib_tmp
24
+      DIB_IMAGE_CACHE: /opt/dib_cache
25
+      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
26
+      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

+ 9
- 0
nodepool/tests/unit/test_builder.py View File

@@ -16,6 +16,8 @@
16 16
 import os
17 17
 import uuid
18 18
 import fixtures
19
+import mock
20
+import subprocess
19 21
 
20 22
 from nodepool import builder, exceptions, tests
21 23
 from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
335 337
 
336 338
         self.assertEqual(build_default._formats, ['qcow2'])
337 339
         self.assertEqual(build_vhd._formats, ['vhd'])
340
+
341
+    @mock.patch.object(subprocess.Popen, 'wait')
342
+    def test_diskimage_build_timeout(self, mock_wait):
343
+        mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
344
+        configfile = self.setup_config('diskimage_build_timeout.yaml')
345
+        self.useBuilder(configfile, cleanup_interval=0)
346
+        self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))

+ 6
- 0
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml View File

@@ -0,0 +1,6 @@
1
+---
2
+features:
3
+  - |
4
+    A new option (build-timeout) has been added to the builder diskimage
5
+    configuration to control how long the builder should wait for image
6
+    builds before giving up. The default is 8 hours.

Loading…
Cancel
Save