Browse Source

Merge "Add a timeout for the image build"

tags/3.5.0
Zuul 5 months ago
parent
commit
26c57ee5a9

+ 6
- 0
doc/source/configuration.rst View File

@@ -277,6 +277,12 @@ Options
277 277
       Specifies the distro to be used as a base image to build the image using
278 278
       diskimage-builder.
279 279
 
280
+   .. attr:: build-timeout
281
+      :type: int
282
+
283
+      How long (in seconds) to wait for the diskimage build before giving up.
284
+      The default is 8 hours.
285
+
280 286
    .. attr:: elements
281 287
       :type: list
282 288
 

+ 28
- 17
nodepool/builder.py View File

@@ -739,21 +739,28 @@ class BuildWorker(BaseWorker):
739 739
         if 'qcow2' in img_types:
740 740
             qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
741 741
 
742
-        cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' %
742
+        log_fn = self._getBuildLog(diskimage.name, build_id)
743
+
744
+        cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
743 745
                (self.dib_cmd, img_types, qemu_img_options, filename,
744
-                img_elements))
746
+                log_fn, img_elements))
745 747
 
746 748
         self._pruneBuildLogs(diskimage.name)
747
-        log_fn = self._getBuildLog(diskimage.name, build_id)
748 749
 
749 750
         self.log.info('Running %s' % (cmd,))
750 751
         self.log.info('Logging to %s' % (log_fn,))
751 752
 
752 753
         start_time = time.monotonic()
754
+
755
+        # We used to use readline() on stdout to output the lines to the
756
+        # build log. Unfortunately, this would block as long as the process
757
+        # ran (with no easy way to timeout the read) and wedge the builder.
758
+        # Now we use --logfile option to the dib command and set a timeout
759
+        # on the wait() call to prevent the wedge.
760
+        did_timeout = False
753 761
         try:
754 762
             p = subprocess.Popen(
755 763
                 shlex.split(cmd),
756
-                stdout=subprocess.PIPE,
757 764
                 stderr=subprocess.STDOUT,
758 765
                 env=env)
759 766
         except OSError as e:
@@ -761,17 +768,20 @@ class BuildWorker(BaseWorker):
761 768
                 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
762 769
             )
763 770
 
764
-        with open(log_fn, 'wb') as log:
765
-            while True:
766
-                ln = p.stdout.readline()
767
-                log.write(ln)
768
-                log.flush()
769
-                if not ln:
770
-                    break
771
-
772
-            rc = p.wait()
773
-            m = "Exit code: %s\n" % rc
774
-            log.write(m.encode('utf8'))
771
+        try:
772
+            rc = p.wait(timeout=diskimage.build_timeout)
773
+        except subprocess.TimeoutExpired:
774
+            p.kill()
775
+            did_timeout = True
776
+            rc = 1
777
+            self.log.error(
778
+                "Build timeout for image %s, build %s (log: %s)",
779
+                diskimage.name, build_id, log_fn)
780
+        else:
781
+            # Append return code to dib's log file
782
+            with open(log_fn, 'ab') as log:
783
+                m = "Exit code: %s\n" % rc
784
+                log.write(m.encode('utf8'))
775 785
 
776 786
         # It's possible the connection to the ZK cluster could have been
777 787
         # interrupted during the build. If so, wait for it to return.
@@ -796,9 +806,10 @@ class BuildWorker(BaseWorker):
796 806
             self.log.info("ZooKeeper lost while building %s" % diskimage.name)
797 807
             self._zk.resetLostFlag()
798 808
             build_data.state = zk.FAILED
799
-        elif p.returncode:
809
+        elif p.returncode or did_timeout:
800 810
             self.log.info(
801
-                "DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
811
+                "DIB failed creating %s (%s) (timeout=%s)" % (
812
+                    diskimage.name, p.returncode, did_timeout))
802 813
             build_data.state = zk.FAILED
803 814
         else:
804 815
             self.log.info("DIB image %s is built" % diskimage.name)

+ 1
- 0
nodepool/cmd/config_validator.py View File

@@ -44,6 +44,7 @@ class ConfigValidator:
44 44
             'rebuild-age': int,
45 45
             'env-vars': {str: str},
46 46
             'username': str,
47
+            'build-timeout': int,
47 48
         }
48 49
 
49 50
         webapp = {

+ 4
- 1
nodepool/config.py View File

@@ -118,6 +118,7 @@ class Config(ConfigValue):
118 118
             d.image_types = set(diskimage.get('formats', []))
119 119
             d.pause = bool(diskimage.get('pause', False))
120 120
             d.username = diskimage.get('username', 'zuul')
121
+            d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
121 122
             self.diskimages[d.name] = d
122 123
 
123 124
     def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
179 180
         self.image_types = None
180 181
         self.pause = False
181 182
         self.username = None
183
+        self.build_timeout = None
182 184
 
183 185
     def __eq__(self, other):
184 186
         if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
189 191
                     other.env_vars == self.env_vars and
190 192
                     other.image_types == self.image_types and
191 193
                     other.pause == self.pause and
192
-                    other.username == self.username)
194
+                    other.username == self.username and
195
+                    other.build_timeout == self.build_timeout)
193 196
         return False
194 197
 
195 198
     def __repr__(self):

+ 12
- 7
nodepool/tests/__init__.py View File

@@ -411,20 +411,25 @@ class DBTestCase(BaseTestCase):
411 411
             time.sleep(1)
412 412
         self.wait_for_threads()
413 413
 
414
-    def waitForBuild(self, image_name, build_id):
414
+    def waitForBuild(self, image_name, build_id, states=None):
415
+        if states is None:
416
+            states = (zk.READY,)
417
+
415 418
         base = "-".join([image_name, build_id])
419
+
416 420
         while True:
417 421
             self.wait_for_threads()
418
-            files = builder.DibImageFile.from_image_id(
419
-                self._config_images_dir.path, base)
420
-            if files:
422
+            build = self.zk.getBuild(image_name, build_id)
423
+            if build and build.state in states:
421 424
                 break
422 425
             time.sleep(1)
423 426
 
424
-        while True:
427
+        # We should only expect a dib manifest with a successful build.
428
+        while build.state == zk.READY:
425 429
             self.wait_for_threads()
426
-            build = self.zk.getBuild(image_name, build_id)
427
-            if build and build.state == zk.READY:
430
+            files = builder.DibImageFile.from_image_id(
431
+                self._config_images_dir.path, base)
432
+            if files:
428 433
                 break
429 434
             time.sleep(1)
430 435
 

+ 54
- 24
nodepool/tests/fake-image-create View File

@@ -1,10 +1,49 @@
1 1
 #!/bin/bash
2 2
 
3
+outfile=
4
+outtypes=("qcow2")
5
+
6
+all_args=$*
7
+logfile=
8
+checksum=
9
+no_tmpfs=
10
+qemu_img_options=
11
+x=
12
+
13
+TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
14
+if [ $? -ne 0 ]; then
15
+    echo "Invalid option"
16
+    exit 1
17
+fi
18
+eval set -- "$TEMP"
19
+while true ; do
20
+    case "$1" in
21
+        --checksum)
22
+            checksum=1; shift 1;;
23
+        --no-tmpfs)
24
+            no_tmpfs=1; shift 1;;
25
+        --qemu-img-options)
26
+            qemu_img_options=$2; shift 2;;
27
+        --logfile)
28
+            logfile=$2; shift 2;;
29
+        -o) outfile=$2; shift 2;;
30
+        -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
31
+        -x) x=1; shift;;
32
+        --) shift ; break ;;
33
+        *) echo "Unknown option : $1"; exit 1;;
34
+    esac
35
+done
36
+
37
+# If --logfile was given, direct stdout to it, as well
38
+if [ ! -z "$logfile" ]; then
39
+    exec > >(tee -a ${logfile})
40
+fi
41
+
3 42
 echo "*** fake-image-create: start"
4 43
 
5 44
 echo "arguments:"
6 45
 echo "----"
7
-echo $*
46
+echo "$all_args"
8 47
 echo "----"
9 48
 
10 49
 if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@@ -38,30 +77,21 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
38 77
     exit 1
39 78
 fi
40 79
 
41
-outfile=
42
-outtypes=("qcow2")
43
-
44
-TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@")
45
-if [ $? -ne 0 ]; then
46
-    echo "Invalid option"
47
-    exit 1
80
+if [ ! -z "$logfile" ]; then
81
+    echo " -> logfile: $logfile"
82
+fi
83
+if [ ! -z "$checksum" ]; then
84
+    echo " -> set --checksum"
85
+fi
86
+if [ ! -z "$no_tmpfs" ]; then
87
+    echo " -> set --no-tmpfs"
88
+fi
89
+if [ ! -z "$qemu_img_options" ]; then
90
+    echo " -> qemu-img-options: $qemu_img_options"
91
+fi
92
+if [ ! -z "$x" ]; then
93
+    echo " -> debugging enabled"
48 94
 fi
49
-eval set -- "$TEMP"
50
-while true ; do
51
-    case "$1" in
52
-        --checksum)
53
-            echo " -> set --checksum"; shift 1;;
54
-        --no-tmpfs)
55
-            echo " -> set --no-tmpfs"; shift 1;;
56
-        --qemu-img-options)
57
-            echo " -> qemu-img-options: $2"; shift 2;;
58
-        -o) outfile=$2; shift 2;;
59
-        -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
60
-        -x) echo " -> debugging enabled"; shift;;
61
-        --) shift ; break ;;
62
-        *) echo "Unknown option : $1"; exit 1;;
63
-    esac
64
-done
65 95
 
66 96
 if [ -z "$outfile" ]; then
67 97
     echo "No output file specified."

+ 1
- 0
nodepool/tests/fixtures/config_validate/good.yaml View File

@@ -152,6 +152,7 @@ diskimages:
152 152
       - cache-devstack
153 153
     release: trusty
154 154
     rebuild-age: 3600
155
+    build-timeout: 3600
155 156
     env-vars:
156 157
       TMPDIR: /opt/dib_tmp
157 158
       DIB_IMAGE_CACHE: /opt/dib_cache

+ 26
- 0
nodepool/tests/fixtures/diskimage_build_timeout.yaml View File

@@ -0,0 +1,26 @@
1
+elements-dir: .
2
+images-dir: '{images_dir}'
3
+build-log-dir: '{build_log_dir}'
4
+
5
+zookeeper-servers:
6
+  - host: {zookeeper_host}
7
+    port: {zookeeper_port}
8
+    chroot: {zookeeper_chroot}
9
+
10
+labels: []
11
+
12
+providers: []
13
+
14
+diskimages:
15
+  - name: fake-image
16
+    formats:
17
+      - tar
18
+    elements:
19
+      - fedora
20
+      - vm
21
+    release: 21
22
+    env-vars:
23
+      TMPDIR: /opt/dib_tmp
24
+      DIB_IMAGE_CACHE: /opt/dib_cache
25
+      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
26
+      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

+ 9
- 0
nodepool/tests/unit/test_builder.py View File

@@ -16,6 +16,8 @@
16 16
 import os
17 17
 import uuid
18 18
 import fixtures
19
+import mock
20
+import subprocess
19 21
 
20 22
 from nodepool import builder, exceptions, tests
21 23
 from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +337,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
335 337
 
336 338
         self.assertEqual(build_default._formats, ['qcow2'])
337 339
         self.assertEqual(build_vhd._formats, ['vhd'])
340
+
341
+    @mock.patch.object(subprocess.Popen, 'wait')
342
+    def test_diskimage_build_timeout(self, mock_wait):
343
+        mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
344
+        configfile = self.setup_config('diskimage_build_timeout.yaml')
345
+        self.useBuilder(configfile, cleanup_interval=0)
346
+        self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))

+ 6
- 0
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml View File

@@ -0,0 +1,6 @@
1
+---
2
+features:
3
+  - |
4
+    A new option (build-timeout) has been added to the builder diskimage
5
+    configuration to control how long the builder should wait for image
6
+    builds before giving up. The default is 8 hours.

Loading…
Cancel
Save