Browse Source

Merge "Revert "Revert "Add a timeout for the image build"""

tags/3.5.0
Zuul 5 months ago
parent
commit
280cd5937d

+ 6
- 0
doc/source/configuration.rst View File

@@ -277,6 +277,12 @@ Options
277 277
       Specifies the distro to be used as a base image to build the image using
278 278
       diskimage-builder.
279 279
 
280
+   .. attr:: build-timeout
281
+      :type: int
282
+
283
+      How long (in seconds) to wait for the diskimage build before giving up.
284
+      The default is 8 hours.
285
+
280 286
    .. attr:: elements
281 287
       :type: list
282 288
 

+ 74
- 11
nodepool/builder.py View File

@@ -13,8 +13,10 @@
13 13
 # License for the specific language governing permissions and limitations
14 14
 # under the License.
15 15
 
16
+import fcntl
16 17
 import logging
17 18
 import os
19
+import select
18 20
 import shutil
19 21
 import socket
20 22
 import subprocess
@@ -42,6 +44,9 @@ SUSPEND_WAIT_TIME = 30
42 44
 # so just hardcode it for all qcow2 building
43 45
 DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS = "--qemu-img-options 'compat=0.10'"
44 46
 
47
+# DIB process polling timeout, in milliseconds
48
+BUILD_PROCESS_POLL_TIMEOUT = 30 * 1000
49
+
45 50
 
46 51
 class DibImageFile(object):
47 52
     '''
@@ -750,6 +755,7 @@ class BuildWorker(BaseWorker):
750 755
         self.log.info('Logging to %s' % (log_fn,))
751 756
 
752 757
         start_time = time.monotonic()
758
+
753 759
         try:
754 760
             p = subprocess.Popen(
755 761
                 shlex.split(cmd),
@@ -761,17 +767,73 @@ class BuildWorker(BaseWorker):
761 767
                 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
762 768
             )
763 769
 
770
+        # Make subprocess stdout non-blocking
771
+        fd = p.stdout.fileno()
772
+        fl = fcntl.fcntl(fd, fcntl.F_GETFL)
773
+        fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
774
+
775
+        # Poll subprocess stdout for readability
776
+        r, w, e = select.select([fd], [], [], 0)
777
+        bitmask = (select.POLLIN | select.POLLHUP)
778
+        poll = select.poll()
779
+        poll.register(fd, bitmask)
780
+
781
+        rc = None
782
+        did_timeout = False
783
+        subprocess_done = False
784
+
785
+        def buildDidTimeout():
786
+            build_time = time.monotonic() - start_time
787
+            if build_time > diskimage.build_timeout:
788
+                return True
789
+            return False
790
+
764 791
         with open(log_fn, 'wb') as log:
765
-            while True:
766
-                ln = p.stdout.readline()
767
-                log.write(ln)
768
-                log.flush()
769
-                if not ln:
770
-                    break
771 792
 
772
-            rc = p.wait()
773
-            m = "Exit code: %s\n" % rc
774
-            log.write(m.encode('utf8'))
793
+            # While the subprocess is running, we will loop through stdout
794
+            # events. If we can read data, write that out to the log file.
795
+            # If the subprocess completes, set the flag so we can exit and
796
+            # write the return code.
797
+            #
798
+            # We check for build timeouts in two places: after we have read
799
+            # all available data in the buffer (which will cause an early exit
800
+            # of the poll loop), and after a poll timeout. If we did not have
801
+            # a check after the reads, we would have to have a poll timeout
802
+            # to occur to check for a build timeout, which may not happen if
803
+            # there is constantly data available for reading.
804
+
805
+            while not subprocess_done:
806
+                for fd, event in poll.poll(BUILD_PROCESS_POLL_TIMEOUT):
807
+
808
+                    # Data available for reading
809
+                    if event & select.POLLIN:
810
+                        data = p.stdout.read(1024)
811
+                        while data:
812
+                            log.write(data)
813
+                            log.flush()
814
+                            data = p.stdout.read(1024)
815
+                        if buildDidTimeout():
816
+                            break
817
+
818
+                    # Subprocess complete
819
+                    elif event & select.POLLHUP:
820
+                        subprocess_done = True
821
+                        rc = p.wait()
822
+
823
+                if not subprocess_done:
824
+                    if buildDidTimeout():
825
+                        did_timeout = True
826
+                        rc = 1
827
+                        self.log.error(
828
+                            "Build timeout for image %s, build %s (log: %s)",
829
+                            diskimage.name, build_id, log_fn)
830
+                        p.kill()
831
+                        break
832
+
833
+            # Subprocess finished, write return code
834
+            if not did_timeout:
835
+                m = "Exit code: %s\n" % rc
836
+                log.write(m.encode('utf8'))
775 837
 
776 838
         # It's possible the connection to the ZK cluster could have been
777 839
         # interrupted during the build. If so, wait for it to return.
@@ -796,9 +858,10 @@ class BuildWorker(BaseWorker):
796 858
             self.log.info("ZooKeeper lost while building %s" % diskimage.name)
797 859
             self._zk.resetLostFlag()
798 860
             build_data.state = zk.FAILED
799
-        elif p.returncode:
861
+        elif p.returncode or did_timeout:
800 862
             self.log.info(
801
-                "DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
863
+                "DIB failed creating %s (%s) (timeout=%s)" % (
864
+                    diskimage.name, p.returncode, did_timeout))
802 865
             build_data.state = zk.FAILED
803 866
         else:
804 867
             self.log.info("DIB image %s is built" % diskimage.name)

+ 1
- 0
nodepool/cmd/config_validator.py View File

@@ -44,6 +44,7 @@ class ConfigValidator:
44 44
             'rebuild-age': int,
45 45
             'env-vars': {str: str},
46 46
             'username': str,
47
+            'build-timeout': int,
47 48
         }
48 49
 
49 50
         webapp = {

+ 4
- 1
nodepool/config.py View File

@@ -118,6 +118,7 @@ class Config(ConfigValue):
118 118
             d.image_types = set(diskimage.get('formats', []))
119 119
             d.pause = bool(diskimage.get('pause', False))
120 120
             d.username = diskimage.get('username', 'zuul')
121
+            d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
121 122
             self.diskimages[d.name] = d
122 123
 
123 124
     def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -179,6 +180,7 @@ class DiskImage(ConfigValue):
179 180
         self.image_types = None
180 181
         self.pause = False
181 182
         self.username = None
183
+        self.build_timeout = None
182 184
 
183 185
     def __eq__(self, other):
184 186
         if isinstance(other, DiskImage):
@@ -189,7 +191,8 @@ class DiskImage(ConfigValue):
189 191
                     other.env_vars == self.env_vars and
190 192
                     other.image_types == self.image_types and
191 193
                     other.pause == self.pause and
192
-                    other.username == self.username)
194
+                    other.username == self.username and
195
+                    other.build_timeout == self.build_timeout)
193 196
         return False
194 197
 
195 198
     def __repr__(self):

+ 12
- 7
nodepool/tests/__init__.py View File

@@ -414,20 +414,25 @@ class DBTestCase(BaseTestCase):
414 414
             time.sleep(1)
415 415
         self.wait_for_threads()
416 416
 
417
-    def waitForBuild(self, image_name, build_id):
417
+    def waitForBuild(self, image_name, build_id, states=None):
418
+        if states is None:
419
+            states = (zk.READY,)
420
+
418 421
         base = "-".join([image_name, build_id])
422
+
419 423
         while True:
420 424
             self.wait_for_threads()
421
-            files = builder.DibImageFile.from_image_id(
422
-                self._config_images_dir.path, base)
423
-            if files:
425
+            build = self.zk.getBuild(image_name, build_id)
426
+            if build and build.state in states:
424 427
                 break
425 428
             time.sleep(1)
426 429
 
427
-        while True:
430
+        # We should only expect a dib manifest with a successful build.
431
+        while build.state == zk.READY:
428 432
             self.wait_for_threads()
429
-            build = self.zk.getBuild(image_name, build_id)
430
-            if build and build.state == zk.READY:
433
+            files = builder.DibImageFile.from_image_id(
434
+                self._config_images_dir.path, base)
435
+            if files:
431 436
                 break
432 437
             time.sleep(1)
433 438
 

+ 1
- 0
nodepool/tests/fixtures/config_validate/good.yaml View File

@@ -174,6 +174,7 @@ diskimages:
174 174
       - cache-devstack
175 175
     release: trusty
176 176
     rebuild-age: 3600
177
+    build-timeout: 3600
177 178
     env-vars:
178 179
       TMPDIR: /opt/dib_tmp
179 180
       DIB_IMAGE_CACHE: /opt/dib_cache

+ 27
- 0
nodepool/tests/fixtures/diskimage_build_timeout.yaml View File

@@ -0,0 +1,27 @@
1
+elements-dir: .
2
+images-dir: '{images_dir}'
3
+build-log-dir: '{build_log_dir}'
4
+
5
+zookeeper-servers:
6
+  - host: {zookeeper_host}
7
+    port: {zookeeper_port}
8
+    chroot: {zookeeper_chroot}
9
+
10
+labels: []
11
+
12
+providers: []
13
+
14
+diskimages:
15
+  - name: fake-image
16
+    build-timeout: 2
17
+    formats:
18
+      - tar
19
+    elements:
20
+      - fedora
21
+      - vm
22
+    release: 21
23
+    env-vars:
24
+      TMPDIR: /opt/dib_tmp
25
+      DIB_IMAGE_CACHE: /opt/dib_cache
26
+      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
27
+      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

+ 8
- 0
nodepool/tests/unit/test_builder.py View File

@@ -16,6 +16,7 @@
16 16
 import os
17 17
 import uuid
18 18
 import fixtures
19
+import mock
19 20
 
20 21
 from nodepool import builder, exceptions, tests
21 22
 from nodepool.driver.fake import provider as fakeprovider
@@ -335,3 +336,10 @@ class TestNodePoolBuilder(tests.DBTestCase):
335 336
 
336 337
         self.assertEqual(build_default._formats, ['qcow2'])
337 338
         self.assertEqual(build_vhd._formats, ['vhd'])
339
+
340
+    @mock.patch('select.poll')
341
+    def test_diskimage_build_timeout(self, mock_poll):
342
+        configfile = self.setup_config('diskimage_build_timeout.yaml')
343
+        builder.BUILD_PROCESS_POLL_TIMEOUT = 500
344
+        self.useBuilder(configfile, cleanup_interval=0)
345
+        self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))

+ 6
- 0
releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml View File

@@ -0,0 +1,6 @@
1
+---
2
+features:
3
+  - |
4
+    A new option (build-timeout) has been added to the builder diskimage
5
+    configuration to control how long the builder should wait for image
6
+    builds before giving up. The default is 8 hours.

Loading…
Cancel
Save