From ccf40a462a4805c7162bc34434cfe718cd96c448 Mon Sep 17 00:00:00 2001
From: David Shrewsbury <shrewsbury.dave@gmail.com>
Date: Fri, 25 Jan 2019 17:36:31 +0000
Subject: [PATCH] Revert "Add a timeout for the image build"

This reverts commit 7225354ec04f0466d95b6d68e51002723d6529c9.

The disk-image-create command does not appear to be starting.

Change-Id: I81abe25a253a385cae08a57561129a678546f18f
---
 doc/source/configuration.rst                  |  6 --
 nodepool/builder.py                           | 45 ++++-------
 nodepool/cmd/config_validator.py              |  1 -
 nodepool/config.py                            |  5 +-
 nodepool/tests/__init__.py                    | 21 ++---
 nodepool/tests/fake-image-create              | 78 ++++++-------------
 .../tests/fixtures/config_validate/good.yaml  |  1 -
 .../fixtures/diskimage_build_timeout.yaml     | 26 -------
 nodepool/tests/unit/test_builder.py           |  9 ---
 .../notes/build-timeout-bb68a1fd24f97a10.yaml |  6 --
 10 files changed, 50 insertions(+), 148 deletions(-)
 delete mode 100644 nodepool/tests/fixtures/diskimage_build_timeout.yaml
 delete mode 100644 releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml

diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst
index 7bec67054..27abacf86 100644
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -277,12 +277,6 @@ Options
       Specifies the distro to be used as a base image to build the image using
       diskimage-builder.
 
-   .. attr:: build-timeout
-      :type: int
-
-      How long (in seconds) to wait for the diskimage build before giving up.
-      The default is 8 hours.
-
    .. attr:: elements
       :type: list
 
diff --git a/nodepool/builder.py b/nodepool/builder.py
index 16fe7148b..dce709e9e 100755
--- a/nodepool/builder.py
+++ b/nodepool/builder.py
@@ -739,28 +739,21 @@ class BuildWorker(BaseWorker):
         if 'qcow2' in img_types:
             qemu_img_options = DEFAULT_QEMU_IMAGE_COMPAT_OPTIONS
 
-        log_fn = self._getBuildLog(diskimage.name, build_id)
-
-        cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s --logfile %s %s' %
+        cmd = ('%s -x -t %s --checksum --no-tmpfs %s -o %s %s' %
                (self.dib_cmd, img_types, qemu_img_options, filename,
-                log_fn, img_elements))
+                img_elements))
 
         self._pruneBuildLogs(diskimage.name)
+        log_fn = self._getBuildLog(diskimage.name, build_id)
 
         self.log.info('Running %s' % (cmd,))
         self.log.info('Logging to %s' % (log_fn,))
 
         start_time = time.monotonic()
-
-        # We used to use readline() on stdout to output the lines to the
-        # build log. Unfortunately, this would block as long as the process
-        # ran (with no easy way to timeout the read) and wedge the builder.
-        # Now we use --logfile option to the dib command and set a timeout
-        # on the wait() call to prevent the wedge.
-        did_timeout = False
         try:
             p = subprocess.Popen(
                 shlex.split(cmd),
+                stdout=subprocess.PIPE,
                 stderr=subprocess.STDOUT,
                 env=env)
         except OSError as e:
@@ -768,20 +761,17 @@ class BuildWorker(BaseWorker):
                 "Failed to exec '%s'. Error: '%s'" % (cmd, e.strerror)
             )
 
-        try:
-            rc = p.wait(timeout=diskimage.build_timeout)
-        except subprocess.TimeoutExpired:
-            p.kill()
-            did_timeout = True
-            rc = 1
-            self.log.error(
-                "Build timeout for image %s, build %s (log: %s)",
-                diskimage.name, build_id, log_fn)
-        else:
-            # Append return code to dib's log file
-            with open(log_fn, 'ab') as log:
-                m = "Exit code: %s\n" % rc
-                log.write(m.encode('utf8'))
+        with open(log_fn, 'wb') as log:
+            while True:
+                ln = p.stdout.readline()
+                log.write(ln)
+                log.flush()
+                if not ln:
+                    break
+
+            rc = p.wait()
+            m = "Exit code: %s\n" % rc
+            log.write(m.encode('utf8'))
 
         # It's possible the connection to the ZK cluster could have been
         # interrupted during the build. If so, wait for it to return.
@@ -806,10 +796,9 @@ class BuildWorker(BaseWorker):
             self.log.info("ZooKeeper lost while building %s" % diskimage.name)
             self._zk.resetLostFlag()
             build_data.state = zk.FAILED
-        elif p.returncode or did_timeout:
+        elif p.returncode:
             self.log.info(
-                "DIB failed creating %s (%s) (timeout=%s)" % (
-                    diskimage.name, p.returncode, did_timeout))
+                "DIB failed creating %s (%s)" % (diskimage.name, p.returncode))
             build_data.state = zk.FAILED
         else:
             self.log.info("DIB image %s is built" % diskimage.name)
diff --git a/nodepool/cmd/config_validator.py b/nodepool/cmd/config_validator.py
index f2646af34..c3dde57d7 100644
--- a/nodepool/cmd/config_validator.py
+++ b/nodepool/cmd/config_validator.py
@@ -44,7 +44,6 @@ class ConfigValidator:
             'rebuild-age': int,
             'env-vars': {str: str},
             'username': str,
-            'build-timeout': int,
         }
 
         webapp = {
diff --git a/nodepool/config.py b/nodepool/config.py
index 7f09961c7..2b5e4ebd4 100755
--- a/nodepool/config.py
+++ b/nodepool/config.py
@@ -118,7 +118,6 @@ class Config(ConfigValue):
             d.image_types = set(diskimage.get('formats', []))
             d.pause = bool(diskimage.get('pause', False))
             d.username = diskimage.get('username', 'zuul')
-            d.build_timeout = diskimage.get('build-timeout', (8 * 60 * 60))
             self.diskimages[d.name] = d
 
     def setSecureDiskimageEnv(self, diskimages, secure_config_path):
@@ -180,7 +179,6 @@ class DiskImage(ConfigValue):
         self.image_types = None
         self.pause = False
         self.username = None
-        self.build_timeout = None
 
     def __eq__(self, other):
         if isinstance(other, DiskImage):
@@ -191,8 +189,7 @@ class DiskImage(ConfigValue):
                     other.env_vars == self.env_vars and
                     other.image_types == self.image_types and
                     other.pause == self.pause and
-                    other.username == self.username and
-                    other.build_timeout == self.build_timeout)
+                    other.username == self.username)
         return False
 
     def __repr__(self):
diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py
index 376b4fb5f..4a8e83d91 100644
--- a/nodepool/tests/__init__.py
+++ b/nodepool/tests/__init__.py
@@ -410,21 +410,9 @@ class DBTestCase(BaseTestCase):
             time.sleep(1)
         self.wait_for_threads()
 
-    def waitForBuild(self, image_name, build_id, states=None):
-        if states is None:
-            states = (zk.READY,)
-
+    def waitForBuild(self, image_name, build_id):
         base = "-".join([image_name, build_id])
-
         while True:
-            self.wait_for_threads()
-            build = self.zk.getBuild(image_name, build_id)
-            if build and build.state in states:
-                break
-            time.sleep(1)
-
-        # We should only expect a dib manifest with a successful build.
-        while build.state == zk.READY:
             self.wait_for_threads()
             files = builder.DibImageFile.from_image_id(
                 self._config_images_dir.path, base)
@@ -432,6 +420,13 @@ class DBTestCase(BaseTestCase):
                 break
             time.sleep(1)
 
+        while True:
+            self.wait_for_threads()
+            build = self.zk.getBuild(image_name, build_id)
+            if build and build.state == zk.READY:
+                break
+            time.sleep(1)
+
         self.wait_for_threads()
         return build
 
diff --git a/nodepool/tests/fake-image-create b/nodepool/tests/fake-image-create
index 1a2406fd2..9d236ff16 100755
--- a/nodepool/tests/fake-image-create
+++ b/nodepool/tests/fake-image-create
@@ -1,49 +1,10 @@
 #!/bin/bash
 
-outfile=
-outtypes=("qcow2")
-
-all_args=$*
-logfile=
-checksum=
-no_tmpfs=
-qemu_img_options=
-x=
-
-TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum,logfile: -- "$@")
-if [ $? -ne 0 ]; then
-    echo "Invalid option"
-    exit 1
-fi
-eval set -- "$TEMP"
-while true ; do
-    case "$1" in
-        --checksum)
-            checksum=1; shift 1;;
-        --no-tmpfs)
-            no_tmpfs=1; shift 1;;
-        --qemu-img-options)
-            qemu_img_options=$2; shift 2;;
-        --logfile)
-            logfile=$2; shift 2;;
-        -o) outfile=$2; shift 2;;
-        -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
-        -x) x=1; shift;;
-        --) shift ; break ;;
-        *) echo "Unknown option : $1"; exit 1;;
-    esac
-done
-
-# If --logfile was given, direct stdout to it, as well
-if [ ! -z "$logfile" ]; then
-    exec > >(tee -a ${logfile})
-fi
-
 echo "*** fake-image-create: start"
 
 echo "arguments:"
 echo "----"
-echo "$all_args"
+echo $*
 echo "----"
 
 if [[ "${SHOULD_FAIL}" == 'true' ]]; then
@@ -77,21 +38,30 @@ if [[ "${BASE_IMAGE_FILE}" != "Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2"
     exit 1
 fi
 
-if [ ! -z "$logfile" ]; then
-    echo " -> logfile: $logfile"
-fi
-if [ ! -z "$checksum" ]; then
-    echo " -> set --checksum"
-fi
-if [ ! -z "$no_tmpfs" ]; then
-    echo " -> set --no-tmpfs"
-fi
-if [ ! -z "$qemu_img_options" ]; then
-    echo " -> qemu-img-options: $qemu_img_options"
-fi
-if [ ! -z "$x" ]; then
-    echo " -> debugging enabled"
+outfile=
+outtypes=("qcow2")
+
+TEMP=$(getopt -o xo:t: --long qemu-img-options:,no-tmpfs,checksum -- "$@")
+if [ $? -ne 0 ]; then
+    echo "Invalid option"
+    exit 1
 fi
+eval set -- "$TEMP"
+while true ; do
+    case "$1" in
+        --checksum)
+            echo " -> set --checksum"; shift 1;;
+        --no-tmpfs)
+            echo " -> set --no-tmpfs"; shift 1;;
+        --qemu-img-options)
+            echo " -> qemu-img-options: $2"; shift 2;;
+        -o) outfile=$2; shift 2;;
+        -t) IFS="," read -a outtypes <<< "$2"; shift 2;;
+        -x) echo " -> debugging enabled"; shift;;
+        --) shift ; break ;;
+        *) echo "Unknown option : $1"; exit 1;;
+    esac
+done
 
 if [ -z "$outfile" ]; then
     echo "No output file specified."
diff --git a/nodepool/tests/fixtures/config_validate/good.yaml b/nodepool/tests/fixtures/config_validate/good.yaml
index fc5993741..db7d1ecfd 100644
--- a/nodepool/tests/fixtures/config_validate/good.yaml
+++ b/nodepool/tests/fixtures/config_validate/good.yaml
@@ -130,7 +130,6 @@ diskimages:
       - cache-devstack
     release: trusty
     rebuild-age: 3600
-    build-timeout: 3600
     env-vars:
       TMPDIR: /opt/dib_tmp
       DIB_IMAGE_CACHE: /opt/dib_cache
diff --git a/nodepool/tests/fixtures/diskimage_build_timeout.yaml b/nodepool/tests/fixtures/diskimage_build_timeout.yaml
deleted file mode 100644
index 01770bddd..000000000
--- a/nodepool/tests/fixtures/diskimage_build_timeout.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-elements-dir: .
-images-dir: '{images_dir}'
-build-log-dir: '{build_log_dir}'
-
-zookeeper-servers:
-  - host: {zookeeper_host}
-    port: {zookeeper_port}
-    chroot: {zookeeper_chroot}
-
-labels: []
-
-providers: []
-
-diskimages:
-  - name: fake-image
-    formats:
-      - tar
-    elements:
-      - fedora
-      - vm
-    release: 21
-    env-vars:
-      TMPDIR: /opt/dib_tmp
-      DIB_IMAGE_CACHE: /opt/dib_cache
-      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
-      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py
index cfea4b855..c992bf7aa 100644
--- a/nodepool/tests/unit/test_builder.py
+++ b/nodepool/tests/unit/test_builder.py
@@ -16,8 +16,6 @@
 import os
 import uuid
 import fixtures
-import mock
-import subprocess
 
 from nodepool import builder, exceptions, tests
 from nodepool.driver.fake import provider as fakeprovider
@@ -337,10 +335,3 @@ class TestNodePoolBuilder(tests.DBTestCase):
 
         self.assertEqual(build_default._formats, ['qcow2'])
         self.assertEqual(build_vhd._formats, ['vhd'])
-
-    @mock.patch.object(subprocess.Popen, 'wait')
-    def test_diskimage_build_timeout(self, mock_wait):
-        mock_wait.side_effect = subprocess.TimeoutExpired('dib_cmd', 1)
-        configfile = self.setup_config('diskimage_build_timeout.yaml')
-        self.useBuilder(configfile, cleanup_interval=0)
-        self.waitForBuild('fake-image', '0000000001', states=(zk.FAILED,))
diff --git a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml b/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
deleted file mode 100644
index 6901903db..000000000
--- a/releasenotes/notes/build-timeout-bb68a1fd24f97a10.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
----
-features:
-  - |
-    A new option (build-timeout) has been added to the builder diskimage
-    configuration to control how long the builder should wait for image
-    builds before giving up. The default is 8 hours.