Add port-cleanup-interval config option

There are some edge cases where the port cleanup logic is too aggressive. This change attempts to address both of them in one commit: * Some providers might spawn instances very slowly. In the past this was handled by hardcoding the timeout to 10 minutes. This allows a user to tweak the timeout in config. * In the esoteric combination of using Ironic without the Ironic Neutron agent, it's normal for ports to remain DOWN indefinitely. Setting the timeout to 0, will work around that edge case. Change-Id: I120d79c4b5f209bb1bd9907db172f94f29b9cb5d
2019-10-07 14:58:47 +02:00 · 2019-10-07 14:58:47 +02:00 · 6789c4b618
parent d2f9c53959
commit 6789c4b618
6 changed files with 99 additions and 10 deletions
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@ -577,6 +577,15 @@ Selecting the OpenStack driver adds the following options to the
     the OpenStack project and will attempt to clean unattached
     floating ips that may have leaked around restarts.

+  .. attr:: port-cleanup-interval
+     :type: int seconds
+     :default: 600
+
+     If greater than 0, nodepool will assume it is the only user of the
+     OpenStack project and will attempt to clean ports in `DOWN` state after
+     `cleanup-interval` has passed. This value can be reduced if the instance
+     spawn time on the provider is reliably quicker.
+
  .. attr:: diskimages
     :type: list

--- a/nodepool/driver/openstack/config.py
+++ b/nodepool/driver/openstack/config.py
@ -230,6 +230,7 @@ class OpenStackProviderConfig(ProviderConfig):
        self.boot_timeout = None
        self.launch_timeout = None
        self.clean_floating_ips = None
+        self.port_cleanup_interval = None
        self.diskimages = {}
        self.cloud_images = {}
        self.hostname_format = None
@ -246,6 +247,8 @@ class OpenStackProviderConfig(ProviderConfig):
                    other.boot_timeout == self.boot_timeout and
                    other.launch_timeout == self.launch_timeout and
                    other.clean_floating_ips == self.clean_floating_ips and
+                    other.port_cleanup_interval ==
+                    self.port_cleanup_interval and
                    other.diskimages == self.diskimages and
                    other.cloud_images == self.cloud_images)
        return False
@ -277,6 +280,10 @@ class OpenStackProviderConfig(ProviderConfig):
        self.launch_timeout = self.provider.get('launch-timeout', 3600)
        self.launch_retries = self.provider.get('launch-retries', 3)
        self.clean_floating_ips = self.provider.get('clean-floating-ips')
+        self.port_cleanup_interval = self.provider.get(
+            'port-cleanup-interval',
+            600
+        )
        self.hostname_format = self.provider.get(
            'hostname-format',
            '{label.name}-{provider.name}-{node.id}'
@ -414,6 +421,7 @@ class OpenStackProviderConfig(ProviderConfig):
            'hostname-format': str,
            'image-name-format': str,
            'clean-floating-ips': bool,
+            'port-cleanup-interval': int,
            'pools': [pool],
            'diskimages': [provider_diskimage],
            'cloud-images': [provider_cloud_images],
--- a/nodepool/driver/openstack/provider.py
+++ b/nodepool/driver/openstack/provider.py
@ -51,10 +51,6 @@ class OpenStackProvider(Provider):
        self._zk = None
        self._down_ports = set()
        self._last_port_cleanup = None
-        # Set this long enough to avoid deleting a port which simply
-        # hasn't yet been attached to an instance which is being
-        # created.
-        self._port_cleanup_interval_secs = 600
        self._statsd = stats.get_client()

    def start(self, zk_conn):
@ -554,7 +550,7 @@ class OpenStackProvider(Provider):

        # Return if not enough time has passed between cleanup
        last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
-        if last_check_in_secs <= self._port_cleanup_interval_secs:
+        if last_check_in_secs <= self.provider.port_cleanup_interval:
            return

        ports = self.listPorts(status='DOWN')
@ -588,7 +584,8 @@ class OpenStackProvider(Provider):

    def cleanupLeakedResources(self):
        self.cleanupLeakedInstances()
-        self.cleanupLeakedPorts()
+        if self.provider.port_cleanup_interval:
+            self.cleanupLeakedPorts()
        if self.provider.clean_floating_ips:
            self._client.delete_unattached_floating_ips()

--- a/nodepool/tests/fixtures/cleanup-port.yaml
+++ b/nodepool/tests/fixtures/cleanup-port.yaml
@ -0,0 +1,54 @@
+elements-dir: .
+images-dir: '{images_dir}'
+build-log-dir: '{build_log_dir}'
+build-log-retention: 1
+
+zookeeper-servers:
+  - host: {zookeeper_host}
+    port: {zookeeper_port}
+    chroot: {zookeeper_chroot}
+
+labels:
+  - name: fake-label
+    min-ready: 1
+
+providers:
+  - name: fake-provider
+    cloud: fake
+    driver: fake
+    region-name: fake-region
+    rate: 0.0001
+    port-cleanup-interval: 2
+    diskimages:
+      - name: fake-image
+        meta:
+          key: value
+          key2: value
+    pools:
+      - name: main
+        max-servers: 96
+        node-attributes:
+          key1: value1
+          key2: value2
+        availability-zones:
+          - az1
+        networks:
+          - net-name
+        labels:
+          - name: fake-label
+            diskimage: fake-image
+            min-ram: 8192
+            flavor-name: 'Fake'
+
+diskimages:
+  - name: fake-image
+    elements:
+      - fedora
+      - vm
+    release: 21
+    dib-cmd: nodepool/tests/fake-image-create
+    env-vars:
+      TMPDIR: /opt/dib_tmp
+      DIB_IMAGE_CACHE: /opt/dib_cache
+      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
+      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
--- a/nodepool/tests/fixtures/config_validate/good.yaml
+++ b/nodepool/tests/fixtures/config_validate/good.yaml
@ -33,6 +33,7 @@ providers:
    boot-timeout: 120
    max-concurrency: 10
    launch-retries: 3
+    port-cleanup-interval: 600
    rate: 1
    diskimages:
      - name: trusty
@ -75,6 +76,7 @@ providers:
    region-name: 'chocolate'
    boot-timeout: 120
    rate: 0.001
+    port-cleanup-interval: 0
    diskimages:
      - name: trusty
        pause: False
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@ -2109,10 +2109,29 @@ class TestLauncher(tests.DBTestCase):
        self.assertEqual(2, len(down_ports))
        self.log.debug("Down ports: %s", down_ports)

-        # Change the port cleanup interval to happen quicker
-        manager._port_cleanup_interval_secs = 2
-        while manager.listPorts(status='DOWN'):
-            time.sleep(1)
+        # Second config decreases cleanup interval to 2 seconds
+        self.replace_config(configfile, 'cleanup-port.yaml')
+        oldmanager = manager
+        manager = pool.getProviderManager('fake-provider')
+        for _ in iterate_timeout(10, Exception, 'assert config updated'):
+            try:
+                self.assertNotEqual(manager, oldmanager)
+                break
+            except AssertionError:
+                # config still hasn't updated, retry
+                manager = pool.getProviderManager('fake-provider')
+        # Reset the client as a new fake client will have been
+        # created.
+        manager.resetClient()
+
+        for _ in iterate_timeout(4, Exception, 'assert ports are cleaned'):
+            try:
+                down_ports = manager.listPorts(status='DOWN')
+                self.assertEqual(0, len(down_ports))
+                break
+            except AssertionError:
+                # ports not cleaned up yet, retry
+                pass

        self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
                                value='2', kind='c')