Add port-cleanup-interval config option

There are some edge cases where the port cleanup logic is too
aggressive. This change attempts to address both of them in one commit:

* Some providers might spawn instances very slowly. In the past this was
  handled by hardcoding the timeout to 10 minutes. This allows a user to
  tweak the timeout in config.
* In the esoteric combination of using Ironic without the Ironic Neutron
  agent, it's normal for ports to remain DOWN indefinitely. Setting the
  timeout to 0, will work around that edge case.

Change-Id: I120d79c4b5f209bb1bd9907db172f94f29b9cb5d
This commit is contained in:
Jan Gutter 2019-10-07 14:58:47 +02:00
parent d2f9c53959
commit 6789c4b618
6 changed files with 99 additions and 10 deletions

View File

@ -577,6 +577,15 @@ Selecting the OpenStack driver adds the following options to the
the OpenStack project and will attempt to clean unattached
floating ips that may have leaked around restarts.
.. attr:: port-cleanup-interval
:type: int seconds
:default: 600
If greater than 0, nodepool will assume it is the only user of the
OpenStack project and will attempt to clean ports in `DOWN` state after
`cleanup-interval` has passed. This value can be reduced if the instance
spawn time on the provider is reliably quicker.
.. attr:: diskimages
:type: list

View File

@ -230,6 +230,7 @@ class OpenStackProviderConfig(ProviderConfig):
self.boot_timeout = None
self.launch_timeout = None
self.clean_floating_ips = None
self.port_cleanup_interval = None
self.diskimages = {}
self.cloud_images = {}
self.hostname_format = None
@ -246,6 +247,8 @@ class OpenStackProviderConfig(ProviderConfig):
other.boot_timeout == self.boot_timeout and
other.launch_timeout == self.launch_timeout and
other.clean_floating_ips == self.clean_floating_ips and
other.port_cleanup_interval ==
self.port_cleanup_interval and
other.diskimages == self.diskimages and
other.cloud_images == self.cloud_images)
return False
@ -277,6 +280,10 @@ class OpenStackProviderConfig(ProviderConfig):
self.launch_timeout = self.provider.get('launch-timeout', 3600)
self.launch_retries = self.provider.get('launch-retries', 3)
self.clean_floating_ips = self.provider.get('clean-floating-ips')
self.port_cleanup_interval = self.provider.get(
'port-cleanup-interval',
600
)
self.hostname_format = self.provider.get(
'hostname-format',
'{label.name}-{provider.name}-{node.id}'
@ -414,6 +421,7 @@ class OpenStackProviderConfig(ProviderConfig):
'hostname-format': str,
'image-name-format': str,
'clean-floating-ips': bool,
'port-cleanup-interval': int,
'pools': [pool],
'diskimages': [provider_diskimage],
'cloud-images': [provider_cloud_images],

View File

@ -51,10 +51,6 @@ class OpenStackProvider(Provider):
self._zk = None
self._down_ports = set()
self._last_port_cleanup = None
# Set this long enough to avoid deleting a port which simply
# hasn't yet been attached to an instance which is being
# created.
self._port_cleanup_interval_secs = 600
self._statsd = stats.get_client()
def start(self, zk_conn):
@ -554,7 +550,7 @@ class OpenStackProvider(Provider):
# Return if not enough time has passed between cleanup
last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
if last_check_in_secs <= self._port_cleanup_interval_secs:
if last_check_in_secs <= self.provider.port_cleanup_interval:
return
ports = self.listPorts(status='DOWN')
@ -588,7 +584,8 @@ class OpenStackProvider(Provider):
def cleanupLeakedResources(self):
self.cleanupLeakedInstances()
self.cleanupLeakedPorts()
if self.provider.port_cleanup_interval:
self.cleanupLeakedPorts()
if self.provider.clean_floating_ips:
self._client.delete_unattached_floating_ips()

View File

@ -0,0 +1,54 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
build-log-retention: 1
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels:
- name: fake-label
min-ready: 1
providers:
- name: fake-provider
cloud: fake
driver: fake
region-name: fake-region
rate: 0.0001
port-cleanup-interval: 2
diskimages:
- name: fake-image
meta:
key: value
key2: value
pools:
- name: main
max-servers: 96
node-attributes:
key1: value1
key2: value2
availability-zones:
- az1
networks:
- net-name
labels:
- name: fake-label
diskimage: fake-image
min-ram: 8192
flavor-name: 'Fake'
diskimages:
- name: fake-image
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -33,6 +33,7 @@ providers:
boot-timeout: 120
max-concurrency: 10
launch-retries: 3
port-cleanup-interval: 600
rate: 1
diskimages:
- name: trusty
@ -75,6 +76,7 @@ providers:
region-name: 'chocolate'
boot-timeout: 120
rate: 0.001
port-cleanup-interval: 0
diskimages:
- name: trusty
pause: False

View File

@ -2109,10 +2109,29 @@ class TestLauncher(tests.DBTestCase):
self.assertEqual(2, len(down_ports))
self.log.debug("Down ports: %s", down_ports)
# Change the port cleanup interval to happen quicker
manager._port_cleanup_interval_secs = 2
while manager.listPorts(status='DOWN'):
time.sleep(1)
# Second config decreases cleanup interval to 2 seconds
self.replace_config(configfile, 'cleanup-port.yaml')
oldmanager = manager
manager = pool.getProviderManager('fake-provider')
for _ in iterate_timeout(10, Exception, 'assert config updated'):
try:
self.assertNotEqual(manager, oldmanager)
break
except AssertionError:
# config still hasn't updated, retry
manager = pool.getProviderManager('fake-provider')
# Reset the client as a new fake client will have been
# created.
manager.resetClient()
for _ in iterate_timeout(4, Exception, 'assert ports are cleaned'):
try:
down_ports = manager.listPorts(status='DOWN')
self.assertEqual(0, len(down_ports))
break
except AssertionError:
# ports not cleaned up yet, retry
pass
self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
value='2', kind='c')