Browse Source

Revert "Revert "Cleanup down ports""

This reverts commit 7e1b8a7261.

openstacksdk >=0.19.0 fixes the filtering problems leading to all
ports being deleted. However openstacksdk <0.21.0 has problems with
dogpile.cache so use 0.21.0 as a minimum.

Change-Id: Id642d074cbb645ced5342dda4a1c89987c91a8fc
tags/3.5.0
Ian Wienand 8 months ago
parent
commit
0cf8144e8c
No account linked to committer's email address

+ 6
- 0
doc/source/operation.rst View File

@@ -329,6 +329,12 @@ Nodepool launcher
329 329
    * ready
330 330
    * used
331 331
 
332
+.. zuul:stat:: nodepool.provider.<provider>.downPorts
333
+   :type: counter
334
+
335
+   Number of ports in the DOWN state that have been removed automatically
336
+   in the cleanup resources phase of the OpenStack driver.
337
+
332 338
 .. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
333 339
    :type: gauge
334 340
 

+ 21
- 0
nodepool/driver/fake/provider.py View File

@@ -32,6 +32,7 @@ class Dummy(object):
32 32
     INSTANCE = 'Instance'
33 33
     FLAVOR = 'Flavor'
34 34
     LOCATION = 'Server.Location'
35
+    PORT = 'Port'
35 36
 
36 37
     def __init__(self, kind, **kw):
37 38
         self.__kind = kind
@@ -105,6 +106,12 @@ class FakeOpenStackCloud(object):
105 106
         self._server_list = []
106 107
         self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
107 108
             _get_quota()
109
+        self._down_ports = [
110
+            Dummy(Dummy.PORT, id='1a', status='DOWN',
111
+                  device_owner="compute:nova"),
112
+            Dummy(Dummy.PORT, id='2b', status='DOWN',
113
+                  device_owner=None),
114
+        ]
108 115
 
109 116
     def _get(self, name_or_id, instance_list):
110 117
         self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
@@ -286,6 +293,20 @@ class FakeOpenStackCloud(object):
286 293
             total_ram_used=8192 * len(self._server_list)
287 294
         )
288 295
 
296
+    def list_ports(self, filters=None):
297
+        if filters and filters.get('status') == 'DOWN':
298
+            return self._down_ports
299
+        return []
300
+
301
+    def delete_port(self, port_id):
302
+        tmp_ports = []
303
+        for port in self._down_ports:
304
+            if port.id != port_id:
305
+                tmp_ports.append(port)
306
+            else:
307
+                self.log.debug("Deleted port ID: %s", port_id)
308
+        self._down_ports = tmp_ports
309
+
289 310
 
290 311
 class FakeUploadFailCloud(FakeOpenStackCloud):
291 312
     log = logging.getLogger("nodepool.FakeUploadFailCloud")

+ 78
- 1
nodepool/driver/openstack/provider.py View File

@@ -26,6 +26,7 @@ from nodepool.driver import Provider
26 26
 from nodepool.driver.utils import QuotaInformation
27 27
 from nodepool.nodeutils import iterate_timeout
28 28
 from nodepool.task_manager import TaskManager
29
+from nodepool import stats
29 30
 from nodepool import version
30 31
 from nodepool import zk
31 32
 
@@ -50,6 +51,10 @@ class OpenStackProvider(Provider):
50 51
         self._taskmanager = None
51 52
         self._current_nodepool_quota = None
52 53
         self._zk = None
54
+        self._down_ports = set()
55
+        self._last_port_cleanup = None
56
+        self._port_cleanup_interval_secs = 180
57
+        self._statsd = stats.get_client()
53 58
 
54 59
     def start(self, zk_conn):
55 60
         if self._use_taskmanager:
@@ -428,6 +433,21 @@ class OpenStackProvider(Provider):
428 433
             **meta)
429 434
         return image.id
430 435
 
436
+    def listPorts(self, status=None):
437
+        '''
438
+        List known ports.
439
+
440
+        :param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'.
441
+        '''
442
+        if status:
443
+            ports = self._client.list_ports(filters={'status': status})
444
+        else:
445
+            ports = self._client.list_ports()
446
+        return ports
447
+
448
+    def deletePort(self, port_id):
449
+        self._client.delete_port(port_id)
450
+
431 451
     def listImages(self):
432 452
         return self._client.list_images()
433 453
 
@@ -455,7 +475,7 @@ class OpenStackProvider(Provider):
455 475
         self.log.debug('Deleting server %s' % server_id)
456 476
         self.deleteServer(server_id)
457 477
 
458
-    def cleanupLeakedResources(self):
478
+    def cleanupLeakedInstances(self):
459 479
         '''
460 480
         Delete any leaked server instances.
461 481
 
@@ -503,6 +523,63 @@ class OpenStackProvider(Provider):
503 523
                 node.state = zk.DELETING
504 524
                 self._zk.storeNode(node)
505 525
 
526
+    def filterComputePorts(self, ports):
527
+        '''
528
+        Return a list of compute ports (or no device owner).
529
+
530
+        We are not interested in ports for routers or DHCP.
531
+        '''
532
+        ret = []
533
+        for p in ports:
534
+            if p.device_owner is None or p.device_owner.startswith("compute:"):
535
+                ret.append(p)
536
+        return ret
537
+
538
+    def cleanupLeakedPorts(self):
539
+        if not self._last_port_cleanup:
540
+            self._last_port_cleanup = time.monotonic()
541
+            ports = self.listPorts(status='DOWN')
542
+            ports = self.filterComputePorts(ports)
543
+            self._down_ports = set([p.id for p in ports])
544
+            return
545
+
546
+        # Return if not enough time has passed between cleanup
547
+        last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
548
+        if last_check_in_secs <= self._port_cleanup_interval_secs:
549
+            return
550
+
551
+        ports = self.listPorts(status='DOWN')
552
+        ports = self.filterComputePorts(ports)
553
+        current_set = set([p.id for p in ports])
554
+        remove_set = current_set & self._down_ports
555
+
556
+        removed_count = 0
557
+        for port_id in remove_set:
558
+            try:
559
+                self.deletePort(port_id)
560
+            except Exception:
561
+                self.log.exception("Exception deleting port %s in %s:",
562
+                                   port_id, self.provider.name)
563
+            else:
564
+                removed_count += 1
565
+                self.log.debug("Removed DOWN port %s in %s",
566
+                               port_id, self.provider.name)
567
+
568
+        if self._statsd and removed_count:
569
+            key = 'nodepool.provider.%s.downPorts' % (self.provider.name)
570
+            self._statsd.incr(key, removed_count)
571
+
572
+        self._last_port_cleanup = time.monotonic()
573
+
574
+        # Rely on OpenStack to tell us the down ports rather than doing our
575
+        # own set adjustment.
576
+        ports = self.listPorts(status='DOWN')
577
+        ports = self.filterComputePorts(ports)
578
+        self._down_ports = set([p.id for p in ports])
579
+
580
+    def cleanupLeakedResources(self):
581
+        self.cleanupLeakedInstances()
582
+        self.cleanupLeakedPorts()
506 583
         if self.provider.clean_floating_ips:
507 584
             self._client.delete_unattached_floating_ips()
508 585
 

+ 21
- 0
nodepool/tests/unit/test_launcher.py View File

@@ -1935,3 +1935,24 @@ class TestLauncher(tests.DBTestCase):
1935 1935
 
1936 1936
         while self.zk.client.exists(path):
1937 1937
             time.sleep(.1)
1938
+
1939
+    def test_leaked_port_cleanup(self):
1940
+        configfile = self.setup_config('node.yaml')
1941
+        self.useBuilder(configfile)
1942
+        pool = self.useNodepool(configfile, watermark_sleep=1)
1943
+        pool.cleanup_interval = 1
1944
+        pool.start()
1945
+        self.waitForNodes('fake-label')
1946
+
1947
+        manager = pool.getProviderManager('fake-provider')
1948
+        down_ports = manager.listPorts(status='DOWN')
1949
+        self.assertEqual(2, len(down_ports))
1950
+        self.log.debug("Down ports: %s", down_ports)
1951
+
1952
+        # Change the port cleanup interval to happen quicker
1953
+        manager._port_cleanup_interval_secs = 2
1954
+        while manager.listPorts(status='DOWN'):
1955
+            time.sleep(1)
1956
+
1957
+        self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
1958
+                                value='2', kind='c')

+ 7
- 0
releasenotes/notes/port-cleanup-667d476437f03358.yaml View File

@@ -0,0 +1,7 @@
1
+---
2
+features:
3
+  - |
4
+    Added a new routine to the OpenStack driver cleanup resources phase that
5
+    will remove any ports reported to be in the DOWN state. Ports will have
6
+    to be seen as DOWN for at least three minutes before they will be removed.
7
+    The number of ports removed will be reported to statsd.

+ 2
- 3
requirements.txt View File

@@ -6,9 +6,8 @@ python-daemon>=2.0.4,<2.1.0
6 6
 extras
7 7
 statsd>=3.0
8 8
 PrettyTable>=0.6,<0.8
9
-# openstacksdk 0.20.0 exposes a bug in keystoneauth. The pin can be removed
10
-# once keystoneauth1 has made a release
11
-openstacksdk>=0.17.2,!=0.18.0,!=0.20.0
9
+# openstacksdk before 0.21.0 has issues with dogpile.cache
10
+openstacksdk>=0.21.0
12 11
 diskimage-builder>=2.0.0
13 12
 voluptuous
14 13
 kazoo

Loading…
Cancel
Save