Преглед на файлове

Merge "Revert "Revert "Cleanup down ports"""

tags/3.5.0
Zuul преди 10 месеца
родител
ревизия
78d2476769
променени са 6 файла, в които са добавени 135 реда и са изтрити 4 реда
  1. +6
    -0
      doc/source/operation.rst
  2. +21
    -0
      nodepool/driver/fake/provider.py
  3. +78
    -1
      nodepool/driver/openstack/provider.py
  4. +21
    -0
      nodepool/tests/unit/test_launcher.py
  5. +7
    -0
      releasenotes/notes/port-cleanup-667d476437f03358.yaml
  6. +2
    -3
      requirements.txt

+ 6
- 0
doc/source/operation.rst Целия файл

@@ -329,6 +329,12 @@ Nodepool launcher
* ready
* used

.. zuul:stat:: nodepool.provider.<provider>.downPorts
:type: counter

Number of ports in the DOWN state that have been removed automatically
in the cleanup resources phase of the OpenStack driver.

.. zuul:stat:: nodepool.provider.<provider>.nodes.<state>
:type: gauge


+ 21
- 0
nodepool/driver/fake/provider.py Целия файл

@@ -32,6 +32,7 @@ class Dummy(object):
INSTANCE = 'Instance'
FLAVOR = 'Flavor'
LOCATION = 'Server.Location'
PORT = 'Port'

def __init__(self, kind, **kw):
self.__kind = kind
@@ -105,6 +106,12 @@ class FakeOpenStackCloud(object):
self._server_list = []
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
self._down_ports = [
Dummy(Dummy.PORT, id='1a', status='DOWN',
device_owner="compute:nova"),
Dummy(Dummy.PORT, id='2b', status='DOWN',
device_owner=None),
]

def _get(self, name_or_id, instance_list):
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
@@ -286,6 +293,20 @@ class FakeOpenStackCloud(object):
total_ram_used=8192 * len(self._server_list)
)

def list_ports(self, filters=None):
if filters and filters.get('status') == 'DOWN':
return self._down_ports
return []

def delete_port(self, port_id):
tmp_ports = []
for port in self._down_ports:
if port.id != port_id:
tmp_ports.append(port)
else:
self.log.debug("Deleted port ID: %s", port_id)
self._down_ports = tmp_ports


class FakeUploadFailCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeUploadFailCloud")

+ 78
- 1
nodepool/driver/openstack/provider.py Целия файл

@@ -26,6 +26,7 @@ from nodepool.driver import Provider
from nodepool.driver.utils import QuotaInformation
from nodepool.nodeutils import iterate_timeout
from nodepool.task_manager import TaskManager
from nodepool import stats
from nodepool import version
from nodepool import zk

@@ -50,6 +51,10 @@ class OpenStackProvider(Provider):
self._taskmanager = None
self._current_nodepool_quota = None
self._zk = None
self._down_ports = set()
self._last_port_cleanup = None
self._port_cleanup_interval_secs = 180
self._statsd = stats.get_client()

def start(self, zk_conn):
if self._use_taskmanager:
@@ -430,6 +435,21 @@ class OpenStackProvider(Provider):
**meta)
return image.id

def listPorts(self, status=None):
'''
List known ports.

:param str status: A valid port status. E.g., 'ACTIVE' or 'DOWN'.
'''
if status:
ports = self._client.list_ports(filters={'status': status})
else:
ports = self._client.list_ports()
return ports

def deletePort(self, port_id):
self._client.delete_port(port_id)

def listImages(self):
return self._client.list_images()

@@ -457,7 +477,7 @@ class OpenStackProvider(Provider):
self.log.debug('Deleting server %s' % server_id)
self.deleteServer(server_id)

def cleanupLeakedResources(self):
def cleanupLeakedInstances(self):
'''
Delete any leaked server instances.

@@ -505,6 +525,63 @@ class OpenStackProvider(Provider):
node.state = zk.DELETING
self._zk.storeNode(node)

def filterComputePorts(self, ports):
'''
Return a list of compute ports (or no device owner).

We are not interested in ports for routers or DHCP.
'''
ret = []
for p in ports:
if p.device_owner is None or p.device_owner.startswith("compute:"):
ret.append(p)
return ret

def cleanupLeakedPorts(self):
if not self._last_port_cleanup:
self._last_port_cleanup = time.monotonic()
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
self._down_ports = set([p.id for p in ports])
return

# Return if not enough time has passed between cleanup
last_check_in_secs = int(time.monotonic() - self._last_port_cleanup)
if last_check_in_secs <= self._port_cleanup_interval_secs:
return

ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
current_set = set([p.id for p in ports])
remove_set = current_set & self._down_ports

removed_count = 0
for port_id in remove_set:
try:
self.deletePort(port_id)
except Exception:
self.log.exception("Exception deleting port %s in %s:",
port_id, self.provider.name)
else:
removed_count += 1
self.log.debug("Removed DOWN port %s in %s",
port_id, self.provider.name)

if self._statsd and removed_count:
key = 'nodepool.provider.%s.downPorts' % (self.provider.name)
self._statsd.incr(key, removed_count)

self._last_port_cleanup = time.monotonic()

# Rely on OpenStack to tell us the down ports rather than doing our
# own set adjustment.
ports = self.listPorts(status='DOWN')
ports = self.filterComputePorts(ports)
self._down_ports = set([p.id for p in ports])

def cleanupLeakedResources(self):
self.cleanupLeakedInstances()
self.cleanupLeakedPorts()
if self.provider.clean_floating_ips:
self._client.delete_unattached_floating_ips()


+ 21
- 0
nodepool/tests/unit/test_launcher.py Целия файл

@@ -1935,3 +1935,24 @@ class TestLauncher(tests.DBTestCase):

while self.zk.client.exists(path):
time.sleep(.1)

def test_leaked_port_cleanup(self):
configfile = self.setup_config('node.yaml')
self.useBuilder(configfile)
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.cleanup_interval = 1
pool.start()
self.waitForNodes('fake-label')

manager = pool.getProviderManager('fake-provider')
down_ports = manager.listPorts(status='DOWN')
self.assertEqual(2, len(down_ports))
self.log.debug("Down ports: %s", down_ports)

# Change the port cleanup interval to happen quicker
manager._port_cleanup_interval_secs = 2
while manager.listPorts(status='DOWN'):
time.sleep(1)

self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
value='2', kind='c')

+ 7
- 0
releasenotes/notes/port-cleanup-667d476437f03358.yaml Целия файл

@@ -0,0 +1,7 @@
---
features:
- |
Added a new routine to the OpenStack driver cleanup resources phase that
will remove any ports reported to be in the DOWN state. Ports will have
to be seen as DOWN for at least three minutes before they will be removed.
The number of ports removed will be reported to statsd.

+ 2
- 3
requirements.txt Целия файл

@@ -6,9 +6,8 @@ python-daemon>=2.0.4,<2.1.0
extras
statsd>=3.0
PrettyTable>=0.6,<0.8
# openstacksdk 0.20.0 exposes a bug in keystoneauth. The pin can be removed
# once keystoneauth1 has made a release
openstacksdk>=0.17.2,!=0.18.0,!=0.20.0
# openstacksdk before 0.21.0 has issues with dogpile.cache
openstacksdk>=0.21.0
diskimage-builder>=2.0.0
voluptuous
kazoo

Loading…
Отказ
Запис