Manage a pool of nodes for a distributed test infrastructure
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

661 lines
24 KiB

# Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
# Copyright (C) 2014 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Common utilities used in testing"""
import glob
import itertools
import kubernetes.config.kube_config
import logging
import os
import random
import select
import string
import socket
import subprocess
import threading
import tempfile
import time
import fixtures
import kazoo.client
import testtools
from nodepool import builder
from nodepool import launcher
from nodepool import webapp
from nodepool import zk
from nodepool.cmd.config_validator import ConfigValidator
from nodepool.nodeutils import iterate_timeout
TRUE_VALUES = ('true', '1', 'yes')
SECOND = 1
ONE_MINUTE = 60 * SECOND
class LoggingPopen(subprocess.Popen):
pass
class ZookeeperServerFixture(fixtures.Fixture):
def _setUp(self):
zk_host = os.environ.get('NODEPOOL_ZK_HOST', 'localhost')
if ':' in zk_host:
host, port = zk_host.split(':')
else:
host = zk_host
port = None
self.zookeeper_host = host
if not port:
self.zookeeper_port = 2281
else:
self.zookeeper_port = int(port)
zk_ca = os.environ.get('NODEPOOL_ZK_CA', None)
if not zk_ca:
zk_ca = os.path.join(os.path.dirname(__file__),
'../../tools/ca/certs/cacert.pem')
self.zookeeper_ca = zk_ca
zk_cert = os.environ.get('NODEPOOL_ZK_CERT', None)
if not zk_cert:
zk_cert = os.path.join(os.path.dirname(__file__),
'../../tools/ca/certs/client.pem')
self.zookeeper_cert = zk_cert
zk_key = os.environ.get('NODEPOOL_ZK_KEY', None)
if not zk_key:
zk_key = os.path.join(os.path.dirname(__file__),
'../../tools/ca/keys/clientkey.pem')
self.zookeeper_key = zk_key
class ChrootedKazooFixture(fixtures.Fixture):
def __init__(self, zookeeper_host, zookeeper_port, zookeeper_ca,
zookeeper_cert, zookeeper_key):
super(ChrootedKazooFixture, self).__init__()
self.zk_args = dict(
hosts='%s:%s' % (zookeeper_host, zookeeper_port),
use_ssl=True,
ca=zookeeper_ca,
certfile=zookeeper_cert,
keyfile=zookeeper_key)
def _setUp(self):
# Make sure the test chroot paths do not conflict
random_bits = ''.join(random.choice(string.ascii_lowercase +
string.ascii_uppercase)
for x in range(8))
rand_test_path = '%s_%s' % (random_bits, os.getpid())
self.zookeeper_chroot = "/nodepool_test/%s" % rand_test_path
# Ensure the chroot path exists and clean up any pre-existing znodes.
_tmp_client = kazoo.client.KazooClient(**self.zk_args)
_tmp_client.start()
if _tmp_client.exists(self.zookeeper_chroot):
_tmp_client.delete(self.zookeeper_chroot, recursive=True)
_tmp_client.ensure_path(self.zookeeper_chroot)
_tmp_client.stop()
_tmp_client.close()
self.addCleanup(self._cleanup)
def _cleanup(self):
'''Remove the chroot path.'''
# Need a non-chroot'ed client to remove the chroot path
_tmp_client = kazoo.client.KazooClient(**self.zk_args)
_tmp_client.start()
_tmp_client.delete(self.zookeeper_chroot, recursive=True)
_tmp_client.stop()
_tmp_client.close()
class StatsdFixture(fixtures.Fixture):
def _setUp(self):
self.running = True
self.thread = threading.Thread(target=self.run)
self.thread.daemon = True
self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
self.sock.bind(('', 0))
self.port = self.sock.getsockname()[1]
self.wake_read, self.wake_write = os.pipe()
self.stats = []
self.thread.start()
self.addCleanup(self._cleanup)
def run(self):
while self.running:
poll = select.poll()
poll.register(self.sock, select.POLLIN)
poll.register(self.wake_read, select.POLLIN)
ret = poll.poll()
for (fd, event) in ret:
if fd == self.sock.fileno():
data = self.sock.recvfrom(1024)
if not data:
return
self.stats.append(data[0])
if fd == self.wake_read:
return
def _cleanup(self):
self.running = False
os.write(self.wake_write, b'1\n')
self.thread.join()
class BaseTestCase(testtools.TestCase):
def setUp(self):
super(BaseTestCase, self).setUp()
test_timeout = os.environ.get('OS_TEST_TIMEOUT', 60)
try:
test_timeout = int(test_timeout)
except ValueError:
# If timeout value is invalid, fail hard.
print("OS_TEST_TIMEOUT set to invalid value"
" defaulting to no timeout")
test_timeout = 0
if test_timeout > 0:
self.useFixture(fixtures.Timeout(test_timeout, gentle=True))
self.useFixture(fixtures.Timeout(test_timeout + 20, gentle=False))
if os.environ.get('OS_STDOUT_CAPTURE') in TRUE_VALUES:
stdout = self.useFixture(fixtures.StringStream('stdout')).stream
self.useFixture(fixtures.MonkeyPatch('sys.stdout', stdout))
if os.environ.get('OS_STDERR_CAPTURE') in TRUE_VALUES:
stderr = self.useFixture(fixtures.StringStream('stderr')).stream
self.useFixture(fixtures.MonkeyPatch('sys.stderr', stderr))
if os.environ.get('OS_LOG_CAPTURE') in TRUE_VALUES:
fs = '%(asctime)s %(levelname)s [%(name)s] %(message)s'
self.useFixture(fixtures.FakeLogger(level=logging.DEBUG,
format=fs))
else:
logging.basicConfig(level=logging.DEBUG)
l = logging.getLogger('kazoo')
l.setLevel(logging.INFO)
l.propagate = False
l = logging.getLogger('stevedore')
l.setLevel(logging.INFO)
l.propagate = False
self.useFixture(fixtures.NestedTempfile())
self.subprocesses = []
def LoggingPopenFactory(*args, **kw):
p = LoggingPopen(*args, **kw)
self.subprocesses.append(p)
return p
self.statsd = StatsdFixture()
self.useFixture(self.statsd)
# note, use 127.0.0.1 rather than localhost to avoid getting ipv6
# see: https://github.com/jsocol/pystatsd/issues/61
os.environ['STATSD_HOST'] = '127.0.0.1'
os.environ['STATSD_PORT'] = str(self.statsd.port)
self.useFixture(fixtures.MonkeyPatch('subprocess.Popen',
LoggingPopenFactory))
self.setUpFakes()
self.addCleanup(self._cleanup)
def _cleanup(self):
# This is a hack to cleanup kubernetes temp files during test runs.
# The kube_config maintains a global dict of temporary files. During
# running the tests those can get deleted during the cleanup phase of
# the tests without kube_config knowing about this so forcefully tell
# kube_config to clean this up.
kubernetes.config.kube_config._cleanup_temp_files()
def setUpFakes(self):
clouds_path = os.path.join(os.path.dirname(__file__),
'fixtures', 'clouds.yaml')
self.useFixture(fixtures.MonkeyPatch(
'openstack.config.loader.CONFIG_FILES', [clouds_path]))
def wait_for_threads(self):
# Wait until all transient threads (node launches, deletions,
# etc.) are all complete. Whitelist any long-running threads.
whitelist = ['MainThread',
'NodePool',
'NodePool Builder',
'fake-provider',
'fake-provider1',
'fake-provider2',
'fake-provider3',
'CleanupWorker',
'DeletedNodeWorker',
'ServerListWatcher',
'StatsWorker',
'pydevd.CommandThread',
'pydevd.Reader',
'pydevd.Writer',
]
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Transient threads to finish",
interval=0.1):
done = True
for t in threading.enumerate():
if t.name.startswith("Thread-"):
# Kazoo
continue
if t.name.startswith("worker "):
# paste web server
continue
if t.name.startswith("UploadWorker"):
continue
if t.name.startswith("BuildWorker"):
continue
if t.name.startswith("CleanupWorker"):
continue
if t.name.startswith("PoolWorker"):
continue
if t.name not in whitelist:
done = False
if done:
return
def assertReportedStat(self, key, value=None, kind=None):
"""Check statsd output
Check statsd return values. A ``value`` should specify a
``kind``, however a ``kind`` may be specified without a
``value`` for a generic match. Leave both empy to just check
for key presence.
:arg str key: The statsd key
:arg str value: The expected value of the metric ``key``
:arg str kind: The expected type of the metric ``key`` For example
- ``c`` counter
- ``g`` gauge
- ``ms`` timing
- ``s`` set
"""
if value:
self.assertNotEqual(kind, None)
for _ in iterate_timeout(5 * SECOND, Exception,
"Find statsd event",
interval=0.1):
# Note our fake statsd just queues up results in a queue.
# We just keep going through them until we find one that
# matches, or fail out. If statsd pipelines are used,
# large single packets are sent with stats separated by
# newlines; thus we first flatten the stats out into
# single entries.
stats = itertools.chain.from_iterable(
[s.decode('utf-8').split('\n') for s in self.statsd.stats])
for stat in stats:
k, v = stat.split(':')
if key == k:
if kind is None:
# key with no qualifiers is found
return True
s_value, s_kind = v.split('|')
# if no kind match, look for other keys
if kind != s_kind:
continue
if value:
# special-case value|ms because statsd can turn
# timing results into float of indeterminate
# length, hence foiling string matching.
if kind == 'ms':
if float(value) == float(s_value):
return True
if value == s_value:
return True
# otherwise keep looking for other matches
continue
# this key matches
return True
raise Exception("Key %s not found in reported stats" % key)
class BuilderFixture(fixtures.Fixture):
log = logging.getLogger("tests.BuilderFixture")
def __init__(self, configfile, cleanup_interval, securefile=None,
num_uploaders=1):
super(BuilderFixture, self).__init__()
self.configfile = configfile
self.securefile = securefile
self.cleanup_interval = cleanup_interval
self.builder = None
self.num_uploaders = num_uploaders
def setUp(self):
super(BuilderFixture, self).setUp()
self.builder = builder.NodePoolBuilder(
self.configfile, secure_path=self.securefile,
num_uploaders=self.num_uploaders)
self.builder.cleanup_interval = self.cleanup_interval
self.builder.build_interval = .1
self.builder.upload_interval = .1
self.builder.start()
self.addCleanup(self.cleanup)
def cleanup(self):
# The NodePoolBuilder.stop() method does not intentionally stop the
# upload workers for reasons documented in that method. But we can
# safely do so in tests.
for worker in self.builder._upload_workers:
worker.shutdown()
worker.join()
self.log.debug("Stopped worker %s", worker.name)
self.builder.stop()
class DBTestCase(BaseTestCase):
def setUp(self):
super(DBTestCase, self).setUp()
self.log = logging.getLogger("tests")
self.setupZK()
def setup_config(self, filename, images_dir=None, **kw):
if images_dir is None:
images_dir = fixtures.TempDir()
self.useFixture(images_dir)
build_log_dir = fixtures.TempDir()
self.useFixture(build_log_dir)
format_dict = dict(
images_dir=images_dir.path,
build_log_dir=build_log_dir.path,
zookeeper_host=self.zookeeper_host,
zookeeper_port=self.zookeeper_port,
zookeeper_chroot=self.zookeeper_chroot,
zookeeper_ca=self.zookeeper_ca,
zookeeper_cert=self.zookeeper_cert,
zookeeper_key=self.zookeeper_key
)
format_dict.update(kw)
if filename.startswith('/'):
path = filename
else:
configfile = os.path.join(os.path.dirname(__file__),
'fixtures', filename)
(fd, path) = tempfile.mkstemp()
with open(configfile, 'rb') as conf_fd:
config = conf_fd.read().decode('utf8')
data = config.format(**format_dict)
os.write(fd, data.encode('utf8'))
os.close(fd)
self._config_images_dir = images_dir
self._config_build_log_dir = build_log_dir
validator = ConfigValidator(path)
ret = validator.validate()
if ret != 0:
raise ValueError("Config file %s could not be validated" % path)
return path
def replace_config(self, configfile, filename):
self.log.debug("Replacing config with %s", filename)
new_configfile = self.setup_config(filename, self._config_images_dir)
os.rename(new_configfile, configfile)
def setup_secure(self, filename):
# replace entries in secure.conf
configfile = os.path.join(os.path.dirname(__file__),
'fixtures', filename)
(fd, path) = tempfile.mkstemp()
with open(configfile, 'rb') as conf_fd:
config = conf_fd.read().decode('utf8')
data = config.format(
zookeeper_host=self.zookeeper_host,
zookeeper_port=self.zookeeper_port,
zookeeper_chroot=self.zookeeper_chroot,
zookeeper_ca=self.zookeeper_ca,
zookeeper_cert=self.zookeeper_cert,
zookeeper_key=self.zookeeper_key)
os.write(fd, data.encode('utf8'))
os.close(fd)
return path
def wait_for_config(self, pool):
for x in range(300):
if pool.config is not None:
return
time.sleep(0.1)
def waitForImage(self, provider_name, image_name, ignore_list=None):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Image upload to be ready",
interval=1):
self.wait_for_threads()
image = self.zk.getMostRecentImageUpload(image_name, provider_name)
if image:
if ignore_list and image not in ignore_list:
break
elif not ignore_list:
break
self.wait_for_threads()
return image
def waitForUploadRecordDeletion(self, provider_name, image_name,
build_id, upload_id):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Image upload record deletion",
interval=1):
self.wait_for_threads()
uploads = self.zk.getUploads(image_name, build_id, provider_name)
if not uploads or upload_id not in [u.id for u in uploads]:
break
self.wait_for_threads()
def waitForImageDeletion(self, provider_name, image_name, match=None):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Image upload deletion",
interval=1):
self.wait_for_threads()
image = self.zk.getMostRecentImageUpload(image_name, provider_name)
if not image or (match and image != match):
break
self.wait_for_threads()
def waitForBuild(self, image_name, build_id, states=None):
if states is None:
states = (zk.READY,)
base = "-".join([image_name, build_id])
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Image build record to reach state",
interval=1):
self.wait_for_threads()
build = self.zk.getBuild(image_name, build_id)
if build and build.state in states:
break
# We should only expect a dib manifest with a successful build.
while build.state == zk.READY:
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if files:
break
time.sleep(1)
self.wait_for_threads()
return build
def waitForBuildDeletion(self, image_name, build_id):
base = "-".join([image_name, build_id])
for _ in iterate_timeout(ONE_MINUTE, Exception,
"DIB build files deletion",
interval=1):
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)
if not files:
break
for _ in iterate_timeout(ONE_MINUTE, Exception,
"DIB build file deletion leaks",
interval=1):
self.wait_for_threads()
# Now, check the disk to ensure we didn't leak any files.
matches = glob.glob('%s/%s.*' % (self._config_images_dir.path,
base))
if not matches:
break
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Image build record deletion",
interval=1):
self.wait_for_threads()
build = self.zk.getBuild(image_name, build_id)
if not build:
break
self.wait_for_threads()
def waitForNodeDeletion(self, node):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Node record deletion",
interval=1):
exists = False
for n in self.zk.nodeIterator():
if node.id == n.id:
exists = True
break
if not exists:
break
def waitForInstanceDeletion(self, manager, instance_id):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Cloud instance deletion",
interval=1):
servers = manager.listNodes()
if not (instance_id in [s.id for s in servers]):
break
def waitForNodeRequestLockDeletion(self, request_id):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Node request lock deletion",
interval=1):
exists = False
for lock_id in self.zk.getNodeRequestLockIDs():
if request_id == lock_id:
exists = True
break
if not exists:
break
def waitForNodes(self, label, count=1):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Ready nodes",
interval=1):
self.wait_for_threads()
ready_nodes = self.zk.getReadyNodesOfTypes([label])
if label in ready_nodes and len(ready_nodes[label]) == count:
break
self.wait_for_threads()
return ready_nodes[label]
def waitForNodeRequest(self, req, states=None):
'''
Wait for a node request to transition to a final state.
'''
if states is None:
states = (zk.FULFILLED, zk.FAILED)
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Node request state transition",
interval=1):
req = self.zk.getNodeRequest(req.id)
if req.state in states:
break
return req
def useNodepool(self, *args, **kwargs):
secure_conf = kwargs.pop('secure_conf', None)
args = (secure_conf,) + args
pool = launcher.NodePool(*args, **kwargs)
pool.cleanup_interval = .5
pool.delete_interval = .5
self.addCleanup(pool.stop)
return pool
def useWebApp(self, *args, **kwargs):
app = webapp.WebApp(*args, **kwargs)
self.addCleanup(app.stop)
return app
def useBuilder(self, configfile, securefile=None, cleanup_interval=.5,
num_uploaders=1):
builder_fixture = self.useFixture(
BuilderFixture(configfile, cleanup_interval, securefile,
num_uploaders)
)
return builder_fixture.builder
def setupZK(self):
f = ZookeeperServerFixture()
self.useFixture(f)
self.zookeeper_host = f.zookeeper_host
self.zookeeper_port = f.zookeeper_port
self.zookeeper_ca = f.zookeeper_ca
self.zookeeper_cert = f.zookeeper_cert
self.zookeeper_key = f.zookeeper_key
kz_fxtr = self.useFixture(ChrootedKazooFixture(
self.zookeeper_host,
self.zookeeper_port,
self.zookeeper_ca,
self.zookeeper_cert,
self.zookeeper_key,
))
self.zookeeper_chroot = kz_fxtr.zookeeper_chroot
self.zk = zk.ZooKeeper(enable_cache=False)
host = zk.ZooKeeperConnectionConfig(
self.zookeeper_host, self.zookeeper_port, self.zookeeper_chroot,
)
self.zk.connect([host],
tls_ca=self.zookeeper_ca,
tls_cert=self.zookeeper_cert,
tls_key=self.zookeeper_key)
self.addCleanup(self.zk.disconnect)
def printZKTree(self, node):
def join(a, b):
if a.endswith('/'):
return a + b
return a + '/' + b
data, stat = self.zk.client.get(node)
self.log.debug("Node: %s" % (node,))
if data:
self.log.debug(data)
for child in self.zk.client.get_children(node):
self.printZKTree(join(node, child))
class IntegrationTestCase(DBTestCase):
def setUpFakes(self):
pass