From fcc68b8fd6ce12348635830df88823fd5156ceb6 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 17 Jul 2017 12:04:24 -0400 Subject: [PATCH] Support UUID as builder identifier We currently use the hostname of the builder host as the identifier for the owner of built images. This value is recorded in ZooKeeper and is used to make sure that only the owner of an image build ever deletes the entry from ZooKeeper. Using hostname can be problematic if it ever changes. It will cause orphaned builds that will not get deleted. This change allows us to use a UUID as the identifier, deprecating use of hostname altogether (although we continue storing that info). The UUID will be stored in a file in the images directory so that it may persist across nodepool-builder restarts. In order to help with transitioning existing builders to using UUID instead of hostname, the code will always compare the UUID value AND the hostname so that existing ZK entries will be matched until they age away. Change-Id: Ifafbab9fb0f41564cc1af595586fa7353ce1d0d0 --- doc/source/configuration.rst | 8 +++++ nodepool/builder.py | 56 ++++++++++++++++++++++++++-------- nodepool/tests/test_builder.py | 13 ++++++++ nodepool/tests/test_zk.py | 6 ++++ nodepool/zk.py | 13 +++++++- 5 files changed, 82 insertions(+), 14 deletions(-) diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index e42582fb0..ad20db73c 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -16,6 +16,12 @@ and ``providers`` sections:: providers: ... +.. note:: The builder daemon creates a UUID to uniquely identify itself and + to mark image builds in ZooKeeper that it owns. This file will be + named ``builder_id.txt`` and will live in the directory named by the + :ref:`images-dir` option. If this file does not exist, it will be + created on builder startup and a UUID will be created automatically. + The following sections are available. All are required unless otherwise indicated. @@ -46,6 +52,8 @@ Example:: elements-dir: /path/to/elements/dir +.. _images-dir: + images-dir ---------- diff --git a/nodepool/builder.py b/nodepool/builder.py index 76013e77d..6deba91bf 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -21,6 +21,7 @@ import subprocess import threading import time import shlex +import uuid from nodepool import config as nodepool_config from nodepool import exceptions @@ -107,7 +108,7 @@ class DibImageFile(object): class BaseWorker(threading.Thread): - def __init__(self, config_path, interval, zk): + def __init__(self, builder_id, config_path, interval, zk): super(BaseWorker, self).__init__() self.log = logging.getLogger("nodepool.builder.BaseWorker") self.daemon = True @@ -118,6 +119,7 @@ class BaseWorker(threading.Thread): self._hostname = socket.gethostname() self._statsd = stats.get_client() self._interval = interval + self._builder_id = builder_id def _checkForZooKeeperChanges(self, new_config): ''' @@ -144,8 +146,9 @@ class CleanupWorker(BaseWorker): and any local DIB builds. ''' - def __init__(self, name, config_path, interval, zk): - super(CleanupWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, interval, zk): + super(CleanupWorker, self).__init__(builder_id, config_path, + interval, zk) self.log = logging.getLogger("nodepool.builder.CleanupWorker.%s" % name) self.name = 'CleanupWorker.%s' % name @@ -236,7 +239,13 @@ class CleanupWorker(BaseWorker): # NOTE(pabelanger): It is possible we don't have any files because # diskimage-builder failed. So, check to see if we have the correct # builder so we can removed the data from zookeeper. - if build.builder == self._hostname: + + # To maintain backward compatibility with builders that didn't + # use unique builder IDs before, but do now, always compare to + # hostname as well since some ZK data may still reference that. + if (build.builder_id == self._builder_id or + build.builder == self._hostname + ): return True return False @@ -510,8 +519,9 @@ class CleanupWorker(BaseWorker): class BuildWorker(BaseWorker): - def __init__(self, name, config_path, interval, zk, dib_cmd): - super(BuildWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, interval, zk, dib_cmd): + super(BuildWorker, self).__init__(builder_id, config_path, + interval, zk) self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name) self.name = 'BuildWorker.%s' % name self.dib_cmd = dib_cmd @@ -573,6 +583,7 @@ class BuildWorker(BaseWorker): data = zk.ImageBuild() data.state = zk.BUILDING + data.builder_id = self._builder_id data.builder = self._hostname data.formats = list(diskimage.image_types) @@ -623,6 +634,7 @@ class BuildWorker(BaseWorker): data = zk.ImageBuild() data.state = zk.BUILDING + data.builder_id = self._builder_id data.builder = self._hostname data.formats = list(diskimage.image_types) @@ -712,6 +724,7 @@ class BuildWorker(BaseWorker): time.sleep(SUSPEND_WAIT_TIME) build_data = zk.ImageBuild() + build_data.builder_id = self._builder_id build_data.builder = self._hostname if self._zk.didLoseConnection: @@ -778,8 +791,9 @@ class BuildWorker(BaseWorker): class UploadWorker(BaseWorker): - def __init__(self, name, config_path, interval, zk): - super(UploadWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, interval, zk): + super(UploadWorker, self).__init__(builder_id, config_path, + interval, zk) self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name) self.name = 'UploadWorker.%s' % name @@ -1055,6 +1069,17 @@ class NodePoolBuilder(object): # Private methods #======================================================================= + def _getBuilderID(self, id_file): + if not os.path.exists(id_file): + with open(id_file, "w") as f: + builder_id = str(uuid.uuid4()) + f.write(builder_id) + return builder_id + + with open(id_file, "r") as f: + builder_id = f.read() + return builder_id + def _getAndValidateConfig(self): config = nodepool_config.loadConfig(self._config_path) if not config.zookeeper_servers.values(): @@ -1082,6 +1107,10 @@ class NodePoolBuilder(object): self._config = self._getAndValidateConfig() self._running = True + builder_id_file = os.path.join(self._config.imagesdir, + "builder_id.txt") + builder_id = self._getBuilderID(builder_id_file) + # All worker threads share a single ZooKeeper instance/connection. self.zk = zk.ZooKeeper() self.zk.connect(list(self._config.zookeeper_servers.values())) @@ -1090,20 +1119,21 @@ class NodePoolBuilder(object): # Create build and upload worker objects for i in range(self._num_builders): - w = BuildWorker(i, self._config_path, self.build_interval, - self.zk, self.dib_cmd) + w = BuildWorker(i, builder_id, self._config_path, + self.build_interval, self.zk, self.dib_cmd) w.start() self._build_workers.append(w) for i in range(self._num_uploaders): - w = UploadWorker(i, self._config_path, self.upload_interval, - self.zk) + w = UploadWorker(i, builder_id, self._config_path, + self.upload_interval, self.zk) w.start() self._upload_workers.append(w) if self.cleanup_interval > 0: self._janitor = CleanupWorker( - 0, self._config_path, self.cleanup_interval, self.zk) + 0, builder_id, self._config_path, + self.cleanup_interval, self.zk) self._janitor.start() # Wait until all threads are running. Otherwise, we have a race diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index 14412ade6..0f8126499 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -14,6 +14,7 @@ # limitations under the License. import os +import uuid import fixtures from nodepool import builder, exceptions, fakeprovider, tests @@ -95,6 +96,18 @@ class TestNodePoolBuilder(tests.DBTestCase): nb.start() nb.stop() + def test_builder_id_file(self): + configfile = self.setup_config('node.yaml') + self._useBuilder(configfile) + path = os.path.join(self._config_images_dir.path, 'builder_id.txt') + + # Validate the unique ID file exists and contents are what we expect + self.assertTrue(os.path.exists(path)) + with open(path, "r") as f: + the_id = f.read() + obj = uuid.UUID(the_id, version=4) + self.assertEqual(the_id, str(obj)) + def test_image_upload_fail(self): """Test that image upload fails are handled properly.""" diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 97446f187..dde66a761 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -127,12 +127,14 @@ class TestZooKeeper(tests.DBTestCase): image = "ubuntu-trusty" orig_data = zk.ImageBuild() orig_data.builder = 'host' + orig_data.builder_id = 'ABC-123' orig_data.state = zk.READY with self.zk.imageBuildLock(image, blocking=True, timeout=1): build_num = self.zk.storeBuild(image, orig_data) data = self.zk.getBuild(image, build_num) self.assertEqual(orig_data.builder, data.builder) + self.assertEqual(orig_data.builder_id, data.builder_id) self.assertEqual(orig_data.state, data.state) self.assertEqual(orig_data.state_time, data.state_time) self.assertEqual(build_num, data.id) @@ -666,6 +668,7 @@ class TestZKModel(tests.BaseTestCase): o = zk.ImageBuild('0001') o.state = zk.BUILDING o.builder = 'localhost' + o.builder_id = 'ABC-123' o.formats = ['qemu', 'raw'] d = o.toDict() @@ -674,12 +677,14 @@ class TestZKModel(tests.BaseTestCase): self.assertIsNotNone(d['state_time']) self.assertEqual(','.join(o.formats), d['formats']) self.assertEqual(o.builder, d['builder']) + self.assertEqual(o.builder_id, d['builder_id']) def test_ImageBuild_fromDict(self): now = int(time.time()) d_id = '0001' d = { 'builder': 'localhost', + 'builder_id': 'ABC-123', 'formats': 'qemu,raw', 'state': zk.BUILDING, 'state_time': now @@ -690,6 +695,7 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.state, d['state']) self.assertEqual(o.state_time, d['state_time']) self.assertEqual(o.builder, d['builder']) + self.assertEqual(o.builder_id, d['builder_id']) self.assertEqual(o.formats, d['formats'].split(',')) def test_ImageUpload_toDict(self): diff --git a/nodepool/zk.py b/nodepool/zk.py index e2f408de4..8459f34b1 100755 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -189,13 +189,21 @@ class BaseModel(object): class ImageBuild(BaseModel): ''' Class representing a DIB image build within the ZooKeeper cluster. + + Note that the 'builder' attribute used to be used to uniquely identify + the owner of an image build in ZooKeeper. Because hostname was used, if + it ever changed, then we would get orphaned znodes. The 'builder_id' + attribute was added as a replacement, keeping 'builder' to mean the + same thing (which is why this attribute is not called 'hostname' or + similar). ''' VALID_STATES = set([BUILDING, READY, DELETING, FAILED]) def __init__(self, build_id=None): super(ImageBuild, self).__init__(build_id) self._formats = [] - self.builder = None # Builder hostname + self.builder = None # Hostname + self.builder_id = None # Unique ID def __repr__(self): d = self.toDict() @@ -223,6 +231,8 @@ class ImageBuild(BaseModel): d = super(ImageBuild, self).toDict() if self.builder is not None: d['builder'] = self.builder + if self.builder_id is not None: + d['builder_id'] = self.builder_id if len(self.formats): d['formats'] = ','.join(self.formats) return d @@ -240,6 +250,7 @@ class ImageBuild(BaseModel): o = ImageBuild(o_id) super(ImageBuild, o).fromDict(d) o.builder = d.get('builder') + o.builder_id = d.get('builder_id') # Only attempt the split on non-empty string if d.get('formats', ''): o.formats = d.get('formats', '').split(',')