diff --git a/doc/source/configuration.rst b/doc/source/configuration.rst index e42582fb0..ad20db73c 100644 --- a/doc/source/configuration.rst +++ b/doc/source/configuration.rst @@ -16,6 +16,12 @@ and ``providers`` sections:: providers: ... +.. note:: The builder daemon creates a UUID to uniquely identify itself and + to mark image builds in ZooKeeper that it owns. This file will be + named ``builder_id.txt`` and will live in the directory named by the + :ref:`images-dir` option. If this file does not exist, it will be + created on builder startup and a UUID will be created automatically. + The following sections are available. All are required unless otherwise indicated. @@ -46,6 +52,8 @@ Example:: elements-dir: /path/to/elements/dir +.. _images-dir: + images-dir ---------- diff --git a/nodepool/builder.py b/nodepool/builder.py index 76013e77d..6deba91bf 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -21,6 +21,7 @@ import subprocess import threading import time import shlex +import uuid from nodepool import config as nodepool_config from nodepool import exceptions @@ -107,7 +108,7 @@ class DibImageFile(object): class BaseWorker(threading.Thread): - def __init__(self, config_path, interval, zk): + def __init__(self, builder_id, config_path, interval, zk): super(BaseWorker, self).__init__() self.log = logging.getLogger("nodepool.builder.BaseWorker") self.daemon = True @@ -118,6 +119,7 @@ class BaseWorker(threading.Thread): self._hostname = socket.gethostname() self._statsd = stats.get_client() self._interval = interval + self._builder_id = builder_id def _checkForZooKeeperChanges(self, new_config): ''' @@ -144,8 +146,9 @@ class CleanupWorker(BaseWorker): and any local DIB builds. ''' - def __init__(self, name, config_path, interval, zk): - super(CleanupWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, interval, zk): + super(CleanupWorker, self).__init__(builder_id, config_path, + interval, zk) self.log = logging.getLogger("nodepool.builder.CleanupWorker.%s" % name) self.name = 'CleanupWorker.%s' % name @@ -236,7 +239,13 @@ class CleanupWorker(BaseWorker): # NOTE(pabelanger): It is possible we don't have any files because # diskimage-builder failed. So, check to see if we have the correct # builder so we can removed the data from zookeeper. - if build.builder == self._hostname: + + # To maintain backward compatibility with builders that didn't + # use unique builder IDs before, but do now, always compare to + # hostname as well since some ZK data may still reference that. + if (build.builder_id == self._builder_id or + build.builder == self._hostname + ): return True return False @@ -510,8 +519,9 @@ class CleanupWorker(BaseWorker): class BuildWorker(BaseWorker): - def __init__(self, name, config_path, interval, zk, dib_cmd): - super(BuildWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, interval, zk, dib_cmd): + super(BuildWorker, self).__init__(builder_id, config_path, + interval, zk) self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name) self.name = 'BuildWorker.%s' % name self.dib_cmd = dib_cmd @@ -573,6 +583,7 @@ class BuildWorker(BaseWorker): data = zk.ImageBuild() data.state = zk.BUILDING + data.builder_id = self._builder_id data.builder = self._hostname data.formats = list(diskimage.image_types) @@ -623,6 +634,7 @@ class BuildWorker(BaseWorker): data = zk.ImageBuild() data.state = zk.BUILDING + data.builder_id = self._builder_id data.builder = self._hostname data.formats = list(diskimage.image_types) @@ -712,6 +724,7 @@ class BuildWorker(BaseWorker): time.sleep(SUSPEND_WAIT_TIME) build_data = zk.ImageBuild() + build_data.builder_id = self._builder_id build_data.builder = self._hostname if self._zk.didLoseConnection: @@ -778,8 +791,9 @@ class BuildWorker(BaseWorker): class UploadWorker(BaseWorker): - def __init__(self, name, config_path, interval, zk): - super(UploadWorker, self).__init__(config_path, interval, zk) + def __init__(self, name, builder_id, config_path, interval, zk): + super(UploadWorker, self).__init__(builder_id, config_path, + interval, zk) self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name) self.name = 'UploadWorker.%s' % name @@ -1055,6 +1069,17 @@ class NodePoolBuilder(object): # Private methods #======================================================================= + def _getBuilderID(self, id_file): + if not os.path.exists(id_file): + with open(id_file, "w") as f: + builder_id = str(uuid.uuid4()) + f.write(builder_id) + return builder_id + + with open(id_file, "r") as f: + builder_id = f.read() + return builder_id + def _getAndValidateConfig(self): config = nodepool_config.loadConfig(self._config_path) if not config.zookeeper_servers.values(): @@ -1082,6 +1107,10 @@ class NodePoolBuilder(object): self._config = self._getAndValidateConfig() self._running = True + builder_id_file = os.path.join(self._config.imagesdir, + "builder_id.txt") + builder_id = self._getBuilderID(builder_id_file) + # All worker threads share a single ZooKeeper instance/connection. self.zk = zk.ZooKeeper() self.zk.connect(list(self._config.zookeeper_servers.values())) @@ -1090,20 +1119,21 @@ class NodePoolBuilder(object): # Create build and upload worker objects for i in range(self._num_builders): - w = BuildWorker(i, self._config_path, self.build_interval, - self.zk, self.dib_cmd) + w = BuildWorker(i, builder_id, self._config_path, + self.build_interval, self.zk, self.dib_cmd) w.start() self._build_workers.append(w) for i in range(self._num_uploaders): - w = UploadWorker(i, self._config_path, self.upload_interval, - self.zk) + w = UploadWorker(i, builder_id, self._config_path, + self.upload_interval, self.zk) w.start() self._upload_workers.append(w) if self.cleanup_interval > 0: self._janitor = CleanupWorker( - 0, self._config_path, self.cleanup_interval, self.zk) + 0, builder_id, self._config_path, + self.cleanup_interval, self.zk) self._janitor.start() # Wait until all threads are running. Otherwise, we have a race diff --git a/nodepool/tests/test_builder.py b/nodepool/tests/test_builder.py index 14412ade6..0f8126499 100644 --- a/nodepool/tests/test_builder.py +++ b/nodepool/tests/test_builder.py @@ -14,6 +14,7 @@ # limitations under the License. import os +import uuid import fixtures from nodepool import builder, exceptions, fakeprovider, tests @@ -95,6 +96,18 @@ class TestNodePoolBuilder(tests.DBTestCase): nb.start() nb.stop() + def test_builder_id_file(self): + configfile = self.setup_config('node.yaml') + self._useBuilder(configfile) + path = os.path.join(self._config_images_dir.path, 'builder_id.txt') + + # Validate the unique ID file exists and contents are what we expect + self.assertTrue(os.path.exists(path)) + with open(path, "r") as f: + the_id = f.read() + obj = uuid.UUID(the_id, version=4) + self.assertEqual(the_id, str(obj)) + def test_image_upload_fail(self): """Test that image upload fails are handled properly.""" diff --git a/nodepool/tests/test_zk.py b/nodepool/tests/test_zk.py index 97446f187..dde66a761 100644 --- a/nodepool/tests/test_zk.py +++ b/nodepool/tests/test_zk.py @@ -127,12 +127,14 @@ class TestZooKeeper(tests.DBTestCase): image = "ubuntu-trusty" orig_data = zk.ImageBuild() orig_data.builder = 'host' + orig_data.builder_id = 'ABC-123' orig_data.state = zk.READY with self.zk.imageBuildLock(image, blocking=True, timeout=1): build_num = self.zk.storeBuild(image, orig_data) data = self.zk.getBuild(image, build_num) self.assertEqual(orig_data.builder, data.builder) + self.assertEqual(orig_data.builder_id, data.builder_id) self.assertEqual(orig_data.state, data.state) self.assertEqual(orig_data.state_time, data.state_time) self.assertEqual(build_num, data.id) @@ -666,6 +668,7 @@ class TestZKModel(tests.BaseTestCase): o = zk.ImageBuild('0001') o.state = zk.BUILDING o.builder = 'localhost' + o.builder_id = 'ABC-123' o.formats = ['qemu', 'raw'] d = o.toDict() @@ -674,12 +677,14 @@ class TestZKModel(tests.BaseTestCase): self.assertIsNotNone(d['state_time']) self.assertEqual(','.join(o.formats), d['formats']) self.assertEqual(o.builder, d['builder']) + self.assertEqual(o.builder_id, d['builder_id']) def test_ImageBuild_fromDict(self): now = int(time.time()) d_id = '0001' d = { 'builder': 'localhost', + 'builder_id': 'ABC-123', 'formats': 'qemu,raw', 'state': zk.BUILDING, 'state_time': now @@ -690,6 +695,7 @@ class TestZKModel(tests.BaseTestCase): self.assertEqual(o.state, d['state']) self.assertEqual(o.state_time, d['state_time']) self.assertEqual(o.builder, d['builder']) + self.assertEqual(o.builder_id, d['builder_id']) self.assertEqual(o.formats, d['formats'].split(',')) def test_ImageUpload_toDict(self): diff --git a/nodepool/zk.py b/nodepool/zk.py index e2f408de4..8459f34b1 100755 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -189,13 +189,21 @@ class BaseModel(object): class ImageBuild(BaseModel): ''' Class representing a DIB image build within the ZooKeeper cluster. + + Note that the 'builder' attribute used to be used to uniquely identify + the owner of an image build in ZooKeeper. Because hostname was used, if + it ever changed, then we would get orphaned znodes. The 'builder_id' + attribute was added as a replacement, keeping 'builder' to mean the + same thing (which is why this attribute is not called 'hostname' or + similar). ''' VALID_STATES = set([BUILDING, READY, DELETING, FAILED]) def __init__(self, build_id=None): super(ImageBuild, self).__init__(build_id) self._formats = [] - self.builder = None # Builder hostname + self.builder = None # Hostname + self.builder_id = None # Unique ID def __repr__(self): d = self.toDict() @@ -223,6 +231,8 @@ class ImageBuild(BaseModel): d = super(ImageBuild, self).toDict() if self.builder is not None: d['builder'] = self.builder + if self.builder_id is not None: + d['builder_id'] = self.builder_id if len(self.formats): d['formats'] = ','.join(self.formats) return d @@ -240,6 +250,7 @@ class ImageBuild(BaseModel): o = ImageBuild(o_id) super(ImageBuild, o).fromDict(d) o.builder = d.get('builder') + o.builder_id = d.get('builder_id') # Only attempt the split on non-empty string if d.get('formats', ''): o.formats = d.get('formats', '').split(',')