Support UUID as builder identifier

We currently use the hostname of the builder host as the
identifier for the owner of built images. This value is
recorded in ZooKeeper and is used to make sure that only
the owner of an image build ever deletes the entry from
ZooKeeper.

Using hostname can be problematic if it ever changes. It will
cause orphaned builds that will not get deleted. This change
allows us to use a UUID as the identifier, deprecating use of
hostname altogether (although we continue storing that info).
The UUID will be stored in a file in the images directory so
that it may persist across nodepool-builder restarts.

In order to help with transitioning existing builders to using
UUID instead of hostname, the code will always compare the UUID
value AND the hostname so that existing ZK entries will be
matched until they age away.

Change-Id: Ifafbab9fb0f41564cc1af595586fa7353ce1d0d0
This commit is contained in:
David Shrewsbury 2017-07-17 12:04:24 -04:00
parent 42e15e2150
commit fcc68b8fd6
5 changed files with 82 additions and 14 deletions

View File

@ -16,6 +16,12 @@ and ``providers`` sections::
providers:
...
.. note:: The builder daemon creates a UUID to uniquely identify itself and
to mark image builds in ZooKeeper that it owns. This file will be
named ``builder_id.txt`` and will live in the directory named by the
:ref:`images-dir` option. If this file does not exist, it will be
created on builder startup and a UUID will be created automatically.
The following sections are available. All are required unless
otherwise indicated.
@ -46,6 +52,8 @@ Example::
elements-dir: /path/to/elements/dir
.. _images-dir:
images-dir
----------

View File

@ -21,6 +21,7 @@ import subprocess
import threading
import time
import shlex
import uuid
from nodepool import config as nodepool_config
from nodepool import exceptions
@ -107,7 +108,7 @@ class DibImageFile(object):
class BaseWorker(threading.Thread):
def __init__(self, config_path, interval, zk):
def __init__(self, builder_id, config_path, interval, zk):
super(BaseWorker, self).__init__()
self.log = logging.getLogger("nodepool.builder.BaseWorker")
self.daemon = True
@ -118,6 +119,7 @@ class BaseWorker(threading.Thread):
self._hostname = socket.gethostname()
self._statsd = stats.get_client()
self._interval = interval
self._builder_id = builder_id
def _checkForZooKeeperChanges(self, new_config):
'''
@ -144,8 +146,9 @@ class CleanupWorker(BaseWorker):
and any local DIB builds.
'''
def __init__(self, name, config_path, interval, zk):
super(CleanupWorker, self).__init__(config_path, interval, zk)
def __init__(self, name, builder_id, config_path, interval, zk):
super(CleanupWorker, self).__init__(builder_id, config_path,
interval, zk)
self.log = logging.getLogger("nodepool.builder.CleanupWorker.%s" % name)
self.name = 'CleanupWorker.%s' % name
@ -236,7 +239,13 @@ class CleanupWorker(BaseWorker):
# NOTE(pabelanger): It is possible we don't have any files because
# diskimage-builder failed. So, check to see if we have the correct
# builder so we can removed the data from zookeeper.
if build.builder == self._hostname:
# To maintain backward compatibility with builders that didn't
# use unique builder IDs before, but do now, always compare to
# hostname as well since some ZK data may still reference that.
if (build.builder_id == self._builder_id or
build.builder == self._hostname
):
return True
return False
@ -510,8 +519,9 @@ class CleanupWorker(BaseWorker):
class BuildWorker(BaseWorker):
def __init__(self, name, config_path, interval, zk, dib_cmd):
super(BuildWorker, self).__init__(config_path, interval, zk)
def __init__(self, name, builder_id, config_path, interval, zk, dib_cmd):
super(BuildWorker, self).__init__(builder_id, config_path,
interval, zk)
self.log = logging.getLogger("nodepool.builder.BuildWorker.%s" % name)
self.name = 'BuildWorker.%s' % name
self.dib_cmd = dib_cmd
@ -573,6 +583,7 @@ class BuildWorker(BaseWorker):
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
@ -623,6 +634,7 @@ class BuildWorker(BaseWorker):
data = zk.ImageBuild()
data.state = zk.BUILDING
data.builder_id = self._builder_id
data.builder = self._hostname
data.formats = list(diskimage.image_types)
@ -712,6 +724,7 @@ class BuildWorker(BaseWorker):
time.sleep(SUSPEND_WAIT_TIME)
build_data = zk.ImageBuild()
build_data.builder_id = self._builder_id
build_data.builder = self._hostname
if self._zk.didLoseConnection:
@ -778,8 +791,9 @@ class BuildWorker(BaseWorker):
class UploadWorker(BaseWorker):
def __init__(self, name, config_path, interval, zk):
super(UploadWorker, self).__init__(config_path, interval, zk)
def __init__(self, name, builder_id, config_path, interval, zk):
super(UploadWorker, self).__init__(builder_id, config_path,
interval, zk)
self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name)
self.name = 'UploadWorker.%s' % name
@ -1055,6 +1069,17 @@ class NodePoolBuilder(object):
# Private methods
#=======================================================================
def _getBuilderID(self, id_file):
if not os.path.exists(id_file):
with open(id_file, "w") as f:
builder_id = str(uuid.uuid4())
f.write(builder_id)
return builder_id
with open(id_file, "r") as f:
builder_id = f.read()
return builder_id
def _getAndValidateConfig(self):
config = nodepool_config.loadConfig(self._config_path)
if not config.zookeeper_servers.values():
@ -1082,6 +1107,10 @@ class NodePoolBuilder(object):
self._config = self._getAndValidateConfig()
self._running = True
builder_id_file = os.path.join(self._config.imagesdir,
"builder_id.txt")
builder_id = self._getBuilderID(builder_id_file)
# All worker threads share a single ZooKeeper instance/connection.
self.zk = zk.ZooKeeper()
self.zk.connect(list(self._config.zookeeper_servers.values()))
@ -1090,20 +1119,21 @@ class NodePoolBuilder(object):
# Create build and upload worker objects
for i in range(self._num_builders):
w = BuildWorker(i, self._config_path, self.build_interval,
self.zk, self.dib_cmd)
w = BuildWorker(i, builder_id, self._config_path,
self.build_interval, self.zk, self.dib_cmd)
w.start()
self._build_workers.append(w)
for i in range(self._num_uploaders):
w = UploadWorker(i, self._config_path, self.upload_interval,
self.zk)
w = UploadWorker(i, builder_id, self._config_path,
self.upload_interval, self.zk)
w.start()
self._upload_workers.append(w)
if self.cleanup_interval > 0:
self._janitor = CleanupWorker(
0, self._config_path, self.cleanup_interval, self.zk)
0, builder_id, self._config_path,
self.cleanup_interval, self.zk)
self._janitor.start()
# Wait until all threads are running. Otherwise, we have a race

View File

@ -14,6 +14,7 @@
# limitations under the License.
import os
import uuid
import fixtures
from nodepool import builder, exceptions, fakeprovider, tests
@ -95,6 +96,18 @@ class TestNodePoolBuilder(tests.DBTestCase):
nb.start()
nb.stop()
def test_builder_id_file(self):
configfile = self.setup_config('node.yaml')
self._useBuilder(configfile)
path = os.path.join(self._config_images_dir.path, 'builder_id.txt')
# Validate the unique ID file exists and contents are what we expect
self.assertTrue(os.path.exists(path))
with open(path, "r") as f:
the_id = f.read()
obj = uuid.UUID(the_id, version=4)
self.assertEqual(the_id, str(obj))
def test_image_upload_fail(self):
"""Test that image upload fails are handled properly."""

View File

@ -127,12 +127,14 @@ class TestZooKeeper(tests.DBTestCase):
image = "ubuntu-trusty"
orig_data = zk.ImageBuild()
orig_data.builder = 'host'
orig_data.builder_id = 'ABC-123'
orig_data.state = zk.READY
with self.zk.imageBuildLock(image, blocking=True, timeout=1):
build_num = self.zk.storeBuild(image, orig_data)
data = self.zk.getBuild(image, build_num)
self.assertEqual(orig_data.builder, data.builder)
self.assertEqual(orig_data.builder_id, data.builder_id)
self.assertEqual(orig_data.state, data.state)
self.assertEqual(orig_data.state_time, data.state_time)
self.assertEqual(build_num, data.id)
@ -666,6 +668,7 @@ class TestZKModel(tests.BaseTestCase):
o = zk.ImageBuild('0001')
o.state = zk.BUILDING
o.builder = 'localhost'
o.builder_id = 'ABC-123'
o.formats = ['qemu', 'raw']
d = o.toDict()
@ -674,12 +677,14 @@ class TestZKModel(tests.BaseTestCase):
self.assertIsNotNone(d['state_time'])
self.assertEqual(','.join(o.formats), d['formats'])
self.assertEqual(o.builder, d['builder'])
self.assertEqual(o.builder_id, d['builder_id'])
def test_ImageBuild_fromDict(self):
now = int(time.time())
d_id = '0001'
d = {
'builder': 'localhost',
'builder_id': 'ABC-123',
'formats': 'qemu,raw',
'state': zk.BUILDING,
'state_time': now
@ -690,6 +695,7 @@ class TestZKModel(tests.BaseTestCase):
self.assertEqual(o.state, d['state'])
self.assertEqual(o.state_time, d['state_time'])
self.assertEqual(o.builder, d['builder'])
self.assertEqual(o.builder_id, d['builder_id'])
self.assertEqual(o.formats, d['formats'].split(','))
def test_ImageUpload_toDict(self):

View File

@ -189,13 +189,21 @@ class BaseModel(object):
class ImageBuild(BaseModel):
'''
Class representing a DIB image build within the ZooKeeper cluster.
Note that the 'builder' attribute used to be used to uniquely identify
the owner of an image build in ZooKeeper. Because hostname was used, if
it ever changed, then we would get orphaned znodes. The 'builder_id'
attribute was added as a replacement, keeping 'builder' to mean the
same thing (which is why this attribute is not called 'hostname' or
similar).
'''
VALID_STATES = set([BUILDING, READY, DELETING, FAILED])
def __init__(self, build_id=None):
super(ImageBuild, self).__init__(build_id)
self._formats = []
self.builder = None # Builder hostname
self.builder = None # Hostname
self.builder_id = None # Unique ID
def __repr__(self):
d = self.toDict()
@ -223,6 +231,8 @@ class ImageBuild(BaseModel):
d = super(ImageBuild, self).toDict()
if self.builder is not None:
d['builder'] = self.builder
if self.builder_id is not None:
d['builder_id'] = self.builder_id
if len(self.formats):
d['formats'] = ','.join(self.formats)
return d
@ -240,6 +250,7 @@ class ImageBuild(BaseModel):
o = ImageBuild(o_id)
super(ImageBuild, o).fromDict(d)
o.builder = d.get('builder')
o.builder_id = d.get('builder_id')
# Only attempt the split on non-empty string
if d.get('formats', ''):
o.formats = d.get('formats', '').split(',')