Avoid collisions after ZK image data import

When image data are imported, if there are holes in the sequence
numbers, ZooKeeper may register a collision after nodepool-builder
builds or uploads a new image.  This is because ZooKeeper stores
a sequence node counter in the parent node, and we lose that
information when exporting/importing.  Newly built images can end
up with the same sequence numbers as imported images.  To avoid this,
re-create missing sequence nodes so that the import state more
closely matches the export state.

Change-Id: I0b96ebecc53dcf47324b8a009af749a3c04e574c
This commit is contained in:
James E. Blair 2022-06-20 11:07:56 -07:00
parent b2d4e3c356
commit cacef76d3a
3 changed files with 105 additions and 4 deletions

View File

@ -474,6 +474,19 @@ class DBTestCase(BaseTestCase):
self.wait_for_threads() self.wait_for_threads()
return image return image
def waitForUpload(self, provider_name, image_name,
build_id, upload_id):
for _ in iterate_timeout(ONE_MINUTE, Exception,
"Image upload to be ready",
interval=1):
self.wait_for_threads()
upload = self.zk.getImageUpload(
image_name, build_id, provider_name, upload_id)
if upload:
break
self.wait_for_threads()
return upload
def waitForUploadRecordDeletion(self, provider_name, image_name, def waitForUploadRecordDeletion(self, provider_name, image_name,
build_id, upload_id): build_id, upload_id):
for _ in iterate_timeout(ONE_MINUTE, Exception, for _ in iterate_timeout(ONE_MINUTE, Exception,

View File

@ -409,6 +409,16 @@ class TestNodepoolCMD(tests.DBTestCase):
self.waitForImage('fake-provider', 'fake-image') self.waitForImage('fake-provider', 'fake-image')
self.waitForNodes('fake-label') self.waitForNodes('fake-label')
build = self.waitForBuild('fake-image', '0000000001')
# Delete the first build so that we have a hole in our
# numbering. This lets us validate that we reconstruct the
# sequence state correctly.
build.state = zk.DELETING
with self.zk.imageBuildLock('fake-image', blocking=True, timeout=1):
self.zk.storeBuild('fake-image', build, '0000000001')
self.waitForBuildDeletion('fake-image', '0000000001')
self.waitForBuild('fake-image', '0000000002')
pool.stop() pool.stop()
for worker in builder._upload_workers: for worker in builder._upload_workers:
worker.shutdown() worker.shutdown()
@ -417,7 +427,7 @@ class TestNodepoolCMD(tests.DBTestCase):
# Save a copy of the data in ZK # Save a copy of the data in ZK
old_data = self.getZKTree('/nodepool/images') old_data = self.getZKTree('/nodepool/images')
# We aren't backing up the lock data # We aren't backing up the lock data
old_data.pop('/nodepool/images/fake-image/builds/0000000001' old_data.pop('/nodepool/images/fake-image/builds/0000000002'
'/providers/fake-provider/images/lock') '/providers/fake-provider/images/lock')
old_data.pop('/nodepool/images/fake-image/builds/lock') old_data.pop('/nodepool/images/fake-image/builds/lock')
@ -434,3 +444,32 @@ class TestNodepoolCMD(tests.DBTestCase):
new_data = self.getZKTree('/nodepool/images') new_data = self.getZKTree('/nodepool/images')
self.assertEqual(new_data, old_data) self.assertEqual(new_data, old_data)
# Now restart the builder to make sure new builds/uploads work
builder = self.useBuilder(configfile)
# First test a new upload of the existing image and make sure
# it uses the correct sequence number.
upload = self.waitForUpload('fake-provider', 'fake-image',
'0000000002', '0000000001')
upload.state = zk.DELETING
with self.zk.imageUploadLock(upload.image_name, upload.build_id,
upload.provider_name, blocking=True,
timeout=1):
self.zk.storeImageUpload(upload.image_name, upload.build_id,
upload.provider_name, upload, upload.id)
# We skip at least one number because upload lock is a sequence
# node too (this is why builds and uploads start at 1 instead of 0).
upload = self.waitForUpload('fake-provider', 'fake-image',
'0000000002', '0000000003')
# Now build a new image and make sure it uses the correct
# sequence number.
build = self.waitForBuild('fake-image', '0000000002')
# Expire rebuild-age (default: 1day) to force a new build.
build.state_time -= 86400
with self.zk.imageBuildLock('fake-image', blocking=True, timeout=1):
self.zk.storeBuild('fake-image', build, '0000000002')
# We skip at least one number because build lock is a sequence
# node too (this is why builds and uploads start at 1 instead of 0).
self.waitForBuild('fake-image', '0000000004')

View File

@ -2376,7 +2376,56 @@ class ZooKeeper(object):
run on a quiescent system with no daemons running. run on a quiescent system with no daemons running.
''' '''
# We do some extra work to ensure that the sequence numbers
# don't collide. ZK sequence numbers are stored in the parent
# node and ZK isn't smart enough to avoid collisions if there
# are missing entries. So if we restore build 1, and then the
# builder later wants to create a new build, it will attempt
# to create build 1, and fail since the node already exists.
#
# NB: The behavior is slightly different for sequence number 1
# vs others; if 2 is the lowest number, then ZK will create
# node 0 and 1 before colliding with 2. This is further
# complicated in the nodepool context since we create lock
# entries under the build/upload znodes which also seem to
# have an effect on the counter.
#
# Regardless, if we pre-create sequence nodes up to our
# highest node numbers for builds and uploads, we are
# guaranteed that the next sequence node created will be
# greater. So we look at all the sequence nodes in our import
# data set and pre-create sequence nodes up to that number.
highest_num = {}
# 0 1 2 3 4 5
# /nodepool/images/fake-image/builds/0000000002/
# 6 7 8 9
# providers/fake-provider/images/0000000001
for path, data in import_data.items(): for path, data in import_data.items():
self.client.create(path, parts = path.split('/')
value=data.encode('utf8'), if len(parts) == 6:
makepath=True) key = '/'.join(parts[:5])
num = int(parts[5])
highest_num[key] = max(highest_num.get(key, num), num)
if len(parts) == 10:
key = '/'.join(parts[:9])
num = int(parts[9])
highest_num[key] = max(highest_num.get(key, num), num)
for path, num in highest_num.items():
for x in range(num):
node = self.client.create(
path + '/', makepath=True, sequence=True)
# If this node isn't in our import data, go ahead and
# delete it.
if node not in import_data:
self.client.delete(node)
for path, data in import_data.items():
# We may have already created a node above; in that
# case, just set the data on it.
if self.client.exists(path):
self.client.set(path,
value=data.encode('utf8'))
else:
self.client.create(path,
value=data.encode('utf8'),
makepath=True)