Ignore unparsable/empty image upload ZNode data
Multiple builders running cleanup of uploads in parallel can lead to a race condition which results in empty image upload nodes in Zookeeper. The problem is the creation of the image upload number lock inside the image upload. The lock creation can re-created an already deleted image upload w/o any data. Later iterations will fail to parse the empty znode data. It's not 100% clear how we arrive at this race condition, but we definitely see multiple builders in the 'guarded' section trying to delete an upload. The following exception shows that there is some kind of race: 2020-06-23 11:00:45,225 ERROR nodepool.builder.CleanupWorker.0: Exception cleaning up build <ImageBuild {'state': 'ready', 'state_time': 1592718469.360345, 'builder': 'nodepool-builder-2', 'builder_id': '8e2c599b-fbfd-4d68-b8df-2fd2efdaa705', 'formats': 'qcow2,raw', 'username': 'Administrator', 'python_path': 'auto', 'id': '0000013010', 'stat': ZnodeStat(czxid=129130944630, mzxid=129131204208, ctime=1592712393474, mtime=1592718469363, version=1, cversion=1, aversion=0, ephemeralOwner=0, dataLength=214, numChildren=1, pzxid=129131204230)}> of image example-image in provider <Provider example-provider>: Traceback (most recent call last): File "/opt/nodepool/lib/python3.6/site-packages/nodepool/builder.py", line 499, in _cleanupImage self._cleanupProvider(provider, image, build.id) File "/opt/nodepool/lib/python3.6/site-packages/nodepool/builder.py", line 292, in _cleanupProvider self._deleteUpload(upload) File "/opt/nodepool/lib/python3.6/site-packages/nodepool/builder.py", line 345, in _deleteUpload upload.provider_name, upload.id) File "/opt/nodepool/lib/python3.6/site-packages/nodepool/zk.py", line 1572, in deleteUpload self.client.delete(path, recursive=True) File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1341, in delete return self._delete_recursive(path) File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1374, in _delete_recursive self._delete_recursive(child_path) File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1376, in _delete_recursive self.delete(path) File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1343, in delete return self.delete_async(path, version).get() File "/opt/nodepool/lib/python3.6/site-packages/kazoo/handlers/utils.py", line 75, in get raise self._exception kazoo.exceptions.NotEmptyError Order of events that might have led to the above exception: - A locks upload - A deletes children of upload (including lock of A) - B locks upload (possible, because lock was deleted by A) - A tries do delete the parent node, which leads to the above exception - ... Related change: https://review.opendev.org/#/c/716566/ Change-Id: I5989c9f5f4c9f1615ae7372250e2e5fdad720256
This commit is contained in:
parent
9e9a5b9bfd
commit
638c6dbb1f
|
@ -417,6 +417,11 @@ class CleanupWorker(BaseWorker):
|
|||
# between the getUploads() and the _inProgressUpload() check.
|
||||
u = self._zk.getImageUpload(image, build_id, provider,
|
||||
upload.id)
|
||||
if not u:
|
||||
self.log.warning(
|
||||
"Ignoring invalid or empty image upload: %s", upload
|
||||
)
|
||||
continue
|
||||
if upload.state != u.state:
|
||||
continue
|
||||
self.log.debug("Removing failed upload record: %s" % upload)
|
||||
|
|
|
@ -217,6 +217,8 @@ def image_list(zk):
|
|||
image_name, build_no, provider):
|
||||
upload = zk.getImageUpload(image_name, build_no,
|
||||
provider, upload_no)
|
||||
if not upload:
|
||||
continue
|
||||
values = [build_no, upload_no, provider, image_name,
|
||||
upload.external_name,
|
||||
upload.external_id,
|
||||
|
|
|
@ -130,6 +130,36 @@ class TestZooKeeper(tests.DBTestCase):
|
|||
upload.id)
|
||||
self.assertIsNone(self.zk.client.exists(path))
|
||||
|
||||
def test_imageUploadNumberLock_orphan(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
|
||||
path = self.zk._imageUploadNumberLockPath(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
self.assertIsNotNone(self.zk.client.exists(path))
|
||||
self.zk.deleteUpload(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
|
||||
# Pretend we still think the image upload exists
|
||||
# (e.g. multiple cleanup workers itertating over uploads)
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
# We now recreated an empty image upload number node
|
||||
pass
|
||||
|
||||
self.assertIsNotNone(self.zk.client.exists(path))
|
||||
|
||||
# Should not throw an exception because of the empty upload
|
||||
self.zk.getImageUpload(upload.image_name, upload.build_id,
|
||||
upload.provider_name, upload.id)
|
||||
|
||||
def test_imageUploadNumberLock(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
|
|
|
@ -1365,9 +1365,9 @@ class ZooKeeper(object):
|
|||
image,
|
||||
upload_number)
|
||||
except json.decoder.JSONDecodeError:
|
||||
self.log.exception('Error loading json data from image upload '
|
||||
'%s, %s, %s', image, build_number, provider)
|
||||
raise
|
||||
self.log.exception('Error loading json data from image upload %s',
|
||||
path)
|
||||
return None
|
||||
d.stat = stat
|
||||
return d
|
||||
|
||||
|
|
Loading…
Reference in New Issue