Ignore unparsable/empty image upload ZNode data

Multiple builders running cleanup of uploads in parallel can lead to a
race condition which results in empty image upload nodes in Zookeeper.

The problem is the creation of the image upload number lock inside the
image upload. The lock creation can re-created an already deleted image
upload w/o any data.

Later iterations will fail to parse the empty znode data.

It's not 100% clear how we arrive at this race condition, but we
definitely see multiple builders in the 'guarded' section trying to
delete an upload.

The following exception shows that there is some kind of race:

    2020-06-23 11:00:45,225 ERROR nodepool.builder.CleanupWorker.0:
        Exception cleaning up build <ImageBuild {'state': 'ready',
        'state_time': 1592718469.360345, 'builder':
        'nodepool-builder-2', 'builder_id':
        '8e2c599b-fbfd-4d68-b8df-2fd2efdaa705', 'formats': 'qcow2,raw',
        'username': 'Administrator', 'python_path': 'auto', 'id':
        '0000013010', 'stat': ZnodeStat(czxid=129130944630,
        mzxid=129131204208, ctime=1592712393474, mtime=1592718469363,
        version=1, cversion=1, aversion=0, ephemeralOwner=0,
        dataLength=214, numChildren=1, pzxid=129131204230)}> of image
        example-image in provider <Provider example-provider>:
    Traceback (most recent call last):
      File "/opt/nodepool/lib/python3.6/site-packages/nodepool/builder.py", line 499, in _cleanupImage
        self._cleanupProvider(provider, image, build.id)
      File "/opt/nodepool/lib/python3.6/site-packages/nodepool/builder.py", line 292, in _cleanupProvider
        self._deleteUpload(upload)
      File "/opt/nodepool/lib/python3.6/site-packages/nodepool/builder.py", line 345, in _deleteUpload
        upload.provider_name, upload.id)
      File "/opt/nodepool/lib/python3.6/site-packages/nodepool/zk.py", line 1572, in deleteUpload
        self.client.delete(path, recursive=True)
      File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1341, in delete
        return self._delete_recursive(path)
      File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1374, in _delete_recursive
        self._delete_recursive(child_path)
      File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1376, in _delete_recursive
        self.delete(path)
      File "/opt/nodepool/lib/python3.6/site-packages/kazoo/client.py", line 1343, in delete
        return self.delete_async(path, version).get()
      File "/opt/nodepool/lib/python3.6/site-packages/kazoo/handlers/utils.py", line 75, in get
        raise self._exception
    kazoo.exceptions.NotEmptyError

Order of events that might have led to the above exception:
  - A locks upload
  - A deletes children of upload (including lock of A)
  - B locks upload (possible, because lock was deleted by A)
  - A tries do delete the parent node, which leads to the above exception
  - ...

Related change: https://review.opendev.org/#/c/716566/
Change-Id: I5989c9f5f4c9f1615ae7372250e2e5fdad720256
This commit is contained in:
Simon Westphahl 2020-06-25 15:19:10 +02:00
parent 9e9a5b9bfd
commit 638c6dbb1f
4 changed files with 40 additions and 3 deletions

View File

@ -417,6 +417,11 @@ class CleanupWorker(BaseWorker):
# between the getUploads() and the _inProgressUpload() check.
u = self._zk.getImageUpload(image, build_id, provider,
upload.id)
if not u:
self.log.warning(
"Ignoring invalid or empty image upload: %s", upload
)
continue
if upload.state != u.state:
continue
self.log.debug("Removing failed upload record: %s" % upload)

View File

@ -217,6 +217,8 @@ def image_list(zk):
image_name, build_no, provider):
upload = zk.getImageUpload(image_name, build_no,
provider, upload_no)
if not upload:
continue
values = [build_no, upload_no, provider, image_name,
upload.external_name,
upload.external_id,

View File

@ -130,6 +130,36 @@ class TestZooKeeper(tests.DBTestCase):
upload.id)
self.assertIsNone(self.zk.client.exists(path))
def test_imageUploadNumberLock_orphan(self):
upload = zk.ImageUpload()
upload.image_name = "ubuntu-trusty"
upload.build_id = "0000000003"
upload.provider_name = "providerA"
path = self.zk._imageUploadNumberLockPath(upload.image_name,
upload.build_id,
upload.provider_name,
upload.id)
with self.zk.imageUploadNumberLock(upload, blocking=False):
self.assertIsNotNone(self.zk.client.exists(path))
self.zk.deleteUpload(upload.image_name,
upload.build_id,
upload.provider_name,
upload.id)
# Pretend we still think the image upload exists
# (e.g. multiple cleanup workers itertating over uploads)
with self.zk.imageUploadNumberLock(upload, blocking=False):
# We now recreated an empty image upload number node
pass
self.assertIsNotNone(self.zk.client.exists(path))
# Should not throw an exception because of the empty upload
self.zk.getImageUpload(upload.image_name, upload.build_id,
upload.provider_name, upload.id)
def test_imageUploadNumberLock(self):
upload = zk.ImageUpload()
upload.image_name = "ubuntu-trusty"

View File

@ -1365,9 +1365,9 @@ class ZooKeeper(object):
image,
upload_number)
except json.decoder.JSONDecodeError:
self.log.exception('Error loading json data from image upload '
'%s, %s, %s', image, build_number, provider)
raise
self.log.exception('Error loading json data from image upload %s',
path)
return None
d.stat = stat
return d