From 2ec2222841f6116707fe25bdcdae6ad6c2b9beb7 Mon Sep 17 00:00:00 2001
From: Gorka Eguileor <geguileo@redhat.com>
Date: Wed, 21 Jul 2021 15:20:38 +0200
Subject: [PATCH] Fix: Race between attachment and volume deletion

There are cases where requests to delete an attachment made by Nova can
race other third-party requests to delete the overall volume.

This has been observed when running cinder-csi, where it first requests
that Nova detaches a volume before itself requesting that the overall
volume is deleted once it becomes `available`.

This is a cinder race condition, and like most race conditions is not
simple to explain.

Some context on the issue:

- Cinder API uses the volume "status" field as a locking mechanism to
  prevent concurrent request processing on the same volume.

- Most cinder operations are asynchronous, so the API returns before the
  operation has been completed by the cinder-volume service, but the
  attachment operations such as creating/updating/deleting an attachment
  are synchronous, so the API only returns to the caller after the
  cinder-volume service has completed the operation.

- Our current code **incorrectly** modifies the status of the volume
  both on the cinder-volume and the cinder-api services on the
  attachment delete operation.

The actual set of events that leads to the issue reported in this bug
are:

[Cinder-CSI]
- Requests Nova to detach volume (Request R1)

[Nova]
- R1: Asks cinder-api to delete the attachment and **waits**

[Cinder-API]
- R1: Checks the status of the volume
- R1: Sends terminate connection request (R1) to cinder-volume and
  **waits**

[Cinder-Volume]
- R1: Ask the driver to terminate the connection
- R1: The driver asks the backend to unmap and unexport the volume
- R1: The last attachment is removed from the DB and the status of the
      volume is changed in the DB to "available"

[Cinder-CSI]
- Checks that there are no attachments in the volume and asks Cinder to
  delete it (Request R2)

[Cinder-API]

- R2: Check that the volume's status is valid. It doesn't have
  attachments and is available, so it can be deleted.
- R2: Tell cinder-volume to delete the volume and return immediately.

[Cinder-Volume]
- R2: Volume is deleted and DB entry is deleted
- R1: Finish the termination of the connection

[Cinder-API]
- R1: Now that cinder-volume has finished the termination the code
  continues
- R1: Try to modify the volume in the DB
- R1: DB layer raises VolumeNotFound since the volume has been deleted
  from the DB
- R1: VolumeNotFound is converted to HTTP 404 status code which is
  returned to Nova

[Nova]
- R1: Cinder responds with 404 on the attachment delete request
- R1: Nova leaves the volume as attached, since the attachment delete
  failed

At this point the Cinder and Nova DBs are out of sync, because Nova
thinks that the attachment is connected and Cinder has detached the
volume and even deleted it.

Hardening is also being done on the Nova side [2] to accept that the
volume attachment may be gone.

This patch fixes the issue mentioned above, but there is a request on
Cinder-CSI [1] to use Nova as the source of truth regarding its
attachments that, when implemented, would also fix the issue.

[1]: https://github.com/kubernetes/cloud-provider-openstack/issues/1645
[2]: https://review.opendev.org/q/topic:%2522bug/1937084%2522+project:openstack/nova

Closes-Bug: #1937084
Change-Id: Iaf149dadad5791e81a3c0efd089d0ee66a1a5614
---
 cinder/db/sqlalchemy/api.py                   | 111 ++++++++++--------
 cinder/db/sqlalchemy/models.py                |  10 ++
 cinder/tests/unit/api/v3/test_attachments.py  |  14 +--
 .../attachments/test_attachments_manager.py   |  30 +++--
 cinder/tests/unit/test_db_api.py              |   6 +-
 cinder/volume/api.py                          |  61 +++-------
 cinder/volume/manager.py                      |   2 -
 .../detach-race-delete-012820ad9c8dbe16.yaml  |   6 +
 8 files changed, 127 insertions(+), 113 deletions(-)
 create mode 100644 releasenotes/notes/detach-race-delete-012820ad9c8dbe16.yaml

diff --git a/cinder/db/sqlalchemy/api.py b/cinder/db/sqlalchemy/api.py
index 41d213a3168..16618190aee 100644
--- a/cinder/db/sqlalchemy/api.py
+++ b/cinder/db/sqlalchemy/api.py
@@ -63,6 +63,9 @@ from cinder.volume import volume_utils
 
 CONF = cfg.CONF
 LOG = logging.getLogger(__name__)
+# Map with cases where attach status differs from volume status
+ATTACH_STATUS_MAP = {'attached': 'in-use', 'detached': 'available'}
+
 
 options.set_defaults(CONF, connection='sqlite:///$state_path/cinder.sqlite')
 
@@ -1727,74 +1730,83 @@ def volume_include_in_cluster(context, cluster, partial_rename=True,
                                partial_rename, filters)
 
 
+def _get_statuses_from_attachments(context, session, volume_id):
+    """Get volume status and attach_status based on existing attachments."""
+    # NOTE: Current implementation ignores attachments on error attaching,
+    # since they will not have been used by any consumer because os-brick's
+    # connect_volume has not been called yet. This leads to cases where a
+    # volume will be in 'available' state yet have attachments.
+
+    # If we sort status of attachments alphabetically, ignoring errors, the
+    # first element will be the attachment status for the volume:
+    #     attached > attaching > detaching > reserved
+    attach_status = session.query(models.VolumeAttachment.attach_status).\
+        filter_by(deleted=False).\
+        filter_by(volume_id=volume_id).\
+        filter(~models.VolumeAttachment.attach_status.startswith('error_')).\
+        order_by(models.VolumeAttachment.attach_status.asc()).\
+        limit(1).\
+        scalar()
+
+    # No volume attachment records means the volume is detached.
+    attach_status = attach_status or 'detached'
+
+    # Check cases where volume status is different from attach status, and
+    # default to the same value if it's not one of those cases.
+    status = ATTACH_STATUS_MAP.get(attach_status, attach_status)
+
+    return (status, attach_status)
+
+
 @require_admin_context
 def volume_detached(context, volume_id, attachment_id):
-    """This updates a volume attachment and marks it as detached.
+    """Delete an attachment and update the volume accordingly.
 
-    This method also ensures that the volume entry is correctly
-    marked as either still attached/in-use or detached/available
-    if this was the last detachment made.
+    After marking the attachment as detached the method will decide the status
+    and attach_status values for the volume based on the current status and the
+    remaining attachments and their status.
 
+    Volume status may be changed to: in-use, attaching, detaching, reserved, or
+    available.
+
+    Volume attach_status will be changed to one of: attached, attaching,
+    detaching, reserved, or detached.
     """
 
     # NOTE(jdg): This is a funky band-aid for the earlier attempts at
     # multiattach, it's a bummer because these things aren't really being used
     # but at the same time we don't want to break them until we work out the
     # new proposal for multi-attach
-    remain_attachment = True
     session = get_session()
     with session.begin():
+        # Only load basic volume info necessary to check various status and use
+        # the volume row as a lock with the for_update.
+        volume = _volume_get(context, volume_id, session=session,
+                             joined_load=False, for_update=True)
+
         try:
             attachment = _attachment_get(context, attachment_id,
                                          session=session)
+            attachment_updates = attachment.delete(session)
         except exception.VolumeAttachmentNotFound:
             attachment_updates = None
-            attachment = None
 
-        if attachment:
-            now = timeutils.utcnow()
-            attachment_updates = {
-                'attach_status': fields.VolumeAttachStatus.DETACHED,
-                'detach_time': now,
-                'deleted': True,
-                'deleted_at': now,
-                'updated_at': attachment.updated_at,
-            }
-            attachment.update(attachment_updates)
-            attachment.save(session=session)
-            del attachment_updates['updated_at']
+        status, attach_status = _get_statuses_from_attachments(context,
+                                                               session,
+                                                               volume_id)
 
-        attachment_list = None
-        volume_ref = _volume_get(context, volume_id,
-                                 session=session)
-        volume_updates = {'updated_at': volume_ref.updated_at}
-        if not volume_ref.volume_attachment:
-            # NOTE(jdg): We kept the old arg style allowing session exclusively
-            # for this one call
-            attachment_list = volume_attachment_get_all_by_volume_id(
-                context, volume_id, session=session)
-            remain_attachment = False
-        if attachment_list and len(attachment_list) > 0:
-            remain_attachment = True
+        volume_updates = {'updated_at': volume.updated_at,
+                          'attach_status': attach_status}
 
-        if not remain_attachment:
-            # Hide status update from user if we're performing volume migration
-            # or uploading it to image
-            if ((not volume_ref.migration_status and
-                    not (volume_ref.status == 'uploading')) or
-                    volume_ref.migration_status in ('success', 'error')):
-                volume_updates['status'] = 'available'
+        # Hide volume status update to available on volume migration or upload,
+        # as status is updated later on those flows.
+        if ((attach_status != 'detached')
+            or (not volume.migration_status and volume.status != 'uploading')
+                or volume.migration_status in ('success', 'error')):
+            volume_updates['status'] = status
 
-            volume_updates['attach_status'] = (
-                fields.VolumeAttachStatus.DETACHED)
-        else:
-            # Volume is still attached
-            volume_updates['status'] = 'in-use'
-            volume_updates['attach_status'] = (
-                fields.VolumeAttachStatus.ATTACHED)
-
-        volume_ref.update(volume_updates)
-        volume_ref.save(session=session)
+        volume.update(volume_updates)
+        volume.save(session=session)
         del volume_updates['updated_at']
         return (volume_updates, attachment_updates)
 
@@ -1878,11 +1890,14 @@ def _volume_get_query(context, session=None, project_only=False,
 
 
 @require_context
-def _volume_get(context, volume_id, session=None, joined_load=True):
+def _volume_get(context, volume_id, session=None, joined_load=True,
+                for_update=False):
     result = _volume_get_query(context, session=session, project_only=True,
                                joined_load=joined_load)
     if joined_load:
         result = result.options(joinedload('volume_type.extra_specs'))
+    if for_update:
+        result = result.with_for_update()
     result = result.filter_by(id=volume_id).first()
 
     if not result:
diff --git a/cinder/db/sqlalchemy/models.py b/cinder/db/sqlalchemy/models.py
index 9d9f7b8ef12..faeb1cfb5b1 100644
--- a/cinder/db/sqlalchemy/models.py
+++ b/cinder/db/sqlalchemy/models.py
@@ -56,8 +56,10 @@ class CinderBase(models.TimestampMixin,
     def delete(self, session):
         """Delete this object."""
         updated_values = self.delete_values()
+        updated_values['updated_at'] = self.updated_at
         self.update(updated_values)
         self.save(session=session)
+        del updated_values['updated_at']
         return updated_values
 
 
@@ -393,6 +395,14 @@ class VolumeAttachment(BASE, CinderBase):
     # Stores a serialized json dict of host connector information from brick.
     connector = sa.Column(sa.Text)
 
+    @staticmethod
+    def delete_values():
+        now = timeutils.utcnow()
+        return {'deleted': True,
+                'deleted_at': now,
+                'attach_status': 'detached',
+                'detach_time': now}
+
 
 class VolumeType(BASE, CinderBase):
     """Represent possible volume_types of volumes offered."""
diff --git a/cinder/tests/unit/api/v3/test_attachments.py b/cinder/tests/unit/api/v3/test_attachments.py
index 42c7b263b0b..6adf5a37cc5 100644
--- a/cinder/tests/unit/api/v3/test_attachments.py
+++ b/cinder/tests/unit/api/v3/test_attachments.py
@@ -172,13 +172,13 @@ class AttachmentsAPITestCase(test.TestCase):
         self.controller.delete(req, attachment.id)
 
         volume2 = objects.Volume.get_by_id(self.ctxt, volume1.id)
-        if status == 'reserved':
-            self.assertEqual('detached', volume2.attach_status)
-            self.assertRaises(
-                exception.VolumeAttachmentNotFound,
-                objects.VolumeAttachment.get_by_id, self.ctxt, attachment.id)
-        else:
-            self.assertEqual('attached', volume2.attach_status)
+        # Volume and attachment status is changed on the API service
+        self.assertEqual('detached', volume2.attach_status)
+        self.assertEqual('available', volume2.status)
+        self.assertRaises(
+            exception.VolumeAttachmentNotFound,
+            objects.VolumeAttachment.get_by_id, self.ctxt, attachment.id)
+        if status != 'reserved':
             mock_delete.assert_called_once_with(req.environ['cinder.context'],
                                                 attachment.id, mock.ANY)
 
diff --git a/cinder/tests/unit/attachments/test_attachments_manager.py b/cinder/tests/unit/attachments/test_attachments_manager.py
index 0aa87661f16..201ebccc66b 100644
--- a/cinder/tests/unit/attachments/test_attachments_manager.py
+++ b/cinder/tests/unit/attachments/test_attachments_manager.py
@@ -18,7 +18,6 @@ from oslo_utils import importutils
 from cinder import context
 from cinder import db
 from cinder.db.sqlalchemy import api as sqla_db
-from cinder import exception
 from cinder.objects import fields
 from cinder.objects import volume_attachment
 from cinder.tests.unit.api.v2 import fakes as v2_fakes
@@ -137,13 +136,21 @@ class AttachmentManagerTestCase(test.TestCase):
         attachment_ref = db.volume_attachment_get(
             self.context,
             attachment_ref['id'])
+
+        vref.refresh()
+        expected_status = (vref.status, vref.attach_status,
+                           attachment_ref.attach_status)
+
         self.manager.attachment_delete(self.context,
                                        attachment_ref['id'],
                                        vref)
-        self.assertRaises(exception.VolumeAttachmentNotFound,
-                          db.volume_attachment_get,
-                          self.context,
-                          attachment_ref.id)
+        # Manager doesn't change the resource status. It is changed on the API
+        attachment_ref = db.volume_attachment_get(self.context,
+                                                  attachment_ref.id)
+        vref.refresh()
+        self.assertEqual(
+            expected_status,
+            (vref.status, vref.attach_status, attachment_ref.attach_status))
 
     def test_attachment_delete_remove_export_fail(self):
         """attachment_delete removes attachment on remove_export failure."""
@@ -160,15 +167,18 @@ class AttachmentManagerTestCase(test.TestCase):
         attach = db.volume_attach(self.context, values)
         # Confirm the volume OVO has the attachment before the deletion
         vref.refresh()
+        expected_vol_status = (vref.status, vref.attach_status)
         self.assertEqual(1, len(vref.volume_attachment))
 
         self.manager.attachment_delete(self.context, attach.id, vref)
 
-        # Attachment has been removed from the DB
-        self.assertRaises(exception.VolumeAttachmentNotFound,
-                          db.volume_attachment_get, self.context, attach.id)
-        # Attachment has been removed from the volume OVO attachment list
-        self.assertEqual(0, len(vref.volume_attachment))
+        # Manager doesn't change the resource status. It is changed on the API
+        attachment = db.volume_attachment_get(self.context, attach.id)
+        self.assertEqual(attach.attach_status, attachment.attach_status)
+
+        vref = db.volume_get(self.context, vref.id)
+        self.assertEqual(expected_vol_status,
+                         (vref.status, vref.attach_status))
 
     def test_attachment_delete_multiple_attachments(self):
         volume_params = {'status': 'available'}
diff --git a/cinder/tests/unit/test_db_api.py b/cinder/tests/unit/test_db_api.py
index 8ee84e13201..987f278322b 100644
--- a/cinder/tests/unit/test_db_api.py
+++ b/cinder/tests/unit/test_db_api.py
@@ -644,11 +644,13 @@ class DBAPIVolumeTestCase(BaseTest):
                   'instance_uuid': instance_uuid,
                   'attach_status': fields.VolumeAttachStatus.ATTACHING, }
         attachment = db.volume_attach(self.ctxt, values)
+        db.volume_attached(self.ctxt, attachment.id, instance_uuid,
+                           None, '/tmp')
         values2 = {'volume_id': volume.id,
                    'instance_uuid': fake.OBJECT_ID,
                    'attach_status': fields.VolumeAttachStatus.ATTACHING, }
-        db.volume_attach(self.ctxt, values2)
-        db.volume_attached(self.ctxt, attachment.id,
+        attachment2 = db.volume_attach(self.ctxt, values2)
+        db.volume_attached(self.ctxt, attachment2.id,
                            instance_uuid,
                            None, '/tmp')
         volume_updates, attachment_updates = (
diff --git a/cinder/volume/api.py b/cinder/volume/api.py
index f2923812877..4fa5b524e09 100644
--- a/cinder/volume/api.py
+++ b/cinder/volume/api.py
@@ -2261,53 +2261,26 @@ class API(base.Base):
         volume = attachment.volume
 
         if attachment.attach_status == fields.VolumeAttachStatus.RESERVED:
-            volume_utils.notify_about_volume_usage(ctxt, volume,
-                                                   "detach.start")
-            volume.finish_detach(attachment.id)
-            do_notify = True
+            volume_utils.notify_about_volume_usage(ctxt,
+                                                   volume, "detach.start")
         else:
-            self.volume_rpcapi.attachment_delete(ctxt,
-                                                 attachment.id,
-                                                 volume)
-            do_notify = False
-        status_updates = {'status': 'available',
-                          'attach_status': 'detached'}
-        remaining_attachments = AO_LIST.get_all_by_volume_id(ctxt, volume.id)
-        LOG.debug("Remaining volume attachments: %s", remaining_attachments,
-                  resource=volume)
+            # Generate the detach.start notification on the volume service to
+            # include the host doing the operation.
+            self.volume_rpcapi.attachment_delete(ctxt, attachment.id, volume)
 
-        # NOTE(jdg) Try and figure out the > state we have left and set that
-        # attached > attaching > > detaching > reserved
-        pending_status_list = []
-        for attachment in remaining_attachments:
-            pending_status_list.append(attachment.attach_status)
-            LOG.debug("Adding status of: %s to pending status list "
-                      "for volume.", attachment.attach_status,
-                      resource=volume)
+        # Trigger attachments lazy load (missing since volume was loaded in the
+        # attachment without joined tables). With them loaded the finish_detach
+        # call removes the detached one from the list, and the notification and
+        # return have the right attachment list.
+        volume.volume_attachment
+        volume.finish_detach(attachment.id)
 
-        LOG.debug("Pending status list for volume during "
-                  "attachment-delete: %s",
-                  pending_status_list, resource=volume)
-        if 'attached' in pending_status_list:
-            status_updates['status'] = 'in-use'
-            status_updates['attach_status'] = 'attached'
-        elif 'attaching' in pending_status_list:
-            status_updates['status'] = 'attaching'
-            status_updates['attach_status'] = 'attaching'
-        elif 'detaching' in pending_status_list:
-            status_updates['status'] = 'detaching'
-            status_updates['attach_status'] = 'detaching'
-        elif 'reserved' in pending_status_list:
-            status_updates['status'] = 'reserved'
-            status_updates['attach_status'] = 'reserved'
-
-        volume.status = status_updates['status']
-        volume.attach_status = status_updates['attach_status']
-        volume.save()
-
-        if do_notify:
-            volume_utils.notify_about_volume_usage(ctxt, volume, "detach.end")
-        return remaining_attachments
+        # Do end notification on the API so it comes after finish_detach.
+        # Doing it on the volume service leads to race condition from bug
+        # #1937084, and doing the notification there with the finish here leads
+        # to bug #1916980.
+        volume_utils.notify_about_volume_usage(ctxt, volume, "detach.end")
+        return volume.volume_attachment
 
 
 class HostAPI(base.Base):
diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py
index 5aba32df95f..f4478f0cee7 100644
--- a/cinder/volume/manager.py
+++ b/cinder/volume/manager.py
@@ -4951,8 +4951,6 @@ class VolumeManager(manager.CleanableManager,
             # Failures on detach_volume and remove_export are not considered
             # failures in terms of detaching the volume.
             pass
-        vref.finish_detach(attachment_id)
-        self._notify_about_volume_usage(context, vref, "detach.end")
 
     # Replication group API (Tiramisu)
     def enable_replication(self,
diff --git a/releasenotes/notes/detach-race-delete-012820ad9c8dbe16.yaml b/releasenotes/notes/detach-race-delete-012820ad9c8dbe16.yaml
new file mode 100644
index 00000000000..ec7f0be27e7
--- /dev/null
+++ b/releasenotes/notes/detach-race-delete-012820ad9c8dbe16.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    `Bug #1937084 <https://bugs.launchpad.net/cinder/+bug/1937084>`_: Fixed
+    race condition between delete attachment and delete volume that can leave
+    deleted volumes stuck as attached to instances.