From 4540cd6ef9731837cf4ed1e56850bcfa3512ae1c Mon Sep 17 00:00:00 2001
From: Matthew Booth <mbooth@redhat.com>
Date: Mon, 13 May 2019 16:04:39 +0100
Subject: [PATCH] Fix retry of instance_update_and_get_original

_instance_update modifies its 'values' argument. Consequently if it is
retried due to an update conflict, the second invocation has the wrong
arguments.

A specific issue this causes is that if we called it with
expected_task_state a concurrent modification to task_state will cause
us to fail and retry. However, expected_task_state will have been popped
from values on the first invocation and will not be present for the
second. Consequently the second invocation will fail to perform the
task_state check and therefore succeed, resulting in a race.

We rewrite the old race unit test which wasn't testing the correct
thing for 2 reasons:

1. Due to the bug fixed in this patch, although we were calling
   update_on_match() twice, the second call didn't check the task state.
2. side_effect=iterable returns function items without executing them,
   but we weren't hitting this due to the bug fixed in this patch.

Closes-Bug: #1821373
Change-Id: I01c63e685113bf30e687ccb14a4d18e344b306f6
(cherry picked from commit aae5c7aa3819ad9161fd2effed3872d540099230)
(cherry picked from commit 61fef49b1555c6509398fc47c21319c1b5e50505)
---
 nova/db/sqlalchemy/api.py         | 21 +++++++-----
 nova/tests/unit/db/test_db_api.py | 56 ++++++++++++++++++++++++-------
 2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
index 37d1beed29a7..5cf77cb2be80 100644
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@@ -2747,6 +2747,11 @@ def _instance_update(context, instance_uuid, values, expected, original=None):
     if not uuidutils.is_uuid_like(instance_uuid):
         raise exception.InvalidUUID(instance_uuid)
 
+    # NOTE(mdbooth): We pop values from this dict below, so we copy it here to
+    # ensure there are no side effects for the caller or if we retry the
+    # function due to a db conflict.
+    updates = copy.copy(values)
+
     if expected is None:
         expected = {}
     else:
@@ -2758,8 +2763,8 @@ def _instance_update(context, instance_uuid, values, expected, original=None):
     # updates
     for field in ('task_state', 'vm_state'):
         expected_field = 'expected_%s' % field
-        if expected_field in values:
-            value = values.pop(expected_field, None)
+        if expected_field in updates:
+            value = updates.pop(expected_field, None)
             # Coerce all single values to singleton lists
             if value is None:
                 expected[field] = [None]
@@ -2767,23 +2772,23 @@ def _instance_update(context, instance_uuid, values, expected, original=None):
                 expected[field] = sqlalchemyutils.to_list(value)
 
     # Values which need to be updated separately
-    metadata = values.pop('metadata', None)
-    system_metadata = values.pop('system_metadata', None)
+    metadata = updates.pop('metadata', None)
+    system_metadata = updates.pop('system_metadata', None)
 
-    _handle_objects_related_type_conversions(values)
+    _handle_objects_related_type_conversions(updates)
 
     # Hostname is potentially unique, but this is enforced in code rather
     # than the DB. The query below races, but the number of users of
     # osapi_compute_unique_server_name_scope is small, and a robust fix
     # will be complex. This is intentionally left as is for the moment.
-    if 'hostname' in values:
-        _validate_unique_server_name(context, values['hostname'])
+    if 'hostname' in updates:
+        _validate_unique_server_name(context, updates['hostname'])
 
     compare = models.Instance(uuid=instance_uuid, **expected)
     try:
         instance_ref = model_query(context, models.Instance,
                                    project_only=True).\
-                       update_on_match(compare, 'uuid', values)
+                       update_on_match(compare, 'uuid', updates)
     except update_match.NoRowsMatched:
         # Update failed. Try to find why and raise a specific error.
 
diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py
index 0433f126b1ba..e75d5e356ddc 100644
--- a/nova/tests/unit/db/test_db_api.py
+++ b/nova/tests/unit/db/test_db_api.py
@@ -44,6 +44,7 @@ from sqlalchemy import inspect
 from sqlalchemy import Integer
 from sqlalchemy import MetaData
 from sqlalchemy.orm import query
+from sqlalchemy.orm import session as sqla_session
 from sqlalchemy import sql
 from sqlalchemy import Table
 
@@ -2558,21 +2559,50 @@ class InstanceTestCase(test.TestCase, ModelsObjectComparatorMixin):
         test(self.ctxt)
 
     def test_instance_update_and_get_original_conflict_race(self):
-        # Ensure that we retry if update_on_match fails for no discernable
-        # reason
-        instance = self.create_instance_with_args()
+        # Ensure that we correctly process expected_task_state when retrying
+        # due to an unknown conflict
 
-        orig_update_on_match = update_match.update_on_match
+        # This requires modelling the MySQL read view, which means that if we
+        # have read something in the current transaction and we read it again,
+        # we will read the same data every time even if another committed
+        # transaction has since altered that data. In this test we have an
+        # instance whose task state was originally None, but has been set to
+        # SHELVING by another, concurrent transaction. Therefore the first time
+        # we read the data we will read None, but when we restart the
+        # transaction we will read the correct data.
 
-        # Reproduce the conditions of a race between fetching and updating the
-        # instance by making update_on_match fail for no discernable reason the
-        # first time it is called, but work normally the second time.
-        with mock.patch.object(update_match, 'update_on_match',
-                        side_effect=[update_match.NoRowsMatched,
-                                     orig_update_on_match]):
-            db.instance_update_and_get_original(
-                self.ctxt, instance['uuid'], {'metadata': {'mk1': 'mv3'}})
-            self.assertEqual(update_match.update_on_match.call_count, 2)
+        instance = self.create_instance_with_args(
+                task_state=task_states.SHELVING)
+
+        instance_out_of_date = copy.copy(instance)
+        instance_out_of_date['task_state'] = None
+
+        # NOTE(mdbooth): SQLA magic which makes this dirty object look
+        # like a freshly loaded one.
+        sqla_session.make_transient(instance_out_of_date)
+        sqla_session.make_transient_to_detached(instance_out_of_date)
+
+        # update_on_match will fail first time because the actual task state
+        # (SHELVING) doesn't match the expected task state (None). However,
+        # we ensure that the first time we fetch the instance object we get
+        # out-of-date data. This forces us to retry the operation to find out
+        # what really went wrong.
+        with mock.patch.object(sqlalchemy_api, '_instance_get_by_uuid',
+                    side_effect=[instance_out_of_date, instance]), \
+                 mock.patch.object(sqlalchemy_api, '_instance_update',
+                     side_effect=sqlalchemy_api._instance_update):
+            self.assertRaises(exception.UnexpectedTaskStateError,
+                              db.instance_update_and_get_original,
+                              self.ctxt, instance['uuid'],
+                              {'expected_task_state': [None]})
+            sqlalchemy_api._instance_update.assert_has_calls([
+                mock.call(self.ctxt, instance['uuid'],
+                          {'expected_task_state': [None]}, None,
+                          original=instance_out_of_date),
+                mock.call(self.ctxt, instance['uuid'],
+                          {'expected_task_state': [None]}, None,
+                          original=instance),
+            ])
 
     def test_instance_update_and_get_original_conflict_race_fallthrough(self):
         # Ensure that is update_match continuously fails for no discernable