Wait for Group Replication to finish; 3100 before commit error

The bug is triggered, as a race, usually by the
prometheus-relation-joined hook, when it tries to create a user whilst
Group Replication is recovering during a rolling restart.  This patch
alters the create_user() method so that it detects the failure condition
and then retries for up to a minute (6 times, every 10 seconds) for the
Group Replication to recover before giving up and returning False
(indicating the the user was not created).  This will usually result in
the handler not completing during the hook, and then retrying on the
next hook.

Change-Id: I5df4fd5ecbdd2b7bce525a9930dcffbc5868cbb8
Closes-Bug: #2018385
This commit is contained in:
Alex Kavanagh 2023-05-16 20:21:43 +01:00
parent cd5176e761
commit 5fb5a05be1
2 changed files with 41 additions and 11 deletions

View File

@ -262,6 +262,7 @@ class MySQLInnoDBClusterCharm(
_read_only_error = 1290
_user_create_failed = 1396
_local_socket_connection_error = 2002
_before_commit_error = 3100
@property
def mysqlsh_bin(self):
@ -557,6 +558,9 @@ class MySQLInnoDBClusterCharm(
m_helper.execute("FLUSH PRIVILEGES")
@tenacity.retry(stop=tenacity.stop_after_attempt(6),
wait=tenacity.wait_fixed(10),
retry=tenacity.retry_if_result(lambda x: x is False))
def create_user(
self,
address,
@ -579,6 +583,19 @@ class MySQLInnoDBClusterCharm(
leadership settings. If the unit isn't clustered, but is configured,
then the function gives up.
NOTE: LP#2018383 means that the function may return False in the
prometheus-relation-joined hook if the cluster is recovering from
switching to TLS and recovering Group Replication. The function
detects this error (3100), and returns False. The tenacity retry on
the method retries the function after 10 seconds, 6 times, allowing for
1 minute for the Group Replication to recover during the hook
execution. Otherwise, the function returns False to allow the caller to
decide what to do on the failure.
The function returns None if the unit is not configured in cluster or
the helper can't connect to the local socket, neither of which are
recoverable in this function.
:param address: User's address
:type address: str
:param exporter_user: User's username
@ -588,8 +605,8 @@ class MySQLInnoDBClusterCharm(
:param privilege: User permission
:type privilege: Literal[all, read_only, prom_exporter]
:side effect: Executes SQL to create DB user
:returns: True if successful, False if there are failures
:rtype: Boolean
:returns: True if successful, False|None if there are failures
:rtype: Optional(Bool)
"""
SQL_CLUSTER_USER_CREATE = (
"CREATE USER '{user}'@'{host}' "
@ -620,7 +637,7 @@ class MySQLInnoDBClusterCharm(
"when this instance is configured for the cluster but "
"not yet in the cluster. Skipping.",
"INFO")
return False
return None
# Otherwise, this unit is not configured for the cluster
m_helper = self.get_db_helper()
@ -632,7 +649,7 @@ class MySQLInnoDBClusterCharm(
"Couldn't connect to local socket when trying to "
"create the user: '{}'.".format(user),
"WARNING")
return False
return None
raise
for address in addresses:
@ -655,16 +672,21 @@ class MySQLInnoDBClusterCharm(
return False
if e.args[0] == self._user_create_failed:
ch_core.hookenv.log(
"User {} exists."
.format(user), "WARNING")
"User {} exists.".format(user), "WARNING")
# NOTE (rgildein): This is necessary to ensure that the
# existing user has the correct privileges.
self._grant_user_privileges(
m_helper, address, user, user_privilege,
)
continue
else:
raise
if e.args[0] == self._before_commit_error:
ch_core.hookenv.log(
"Couldn't commit due to error {}; most likely the "
"cluster's group replication recovery is in process "
.format(self._before_commit_error),
"WARNING")
return False
raise
return True
def configure_instance(self, address):

View File

@ -504,7 +504,7 @@ class TestMySQLInnoDBClusterCharm(test_utils.PatchHelper):
# test no cluster rw_db_helper, configured, but not yet in cluster
self.is_flag_set.side_effect = [True, False]
mock_cluster_address.return_value = "10.1.2.3"
self.assertFalse(midbc.create_user(_addr, _user, _pass, "all"))
self.assertIsNone(midbc.create_user(_addr, _user, _pass, "all"))
self.is_flag_set.assert_has_calls([
mock.call("leadership.set.{}".format(
mysql_innodb_cluster.make_cluster_instance_configured_key(
@ -567,7 +567,7 @@ class TestMySQLInnoDBClusterCharm(test_utils.PatchHelper):
_helper.reset_mock()
# _helper.connect.side_effect = _error
_helper.connect.side_effect = FakeException(2002, "Failed connection")
self.assertFalse(midbc.create_user(_localhost, _user, _pass, "all"))
self.assertIsNone(midbc.create_user(_localhost, _user, _pass, "all"))
_helper.connect.assert_called_once_with(password=mock.ANY)
# _helper.connect raises a different error.
@ -593,7 +593,15 @@ class TestMySQLInnoDBClusterCharm(test_utils.PatchHelper):
_helper.reset_mock()
_helper.execute.side_effect = (
self._exceptions.OperationalError(1290, "Super read only"))
self.assertFalse(midbc.create_user(_localhost, _user, _pass, "all"))
self.assertEqual(midbc.create_user(_localhost, _user, _pass, "all"),
False)
# before commit error (3100)
_helper.reset_mock()
_helper.execute.side_effect = (
self._exceptions.OperationalError(3100, "Before commit error"))
self.assertEqual(midbc.create_user(_localhost, _user, _pass, "all"),
False)
# Unhandled Exception
_helper.reset_mock()