From ecef670efcfdd4a7101f6e81bd8cd8d71cb542e6 Mon Sep 17 00:00:00 2001 From: Arnaud Morin Date: Thu, 20 Nov 2025 15:29:52 +0100 Subject: [PATCH] Do not delete queues on error When a queue is in error, we used to delete the faulty queue. The problem with that code is that under heavy load, for queues that are shared accross agents, this could fail in a loop and never recover. Agent A detect the queue is broken: it delete the queue and then recreate In the meantime, Agent B detect the same thing: it delete the queue and then recreate as well. Now imagine this with more than 3k agents... Change-Id: I762bf2839482ee06c5b2fc9fa50f38f5542cbe99 Closes-bug: #2133389 Signed-off-by: Arnaud Morin --- oslo_messaging/_drivers/impl_rabbit.py | 26 ------------------- ...void-quorum-deletion-ec5fede2d4ecb001.yaml | 5 ++++ 2 files changed, 5 insertions(+), 26 deletions(-) create mode 100644 releasenotes/notes/avoid-quorum-deletion-ec5fede2d4ecb001.yaml diff --git a/oslo_messaging/_drivers/impl_rabbit.py b/oslo_messaging/_drivers/impl_rabbit.py index 94506dca8..88001e17d 100644 --- a/oslo_messaging/_drivers/impl_rabbit.py +++ b/oslo_messaging/_drivers/impl_rabbit.py @@ -558,14 +558,6 @@ class Consumer: 'Queue: [%(queue)s], ' 'error message: [%(err_str)s]', info) time.sleep(interval) - if self.queue_arguments.get('x-queue-type') == 'quorum': - # Before re-declare queue, try to delete it - # This is helping with issue #2028384 - # NOTE(amorin) we need to make sure the connection is - # established again, because when an error occur, the - # connection is closed. - conn.ensure_connection() - self.queue.delete() self.queue.declare() else: raise @@ -608,24 +600,6 @@ class Consumer: nowait=self.nowait) else: raise - except amqp_ex.InternalError as exc: - if self.queue_arguments.get('x-queue-type') == 'quorum': - # Before re-consume queue, try to delete it - # This is helping with issue #2028384 - if exc.code == 541: - LOG.warning('Queue %s seems broken, will try delete it ' - 'before starting over.', self.queue.name) - # NOTE(amorin) we need to make sure the connection is - # established again, because when an error occur, the - # connection is closed. - conn.ensure_connection() - self.queue.delete() - self.declare(conn) - self.queue.consume(callback=self._callback, - consumer_tag=str(tag), - nowait=self.nowait) - else: - raise def cancel(self, tag): LOG.trace('ConsumerBase.cancel: canceling %s', tag) diff --git a/releasenotes/notes/avoid-quorum-deletion-ec5fede2d4ecb001.yaml b/releasenotes/notes/avoid-quorum-deletion-ec5fede2d4ecb001.yaml new file mode 100644 index 000000000..efe7d4c1e --- /dev/null +++ b/releasenotes/notes/avoid-quorum-deletion-ec5fede2d4ecb001.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Avoid deleting RabbitMQ ``quorum`` queues if they are failing on server + side with ``Internal Server Error`` (error ``541``).