charm-rabbitmq-server/actions/actions.py
Liam Young 3d5e1e22d8 Coordination module for rabbit restarts
Use the coordination module to manage restarting the rabbitmq
services. This is to ensure that restarts are only
performed on one unit at a time. This helps prevent
situation which can cause the cluster to become split
brained (eg if two or more nodes are restarted at the same
time).

* Manually run _run_atstart & _run_atexit method when actions
  are run as this does not happen automatically and is needed by
  the coordination layer.
* Replace restart_on_change decorator with
  coordinated_restart_on_change. coordinated_restart_on_change
  includes logic for requesting restart locks from the coordination
  module.
* The coordination module works via the leader and cluster events so
  the hooks now include calls to check_coordinated_functions
  which will run any function that is waiting for a lock.
* Logic has been added to check for the situation where a hook is
  being run via the run_deferred_hooks actions. If this is the
  case then restarts are immediate as the action should only be run
  on one unit at a time.

Change-Id: Ia133c90a610793d4da96d3400a3906b801b52b73
2022-02-17 11:06:00 +00:00

341 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Copyright 2016 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
from collections import OrderedDict
from subprocess import check_output, CalledProcessError, PIPE
import sys
_path = os.path.dirname(os.path.realpath(__file__))
_root = os.path.abspath(os.path.join(_path, '..'))
_hooks = os.path.abspath(os.path.join(_path, '../hooks'))
def _add_path(path):
if path not in sys.path:
sys.path.insert(1, path)
_add_path(_root)
_add_path(_hooks)
import charmhelpers.contrib.openstack.deferred_events as deferred_events
import charmhelpers.contrib.openstack.utils as os_utils
from charmhelpers import coordinator
from charmhelpers.core.host import (
service_start,
service_stop,
)
from charmhelpers.core.hookenv import (
action_fail,
action_set,
action_get,
is_leader,
leader_set,
log,
INFO,
ERROR,
_run_atstart,
_run_atexit,
)
from charmhelpers.core.host import (
cmp_pkgrevno,
)
import rabbitmq_server_relations
from hooks.rabbit_utils import (
ConfigRenderer,
CONFIG_FILES,
pause_unit_helper,
resume_unit_helper,
assess_status,
list_vhosts,
vhost_queue_info,
rabbitmq_version_newer_or_equal,
)
def pause(args):
"""Pause the RabbitMQ services.
@raises Exception should the service fail to stop.
"""
pause_unit_helper(ConfigRenderer(CONFIG_FILES()))
def resume(args):
"""Resume the RabbitMQ services.
@raises Exception should the service fail to start."""
resume_unit_helper(ConfigRenderer(CONFIG_FILES()))
def cluster_status(args):
"""Return the output of 'rabbitmqctl cluster_status'."""
try:
if rabbitmq_version_newer_or_equal('3.7'):
clusterstat = check_output(['rabbitmqctl', 'cluster_status',
'--formatter', 'json'],
universal_newlines=True)
clusterstat = json.loads(clusterstat)
action_set({'output': json.dumps(clusterstat, indent=4)})
else:
clusterstat = check_output(['rabbitmqctl', 'cluster_status'],
universal_newlines=True)
action_set({'output': clusterstat})
except CalledProcessError as e:
action_set({'output': e.output})
action_fail('Failed to run rabbitmqctl cluster_status')
except Exception:
raise
def check_queues(args):
"""Check for queues with greater than N messages.
Return those queues to the user."""
queue_depth = (action_get('queue-depth'))
vhost = (action_get('vhost'))
# rabbitmqctl's output contains lines we don't want, such as
# 'Listing queues ..' and '...done.', which may vary by release.
# Actual queue results *should* always look like 'test\t0'
queue_pattern = re.compile(r"^(.*)\t([0-9]+$)")
try:
queue_lines = check_output(
['rabbitmqctl', 'list_queues', '-q', '-p', vhost]
).decode('utf-8').splitlines()
filtered = filter(
None, # filter out empty records
map(lambda line: queue_pattern.findall(line), queue_lines))
queues = [(queue, int(size)) for [[queue, size]] in filtered]
result = {queue: size for queue, size in queues if size >= queue_depth}
action_set({'output': result, 'outcome': 'Success'})
except CalledProcessError as e:
action_set({'output': e.output})
action_fail('Failed to run rabbitmqctl list_queues')
def complete_cluster_series_upgrade(args):
""" Complete the series upgrade process
After all nodes have been upgraded, this action is run to inform the whole
cluster the upgrade is done. Config files will be re-rendered with each
peer in the wsrep_cluster_address config.
"""
if is_leader():
# Unset cluster_series_upgrading
leader_set(cluster_series_upgrading="")
assess_status(ConfigRenderer(CONFIG_FILES()))
def forget_cluster_node(args):
"""Remove previously departed node from cluster."""
node = (action_get('node'))
if cmp_pkgrevno('rabbitmq-server', '3.0.0') < 0:
action_fail(
'rabbitmq-server version < 3.0.0, '
'forget_cluster_node not supported.')
return
try:
output = check_output(
['rabbitmqctl', 'forget_cluster_node', node],
stderr=PIPE)
action_set({'output': output.decode('utf-8'), 'outcome': 'Success'})
except CalledProcessError as e:
action_set({'output': e.stderr})
if e.returncode == 2:
action_fail(
"Unable to remove node '{}' from cluster. It is either still "
"running or already removed. (Output: '{}')"
.format(node, e.stderr))
else:
action_fail('Failed running rabbitmqctl forget_cluster_node')
def list_unconsumed_queues(args):
"""List queues which are unconsumed in RabbitMQ"""
log("Listing unconsumed queues...", level=INFO)
count = 0
for vhost in list_vhosts():
try:
queue_info_dict = vhost_queue_info(vhost)
except CalledProcessError as e:
# if no queues, just raises an exception
action_set({'output': e.output,
'return-code': e.returncode})
action_fail("Failed to query RabbitMQ vhost {} queues"
"".format(vhost))
return False
for queue in queue_info_dict:
if queue['consumers'] == 0:
vhostqueue = ''
value = ''
try:
vhostqueue = "unconsumed-queues.{}".format(count)
value = OrderedDict((
('vhost', vhost),
('name', queue['name']),
('messages', queue['messages']),
))
action_set({vhostqueue: json.dumps(value)})
except Exception as e:
log('{}, vhostqueue={}, value={}'.format(
e, vhostqueue, value), level=ERROR)
count += 1
log("{} unconsumed queue(s) found".format(count), level=INFO)
action_set({'unconsumed-queue-count': count})
def force_boot(args):
"""Set the force_boot flag and start RabbitMQ broker"""
try:
service_stop('rabbitmq-server')
except CalledProcessError as e:
action_set({'output': e.output})
action_fail('Failed to stop rabbitmqctl service')
return False
try:
force_boot = check_output(['rabbitmqctl', 'force_boot'],
universal_newlines=True)
action_set({'output': force_boot})
except CalledProcessError as e:
action_set({'output': e.output})
action_fail('Failed to run rabbitmqctl force_boot')
return False
try:
service_start('rabbitmq-server')
except CalledProcessError as e:
action_set({'output': e.output})
action_fail('Failed to start rabbitmqctl service after force_boot')
return False
def restart(args):
"""Restart services.
:param args: Unused
:type args: List[str]
"""
deferred_only = action_get("deferred-only")
svcs = action_get("services").split()
# Check input
if deferred_only and svcs:
action_fail("Cannot set deferred-only and services")
return
if not (deferred_only or svcs):
action_fail("Please specify deferred-only or services")
return
if action_get('run-hooks'):
_run_deferred_hooks()
if deferred_only:
os_utils.restart_services_action(deferred_only=True)
else:
os_utils.restart_services_action(services=svcs)
assess_status(ConfigRenderer(CONFIG_FILES()))
def _run_deferred_hooks():
"""Run supported deferred hooks as needed.
Run supported deferred hooks as needed. If support for deferring a new
hook is added to the charm then this method will need updating.
"""
if not deferred_events.is_restart_permitted():
if 'config-changed' in deferred_events.get_deferred_hooks():
log("Running hook config-changed", level=INFO)
rabbitmq_server_relations.config_changed(
check_deferred_restarts=False)
deferred_events.clear_deferred_hook('config-changed')
if 'amqp-relation-changed' in deferred_events.get_deferred_hooks():
log("Running hook amqp-relation-changed", level=INFO)
# update_clients cycles through amqp relations running
# amqp-relation-changed hook.
rabbitmq_server_relations.update_clients(
check_deferred_restarts=False)
deferred_events.clear_deferred_hook('amqp-relation-changed')
log("Remaining hooks: {}".format(
deferred_events.get_deferred_hooks()),
level=INFO)
def run_deferred_hooks(args):
"""Run deferred hooks.
:param args: Unused
:type args: List[str]
"""
_run_deferred_hooks()
os_utils.restart_services_action(deferred_only=True)
assess_status(ConfigRenderer(CONFIG_FILES()))
def show_deferred_events(args):
"""Show the deferred events.
:param args: Unused
:type args: List[str]
"""
os_utils.show_deferred_events_action_helper()
# A dictionary of all the defined actions to callables (which take
# parsed arguments).
ACTIONS = {
"pause": pause,
"resume": resume,
"cluster-status": cluster_status,
"check-queues": check_queues,
"complete-cluster-series-upgrade": complete_cluster_series_upgrade,
"forget-cluster-node": forget_cluster_node,
"list-unconsumed-queues": list_unconsumed_queues,
"force-boot": force_boot,
"restart-services": restart,
"run-deferred-hooks": run_deferred_hooks,
"show-deferred-events": show_deferred_events,
}
def main(args):
_ = coordinator.Serial()
# Manually trigger any registered atstart events as these are not
# triggered by actions
_run_atstart()
action_name = os.path.basename(args[0])
try:
action = ACTIONS[action_name]
except KeyError:
s = "Action {} undefined".format(action_name)
action_fail(s)
return s
else:
try:
action(args)
except Exception as e:
action_fail("Action {} failed: {}".format(action_name, str(e)))
_run_atexit()
if __name__ == "__main__":
sys.exit(main(sys.argv))