Merge "Improve OCF script diagnostics for timed-out 'list_channels'"
This commit is contained in:
commit
c08ac40fb8
@ -1500,6 +1500,7 @@ get_monitor() {
|
|||||||
local timeout_alive
|
local timeout_alive
|
||||||
su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null"
|
su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null"
|
||||||
rc_alive=$?
|
rc_alive=$?
|
||||||
|
[ $rc_alive -eq 137 -o $rc_alive -eq 124 ] && ocf_log err "${LH} 'rabbitmqctl list_channels' timed out, per-node explanation: $(enhanced_list_channels)"
|
||||||
check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels"
|
check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels"
|
||||||
timeout_alive=$?
|
timeout_alive=$?
|
||||||
|
|
||||||
@ -1693,6 +1694,114 @@ action_stop() {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
# Enhanced list_channels:
|
||||||
|
# - nodes are processed in parallel
|
||||||
|
# - report contains information about which nodes timed out
|
||||||
|
#
|
||||||
|
# 'list_channels' is used as a healh-check for current node, but it
|
||||||
|
# actually checks overall health of all node in cluster. And there were
|
||||||
|
# some bugs where only one (non-local) channel became stuck, but OCF
|
||||||
|
# script was wrongfully killing local node.
|
||||||
|
#
|
||||||
|
# Hopefully all such bugs are fixed, but if not - it will allow to
|
||||||
|
# detect such conditions.
|
||||||
|
#
|
||||||
|
# Somewhat strange implementation is due to the following reasons:
|
||||||
|
# - ability to support older versions of RabbitMQ which have reached
|
||||||
|
# end-of-life with single version of the script
|
||||||
|
# - zero dependencies - for older versions this functionality could be
|
||||||
|
# implemented as a plugin, but it'll require this plugin installation
|
||||||
|
enhanced_list_channels() {
|
||||||
|
# One second less than timeout of su_rabbit_cmd
|
||||||
|
local timeout=$((${TIMEOUT_ARG:-5} - 1))
|
||||||
|
|
||||||
|
su_rabbit_cmd "xargs -0 ${OCF_RESKEY_ctl} eval" <<EOF
|
||||||
|
SecondsToCompletion = $timeout,
|
||||||
|
|
||||||
|
%% Milliseconds since unix epoch
|
||||||
|
Now = fun() ->
|
||||||
|
{Mega, Secs, Micro} = os:timestamp(),
|
||||||
|
Mili = Micro div 1000,
|
||||||
|
Mili + 1000 * (Secs + 1000000 * Mega)
|
||||||
|
end,
|
||||||
|
|
||||||
|
%% We shouldn't continue execution past this time
|
||||||
|
ShouldEndAt = Now() + SecondsToCompletion * 1000,
|
||||||
|
|
||||||
|
%% How many milliseconds we still have
|
||||||
|
Timeout = fun() ->
|
||||||
|
case ShouldEndAt - Now() of
|
||||||
|
Past when Past =< 0 ->
|
||||||
|
0;
|
||||||
|
Timeout ->
|
||||||
|
Timeout
|
||||||
|
end
|
||||||
|
end,
|
||||||
|
|
||||||
|
%% Lambda combinator - for defining anonymous recursive functions
|
||||||
|
Y = fun(F) ->
|
||||||
|
(fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)(
|
||||||
|
fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)
|
||||||
|
end,
|
||||||
|
|
||||||
|
Parent = self(),
|
||||||
|
|
||||||
|
ListChannels = Y(fun(Rec) ->
|
||||||
|
fun (({Node, [], OkChannelsCount})) ->
|
||||||
|
Parent ! {Node, ok, OkChannelsCount};
|
||||||
|
({Node, [Chan|Rest], OkChannelsCount}) ->
|
||||||
|
case catch rpc:call(Node, rabbit_channel, info, [Chan], Timeout()) of
|
||||||
|
Infos when is_list(Infos) ->
|
||||||
|
Rec({Node, Rest, OkChannelsCount + 1});
|
||||||
|
{badrpc, {'EXIT', {noproc, _}}} ->
|
||||||
|
%% Channel became dead before we could request it's status, don't care
|
||||||
|
Rec({Node, Rest, OkChannelsCount});
|
||||||
|
Err ->
|
||||||
|
Parent ! {Node, Err, OkChannelsCount}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end),
|
||||||
|
|
||||||
|
SingleNodeListing = fun(Node) ->
|
||||||
|
case catch rpc:call(Node, pg_local, get_members, [rabbit_channels], Timeout()) of
|
||||||
|
LocalChannels when is_list(LocalChannels) ->
|
||||||
|
ListChannels({Node, LocalChannels, 0});
|
||||||
|
Err ->
|
||||||
|
Parent ! {Node, Err, 0}
|
||||||
|
end
|
||||||
|
end,
|
||||||
|
|
||||||
|
AllNodes = rabbit_mnesia:cluster_nodes(running),
|
||||||
|
[ spawn(fun() -> SingleNodeListing(Node) end) || Node <- AllNodes ],
|
||||||
|
|
||||||
|
WaitForNodes = Y(fun(Rec) ->
|
||||||
|
fun ({[], Acc}) ->
|
||||||
|
Acc;
|
||||||
|
({RemainingNodes, Acc}) ->
|
||||||
|
receive
|
||||||
|
{Node, _Status, _ChannelCount} = Smth ->
|
||||||
|
RemainingNodes1 = lists:delete(Node, RemainingNodes),
|
||||||
|
Rec({RemainingNodes1, [Smth|Acc]})
|
||||||
|
after Timeout() + 100 ->
|
||||||
|
Acc
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end),
|
||||||
|
|
||||||
|
Result = WaitForNodes({AllNodes, []}),
|
||||||
|
|
||||||
|
ExpandedResult = [ case lists:keysearch(Node, 1, Result) of
|
||||||
|
{value, NodeResult} ->
|
||||||
|
NodeResult;
|
||||||
|
false ->
|
||||||
|
{Node, no_data_collected, 0}
|
||||||
|
end || Node <- AllNodes ],
|
||||||
|
|
||||||
|
ExpandedResult.
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
#######################################################################
|
#######################################################################
|
||||||
# Join the cluster and return OCF_SUCCESS, if joined.
|
# Join the cluster and return OCF_SUCCESS, if joined.
|
||||||
# Return 10, if node is trying to join to itself or empty destination.
|
# Return 10, if node is trying to join to itself or empty destination.
|
||||||
|
Loading…
Reference in New Issue
Block a user