Fix neutron-ha-tool for active/passive usage

The neutron-ha-tool Pacemaker resource primitive is only intended to be run on a single node at a time, i.e. in active/passive mode, rather than as a clone. However until now, the RA didn't change behaviour depending on whether it was supposed to be active on the current node. So if Pacemaker did a probe on a node where it was not expecting it to be active, the monitor action would typically return OCF_SUCCESS, causing messages from pengine like: error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on 2 nodes attempting recovery warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active for more information. and then Pacemaker could attempt unnecessary recovery according to the value of the cluster-wide "multiple-active" option, which defaults to "stop-start". This would stop the resource everywhere (which is a noop), and then start it on one node, resulting in unnecessary cluster transitions and unnecessary runs of this RA's "start" action. To avoid this, we introduce a state file to keep track of whether it's active on the current node, and if so, skip the l3-agent check and always return OCF_NOT_RUNNING. This is the same technique already used by NovaEvacuate. Change-Id: I459e49d27802552ef5424d290ef3fca51640723b Closes-Bug: #1555711 Signed-off-by: Adam Spiers <aspiers@suse.com>
2016-03-10 15:53:41 +00:00 · 2016-03-10 15:53:41 +00:00 · 34447f8fa8
parent 04051d7bb6
commit 34447f8fa8
1 changed files with 35 additions and 1 deletions
--- a/ocf/neutron-ha-tool
+++ b/ocf/neutron-ha-tool
@ -192,6 +192,26 @@ neutron_ha_tool_status() {
 }

 neutron_ha_tool_monitor() {
+    if ! [ -e "$statefile" ]; then
+        # neutron-ha-tool is run on a single node at a time, i.e. in
+        # active/passive mode.  So we use this state file to keep
+        # track of whether it's active on the current node, and if
+        # Pacemaker does a probe on a node where it's not active, we
+        # skip the l3-agent check and always return OCF_NOT_RUNNING,
+        # otherwise we'd get messages from pengine like:
+        #
+        #   error: Resource neutron-ha-tool (ocf::neutron-ha-tool) is active on
+        #       2 nodes attempting recovery
+        #   warning: See http://clusterlabs.org/wiki/FAQ#Resource_is_Too_Active
+        #       for more information.
+        #
+        # and Pacemaker could attempt unnecessary recovery according to the
+        # value of the cluster-wide "multiple-active" option.
+        ocf_log debug "neutron-ha-tool not currently active on this node; " \
+            "skipping l3-agent check"
+        return $OCF_NOT_RUNNING
+    fi
+
    INSECURE=""
    if ocf_is_true $OCF_RESKEY_os_insecure; then
        INSECURE="--insecure"
@ -210,6 +230,12 @@ neutron_ha_tool_monitor() {
 }

 neutron_ha_tool_start() {
+    touch "$statefile"
+    if ! [ -e "$statefile" ]; then
+        ocf_log err "Failed to create $statefile - aborting!"
+        return $OCF_ERR_GENERIC
+    fi
+
    INSECURE=""
    if ocf_is_true $OCF_RESKEY_os_insecure; then
        INSECURE="--insecure"
@ -238,7 +264,13 @@ neutron_ha_tool_start() {
 }

 neutron_ha_tool_stop() {
-    # This is a noop
+    rm -f "$statefile"
+    if [ -e "$statefile" ]; then
+        ocf_log err "Uh-oh - failed to remove $statefile!"
+        # If we can't even remove a file in tmpfs (/run), something
+        # is *really* badly wrong, so fence the node.
+        return $OCF_ERR_GENERIC
+    fi
    return $OCF_SUCCESS
 }

@ -268,6 +300,8 @@ if [ -n "$OCF_RESKEY_os_cacert" ]; then
    export OS_CACERT=$OCF_RESKEY_os_cacert
 fi

+statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active"
+
 # What kind of method was invoked?
 case "$1" in
    start)