Files
puppet-pacemaker/lib/pacemaker/xml/status.rb
Dmitry Ilyin c1c2031c84 Fixes for race condition when adding primitives without OCF present
* Change the servie "status" to report "stopped" if the
  primitive has failures on the node.
* Enable "status" failure check.
* Support error detection for a missing OCF monitor operations.
* Add operations status debug method
* Add forgotten cib_reset to the wait_for_online

Change-Id: I34fbf8b4a7d2420fb568719f473bc9b40063cc82
2017-02-23 17:45:07 +00:00

278 lines
9.4 KiB
Ruby

module Pacemaker
# functions related to the primitive and node status
# main structure "node_status"
module Status
# get lrm_rsc_ops section from lrm_resource section CIB section
# @param lrm_resource [REXML::Element]
# at /cib/status/node_state/lrm[@id="node-name"]/lrm_resources/lrm_resource[@id="resource-name"]/lrm_rsc_op
# @return [REXML::Element]
def cib_section_lrm_rsc_ops(lrm_resource)
return unless lrm_resource.is_a? REXML::Element
REXML::XPath.match lrm_resource, 'lrm_rsc_op'
end
# get node_state CIB section
# @return [REXML::Element] at /cib/status/node_state
def cib_section_node_state
REXML::XPath.match cib, '//node_state'
end
# get lrm_rsc_ops section from lrm_resource section CIB section
# @param lrm [REXML::Element]
# at /cib/status/node_state/lrm[@id="node-name"]/lrm_resources/lrm_resource
# @return [REXML::Element]
def cib_section_lrm_resources(lrm)
return unless lrm.is_a? REXML::Element
REXML::XPath.match lrm, 'lrm_resources/lrm_resource'
end
# determine the status of a single operation
# @param op [Hash<String => String>]
# @return ['start','stop','master',nil]
def operation_status(op)
# skip pendings ops
# we should waqit until status becomes known
return if op['op-status'] == '-1'
if op['operation'] == 'monitor'
# for monitor operation status is determined by its rc-code
# 0 - start, 8 - master, 7 - stop, else - error
case op['rc-code']
when '0'
'start'
when '7'
'stop'
when '8'
'master'
else
# not entirely correct but count failed monitor as 'stop'
'stop'
end
elsif %w(start stop promote demote).include? op['operation']
# if the operation was not successful the status is unknown
# it will be determined by the next monitor
# if Pacemaker is unable to bring the resource to a known state
# it can use STONITH on this node if it's configured
return unless op['rc-code'] == '0'
# for a successful start/stop/promote/demote operations
# we use use master instead of promote and start instead of demote
if op['operation'] == 'promote'
'master'
elsif op['operation'] == 'demote'
'start'
else
op['operation']
end
end
end
# determine resource status by parsing its operations
# it goes from the first operation to the last updating
# status if it's defined in the end there will be the
# actual status of this primitive
# @param ops [Array<Hash>]
# @return ['start','stop','master',nil]
# nil means that the status is unknown
def determine_primitive_status(ops)
status = nil
ops.each do |op|
op_status = operation_status op
status = op_status if op_status
end
status
end
# decode lrm_resources section of CIB
# @param lrm_resources [REXML::Element]
# @param [String] node_name
# @return [Hash<String => Hash>]
def decode_lrm_resources(lrm_resources, node_name=nil)
resources = {}
lrm_resources.each do |lrm_resource|
resource = attributes_to_hash lrm_resource
id = resource['id']
next unless id
lrm_rsc_ops = cib_section_lrm_rsc_ops lrm_resource
next unless lrm_rsc_ops
ops = decode_lrm_rsc_ops lrm_rsc_ops
resource.store 'ops', ops
resource.store 'status', determine_primitive_status(ops)
resource.store 'failed', failed_operations_found?(ops)
debug resource_operations_report ops, resource, node_name if pacemaker_options[:debug_show_operations]
resources.store id, resource
end
resources
end
# decode lrm_rsc_ops section of the resource's CIB
# @param lrm_rsc_ops [REXML::Element]
# @return [Array<Hash>]
def decode_lrm_rsc_ops(lrm_rsc_ops)
ops = []
lrm_rsc_ops.each do |lrm_rsc_op|
op = attributes_to_hash lrm_rsc_op
next unless op['call-id']
ops << op
end
ops.sort { |a, b| a['call-id'].to_i <=> b['call-id'].to_i }
end
# get nodes_status structure with resources and their statuses
# @return [Hash<String => Hash>]
def node_status
return @node_status_structure if @node_status_structure
@node_status_structure = {}
cib_section_node_state.each do |node_state|
node = attributes_to_hash node_state
node_name = node['uname']
next unless node_name
lrm = node_state.elements['lrm']
next unless lrm
lrm_resources = cib_section_lrm_resources lrm
next unless lrm_resources
resources = decode_lrm_resources lrm_resources, node_name
node.store 'primitives', resources
@node_status_structure.store node_name, node
end
@node_status_structure
end
# check if operations have same failed operations
# that should be cleaned up later
# @param ops [Array<Hash>]
# @return [TrueClass,FalseClass]
def failed_operations_found?(ops)
ops.each do |op|
# skip pending ops
next if op['op-status'] == '-1'
# skip useless ops
next unless %w(start stop monitor promote).include? op['operation']
# are there failed start, stop
if %w(start stop promote).include? op['operation']
return true if op['rc-code'] != '0'
end
# are there failed monitors
if op['operation'] == 'monitor'
return true unless %w(0 7 8).include? op['rc-code']
end
end
false
end
# get a status of a primitive on the entire cluster
# of on a node if node name param given
# @param primitive [String]
# @param node [String]
# @return [String]
def primitive_status(primitive, node = nil)
if node
node_status
.fetch(node, {})
.fetch('primitives', {})
.fetch(primitive, {})
.fetch('status', nil)
else
statuses = []
node_status.each do |_node_name, node_status|
status = node_status.fetch('primitives', {})
.fetch(primitive, {})
.fetch('status', nil)
statuses << status
end
status_values = {
'stop' => 0,
'start' => 1,
'master' => 2,
}
statuses.max_by do |status|
return nil unless status
status_values[status]
end
end
end
# does this primitive have failed operations?
# @param primitive [String] primitive name
# @param node [String] on this node if given
# @return [TrueClass,FalseClass]
def primitive_has_failures?(primitive, node = nil)
return unless primitive_exists? primitive
if node
node_status
.fetch(node, {})
.fetch('primitives', {})
.fetch(primitive, {})
.fetch('failed', nil)
else
node_status.each do |_k, v|
failed = v.fetch('primitives', {})
.fetch(primitive, {})
.fetch('failed', nil)
return true if failed
end
false
end
end
# determine if a primitive is running on the entire cluster
# of on a node if node name param given
# @param primitive [String] primitive id
# @param node [String] on this node if given
# @return [TrueClass,FalseClass]
def primitive_is_running?(primitive, node = nil)
return unless primitive_exists? primitive
status = primitive_status primitive, node
return status unless status
%w(start master).include? status
end
# check if primitive is running as a master
# either anywhere or on the give node
# @param primitive [String] primitive id
# @param node [String] on this node if given
# @return [TrueClass,FalseClass]
def primitive_has_master_running?(primitive, node = nil)
is_master = primitive_is_master? primitive
return is_master unless is_master
status = primitive_status primitive, node
return status unless status
status == 'master'
end
# generate the report of primitive statuses by node
# @return [Hash]
def primitives_status_by_node
report = {}
return unless node_status.is_a? Hash
node_status.each do |node_name, node_data|
primitives_of_node = node_data['primitives']
next unless primitives_of_node.is_a? Hash
primitives_of_node.each do |primitive, primitive_data|
primitive_status = primitive_data['status']
report[primitive] = {} unless report[primitive].is_a? Hash
report[primitive][node_name] = primitive_status
end
end
report
end
# Get the list on node names where this primitive
# has the specified status.
# @param [String] primitive
# @param [String,Symbol] expected_status (stop/start/master)
# @return [Array<String>] The array of node names where the primitive has this status
def primitive_has_status_on(primitive, expected_status = 'start')
expected_status = expected_status.to_s.downcase
primitive_status_by_node = primitives_status_by_node[primitive]
primitive_status_by_node.inject([]) do |found_nodes, node_and_status|
next found_nodes unless node_and_status.last == expected_status
found_nodes << node_and_status.first
end
end
end
end