Fixes for race condition when adding primitives without OCF present
* Change the servie "status" to report "stopped" if the primitive has failures on the node. * Enable "status" failure check. * Support error detection for a missing OCF monitor operations. * Add operations status debug method * Add forgotten cib_reset to the wait_for_online Change-Id: I34fbf8b4a7d2420fb568719f473bc9b40063cc82
This commit is contained in:
parent
fe4d448938
commit
c1c2031c84
@ -19,6 +19,9 @@
|
||||
- symmetric-cluster
|
||||
- no-quorum-policy
|
||||
|
||||
# Show the debug messages for the resource operations status calculation
|
||||
:debug_show_operations: false
|
||||
|
||||
# don't actually do any changes to the system
|
||||
# only show what command would have been run
|
||||
:debug_enabled: false
|
||||
@ -48,10 +51,14 @@
|
||||
:status_mode_simple: :global
|
||||
|
||||
# cleanup the primitive during these actions?
|
||||
:cleanup_on_status: false
|
||||
:cleanup_on_start: true
|
||||
:cleanup_on_stop: true
|
||||
|
||||
# set the primitive status to stopped if there are failures
|
||||
# forcing the primitive to be started again and cleaned up
|
||||
# on this node
|
||||
:cleanup_on_status: true
|
||||
|
||||
# try to stop and disable the basic service on these provider actions
|
||||
# the basic service is the service managed by the system
|
||||
# init scripts or the upstart/systemd units
|
||||
|
@ -31,7 +31,10 @@ module Pacemaker
|
||||
message = "Waiting #{max_wait_time} seconds for Pacemaker to become online"
|
||||
message += " (#{comment})" if comment
|
||||
debug message
|
||||
retry_block { online? }
|
||||
retry_block do
|
||||
cib_reset 'wait_for_online'
|
||||
online?
|
||||
end
|
||||
debug 'Pacemaker is online'
|
||||
end
|
||||
|
||||
|
@ -102,7 +102,7 @@ module Pacemaker
|
||||
node_status_string = '?' unless node_status_string.is_a? String
|
||||
node_status_string = node_status_string.upcase
|
||||
node_block = "#{node_name}: #{node_status_string}"
|
||||
node_block += ' (F)' if primitive_has_failures?(primitive, node_name) && (!primitive_is_running? primitive, node_name)
|
||||
node_block += ' (F)' if primitive_has_failures? primitive, node_name
|
||||
node_block += ' (L)' if service_location_exists? primitive_full_name(primitive), node_name
|
||||
nodes << node_block
|
||||
end
|
||||
@ -115,5 +115,24 @@ module Pacemaker
|
||||
report += " at '#{tag}'" if tag
|
||||
report + "\n"
|
||||
end
|
||||
|
||||
# Generate the report message for the operation status calculation
|
||||
# @param [Array<Hash>] operations
|
||||
# @param [Hash<String => String>] resource
|
||||
# @param [String] node_name
|
||||
# @return [String]
|
||||
def resource_operations_report(operations, resource, node_name)
|
||||
report = "Operations status debug start for the node: '#{node_name}'\n"
|
||||
report += "Resource: '#{resource['id']}'\n"
|
||||
operations.each do |operation|
|
||||
type = operation.fetch('operation', '?').capitalize
|
||||
rc_code = operation.fetch('rc-code', '?')
|
||||
op_code = operation.fetch('op-status', '?')
|
||||
report += "* #{type.ljust 7}: rc:#{rc_code} op:#{op_code}\n"
|
||||
end
|
||||
report += "Status: #{resource['status']} Failed: #{resource['failed']}\n"
|
||||
report + "Operations status debug end for the node: '#{node_name}'\n"
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
@ -84,8 +84,9 @@ module Pacemaker
|
||||
|
||||
# decode lrm_resources section of CIB
|
||||
# @param lrm_resources [REXML::Element]
|
||||
# @param [String] node_name
|
||||
# @return [Hash<String => Hash>]
|
||||
def decode_lrm_resources(lrm_resources)
|
||||
def decode_lrm_resources(lrm_resources, node_name=nil)
|
||||
resources = {}
|
||||
lrm_resources.each do |lrm_resource|
|
||||
resource = attributes_to_hash lrm_resource
|
||||
@ -97,6 +98,7 @@ module Pacemaker
|
||||
resource.store 'ops', ops
|
||||
resource.store 'status', determine_primitive_status(ops)
|
||||
resource.store 'failed', failed_operations_found?(ops)
|
||||
debug resource_operations_report ops, resource, node_name if pacemaker_options[:debug_show_operations]
|
||||
resources.store id, resource
|
||||
end
|
||||
resources
|
||||
@ -128,7 +130,7 @@ module Pacemaker
|
||||
next unless lrm
|
||||
lrm_resources = cib_section_lrm_resources lrm
|
||||
next unless lrm_resources
|
||||
resources = decode_lrm_resources lrm_resources
|
||||
resources = decode_lrm_resources lrm_resources, node_name
|
||||
node.store 'primitives', resources
|
||||
@node_status_structure.store node_name, node
|
||||
end
|
||||
@ -141,8 +143,9 @@ module Pacemaker
|
||||
# @return [TrueClass,FalseClass]
|
||||
def failed_operations_found?(ops)
|
||||
ops.each do |op|
|
||||
# skip incompleate ops
|
||||
next unless op['op-status'] == '0'
|
||||
# skip pending ops
|
||||
next if op['op-status'] == '-1'
|
||||
|
||||
# skip useless ops
|
||||
next unless %w(start stop monitor promote).include? op['operation']
|
||||
|
||||
@ -173,8 +176,8 @@ module Pacemaker
|
||||
.fetch('status', nil)
|
||||
else
|
||||
statuses = []
|
||||
node_status.each do |_k, v|
|
||||
status = v.fetch('primitives', {})
|
||||
node_status.each do |_node_name, node_status|
|
||||
status = node_status.fetch('primitives', {})
|
||||
.fetch(primitive, {})
|
||||
.fetch('status', nil)
|
||||
statuses << status
|
||||
|
@ -147,12 +147,6 @@ Puppet::Type.type(:service).provide(:pacemaker_xml, parent: Puppet::Provider::Pa
|
||||
cib_reset 'service_status'
|
||||
wait_for_online 'service_status'
|
||||
|
||||
if pacemaker_options[:cleanup_on_status]
|
||||
if !pacemaker_options[:cleanup_only_if_failures] || primitive_has_failures?(name, hostname)
|
||||
cleanup
|
||||
end
|
||||
end
|
||||
|
||||
out = if primitive_is_master? name
|
||||
service_status_mode pacemaker_options[:status_mode_master]
|
||||
elsif primitive_is_clone? name
|
||||
@ -168,6 +162,13 @@ Puppet::Type.type(:service).provide(:pacemaker_xml, parent: Puppet::Provider::Pa
|
||||
end
|
||||
end
|
||||
|
||||
if pacemaker_options[:cleanup_on_status]
|
||||
if out == :running and primitive_has_failures? name, hostname
|
||||
debug "Primitive: '#{name}' has failures on the node: '#{hostname}' Service status set to 'stopped'."
|
||||
out = :stopped
|
||||
end
|
||||
end
|
||||
|
||||
debug "Return: '#{out}' (#{out.class})"
|
||||
debug cluster_debug_report "#{@resource} status"
|
||||
out
|
||||
|
@ -75,13 +75,13 @@ describe Puppet::Provider::PacemakerXML do
|
||||
'then' => 'p_neutron-dhcp-agent',
|
||||
},
|
||||
'order-test1-test2-Mandatory' => {
|
||||
'first'=>'test1',
|
||||
'first-action'=>'promote',
|
||||
'id'=>'order-test1-test2-Mandatory',
|
||||
'kind'=>'Mandatory',
|
||||
'symmetrical'=>'true',
|
||||
'then'=>'test2',
|
||||
'then-action'=>'start',
|
||||
'first' => 'test1',
|
||||
'first-action' => 'promote',
|
||||
'id' => 'order-test1-test2-Mandatory',
|
||||
'kind' => 'Mandatory',
|
||||
'symmetrical' => 'true',
|
||||
'then' => 'test2',
|
||||
'then-action' => 'start',
|
||||
},
|
||||
}
|
||||
end
|
||||
@ -278,7 +278,7 @@ Pacemaker debug block start at 'test'
|
||||
-> Simple primitive: 'p_ceilometer-alarm-evaluator'
|
||||
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
|
||||
-> Simple primitive: 'p_heat-engine'
|
||||
node-1: START (L) | node-2: STOP | node-3: STOP
|
||||
node-1: START (F) (L) | node-2: STOP | node-3: STOP
|
||||
-> Simple primitive: 'p_ceilometer-agent-central' (M)
|
||||
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
|
||||
-> Simple primitive: 'vip__management'
|
||||
@ -292,7 +292,7 @@ Pacemaker debug block start at 'test'
|
||||
-> Clone primitive: 'p_mysql-clone'
|
||||
node-1: START (L) | node-2: START (L) | node-3: STOP
|
||||
-> Simple primitive: 'p_neutron-dhcp-agent'
|
||||
node-1: START (L) | node-2: STOP | node-3: STOP
|
||||
node-1: START (F) (L) | node-2: STOP | node-3: STOP
|
||||
-> Simple primitive: 'vip__public'
|
||||
node-1: START (L) | node-2: STOP (L) | node-3: STOP (L)
|
||||
-> Clone primitive: 'p_haproxy-clone'
|
||||
@ -354,14 +354,14 @@ Pacemaker debug block end at 'test'
|
||||
|
||||
it 'can determine if primitive is failed or not globally' do
|
||||
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central').to eq true
|
||||
expect(subject.primitive_has_failures? 'p_heat-engine').to eq false
|
||||
expect(subject.primitive_has_failures? 'p_heat-engine').to eq true
|
||||
expect(subject.primitive_has_failures? 'UNKNOWN').to eq nil
|
||||
end
|
||||
|
||||
it 'can determine if primitive is failed or not locally' do
|
||||
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-1').to eq false
|
||||
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-2').to eq true
|
||||
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-1').to eq false
|
||||
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-1').to eq true
|
||||
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-2').to eq false
|
||||
expect(subject.primitive_has_failures? 'UNKNOWN', 'node-1').to eq nil
|
||||
end
|
||||
|
@ -166,6 +166,15 @@ describe Puppet::Type.type(:service).provider(:pacemaker_xml) do
|
||||
provider.stubs(:service_location_exists?).returns(true)
|
||||
expect(provider.status).to eq :running
|
||||
end
|
||||
|
||||
it 'counts a service as stopped if the primitive has failures' do
|
||||
provider.stubs(:get_primitive_puppet_status).returns(:running)
|
||||
provider.stubs(:service_location_exists?).returns(true)
|
||||
provider.expects(:primitive_has_failures?).with(name, hostname).returns(true)
|
||||
expect(provider.status).to eq :stopped
|
||||
provider.expects(:primitive_has_failures?).with(name, hostname).returns(false)
|
||||
expect(provider.status).to eq :running
|
||||
end
|
||||
end
|
||||
|
||||
context '#start' do
|
||||
|
Loading…
Reference in New Issue
Block a user