Fixes for race condition when adding primitives without OCF present
* Change the servie "status" to report "stopped" if the primitive has failures on the node. * Enable "status" failure check. * Support error detection for a missing OCF monitor operations. * Add operations status debug method * Add forgotten cib_reset to the wait_for_online Change-Id: I34fbf8b4a7d2420fb568719f473bc9b40063cc82
This commit is contained in:

committed by
Alex Schultz

parent
fe4d448938
commit
c1c2031c84
@@ -19,6 +19,9 @@
|
|||||||
- symmetric-cluster
|
- symmetric-cluster
|
||||||
- no-quorum-policy
|
- no-quorum-policy
|
||||||
|
|
||||||
|
# Show the debug messages for the resource operations status calculation
|
||||||
|
:debug_show_operations: false
|
||||||
|
|
||||||
# don't actually do any changes to the system
|
# don't actually do any changes to the system
|
||||||
# only show what command would have been run
|
# only show what command would have been run
|
||||||
:debug_enabled: false
|
:debug_enabled: false
|
||||||
@@ -48,10 +51,14 @@
|
|||||||
:status_mode_simple: :global
|
:status_mode_simple: :global
|
||||||
|
|
||||||
# cleanup the primitive during these actions?
|
# cleanup the primitive during these actions?
|
||||||
:cleanup_on_status: false
|
|
||||||
:cleanup_on_start: true
|
:cleanup_on_start: true
|
||||||
:cleanup_on_stop: true
|
:cleanup_on_stop: true
|
||||||
|
|
||||||
|
# set the primitive status to stopped if there are failures
|
||||||
|
# forcing the primitive to be started again and cleaned up
|
||||||
|
# on this node
|
||||||
|
:cleanup_on_status: true
|
||||||
|
|
||||||
# try to stop and disable the basic service on these provider actions
|
# try to stop and disable the basic service on these provider actions
|
||||||
# the basic service is the service managed by the system
|
# the basic service is the service managed by the system
|
||||||
# init scripts or the upstart/systemd units
|
# init scripts or the upstart/systemd units
|
||||||
|
@@ -31,7 +31,10 @@ module Pacemaker
|
|||||||
message = "Waiting #{max_wait_time} seconds for Pacemaker to become online"
|
message = "Waiting #{max_wait_time} seconds for Pacemaker to become online"
|
||||||
message += " (#{comment})" if comment
|
message += " (#{comment})" if comment
|
||||||
debug message
|
debug message
|
||||||
retry_block { online? }
|
retry_block do
|
||||||
|
cib_reset 'wait_for_online'
|
||||||
|
online?
|
||||||
|
end
|
||||||
debug 'Pacemaker is online'
|
debug 'Pacemaker is online'
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@@ -102,7 +102,7 @@ module Pacemaker
|
|||||||
node_status_string = '?' unless node_status_string.is_a? String
|
node_status_string = '?' unless node_status_string.is_a? String
|
||||||
node_status_string = node_status_string.upcase
|
node_status_string = node_status_string.upcase
|
||||||
node_block = "#{node_name}: #{node_status_string}"
|
node_block = "#{node_name}: #{node_status_string}"
|
||||||
node_block += ' (F)' if primitive_has_failures?(primitive, node_name) && (!primitive_is_running? primitive, node_name)
|
node_block += ' (F)' if primitive_has_failures? primitive, node_name
|
||||||
node_block += ' (L)' if service_location_exists? primitive_full_name(primitive), node_name
|
node_block += ' (L)' if service_location_exists? primitive_full_name(primitive), node_name
|
||||||
nodes << node_block
|
nodes << node_block
|
||||||
end
|
end
|
||||||
@@ -115,5 +115,24 @@ module Pacemaker
|
|||||||
report += " at '#{tag}'" if tag
|
report += " at '#{tag}'" if tag
|
||||||
report + "\n"
|
report + "\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Generate the report message for the operation status calculation
|
||||||
|
# @param [Array<Hash>] operations
|
||||||
|
# @param [Hash<String => String>] resource
|
||||||
|
# @param [String] node_name
|
||||||
|
# @return [String]
|
||||||
|
def resource_operations_report(operations, resource, node_name)
|
||||||
|
report = "Operations status debug start for the node: '#{node_name}'\n"
|
||||||
|
report += "Resource: '#{resource['id']}'\n"
|
||||||
|
operations.each do |operation|
|
||||||
|
type = operation.fetch('operation', '?').capitalize
|
||||||
|
rc_code = operation.fetch('rc-code', '?')
|
||||||
|
op_code = operation.fetch('op-status', '?')
|
||||||
|
report += "* #{type.ljust 7}: rc:#{rc_code} op:#{op_code}\n"
|
||||||
|
end
|
||||||
|
report += "Status: #{resource['status']} Failed: #{resource['failed']}\n"
|
||||||
|
report + "Operations status debug end for the node: '#{node_name}'\n"
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@@ -84,8 +84,9 @@ module Pacemaker
|
|||||||
|
|
||||||
# decode lrm_resources section of CIB
|
# decode lrm_resources section of CIB
|
||||||
# @param lrm_resources [REXML::Element]
|
# @param lrm_resources [REXML::Element]
|
||||||
|
# @param [String] node_name
|
||||||
# @return [Hash<String => Hash>]
|
# @return [Hash<String => Hash>]
|
||||||
def decode_lrm_resources(lrm_resources)
|
def decode_lrm_resources(lrm_resources, node_name=nil)
|
||||||
resources = {}
|
resources = {}
|
||||||
lrm_resources.each do |lrm_resource|
|
lrm_resources.each do |lrm_resource|
|
||||||
resource = attributes_to_hash lrm_resource
|
resource = attributes_to_hash lrm_resource
|
||||||
@@ -97,6 +98,7 @@ module Pacemaker
|
|||||||
resource.store 'ops', ops
|
resource.store 'ops', ops
|
||||||
resource.store 'status', determine_primitive_status(ops)
|
resource.store 'status', determine_primitive_status(ops)
|
||||||
resource.store 'failed', failed_operations_found?(ops)
|
resource.store 'failed', failed_operations_found?(ops)
|
||||||
|
debug resource_operations_report ops, resource, node_name if pacemaker_options[:debug_show_operations]
|
||||||
resources.store id, resource
|
resources.store id, resource
|
||||||
end
|
end
|
||||||
resources
|
resources
|
||||||
@@ -128,7 +130,7 @@ module Pacemaker
|
|||||||
next unless lrm
|
next unless lrm
|
||||||
lrm_resources = cib_section_lrm_resources lrm
|
lrm_resources = cib_section_lrm_resources lrm
|
||||||
next unless lrm_resources
|
next unless lrm_resources
|
||||||
resources = decode_lrm_resources lrm_resources
|
resources = decode_lrm_resources lrm_resources, node_name
|
||||||
node.store 'primitives', resources
|
node.store 'primitives', resources
|
||||||
@node_status_structure.store node_name, node
|
@node_status_structure.store node_name, node
|
||||||
end
|
end
|
||||||
@@ -141,8 +143,9 @@ module Pacemaker
|
|||||||
# @return [TrueClass,FalseClass]
|
# @return [TrueClass,FalseClass]
|
||||||
def failed_operations_found?(ops)
|
def failed_operations_found?(ops)
|
||||||
ops.each do |op|
|
ops.each do |op|
|
||||||
# skip incompleate ops
|
# skip pending ops
|
||||||
next unless op['op-status'] == '0'
|
next if op['op-status'] == '-1'
|
||||||
|
|
||||||
# skip useless ops
|
# skip useless ops
|
||||||
next unless %w(start stop monitor promote).include? op['operation']
|
next unless %w(start stop monitor promote).include? op['operation']
|
||||||
|
|
||||||
@@ -173,8 +176,8 @@ module Pacemaker
|
|||||||
.fetch('status', nil)
|
.fetch('status', nil)
|
||||||
else
|
else
|
||||||
statuses = []
|
statuses = []
|
||||||
node_status.each do |_k, v|
|
node_status.each do |_node_name, node_status|
|
||||||
status = v.fetch('primitives', {})
|
status = node_status.fetch('primitives', {})
|
||||||
.fetch(primitive, {})
|
.fetch(primitive, {})
|
||||||
.fetch('status', nil)
|
.fetch('status', nil)
|
||||||
statuses << status
|
statuses << status
|
||||||
|
@@ -147,12 +147,6 @@ Puppet::Type.type(:service).provide(:pacemaker_xml, parent: Puppet::Provider::Pa
|
|||||||
cib_reset 'service_status'
|
cib_reset 'service_status'
|
||||||
wait_for_online 'service_status'
|
wait_for_online 'service_status'
|
||||||
|
|
||||||
if pacemaker_options[:cleanup_on_status]
|
|
||||||
if !pacemaker_options[:cleanup_only_if_failures] || primitive_has_failures?(name, hostname)
|
|
||||||
cleanup
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
out = if primitive_is_master? name
|
out = if primitive_is_master? name
|
||||||
service_status_mode pacemaker_options[:status_mode_master]
|
service_status_mode pacemaker_options[:status_mode_master]
|
||||||
elsif primitive_is_clone? name
|
elsif primitive_is_clone? name
|
||||||
@@ -168,6 +162,13 @@ Puppet::Type.type(:service).provide(:pacemaker_xml, parent: Puppet::Provider::Pa
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
if pacemaker_options[:cleanup_on_status]
|
||||||
|
if out == :running and primitive_has_failures? name, hostname
|
||||||
|
debug "Primitive: '#{name}' has failures on the node: '#{hostname}' Service status set to 'stopped'."
|
||||||
|
out = :stopped
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
debug "Return: '#{out}' (#{out.class})"
|
debug "Return: '#{out}' (#{out.class})"
|
||||||
debug cluster_debug_report "#{@resource} status"
|
debug cluster_debug_report "#{@resource} status"
|
||||||
out
|
out
|
||||||
|
@@ -75,13 +75,13 @@ describe Puppet::Provider::PacemakerXML do
|
|||||||
'then' => 'p_neutron-dhcp-agent',
|
'then' => 'p_neutron-dhcp-agent',
|
||||||
},
|
},
|
||||||
'order-test1-test2-Mandatory' => {
|
'order-test1-test2-Mandatory' => {
|
||||||
'first'=>'test1',
|
'first' => 'test1',
|
||||||
'first-action'=>'promote',
|
'first-action' => 'promote',
|
||||||
'id'=>'order-test1-test2-Mandatory',
|
'id' => 'order-test1-test2-Mandatory',
|
||||||
'kind'=>'Mandatory',
|
'kind' => 'Mandatory',
|
||||||
'symmetrical'=>'true',
|
'symmetrical' => 'true',
|
||||||
'then'=>'test2',
|
'then' => 'test2',
|
||||||
'then-action'=>'start',
|
'then-action' => 'start',
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
@@ -278,7 +278,7 @@ Pacemaker debug block start at 'test'
|
|||||||
-> Simple primitive: 'p_ceilometer-alarm-evaluator'
|
-> Simple primitive: 'p_ceilometer-alarm-evaluator'
|
||||||
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
|
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
|
||||||
-> Simple primitive: 'p_heat-engine'
|
-> Simple primitive: 'p_heat-engine'
|
||||||
node-1: START (L) | node-2: STOP | node-3: STOP
|
node-1: START (F) (L) | node-2: STOP | node-3: STOP
|
||||||
-> Simple primitive: 'p_ceilometer-agent-central' (M)
|
-> Simple primitive: 'p_ceilometer-agent-central' (M)
|
||||||
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
|
node-1: STOP | node-2: STOP (F) | node-3: STOP (F)
|
||||||
-> Simple primitive: 'vip__management'
|
-> Simple primitive: 'vip__management'
|
||||||
@@ -292,7 +292,7 @@ Pacemaker debug block start at 'test'
|
|||||||
-> Clone primitive: 'p_mysql-clone'
|
-> Clone primitive: 'p_mysql-clone'
|
||||||
node-1: START (L) | node-2: START (L) | node-3: STOP
|
node-1: START (L) | node-2: START (L) | node-3: STOP
|
||||||
-> Simple primitive: 'p_neutron-dhcp-agent'
|
-> Simple primitive: 'p_neutron-dhcp-agent'
|
||||||
node-1: START (L) | node-2: STOP | node-3: STOP
|
node-1: START (F) (L) | node-2: STOP | node-3: STOP
|
||||||
-> Simple primitive: 'vip__public'
|
-> Simple primitive: 'vip__public'
|
||||||
node-1: START (L) | node-2: STOP (L) | node-3: STOP (L)
|
node-1: START (L) | node-2: STOP (L) | node-3: STOP (L)
|
||||||
-> Clone primitive: 'p_haproxy-clone'
|
-> Clone primitive: 'p_haproxy-clone'
|
||||||
@@ -354,14 +354,14 @@ Pacemaker debug block end at 'test'
|
|||||||
|
|
||||||
it 'can determine if primitive is failed or not globally' do
|
it 'can determine if primitive is failed or not globally' do
|
||||||
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central').to eq true
|
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central').to eq true
|
||||||
expect(subject.primitive_has_failures? 'p_heat-engine').to eq false
|
expect(subject.primitive_has_failures? 'p_heat-engine').to eq true
|
||||||
expect(subject.primitive_has_failures? 'UNKNOWN').to eq nil
|
expect(subject.primitive_has_failures? 'UNKNOWN').to eq nil
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'can determine if primitive is failed or not locally' do
|
it 'can determine if primitive is failed or not locally' do
|
||||||
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-1').to eq false
|
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-1').to eq false
|
||||||
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-2').to eq true
|
expect(subject.primitive_has_failures? 'p_ceilometer-agent-central', 'node-2').to eq true
|
||||||
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-1').to eq false
|
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-1').to eq true
|
||||||
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-2').to eq false
|
expect(subject.primitive_has_failures? 'p_heat-engine', 'node-2').to eq false
|
||||||
expect(subject.primitive_has_failures? 'UNKNOWN', 'node-1').to eq nil
|
expect(subject.primitive_has_failures? 'UNKNOWN', 'node-1').to eq nil
|
||||||
end
|
end
|
||||||
|
@@ -166,6 +166,15 @@ describe Puppet::Type.type(:service).provider(:pacemaker_xml) do
|
|||||||
provider.stubs(:service_location_exists?).returns(true)
|
provider.stubs(:service_location_exists?).returns(true)
|
||||||
expect(provider.status).to eq :running
|
expect(provider.status).to eq :running
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it 'counts a service as stopped if the primitive has failures' do
|
||||||
|
provider.stubs(:get_primitive_puppet_status).returns(:running)
|
||||||
|
provider.stubs(:service_location_exists?).returns(true)
|
||||||
|
provider.expects(:primitive_has_failures?).with(name, hostname).returns(true)
|
||||||
|
expect(provider.status).to eq :stopped
|
||||||
|
provider.expects(:primitive_has_failures?).with(name, hostname).returns(false)
|
||||||
|
expect(provider.status).to eq :running
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context '#start' do
|
context '#start' do
|
||||||
|
Reference in New Issue
Block a user