Network problem tolerance puppet status check
Connection between node and Astute can be lost some times, so we need more tries to get info about task status on node. Two changes: - instead of 1 try Astute will run 6 tries with 10 timeout for every attempt; - it will process such behavior for puppet using separately retries: puppet_undefined_retries Instead of full puppet retry status retry is safety because it is idempotent. Puppet undefined retries can be setup using Astute config or sending undefined_retries in puppet task parameters same way as for usual retries. Most important thing: it will refresh to original value every time when Astute get defined answer. Change-Id: Ie86576a3400be5a6b11041c8e6acf89abf3bbd51 Related-Bug: #1653210 Closes-Bug: #1653737
This commit is contained in:
parent
06fe7856f6
commit
7c0485eb1a
@ -62,6 +62,7 @@ module Astute
|
||||
conf[:puppet_start_interval] = 2 # interval between attemps to start puppet
|
||||
conf[:puppet_retries] = 2 # how many times astute will try to run puppet
|
||||
conf[:puppet_succeed_retries] = 0 # use this to rerun a puppet task again if it was successful (idempotency)
|
||||
conf[:puppet_undefined_retries] = 3 # how many times astute will try to get actual status of node before fail
|
||||
conf[:puppet_module_path] = '/etc/puppet/modules' # where we should find basic modules for puppet
|
||||
conf[:puppet_noop_run] = false # enable Puppet noop run
|
||||
conf[:mc_retries] = 10 # MClient tries to call mcagent before failure
|
||||
|
@ -56,14 +56,14 @@ module Astute
|
||||
|
||||
# Create configured puppet mcollective agent
|
||||
# @return [Astute::MClient]
|
||||
def puppetd
|
||||
def puppetd(timeout=nil, retries=1)
|
||||
puppetd = MClient.new(
|
||||
@ctx,
|
||||
"puppetd",
|
||||
[@node_id],
|
||||
_check_result=true,
|
||||
_timeout=nil,
|
||||
_retries=1,
|
||||
_timeout=timeout,
|
||||
_retries=retries,
|
||||
_enable_result_logging=false
|
||||
)
|
||||
puppetd.on_respond_timeout do |uids|
|
||||
@ -77,7 +77,7 @@ module Astute
|
||||
# Run last_run_summary action using mcollective puppet agent
|
||||
# @return [Hash] return hash with status and resources
|
||||
def last_run_summary
|
||||
@summary = puppetd.last_run_summary(
|
||||
@summary = puppetd(_timeout=10, _retries=6).last_run_summary(
|
||||
:puppet_noop_run => @options['puppet_noop_run'],
|
||||
:raw_report => @options['raw_report']
|
||||
).first[:data]
|
||||
|
@ -39,8 +39,6 @@ module Astute
|
||||
'stopped', 'disabled'
|
||||
]
|
||||
|
||||
FAILED_STATUSES = UNDEFINED_STATUSES + STOPED_STATUSES
|
||||
|
||||
def initialize(task, puppet_mclient, options)
|
||||
@task = task
|
||||
@retries = options['retries']
|
||||
@ -48,6 +46,8 @@ module Astute
|
||||
@puppet_start_interval = options['puppet_start_interval']
|
||||
@time_observer = TimeObserver.new(options['timeout'])
|
||||
@succeed_retries = options['succeed_retries']
|
||||
@undefined_retries = options['undefined_retries']
|
||||
@original_undefined_retries = options['undefined_retries']
|
||||
@puppet_mclient = puppet_mclient
|
||||
end
|
||||
|
||||
@ -76,6 +76,8 @@ module Astute
|
||||
processing_running_task
|
||||
when 'failed'
|
||||
processing_error_task
|
||||
when 'undefined'
|
||||
processing_undefined_task
|
||||
end
|
||||
|
||||
time_is_up! if should_stop?
|
||||
@ -169,8 +171,10 @@ module Astute
|
||||
'successful'
|
||||
when BUSY_STATUSES.include?(mco_puppet_status)
|
||||
'running'
|
||||
when FAILED_STATUSES.include?(mco_puppet_status)
|
||||
when STOPED_STATUSES.include?(mco_puppet_status)
|
||||
'failed'
|
||||
when UNDEFINED_STATUSES.include?(mco_puppet_status)
|
||||
'undefined'
|
||||
else
|
||||
raise StatusValidationError,
|
||||
"Unknow puppet status: #{mco_puppet_status}"
|
||||
@ -189,7 +193,7 @@ module Astute
|
||||
# @return [void]
|
||||
def log_current_status(status)
|
||||
message = "#{task_details_for_log}, status: #{status}"
|
||||
if FAILED_STATUSES.include?(status)
|
||||
if (UNDEFINED_STATUSES + STOPED_STATUSES).include?(status)
|
||||
Astute.logger.error message
|
||||
else
|
||||
Astute.logger.debug message
|
||||
@ -197,8 +201,9 @@ module Astute
|
||||
end
|
||||
|
||||
# Process additional action in case of puppet succeed
|
||||
# @return [String] Task status: successful, failed or running
|
||||
# @return [String] Task status: successful or running
|
||||
def processing_succeed_task
|
||||
reset_undefined_retries!
|
||||
Astute.logger.debug "Puppet completed within "\
|
||||
"#{@time_observer.since_start} seconds"
|
||||
if @succeed_retries > 0
|
||||
@ -218,8 +223,9 @@ module Astute
|
||||
end
|
||||
|
||||
# Process additional action in case of puppet failed
|
||||
# @return [String] Task status: successful, failed or running
|
||||
# @return [String] Task status: failed or running
|
||||
def processing_error_task
|
||||
reset_undefined_retries!
|
||||
if @retries > 0
|
||||
@retries -= 1
|
||||
Astute.logger.debug "Puppet on node will be "\
|
||||
@ -236,12 +242,39 @@ module Astute
|
||||
end
|
||||
end
|
||||
|
||||
# Process additional action in case of undefined puppet status
|
||||
# @return [String] Task status: failed or running
|
||||
def processing_undefined_task
|
||||
if @undefined_retries > 0
|
||||
@undefined_retries -= 1
|
||||
Astute.logger.debug "Puppet on node has undefined status. "\
|
||||
"#{@undefined_retries} retries remained. "\
|
||||
"#{task_details_for_log}"
|
||||
Astute.logger.info "Retrying to check status for following "\
|
||||
"nodes: #{@puppet_mclient.node_id}"
|
||||
'running'
|
||||
else
|
||||
Astute.logger.error "Node has failed to get status. There is"\
|
||||
" no more retries for status check. #{task_details_for_log}"
|
||||
'failed'
|
||||
end
|
||||
end
|
||||
|
||||
# Process additional action in case of puppet running
|
||||
# @return [String]: Task status: successful, failed or running
|
||||
def processing_running_task
|
||||
reset_undefined_retries!
|
||||
'running'
|
||||
end
|
||||
|
||||
# Reset undefined retries to original value
|
||||
# @return [void]
|
||||
def reset_undefined_retries!
|
||||
Astute.logger.debug "Reset undefined retries to original "\
|
||||
"value: #{@original_undefined_retries}"
|
||||
@undefined_retries = @original_undefined_retries
|
||||
end
|
||||
|
||||
end #PuppetJob
|
||||
|
||||
end
|
||||
|
@ -47,6 +47,7 @@ module Astute
|
||||
'timeout' => Astute.config.puppet_timeout,
|
||||
'puppet_debug' => false,
|
||||
'succeed_retries' => Astute.config.puppet_succeed_retries,
|
||||
'undefined_retries' => Astute.config.puppet_undefined_retries,
|
||||
'raw_report' => Astute.config.puppet_raw_report,
|
||||
'puppet_noop_run' => Astute.config.puppet_noop_run,
|
||||
'puppet_start_timeout' => Astute.config.puppet_start_timeout,
|
||||
@ -67,6 +68,7 @@ module Astute
|
||||
{
|
||||
'retries' => @task['parameters']['retries'],
|
||||
'succeed_retries' => @task['parameters']['succeed_retries'],
|
||||
'undefined_retries' => @task['parameters']['undefined_retries'],
|
||||
'timeout' => @task['parameters']['timeout'],
|
||||
'puppet_start_timeout' => @task['parameters'][
|
||||
'puppet_start_timeout'],
|
||||
|
@ -31,6 +31,7 @@ describe Astute::PuppetJob do
|
||||
{
|
||||
'retries' => 1,
|
||||
'succeed_retries' => 0,
|
||||
'undefined_retries' => 1,
|
||||
'timeout' => 1,
|
||||
'puppet_start_timeout' => 1,
|
||||
'puppet_start_interval' => 0
|
||||
@ -71,7 +72,7 @@ describe Astute::PuppetJob do
|
||||
describe '#task_status=' do
|
||||
it 'should raise error if status do not support' do
|
||||
expect {subject.send(:task_status=, 'unknow_status')}.to \
|
||||
raise_error(StatusValidationError, /unknow_status/)
|
||||
raise_error(Astute::StatusValidationError, /unknow_status/)
|
||||
end
|
||||
end
|
||||
|
||||
@ -84,7 +85,7 @@ describe Astute::PuppetJob do
|
||||
|
||||
puppet_mclient.expects(:status).returns('unknow_status')
|
||||
expect {subject.status}.to raise_error(
|
||||
StatusValidationError, /unknow_status/
|
||||
Astute::StatusValidationError, /unknow_status/
|
||||
)
|
||||
end
|
||||
end
|
||||
@ -124,7 +125,7 @@ describe Astute::PuppetJob do
|
||||
end
|
||||
|
||||
it 'should return runing when magent failed but can retry' do
|
||||
puppet_mclient.expects(:run).twice.returns(true)
|
||||
puppet_mclient.expects(:run).once.returns(true)
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
@ -149,13 +150,45 @@ describe Astute::PuppetJob do
|
||||
expect(subject.status).to eq('successful')
|
||||
end
|
||||
|
||||
it 'should successful if undefined/failed but retry succeed' do
|
||||
puppet_mclient.stubs(:run).returns(true)
|
||||
options['undefined_retries'] = 1
|
||||
options['retries'] = 1
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.then.returns('undefined')
|
||||
.returns('stopped')
|
||||
.then.returns('undefined')
|
||||
.then.returns('succeed')
|
||||
|
||||
3.times { expect(subject.status).to eq('running') }
|
||||
expect(subject.status). to eq('successful')
|
||||
end
|
||||
|
||||
it 'should successful if failed but retry succeed' do
|
||||
puppet_mclient.stubs(:run).returns(true)
|
||||
options['undefined_retries'] = 0
|
||||
options['retries'] = 2
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.returns('stopped')
|
||||
.then.returns('stopped')
|
||||
.then.returns('succeed')
|
||||
|
||||
2.times { expect(subject.status).to eq('running') }
|
||||
expect(subject.status). to eq('successful')
|
||||
end
|
||||
|
||||
it 'should successful if undefined but retry succeed' do
|
||||
puppet_mclient.stubs(:run).returns(true)
|
||||
options['undefined_retries'] = 2
|
||||
options['retries'] = 0
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.returns('undefined')
|
||||
.then.returns('undefined')
|
||||
.then.returns('succeed')
|
||||
|
||||
@ -188,7 +221,7 @@ describe Astute::PuppetJob do
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.returns('undefined')
|
||||
.returns('stopped')
|
||||
.then.returns('stopped')
|
||||
|
||||
expect(subject.status).to eq('running')
|
||||
@ -210,13 +243,67 @@ describe Astute::PuppetJob do
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.returns('undefined')
|
||||
.returns('stopped')
|
||||
.then.returns('stopped')
|
||||
|
||||
expect(subject.status).to eq('running')
|
||||
3.times { expect(subject.status). to eq('failed') }
|
||||
end
|
||||
end
|
||||
|
||||
context 'undefined' do
|
||||
it 'should return failed if undefined and no more retries' do
|
||||
puppet_mclient.stubs(:run).returns(true)
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.returns('undefined')
|
||||
.then.returns('undefined')
|
||||
|
||||
expect(subject.status).to eq('running')
|
||||
expect(subject.status).to eq('failed')
|
||||
end
|
||||
|
||||
it 'should return failed if time is over and no result' do
|
||||
puppet_mclient.stubs(:run).returns(true)
|
||||
options['timeout'] = 0
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status).returns('undefined')
|
||||
expect(subject.status).to eq('failed')
|
||||
end
|
||||
|
||||
it 'should do nothing if final status set and retries end' do
|
||||
puppet_mclient.stubs(:run).returns(true)
|
||||
options['undefined'] = 0
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.returns('undefined')
|
||||
.then.returns('undefined')
|
||||
|
||||
expect(subject.status).to eq('running')
|
||||
3.times { expect(subject.status). to eq('failed') }
|
||||
end
|
||||
|
||||
it 'should reset retries if answer was received' do
|
||||
puppet_mclient.stubs(:run).returns(true)
|
||||
options['undefined_retries'] = 1
|
||||
options['retries'] = 0
|
||||
subject.run
|
||||
|
||||
puppet_mclient.stubs(:status)
|
||||
.then.returns('undefined')
|
||||
.returns('running')
|
||||
.then.returns('undefined')
|
||||
.then.returns('succeed')
|
||||
|
||||
3.times { expect(subject.status).to eq('running') }
|
||||
expect(subject.status). to eq('successful')
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user