New behavior for idealing puppet state

* Kill process instead of send USR1 signal;
* Increase time to examine running process from 60 to 120 sec;
* Increase time between last_run request from 5 to 10 sec.

Change-Id: I3fce9ac162a0201f7dc7062dead3350f55e350fc
Closes-Bug: #1261276
This commit is contained in:
Vladimir 2013-12-16 16:25:45 +04:00
parent 75aa0877cb
commit f655181184
4 changed files with 14 additions and 13 deletions

View File

@ -15,8 +15,8 @@ PUPPET_DEPLOY_INTERVAL: 2
# After Puppet agent has finished real work it spend some time to graceful exit.
# PUPPET_FADE_TIMEOUT means how long (in seconds) Astute can take for Puppet
# to exit after real work has finished.
PUPPET_FADE_TIMEOUT: 60
PUPPET_FADE_TIMEOUT: 120
# PUPPET_FADE_INTERVAL is used in puppetd.rb file.
# Retry every PUPPET_FADE_INTERVAL seconds to check puppet state if it was
# in 'running' state.
PUPPET_FADE_INTERVAL: 5
PUPPET_FADE_INTERVAL: 10

View File

@ -55,10 +55,10 @@ module Astute
conf = {}
conf[:PUPPET_TIMEOUT] = 60 * 60 # maximum time it waits for the whole deployment
conf[:PUPPET_DEPLOY_INTERVAL] = 2 # sleep for ## sec, then check puppet status again
conf[:PUPPET_FADE_TIMEOUT] = 60 # how long it can take for puppet to exit after dumping to last_run_summary
conf[:PUPPET_FADE_TIMEOUT] = 120 # how long it can take for puppet to exit after dumping to last_run_summary
conf[:MC_RETRIES] = 5 # MClient tries to call mcagent before failure
conf[:MC_RETRY_INTERVAL] = 1 # MClient sleeps for ## sec between retries
conf[:PUPPET_FADE_INTERVAL] = 5 # retry every ## seconds to check puppet state if it was running
conf[:PUPPET_FADE_INTERVAL] = 10 # retry every ## seconds to check puppet state if it was running
conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
conf[:REBOOT_TIMEOUT] = 120 # how long it can take for node to reboot

View File

@ -91,13 +91,12 @@ module Astute
# but we should to turn it on only in error_nodes
succeed_nodes -= hung_nodes
error_nodes = (error_nodes + hung_nodes).uniq
running_nodes = last_run.map {|n| n.results[:sender]} - stopped_nodes - hung_nodes
running_nodes -= hung_nodes
nodes_to_check = running_nodes + succeed_nodes + error_nodes
unless nodes_to_check.size == last_run.size
raise "Should never happen. Internal error in nodes statuses calculation. Statuses calculated for: #{nodes_to_check.inspect},"
"nodes passed to check statuses of: #{last_run.map {|n| n.results[:sender]}}"
all_nodes = last_run.map { |n| n.results[:sender] }
if nodes_to_check.size != all_nodes.size
raise "Internal error. Check: #{nodes_to_check.inspect}, passed #{all_nodes.inspect}"
end
{'succeed' => succeed_nodes, 'error' => error_nodes, 'running' => running_nodes}
end

View File

@ -148,10 +148,12 @@ module MCollective
when 'idling' then # signal daemon
pid = puppet_agent_pid
begin
::Process.kill('USR1', pid)
reply[:output] = "Signalled daemonized puppet to run (process #{pid}); " + (reply[:output] || '')
rescue => ex
reply.fail "Failed to signal the puppet daemon (process #{pid}): #{ex}"
::Process.kill('INT', pid)
rescue Errno::ESRCH => e
reply[:err_msg] = "Failed to signal the puppet apply daemon (process #{pid}): #{e}"
ensure
runonce_background
reply[:output] = "Kill old idling puppet process #{pid})." + (reply[:output] || '')
end
when 'stopped' then # just run