New behavior for idealing puppet state

* Kill process instead of send USR1 signal; * Increase time to examine running process from 60 to 120 sec; * Increase time between last_run request from 5 to 10 sec. Change-Id: I3fce9ac162a0201f7dc7062dead3350f55e350fc Closes-Bug: #1261276
2013-12-16 16:25:45 +04:00 · 2013-12-16 16:25:45 +04:00 · f655181184
parent 75aa0877cb
commit f655181184
4 changed files with 14 additions and 13 deletions
--- a/examples/example_astute_config.yaml
+++ b/examples/example_astute_config.yaml
@ -15,8 +15,8 @@ PUPPET_DEPLOY_INTERVAL: 2
 # After Puppet agent has finished real work it spend some time to graceful exit.
 # PUPPET_FADE_TIMEOUT means how long (in seconds) Astute can take for Puppet
 # to exit after real work has finished.
-PUPPET_FADE_TIMEOUT: 60
+PUPPET_FADE_TIMEOUT: 120
 # PUPPET_FADE_INTERVAL is used in puppetd.rb file.
 # Retry every PUPPET_FADE_INTERVAL seconds to check puppet state if it was
 # in 'running' state.
-PUPPET_FADE_INTERVAL: 5
+PUPPET_FADE_INTERVAL: 10
--- a/lib/astute/config.rb
+++ b/lib/astute/config.rb
@ -55,10 +55,10 @@ module Astute
    conf = {}
    conf[:PUPPET_TIMEOUT] = 60 * 60       # maximum time it waits for the whole deployment
    conf[:PUPPET_DEPLOY_INTERVAL] = 2     # sleep for ## sec, then check puppet status again
-    conf[:PUPPET_FADE_TIMEOUT] = 60       # how long it can take for puppet to exit after dumping to last_run_summary
+    conf[:PUPPET_FADE_TIMEOUT] = 120      # how long it can take for puppet to exit after dumping to last_run_summary
    conf[:MC_RETRIES] = 5                 # MClient tries to call mcagent before failure
    conf[:MC_RETRY_INTERVAL] = 1          # MClient sleeps for ## sec between retries
-    conf[:PUPPET_FADE_INTERVAL] = 5       # retry every ## seconds to check puppet state if it was running
+    conf[:PUPPET_FADE_INTERVAL] = 10      # retry every ## seconds to check puppet state if it was running
    conf[:PROVISIONING_TIMEOUT] = 90 * 60 # timeout for booting target OS in provision
    conf[:REBOOT_TIMEOUT] = 120           # how long it can take for node to reboot

--- a/lib/astute/puppetd.rb
+++ b/lib/astute/puppetd.rb
@ -91,13 +91,12 @@ module Astute
      # but we should to turn it on only in error_nodes
      succeed_nodes -= hung_nodes
      error_nodes = (error_nodes + hung_nodes).uniq
-      running_nodes = last_run.map {|n| n.results[:sender]} - stopped_nodes - hung_nodes
-
+      running_nodes -= hung_nodes

      nodes_to_check = running_nodes + succeed_nodes + error_nodes
-      unless nodes_to_check.size == last_run.size
-        raise "Should never happen. Internal error in nodes statuses calculation. Statuses calculated for: #{nodes_to_check.inspect},"
-                    "nodes passed to check statuses of: #{last_run.map {|n| n.results[:sender]}}"
+      all_nodes = last_run.map { |n| n.results[:sender] }
+      if nodes_to_check.size != all_nodes.size
+        raise "Internal error. Check: #{nodes_to_check.inspect}, passed #{all_nodes.inspect}"
      end
      {'succeed' => succeed_nodes, 'error' => error_nodes, 'running' => running_nodes}
    end
--- a/mcagents/puppetd.rb
+++ b/mcagents/puppetd.rb
@ -148,10 +148,12 @@ module MCollective
        when 'idling' then       # signal daemon
          pid = puppet_agent_pid
          begin
-            ::Process.kill('USR1', pid)
-            reply[:output] = "Signalled daemonized puppet to run (process #{pid}); " + (reply[:output] || '')
-          rescue => ex
-            reply.fail "Failed to signal the puppet daemon (process #{pid}): #{ex}"
+            ::Process.kill('INT', pid)
+          rescue Errno::ESRCH => e
+            reply[:err_msg] = "Failed to signal the puppet apply daemon (process #{pid}): #{e}"
+          ensure
+            runonce_background
+            reply[:output] = "Kill old idling puppet process #{pid})." + (reply[:output] || '')
          end

        when 'stopped' then      # just run