Erase provisioned node when cancel provisioning

* always erase node in boostrap state (failsafe optimization); * do erase using shell script nodes in provisioned/boostrap state; * for provisioned/boostrap state use mcollective agent. Change-Id: I2a3df52920f57f9c66e237de0d0d48a814ebf409 Related-Bug: #1316583 Closes-Bug: #1322573
2014-05-29 22:49:01 +04:00 · 2014-05-29 22:49:01 +04:00 · 5fa18e8e08
commit 5fa18e8e08
parent 4b5a0003f6
7 changed files with 284 additions and 21 deletions
--- a/lib/astute/config.rb
+++ b/lib/astute/config.rb
@ -72,7 +72,6 @@ module Astute
    conf[:SSH_RETRIES] = 5                # SSH tries to call ssh client before failure
    conf[:SSH_RETRY_TIMEOUT] = 30         # SSH sleeps for ## sec between retries

-
    conf[:MAX_NODES_PER_REMOVE_CALL] = 10 # how many nodes to remove in one call
    conf[:NODES_REMOVE_INTERVAL] = 10     # sleeps for ## sec between remove calls

--- a/lib/astute/orchestrator.rb
+++ b/lib/astute/orchestrator.rb
@ -20,10 +20,9 @@ module Astute
      @log_parsing = log_parsing
    end

-    def node_type(reporter, task_id, nodes, timeout=nil)
+    def node_type(reporter, task_id, nodes_uids, timeout=nil)
      context = Context.new(task_id, reporter)
-      uids = nodes.map {|n| n['uid']}
-      systemtype = MClient.new(context, "systemtype", uids, check_result=false, timeout)
+      systemtype = MClient.new(context, "systemtype", nodes_uids, check_result=false, timeout)
      systems = systemtype.get_type
      systems.map do |n|
        {
@ -94,7 +93,7 @@ module Astute
          catch :done do
            loop do
              sleep_not_greater_than(5) do
-                nodes_types = node_type(proxy_reporter, task_id, nodes, 2)
+                nodes_types = node_type(proxy_reporter, task_id, nodes.map {|n| n['uid']}, 2)
                target_uids, nodes_not_booted = analize_node_types(nodes_types, nodes_not_booted)

                if nodes.length == target_uids.length
@ -146,9 +145,9 @@ module Astute
    def remove_nodes(reporter, task_id, engine_attrs, nodes, reboot=true)
      cobbler = CobblerManager.new(engine_attrs, reporter)
      cobbler.remove_nodes(nodes)
-      ctxt = Context.new(task_id, reporter)
-      result = NodesRemover.new(ctxt, nodes, reboot).remove
-      Rsyslogd.send_sighup(ctxt, engine_attrs["master_ip"])
+      ctx = Context.new(task_id, reporter)
+      result = NodesRemover.new(ctx, nodes, reboot).remove
+      Rsyslogd.send_sighup(ctx, engine_attrs["master_ip"])

      result
    end
@ -160,13 +159,21 @@ module Astute
    end

    def stop_provision(reporter, task_id, engine_attrs, nodes)
-      Ssh.execute(Context.new(task_id, reporter), nodes, SshEraseNodes.command)
-      CobblerManager.new(engine_attrs, reporter).remove_nodes(nodes)
-      Ssh.execute(Context.new(task_id, reporter),
-                  nodes,
-                  SshHardReboot.command,
-                  timeout=5,
-                  retries=1)
+      ctx = Context.new(task_id, reporter)
+
+      ssh_result = stop_provision_via_ssh(ctx, nodes, engine_attrs)
+
+      # Remove already provisioned node. Possible erasing nodes twice
+      provisioned_nodes, mco_result = stop_provision_via_mcollective(ctx, nodes)
+
+      # For nodes responded via mcollective use mcollective result instead of ssh
+      ['nodes', 'error_nodes', 'inaccessible_nodes'].each do |node_status|
+        ssh_result[node_status] = ssh_result.fetch(node_status, []) - provisioned_nodes
+      end
+
+      result = merge_rm_nodes_result(ssh_result, mco_result)
+      result['status'] = 'error' if result['error_nodes'].present?
+      result
    end

    def dump_environment(reporter, task_id, settings)
@ -232,5 +239,61 @@ module Astute
      end
    end

+    def stop_provision_via_mcollective(ctx, nodes)
+      return [], {} if nodes.empty?
+
+      mco_result = {}
+      nodes_uids = nodes.map{ |n| n['uid'] }
+
+      Astute.config.MC_RETRIES.times do |i|
+        sleep Astute.config.NODES_REMOVE_INTERVAL
+
+        Astute.logger.debug "Trying to connect to nodes #{nodes_uids} using mcollective"
+        nodes_types = node_type(ctx.reporter, ctx.task_id, nodes_uids, 2)
+        next if nodes_types.empty?
+
+        provisioned = nodes_types.select{ |n| ['target', 'bootstrap'].include? n['node_type'] }
+                                 .map{ |n| {'uid' => n['uid']} }
+        current_mco_result = NodesRemover.new(ctx, provisioned, reboot=true).remove
+        Astute.logger.debug "Retry result #{i}: "\
+          "mco success nodes: #{current_mco_result['nodes']}, "\
+          "mco error nodes: #{current_mco_result['error_nodes']}, "\
+          "mco inaccessible nodes: #{current_mco_result['inaccessible_nodes']}"
+
+        mco_result = merge_rm_nodes_result(mco_result, current_mco_result)
+        nodes_uids -= provisioned.map{ |n| n['uid'] }
+
+        break if nodes_uids.empty?
+      end
+
+      provisioned_nodes = nodes.map{ |n| {'uid' => n['uid']} } - nodes_uids.map {|n| {'uid' => n} }
+
+      Astute.logger.debug "MCO final result: "\
+        "mco success nodes: #{mco_result['nodes']}, "\
+        "mco error nodes: #{mco_result['error_nodes']}, "\
+        "mco inaccessible nodes: #{mco_result['inaccessible_nodes']}, "\
+        "all mco nodes: #{provisioned_nodes}"
+
+      return provisioned_nodes, mco_result
+    end
+
+    def stop_provision_via_ssh(ctx, nodes, engine_attrs)
+      ssh_result = Ssh.execute(ctx, nodes, SshEraseNodes.command)
+      CobblerManager.new(engine_attrs, ctx.reporter).remove_nodes(nodes)
+      Ssh.execute(ctx,
+                  nodes,
+                  SshHardReboot.command,
+                  timeout=5,
+                  retries=1)
+      ssh_result
+    end
+
+    def merge_rm_nodes_result(res1, res2)
+      ['nodes', 'error_nodes', 'inaccessible_nodes'].inject({}) do |result, node_status|
+        result[node_status] = (res1.fetch(node_status, []) + res2.fetch(node_status, [])).uniq
+        result
+      end
+    end
+
  end
 end
--- a/lib/astute/ssh.rb
+++ b/lib/astute/ssh.rb
@ -92,8 +92,15 @@ module Astute
        # execute commands on all servers
        # FIXME: debug not show a messages if command contain a several
        # strings
-        channel = session.exec cmd do |ch, stream, data|
-          Astute.logger.debug "[#{ch[:host]} : #{stream}] #{data}"
+        channel = session.exec cmd do |ch, success|
+
+          ch.on_data do |ichannel, data|
+            Astute.logger.debug "[#{ch[:host]} : #{ichannel}] #{data}"
+          end
+
+          ch.on_request "exit-status" do |_ichannel, data|
+            exit_status = data.read_long
+          end
        end

        Timeout::timeout(timeout) { session.loop }
--- a/lib/astute/ssh_actions/ssh_erase_nodes.rb
+++ b/lib/astute/ssh_actions/ssh_erase_nodes.rb
@ -64,11 +64,15 @@ module Astute
          done
        }

+        # Need more robust mechanizm to detect provisining or provisined node
+        node_type=$(cat /etc/nailgun_systemtype)
+        if [ "$node_type" == "target" ] || [ "$node_type" == "bootstrap" ]; then
+          echo "Do not erase $node_type node using shell"
+          exit
+        fi
+
        echo "Run erase node command"
        erase_boot_devices
-
-        # Avoid shell hang using nohup and stdout/stderr redirections
-        # nohup reboot_with_sleep > /dev/null 2>&1 &
      ERASE_COMMAND
    end
  end
--- a/lib/astute/ssh_actions/ssh_hard_reboot.rb
+++ b/lib/astute/ssh_actions/ssh_hard_reboot.rb
@ -17,6 +17,12 @@ module Astute

    def self.command
     <<-REBOOT_COMMAND
+        # Need more robust mechanizm to detect provisining or provisined node
+        node_type=$(cat /etc/nailgun_systemtype)
+        if [ "$node_type" == "target" ] || [ "$node_type" == "bootstrap" ]; then
+          echo "Do not affect $node_type node"
+          exit
+        fi
        echo "Run node rebooting command using 'SB' to sysrq-trigger"
        echo "1" > /proc/sys/kernel/panic_on_oops
        echo "10" > /proc/sys/kernel/panic
--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@ -37,6 +37,7 @@ Astute.config.PUPPET_FADE_TIMEOUT = 1
 Astute.config.MC_RETRY_INTERVAL = 0
 Astute.config.PROVISIONING_TIMEOUT = 0
 Astute.config.REBOOT_TIMEOUT = 0
+Astute.config.SSH_RETRY_TIMEOUT = 0
 Astute.config.NODES_REMOVE_INTERVAL = 0
 Astute.logger = Logger.new(STDERR)

--- a/spec/unit/orchestrator_spec.rb
+++ b/spec/unit/orchestrator_spec.rb
@ -44,7 +44,7 @@ describe Astute::Orchestrator do
      rpcclient = mock_rpcclient(nodes, mc_timeout)
      rpcclient.expects(:get_type).once.returns([mc_res])

-      types = @orchestrator.node_type(@reporter, 'task_uuid', nodes, mc_timeout)
+      types = @orchestrator.node_type(@reporter, 'task_uuid', nodes.map { |n| n['uid'] }, mc_timeout)
      types.should eql([{"node_type"=>"target", "uid"=>"1"}])
    end
  end
@ -149,6 +149,7 @@ describe Astute::Orchestrator do
          'uid' => '1',
          'profile' => 'centos-x86_64',
          "slave_name"=>"controller-1",
+          "admin_ip" =>'1.2.3.5',
          'power_type' => 'ssh',
          'power_user' => 'root',
          'power_pass' => '/root/.ssh/bootstrap.rsa',
@ -355,4 +356,186 @@ describe Astute::Orchestrator do
    end

  end
+
+  describe '#stop_provision' do
+    around(:each) do |example|
+      old_ssh_retries = Astute.config.SSH_RETRIES
+      old_mc_retries = Astute.config.MC_RETRIES
+      old_nodes_rm_interal = Astute.config.NODES_REMOVE_INTERVAL
+      example.run
+      Astute.config.SSH_RETRIES = old_ssh_retries
+      Astute.config.MC_RETRIES = old_mc_retries
+      Astute.config.NODES_REMOVE_INTERVAL = old_nodes_rm_interal
+    end
+
+    before(:each) do
+      Astute.config.SSH_RETRIES = 1
+      Astute.config.MC_RETRIES = 1
+      Astute.config.NODES_REMOVE_INTERVAL = 0
+    end
+
+    it 'erase nodes using ssh' do
+      Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
+      @orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
+      Astute::Ssh.stubs(:execute).returns({'inaccessible_nodes' => [{'uid' => 1}]}).once
+
+      Astute::Ssh.expects(:execute).with(instance_of(Astute::Context),
+                                        data['nodes'],
+                                        Astute::SshEraseNodes.command)
+                                   .returns({'nodes' => [{'uid' => 1}]})
+
+      expect(@orchestrator.stop_provision(@reporter,
+                                   data['task_uuid'],
+                                   data['engine'],
+                                   data['nodes']))
+            .to eql({
+                     "error_nodes" => [],
+                     "inaccessible_nodes" => [],
+                     "nodes" => [{"uid"=>1}]
+                    })
+    end
+
+    it 'always remove nodes from Cobbler' do
+      Astute::Ssh.stubs(:execute).twice.returns({'inaccessible_nodes' => [{'uid' => 1}]})
+      @orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
+
+      Astute::CobblerManager.any_instance.expects(:remove_nodes)
+                                         .with(data['nodes'])
+                                         .returns([])
+
+      @orchestrator.stop_provision(@reporter,
+                                   data['task_uuid'],
+                                   data['engine'],
+                                   data['nodes'])
+    end
+
+    it 'reboot nodes using using ssh' do
+      Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
+      @orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
+      Astute::Ssh.stubs(:execute).returns({'nodes' => [{'uid' => 1}]}).once
+
+      Astute::Ssh.expects(:execute).with(instance_of(Astute::Context),
+                                       data['nodes'],
+                                       Astute::SshHardReboot.command,
+                                       timeout=5,
+                                       retries=1)
+                                 .returns({'inaccessible_nodes' => [{'uid' => 1}]})
+
+      expect(@orchestrator.stop_provision(@reporter,
+                                   data['task_uuid'],
+                                   data['engine'],
+                                   data['nodes']))
+            .to eql({
+                     "error_nodes" => [],
+                     "inaccessible_nodes" => [],
+                     "nodes" => [{"uid"=>1}]
+                    })
+    end
+
+    it 'stop provision if provision operation stop immediately' do
+      @orchestrator.stubs(:stop_provision_via_ssh)
+                   .returns({'inaccessible_nodes' => [{'uid' => '1'}]})
+      @orchestrator.stubs(:node_type).returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
+
+      Astute::NodesRemover.any_instance.expects(:remove)
+                          .once.returns({"nodes"=>[{"uid"=>"1", }]})
+
+      expect(@orchestrator.stop_provision(@reporter,
+                                   data['task_uuid'],
+                                   data['engine'],
+                                   data['nodes']))
+            .to eql({
+                     "error_nodes" => [],
+                     "inaccessible_nodes" => [],
+                     "nodes" => [{"uid"=>"1"}]
+                    })
+    end
+
+    it 'stop provision if provision operation stop in the end' do
+      @orchestrator.stubs(:stop_provision_via_ssh)
+             .returns({'nodes' => [{'uid' => "1"}]})
+      @orchestrator.stubs(:node_type).returns([{'uid' => "1", 'node_type' => 'target'}])
+
+      Astute::NodesRemover.any_instance.expects(:remove)
+                          .once.returns({"nodes"=>[{"uid"=>"1", }]})
+
+      expect(@orchestrator.stop_provision(@reporter,
+                                   data['task_uuid'],
+                                   data['engine'],
+                                   data['nodes']))
+            .to eql({
+                     "error_nodes" => [],
+                     "inaccessible_nodes" => [],
+                     "nodes" => [{"uid"=>"1"}]
+                    })
+    end
+
+    it 'inform about inaccessible nodes' do
+      Astute::Ssh.stubs(:execute).returns({'inaccessible_nodes' => [{'uid' => 1}]}).twice
+      Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
+      @orchestrator.stubs(:node_type).returns([])
+
+      Astute::NodesRemover.any_instance.expects(:remove).never
+
+      expect(@orchestrator.stop_provision(@reporter,
+                             data['task_uuid'],
+                             data['engine'],
+                             data['nodes']))
+            .to eql({
+                     "error_nodes" => [],
+                     "inaccessible_nodes" => [{"uid"=>1}],
+                     "nodes" => []
+                    })
+    end
+
+    it 'sleep between attempts to find and erase nodes using mcollective' do
+      @orchestrator.stubs(:stop_provision_via_ssh)
+                   .returns({'inaccessible_nodes' => [{'uid' => '1'}]})
+      @orchestrator.stubs(:node_type).returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
+      Astute::NodesRemover.any_instance.stubs(:remove)
+                          .once.returns({"nodes"=>[{"uid"=>"1", }]})
+
+      @orchestrator.expects(:sleep).with(Astute.config.NODES_REMOVE_INTERVAL)
+
+      @orchestrator.stop_provision(@reporter,
+                             data['task_uuid'],
+                             data['engine'],
+                             data['nodes'])
+    end
+
+    it 'perform several attempts to find and erase nodes using mcollective' do
+      Astute.config.MC_RETRIES = 2
+      Astute.config.NODES_REMOVE_INTERVAL = 0
+
+      @orchestrator.stubs(:stop_provision_via_ssh)
+                   .returns({'nodes' => [{'uid' => "1"}],
+                             'inaccessible_nodes' => [{'uid' => '2'}]})
+
+      @orchestrator.stubs(:node_type).twice
+                   .returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
+                   .then.returns([{'uid' => '2', 'node_type' => 'target'}])
+
+      Astute::NodesRemover.any_instance.stubs(:remove).twice
+                          .returns({"nodes"=>[{"uid"=>"1"}]}).then
+                          .returns({"error_nodes"=>[{"uid"=>"2"}]})
+
+      data['nodes'] << {
+        "uid" => '2',
+        "slave_name"=>"controller-2",
+        "admin_ip" =>'1.2.3.6'
+      }
+
+      expect(@orchestrator.stop_provision(@reporter,
+                             data['task_uuid'],
+                             data['engine'],
+                             data['nodes']))
+            .to eql({
+                     "error_nodes" => [{"uid"=>'2'}],
+                     "inaccessible_nodes" => [],
+                     "nodes" => [{"uid"=>"1"}],
+                     "status" => "error"
+                    })
+    end
+
+  end # stop_provision
 end