Erase provisioned node when cancel provisioning
* always erase node in boostrap state (failsafe optimization); * do erase using shell script nodes in provisioned/boostrap state; * for provisioned/boostrap state use mcollective agent. Change-Id: I2a3df52920f57f9c66e237de0d0d48a814ebf409 Related-Bug: #1316583 Closes-Bug: #1322573
This commit is contained in:
parent
4b5a0003f6
commit
5fa18e8e08
@ -72,7 +72,6 @@ module Astute
|
||||
conf[:SSH_RETRIES] = 5 # SSH tries to call ssh client before failure
|
||||
conf[:SSH_RETRY_TIMEOUT] = 30 # SSH sleeps for ## sec between retries
|
||||
|
||||
|
||||
conf[:MAX_NODES_PER_REMOVE_CALL] = 10 # how many nodes to remove in one call
|
||||
conf[:NODES_REMOVE_INTERVAL] = 10 # sleeps for ## sec between remove calls
|
||||
|
||||
|
@ -20,10 +20,9 @@ module Astute
|
||||
@log_parsing = log_parsing
|
||||
end
|
||||
|
||||
def node_type(reporter, task_id, nodes, timeout=nil)
|
||||
def node_type(reporter, task_id, nodes_uids, timeout=nil)
|
||||
context = Context.new(task_id, reporter)
|
||||
uids = nodes.map {|n| n['uid']}
|
||||
systemtype = MClient.new(context, "systemtype", uids, check_result=false, timeout)
|
||||
systemtype = MClient.new(context, "systemtype", nodes_uids, check_result=false, timeout)
|
||||
systems = systemtype.get_type
|
||||
systems.map do |n|
|
||||
{
|
||||
@ -94,7 +93,7 @@ module Astute
|
||||
catch :done do
|
||||
loop do
|
||||
sleep_not_greater_than(5) do
|
||||
nodes_types = node_type(proxy_reporter, task_id, nodes, 2)
|
||||
nodes_types = node_type(proxy_reporter, task_id, nodes.map {|n| n['uid']}, 2)
|
||||
target_uids, nodes_not_booted = analize_node_types(nodes_types, nodes_not_booted)
|
||||
|
||||
if nodes.length == target_uids.length
|
||||
@ -146,9 +145,9 @@ module Astute
|
||||
def remove_nodes(reporter, task_id, engine_attrs, nodes, reboot=true)
|
||||
cobbler = CobblerManager.new(engine_attrs, reporter)
|
||||
cobbler.remove_nodes(nodes)
|
||||
ctxt = Context.new(task_id, reporter)
|
||||
result = NodesRemover.new(ctxt, nodes, reboot).remove
|
||||
Rsyslogd.send_sighup(ctxt, engine_attrs["master_ip"])
|
||||
ctx = Context.new(task_id, reporter)
|
||||
result = NodesRemover.new(ctx, nodes, reboot).remove
|
||||
Rsyslogd.send_sighup(ctx, engine_attrs["master_ip"])
|
||||
|
||||
result
|
||||
end
|
||||
@ -160,13 +159,21 @@ module Astute
|
||||
end
|
||||
|
||||
def stop_provision(reporter, task_id, engine_attrs, nodes)
|
||||
Ssh.execute(Context.new(task_id, reporter), nodes, SshEraseNodes.command)
|
||||
CobblerManager.new(engine_attrs, reporter).remove_nodes(nodes)
|
||||
Ssh.execute(Context.new(task_id, reporter),
|
||||
nodes,
|
||||
SshHardReboot.command,
|
||||
timeout=5,
|
||||
retries=1)
|
||||
ctx = Context.new(task_id, reporter)
|
||||
|
||||
ssh_result = stop_provision_via_ssh(ctx, nodes, engine_attrs)
|
||||
|
||||
# Remove already provisioned node. Possible erasing nodes twice
|
||||
provisioned_nodes, mco_result = stop_provision_via_mcollective(ctx, nodes)
|
||||
|
||||
# For nodes responded via mcollective use mcollective result instead of ssh
|
||||
['nodes', 'error_nodes', 'inaccessible_nodes'].each do |node_status|
|
||||
ssh_result[node_status] = ssh_result.fetch(node_status, []) - provisioned_nodes
|
||||
end
|
||||
|
||||
result = merge_rm_nodes_result(ssh_result, mco_result)
|
||||
result['status'] = 'error' if result['error_nodes'].present?
|
||||
result
|
||||
end
|
||||
|
||||
def dump_environment(reporter, task_id, settings)
|
||||
@ -232,5 +239,61 @@ module Astute
|
||||
end
|
||||
end
|
||||
|
||||
def stop_provision_via_mcollective(ctx, nodes)
|
||||
return [], {} if nodes.empty?
|
||||
|
||||
mco_result = {}
|
||||
nodes_uids = nodes.map{ |n| n['uid'] }
|
||||
|
||||
Astute.config.MC_RETRIES.times do |i|
|
||||
sleep Astute.config.NODES_REMOVE_INTERVAL
|
||||
|
||||
Astute.logger.debug "Trying to connect to nodes #{nodes_uids} using mcollective"
|
||||
nodes_types = node_type(ctx.reporter, ctx.task_id, nodes_uids, 2)
|
||||
next if nodes_types.empty?
|
||||
|
||||
provisioned = nodes_types.select{ |n| ['target', 'bootstrap'].include? n['node_type'] }
|
||||
.map{ |n| {'uid' => n['uid']} }
|
||||
current_mco_result = NodesRemover.new(ctx, provisioned, reboot=true).remove
|
||||
Astute.logger.debug "Retry result #{i}: "\
|
||||
"mco success nodes: #{current_mco_result['nodes']}, "\
|
||||
"mco error nodes: #{current_mco_result['error_nodes']}, "\
|
||||
"mco inaccessible nodes: #{current_mco_result['inaccessible_nodes']}"
|
||||
|
||||
mco_result = merge_rm_nodes_result(mco_result, current_mco_result)
|
||||
nodes_uids -= provisioned.map{ |n| n['uid'] }
|
||||
|
||||
break if nodes_uids.empty?
|
||||
end
|
||||
|
||||
provisioned_nodes = nodes.map{ |n| {'uid' => n['uid']} } - nodes_uids.map {|n| {'uid' => n} }
|
||||
|
||||
Astute.logger.debug "MCO final result: "\
|
||||
"mco success nodes: #{mco_result['nodes']}, "\
|
||||
"mco error nodes: #{mco_result['error_nodes']}, "\
|
||||
"mco inaccessible nodes: #{mco_result['inaccessible_nodes']}, "\
|
||||
"all mco nodes: #{provisioned_nodes}"
|
||||
|
||||
return provisioned_nodes, mco_result
|
||||
end
|
||||
|
||||
def stop_provision_via_ssh(ctx, nodes, engine_attrs)
|
||||
ssh_result = Ssh.execute(ctx, nodes, SshEraseNodes.command)
|
||||
CobblerManager.new(engine_attrs, ctx.reporter).remove_nodes(nodes)
|
||||
Ssh.execute(ctx,
|
||||
nodes,
|
||||
SshHardReboot.command,
|
||||
timeout=5,
|
||||
retries=1)
|
||||
ssh_result
|
||||
end
|
||||
|
||||
def merge_rm_nodes_result(res1, res2)
|
||||
['nodes', 'error_nodes', 'inaccessible_nodes'].inject({}) do |result, node_status|
|
||||
result[node_status] = (res1.fetch(node_status, []) + res2.fetch(node_status, [])).uniq
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
@ -92,8 +92,15 @@ module Astute
|
||||
# execute commands on all servers
|
||||
# FIXME: debug not show a messages if command contain a several
|
||||
# strings
|
||||
channel = session.exec cmd do |ch, stream, data|
|
||||
Astute.logger.debug "[#{ch[:host]} : #{stream}] #{data}"
|
||||
channel = session.exec cmd do |ch, success|
|
||||
|
||||
ch.on_data do |ichannel, data|
|
||||
Astute.logger.debug "[#{ch[:host]} : #{ichannel}] #{data}"
|
||||
end
|
||||
|
||||
ch.on_request "exit-status" do |_ichannel, data|
|
||||
exit_status = data.read_long
|
||||
end
|
||||
end
|
||||
|
||||
Timeout::timeout(timeout) { session.loop }
|
||||
|
@ -64,11 +64,15 @@ module Astute
|
||||
done
|
||||
}
|
||||
|
||||
# Need more robust mechanizm to detect provisining or provisined node
|
||||
node_type=$(cat /etc/nailgun_systemtype)
|
||||
if [ "$node_type" == "target" ] || [ "$node_type" == "bootstrap" ]; then
|
||||
echo "Do not erase $node_type node using shell"
|
||||
exit
|
||||
fi
|
||||
|
||||
echo "Run erase node command"
|
||||
erase_boot_devices
|
||||
|
||||
# Avoid shell hang using nohup and stdout/stderr redirections
|
||||
# nohup reboot_with_sleep > /dev/null 2>&1 &
|
||||
ERASE_COMMAND
|
||||
end
|
||||
end
|
||||
|
@ -17,6 +17,12 @@ module Astute
|
||||
|
||||
def self.command
|
||||
<<-REBOOT_COMMAND
|
||||
# Need more robust mechanizm to detect provisining or provisined node
|
||||
node_type=$(cat /etc/nailgun_systemtype)
|
||||
if [ "$node_type" == "target" ] || [ "$node_type" == "bootstrap" ]; then
|
||||
echo "Do not affect $node_type node"
|
||||
exit
|
||||
fi
|
||||
echo "Run node rebooting command using 'SB' to sysrq-trigger"
|
||||
echo "1" > /proc/sys/kernel/panic_on_oops
|
||||
echo "10" > /proc/sys/kernel/panic
|
||||
|
@ -37,6 +37,7 @@ Astute.config.PUPPET_FADE_TIMEOUT = 1
|
||||
Astute.config.MC_RETRY_INTERVAL = 0
|
||||
Astute.config.PROVISIONING_TIMEOUT = 0
|
||||
Astute.config.REBOOT_TIMEOUT = 0
|
||||
Astute.config.SSH_RETRY_TIMEOUT = 0
|
||||
Astute.config.NODES_REMOVE_INTERVAL = 0
|
||||
Astute.logger = Logger.new(STDERR)
|
||||
|
||||
|
@ -44,7 +44,7 @@ describe Astute::Orchestrator do
|
||||
rpcclient = mock_rpcclient(nodes, mc_timeout)
|
||||
rpcclient.expects(:get_type).once.returns([mc_res])
|
||||
|
||||
types = @orchestrator.node_type(@reporter, 'task_uuid', nodes, mc_timeout)
|
||||
types = @orchestrator.node_type(@reporter, 'task_uuid', nodes.map { |n| n['uid'] }, mc_timeout)
|
||||
types.should eql([{"node_type"=>"target", "uid"=>"1"}])
|
||||
end
|
||||
end
|
||||
@ -149,6 +149,7 @@ describe Astute::Orchestrator do
|
||||
'uid' => '1',
|
||||
'profile' => 'centos-x86_64',
|
||||
"slave_name"=>"controller-1",
|
||||
"admin_ip" =>'1.2.3.5',
|
||||
'power_type' => 'ssh',
|
||||
'power_user' => 'root',
|
||||
'power_pass' => '/root/.ssh/bootstrap.rsa',
|
||||
@ -355,4 +356,186 @@ describe Astute::Orchestrator do
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
describe '#stop_provision' do
|
||||
around(:each) do |example|
|
||||
old_ssh_retries = Astute.config.SSH_RETRIES
|
||||
old_mc_retries = Astute.config.MC_RETRIES
|
||||
old_nodes_rm_interal = Astute.config.NODES_REMOVE_INTERVAL
|
||||
example.run
|
||||
Astute.config.SSH_RETRIES = old_ssh_retries
|
||||
Astute.config.MC_RETRIES = old_mc_retries
|
||||
Astute.config.NODES_REMOVE_INTERVAL = old_nodes_rm_interal
|
||||
end
|
||||
|
||||
before(:each) do
|
||||
Astute.config.SSH_RETRIES = 1
|
||||
Astute.config.MC_RETRIES = 1
|
||||
Astute.config.NODES_REMOVE_INTERVAL = 0
|
||||
end
|
||||
|
||||
it 'erase nodes using ssh' do
|
||||
Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
|
||||
@orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
|
||||
Astute::Ssh.stubs(:execute).returns({'inaccessible_nodes' => [{'uid' => 1}]}).once
|
||||
|
||||
Astute::Ssh.expects(:execute).with(instance_of(Astute::Context),
|
||||
data['nodes'],
|
||||
Astute::SshEraseNodes.command)
|
||||
.returns({'nodes' => [{'uid' => 1}]})
|
||||
|
||||
expect(@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes']))
|
||||
.to eql({
|
||||
"error_nodes" => [],
|
||||
"inaccessible_nodes" => [],
|
||||
"nodes" => [{"uid"=>1}]
|
||||
})
|
||||
end
|
||||
|
||||
it 'always remove nodes from Cobbler' do
|
||||
Astute::Ssh.stubs(:execute).twice.returns({'inaccessible_nodes' => [{'uid' => 1}]})
|
||||
@orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
|
||||
|
||||
Astute::CobblerManager.any_instance.expects(:remove_nodes)
|
||||
.with(data['nodes'])
|
||||
.returns([])
|
||||
|
||||
@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes'])
|
||||
end
|
||||
|
||||
it 'reboot nodes using using ssh' do
|
||||
Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
|
||||
@orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
|
||||
Astute::Ssh.stubs(:execute).returns({'nodes' => [{'uid' => 1}]}).once
|
||||
|
||||
Astute::Ssh.expects(:execute).with(instance_of(Astute::Context),
|
||||
data['nodes'],
|
||||
Astute::SshHardReboot.command,
|
||||
timeout=5,
|
||||
retries=1)
|
||||
.returns({'inaccessible_nodes' => [{'uid' => 1}]})
|
||||
|
||||
expect(@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes']))
|
||||
.to eql({
|
||||
"error_nodes" => [],
|
||||
"inaccessible_nodes" => [],
|
||||
"nodes" => [{"uid"=>1}]
|
||||
})
|
||||
end
|
||||
|
||||
it 'stop provision if provision operation stop immediately' do
|
||||
@orchestrator.stubs(:stop_provision_via_ssh)
|
||||
.returns({'inaccessible_nodes' => [{'uid' => '1'}]})
|
||||
@orchestrator.stubs(:node_type).returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
|
||||
|
||||
Astute::NodesRemover.any_instance.expects(:remove)
|
||||
.once.returns({"nodes"=>[{"uid"=>"1", }]})
|
||||
|
||||
expect(@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes']))
|
||||
.to eql({
|
||||
"error_nodes" => [],
|
||||
"inaccessible_nodes" => [],
|
||||
"nodes" => [{"uid"=>"1"}]
|
||||
})
|
||||
end
|
||||
|
||||
it 'stop provision if provision operation stop in the end' do
|
||||
@orchestrator.stubs(:stop_provision_via_ssh)
|
||||
.returns({'nodes' => [{'uid' => "1"}]})
|
||||
@orchestrator.stubs(:node_type).returns([{'uid' => "1", 'node_type' => 'target'}])
|
||||
|
||||
Astute::NodesRemover.any_instance.expects(:remove)
|
||||
.once.returns({"nodes"=>[{"uid"=>"1", }]})
|
||||
|
||||
expect(@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes']))
|
||||
.to eql({
|
||||
"error_nodes" => [],
|
||||
"inaccessible_nodes" => [],
|
||||
"nodes" => [{"uid"=>"1"}]
|
||||
})
|
||||
end
|
||||
|
||||
it 'inform about inaccessible nodes' do
|
||||
Astute::Ssh.stubs(:execute).returns({'inaccessible_nodes' => [{'uid' => 1}]}).twice
|
||||
Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
|
||||
@orchestrator.stubs(:node_type).returns([])
|
||||
|
||||
Astute::NodesRemover.any_instance.expects(:remove).never
|
||||
|
||||
expect(@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes']))
|
||||
.to eql({
|
||||
"error_nodes" => [],
|
||||
"inaccessible_nodes" => [{"uid"=>1}],
|
||||
"nodes" => []
|
||||
})
|
||||
end
|
||||
|
||||
it 'sleep between attempts to find and erase nodes using mcollective' do
|
||||
@orchestrator.stubs(:stop_provision_via_ssh)
|
||||
.returns({'inaccessible_nodes' => [{'uid' => '1'}]})
|
||||
@orchestrator.stubs(:node_type).returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
|
||||
Astute::NodesRemover.any_instance.stubs(:remove)
|
||||
.once.returns({"nodes"=>[{"uid"=>"1", }]})
|
||||
|
||||
@orchestrator.expects(:sleep).with(Astute.config.NODES_REMOVE_INTERVAL)
|
||||
|
||||
@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes'])
|
||||
end
|
||||
|
||||
it 'perform several attempts to find and erase nodes using mcollective' do
|
||||
Astute.config.MC_RETRIES = 2
|
||||
Astute.config.NODES_REMOVE_INTERVAL = 0
|
||||
|
||||
@orchestrator.stubs(:stop_provision_via_ssh)
|
||||
.returns({'nodes' => [{'uid' => "1"}],
|
||||
'inaccessible_nodes' => [{'uid' => '2'}]})
|
||||
|
||||
@orchestrator.stubs(:node_type).twice
|
||||
.returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
|
||||
.then.returns([{'uid' => '2', 'node_type' => 'target'}])
|
||||
|
||||
Astute::NodesRemover.any_instance.stubs(:remove).twice
|
||||
.returns({"nodes"=>[{"uid"=>"1"}]}).then
|
||||
.returns({"error_nodes"=>[{"uid"=>"2"}]})
|
||||
|
||||
data['nodes'] << {
|
||||
"uid" => '2',
|
||||
"slave_name"=>"controller-2",
|
||||
"admin_ip" =>'1.2.3.6'
|
||||
}
|
||||
|
||||
expect(@orchestrator.stop_provision(@reporter,
|
||||
data['task_uuid'],
|
||||
data['engine'],
|
||||
data['nodes']))
|
||||
.to eql({
|
||||
"error_nodes" => [{"uid"=>'2'}],
|
||||
"inaccessible_nodes" => [],
|
||||
"nodes" => [{"uid"=>"1"}],
|
||||
"status" => "error"
|
||||
})
|
||||
end
|
||||
|
||||
end # stop_provision
|
||||
end
|
||||
|
Loading…
Reference in New Issue
Block a user