Erase provisioned node when cancel provisioning

* always erase node in boostrap state (failsafe optimization);
* do erase using shell script nodes in provisioned/boostrap state;
* for provisioned/boostrap state use mcollective agent.

Change-Id: I2a3df52920f57f9c66e237de0d0d48a814ebf409
Related-Bug: #1316583
Closes-Bug: #1322573
This commit is contained in:
Vladimir Sharshov 2014-05-29 22:49:01 +04:00
parent 4b5a0003f6
commit 5fa18e8e08
7 changed files with 284 additions and 21 deletions

View File

@ -72,7 +72,6 @@ module Astute
conf[:SSH_RETRIES] = 5 # SSH tries to call ssh client before failure
conf[:SSH_RETRY_TIMEOUT] = 30 # SSH sleeps for ## sec between retries
conf[:MAX_NODES_PER_REMOVE_CALL] = 10 # how many nodes to remove in one call
conf[:NODES_REMOVE_INTERVAL] = 10 # sleeps for ## sec between remove calls

View File

@ -20,10 +20,9 @@ module Astute
@log_parsing = log_parsing
end
def node_type(reporter, task_id, nodes, timeout=nil)
def node_type(reporter, task_id, nodes_uids, timeout=nil)
context = Context.new(task_id, reporter)
uids = nodes.map {|n| n['uid']}
systemtype = MClient.new(context, "systemtype", uids, check_result=false, timeout)
systemtype = MClient.new(context, "systemtype", nodes_uids, check_result=false, timeout)
systems = systemtype.get_type
systems.map do |n|
{
@ -94,7 +93,7 @@ module Astute
catch :done do
loop do
sleep_not_greater_than(5) do
nodes_types = node_type(proxy_reporter, task_id, nodes, 2)
nodes_types = node_type(proxy_reporter, task_id, nodes.map {|n| n['uid']}, 2)
target_uids, nodes_not_booted = analize_node_types(nodes_types, nodes_not_booted)
if nodes.length == target_uids.length
@ -146,9 +145,9 @@ module Astute
def remove_nodes(reporter, task_id, engine_attrs, nodes, reboot=true)
cobbler = CobblerManager.new(engine_attrs, reporter)
cobbler.remove_nodes(nodes)
ctxt = Context.new(task_id, reporter)
result = NodesRemover.new(ctxt, nodes, reboot).remove
Rsyslogd.send_sighup(ctxt, engine_attrs["master_ip"])
ctx = Context.new(task_id, reporter)
result = NodesRemover.new(ctx, nodes, reboot).remove
Rsyslogd.send_sighup(ctx, engine_attrs["master_ip"])
result
end
@ -160,13 +159,21 @@ module Astute
end
def stop_provision(reporter, task_id, engine_attrs, nodes)
Ssh.execute(Context.new(task_id, reporter), nodes, SshEraseNodes.command)
CobblerManager.new(engine_attrs, reporter).remove_nodes(nodes)
Ssh.execute(Context.new(task_id, reporter),
nodes,
SshHardReboot.command,
timeout=5,
retries=1)
ctx = Context.new(task_id, reporter)
ssh_result = stop_provision_via_ssh(ctx, nodes, engine_attrs)
# Remove already provisioned node. Possible erasing nodes twice
provisioned_nodes, mco_result = stop_provision_via_mcollective(ctx, nodes)
# For nodes responded via mcollective use mcollective result instead of ssh
['nodes', 'error_nodes', 'inaccessible_nodes'].each do |node_status|
ssh_result[node_status] = ssh_result.fetch(node_status, []) - provisioned_nodes
end
result = merge_rm_nodes_result(ssh_result, mco_result)
result['status'] = 'error' if result['error_nodes'].present?
result
end
def dump_environment(reporter, task_id, settings)
@ -232,5 +239,61 @@ module Astute
end
end
def stop_provision_via_mcollective(ctx, nodes)
return [], {} if nodes.empty?
mco_result = {}
nodes_uids = nodes.map{ |n| n['uid'] }
Astute.config.MC_RETRIES.times do |i|
sleep Astute.config.NODES_REMOVE_INTERVAL
Astute.logger.debug "Trying to connect to nodes #{nodes_uids} using mcollective"
nodes_types = node_type(ctx.reporter, ctx.task_id, nodes_uids, 2)
next if nodes_types.empty?
provisioned = nodes_types.select{ |n| ['target', 'bootstrap'].include? n['node_type'] }
.map{ |n| {'uid' => n['uid']} }
current_mco_result = NodesRemover.new(ctx, provisioned, reboot=true).remove
Astute.logger.debug "Retry result #{i}: "\
"mco success nodes: #{current_mco_result['nodes']}, "\
"mco error nodes: #{current_mco_result['error_nodes']}, "\
"mco inaccessible nodes: #{current_mco_result['inaccessible_nodes']}"
mco_result = merge_rm_nodes_result(mco_result, current_mco_result)
nodes_uids -= provisioned.map{ |n| n['uid'] }
break if nodes_uids.empty?
end
provisioned_nodes = nodes.map{ |n| {'uid' => n['uid']} } - nodes_uids.map {|n| {'uid' => n} }
Astute.logger.debug "MCO final result: "\
"mco success nodes: #{mco_result['nodes']}, "\
"mco error nodes: #{mco_result['error_nodes']}, "\
"mco inaccessible nodes: #{mco_result['inaccessible_nodes']}, "\
"all mco nodes: #{provisioned_nodes}"
return provisioned_nodes, mco_result
end
def stop_provision_via_ssh(ctx, nodes, engine_attrs)
ssh_result = Ssh.execute(ctx, nodes, SshEraseNodes.command)
CobblerManager.new(engine_attrs, ctx.reporter).remove_nodes(nodes)
Ssh.execute(ctx,
nodes,
SshHardReboot.command,
timeout=5,
retries=1)
ssh_result
end
def merge_rm_nodes_result(res1, res2)
['nodes', 'error_nodes', 'inaccessible_nodes'].inject({}) do |result, node_status|
result[node_status] = (res1.fetch(node_status, []) + res2.fetch(node_status, [])).uniq
result
end
end
end
end

View File

@ -92,8 +92,15 @@ module Astute
# execute commands on all servers
# FIXME: debug not show a messages if command contain a several
# strings
channel = session.exec cmd do |ch, stream, data|
Astute.logger.debug "[#{ch[:host]} : #{stream}] #{data}"
channel = session.exec cmd do |ch, success|
ch.on_data do |ichannel, data|
Astute.logger.debug "[#{ch[:host]} : #{ichannel}] #{data}"
end
ch.on_request "exit-status" do |_ichannel, data|
exit_status = data.read_long
end
end
Timeout::timeout(timeout) { session.loop }

View File

@ -64,11 +64,15 @@ module Astute
done
}
# Need more robust mechanizm to detect provisining or provisined node
node_type=$(cat /etc/nailgun_systemtype)
if [ "$node_type" == "target" ] || [ "$node_type" == "bootstrap" ]; then
echo "Do not erase $node_type node using shell"
exit
fi
echo "Run erase node command"
erase_boot_devices
# Avoid shell hang using nohup and stdout/stderr redirections
# nohup reboot_with_sleep > /dev/null 2>&1 &
ERASE_COMMAND
end
end

View File

@ -17,6 +17,12 @@ module Astute
def self.command
<<-REBOOT_COMMAND
# Need more robust mechanizm to detect provisining or provisined node
node_type=$(cat /etc/nailgun_systemtype)
if [ "$node_type" == "target" ] || [ "$node_type" == "bootstrap" ]; then
echo "Do not affect $node_type node"
exit
fi
echo "Run node rebooting command using 'SB' to sysrq-trigger"
echo "1" > /proc/sys/kernel/panic_on_oops
echo "10" > /proc/sys/kernel/panic

View File

@ -37,6 +37,7 @@ Astute.config.PUPPET_FADE_TIMEOUT = 1
Astute.config.MC_RETRY_INTERVAL = 0
Astute.config.PROVISIONING_TIMEOUT = 0
Astute.config.REBOOT_TIMEOUT = 0
Astute.config.SSH_RETRY_TIMEOUT = 0
Astute.config.NODES_REMOVE_INTERVAL = 0
Astute.logger = Logger.new(STDERR)

View File

@ -44,7 +44,7 @@ describe Astute::Orchestrator do
rpcclient = mock_rpcclient(nodes, mc_timeout)
rpcclient.expects(:get_type).once.returns([mc_res])
types = @orchestrator.node_type(@reporter, 'task_uuid', nodes, mc_timeout)
types = @orchestrator.node_type(@reporter, 'task_uuid', nodes.map { |n| n['uid'] }, mc_timeout)
types.should eql([{"node_type"=>"target", "uid"=>"1"}])
end
end
@ -149,6 +149,7 @@ describe Astute::Orchestrator do
'uid' => '1',
'profile' => 'centos-x86_64',
"slave_name"=>"controller-1",
"admin_ip" =>'1.2.3.5',
'power_type' => 'ssh',
'power_user' => 'root',
'power_pass' => '/root/.ssh/bootstrap.rsa',
@ -355,4 +356,186 @@ describe Astute::Orchestrator do
end
end
describe '#stop_provision' do
around(:each) do |example|
old_ssh_retries = Astute.config.SSH_RETRIES
old_mc_retries = Astute.config.MC_RETRIES
old_nodes_rm_interal = Astute.config.NODES_REMOVE_INTERVAL
example.run
Astute.config.SSH_RETRIES = old_ssh_retries
Astute.config.MC_RETRIES = old_mc_retries
Astute.config.NODES_REMOVE_INTERVAL = old_nodes_rm_interal
end
before(:each) do
Astute.config.SSH_RETRIES = 1
Astute.config.MC_RETRIES = 1
Astute.config.NODES_REMOVE_INTERVAL = 0
end
it 'erase nodes using ssh' do
Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
@orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
Astute::Ssh.stubs(:execute).returns({'inaccessible_nodes' => [{'uid' => 1}]}).once
Astute::Ssh.expects(:execute).with(instance_of(Astute::Context),
data['nodes'],
Astute::SshEraseNodes.command)
.returns({'nodes' => [{'uid' => 1}]})
expect(@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes']))
.to eql({
"error_nodes" => [],
"inaccessible_nodes" => [],
"nodes" => [{"uid"=>1}]
})
end
it 'always remove nodes from Cobbler' do
Astute::Ssh.stubs(:execute).twice.returns({'inaccessible_nodes' => [{'uid' => 1}]})
@orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
Astute::CobblerManager.any_instance.expects(:remove_nodes)
.with(data['nodes'])
.returns([])
@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes'])
end
it 'reboot nodes using using ssh' do
Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
@orchestrator.stubs(:stop_provision_via_mcollective).returns([[], {}])
Astute::Ssh.stubs(:execute).returns({'nodes' => [{'uid' => 1}]}).once
Astute::Ssh.expects(:execute).with(instance_of(Astute::Context),
data['nodes'],
Astute::SshHardReboot.command,
timeout=5,
retries=1)
.returns({'inaccessible_nodes' => [{'uid' => 1}]})
expect(@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes']))
.to eql({
"error_nodes" => [],
"inaccessible_nodes" => [],
"nodes" => [{"uid"=>1}]
})
end
it 'stop provision if provision operation stop immediately' do
@orchestrator.stubs(:stop_provision_via_ssh)
.returns({'inaccessible_nodes' => [{'uid' => '1'}]})
@orchestrator.stubs(:node_type).returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
Astute::NodesRemover.any_instance.expects(:remove)
.once.returns({"nodes"=>[{"uid"=>"1", }]})
expect(@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes']))
.to eql({
"error_nodes" => [],
"inaccessible_nodes" => [],
"nodes" => [{"uid"=>"1"}]
})
end
it 'stop provision if provision operation stop in the end' do
@orchestrator.stubs(:stop_provision_via_ssh)
.returns({'nodes' => [{'uid' => "1"}]})
@orchestrator.stubs(:node_type).returns([{'uid' => "1", 'node_type' => 'target'}])
Astute::NodesRemover.any_instance.expects(:remove)
.once.returns({"nodes"=>[{"uid"=>"1", }]})
expect(@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes']))
.to eql({
"error_nodes" => [],
"inaccessible_nodes" => [],
"nodes" => [{"uid"=>"1"}]
})
end
it 'inform about inaccessible nodes' do
Astute::Ssh.stubs(:execute).returns({'inaccessible_nodes' => [{'uid' => 1}]}).twice
Astute::CobblerManager.any_instance.stubs(:remove_nodes).returns([])
@orchestrator.stubs(:node_type).returns([])
Astute::NodesRemover.any_instance.expects(:remove).never
expect(@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes']))
.to eql({
"error_nodes" => [],
"inaccessible_nodes" => [{"uid"=>1}],
"nodes" => []
})
end
it 'sleep between attempts to find and erase nodes using mcollective' do
@orchestrator.stubs(:stop_provision_via_ssh)
.returns({'inaccessible_nodes' => [{'uid' => '1'}]})
@orchestrator.stubs(:node_type).returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
Astute::NodesRemover.any_instance.stubs(:remove)
.once.returns({"nodes"=>[{"uid"=>"1", }]})
@orchestrator.expects(:sleep).with(Astute.config.NODES_REMOVE_INTERVAL)
@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes'])
end
it 'perform several attempts to find and erase nodes using mcollective' do
Astute.config.MC_RETRIES = 2
Astute.config.NODES_REMOVE_INTERVAL = 0
@orchestrator.stubs(:stop_provision_via_ssh)
.returns({'nodes' => [{'uid' => "1"}],
'inaccessible_nodes' => [{'uid' => '2'}]})
@orchestrator.stubs(:node_type).twice
.returns([{'uid' => '1', 'node_type' => 'bootstrap'}])
.then.returns([{'uid' => '2', 'node_type' => 'target'}])
Astute::NodesRemover.any_instance.stubs(:remove).twice
.returns({"nodes"=>[{"uid"=>"1"}]}).then
.returns({"error_nodes"=>[{"uid"=>"2"}]})
data['nodes'] << {
"uid" => '2',
"slave_name"=>"controller-2",
"admin_ip" =>'1.2.3.6'
}
expect(@orchestrator.stop_provision(@reporter,
data['task_uuid'],
data['engine'],
data['nodes']))
.to eql({
"error_nodes" => [{"uid"=>'2'}],
"inaccessible_nodes" => [],
"nodes" => [{"uid"=>"1"}],
"status" => "error"
})
end
end # stop_provision
end