Reset env reboots nodes twice sometimes
There was second node reboot during reset environment. Root cause of this is kernel panic on node, caused by removing partitions and data on the node (dd command). In this way, node can't send status about removing process and astute retries to send removing and reboot requests to node. This problem fixed by detecting already removed nodes using comparing time from boot. Change-Id: I5e54b9f741cdc762ffdcf46781e2a62dd7057a6c Closes-Bug: #1478020 Signed-off-by: Ruslan Aliev <raliev@mirantis.com>
This commit is contained in:
parent
1bb6db2c4b
commit
c14a4ddf69
@ -77,6 +77,26 @@ module Astute
|
||||
[mclient_skipped_nodes, mclient_nodes]
|
||||
end
|
||||
|
||||
def get_already_removed_nodes(nodes)
|
||||
removed_nodes = []
|
||||
control_time = {}
|
||||
|
||||
nodes.uids.sort.each_slice(Astute.config[:max_nodes_per_call]) do |part|
|
||||
control_time.merge!(get_boot_time(part))
|
||||
end
|
||||
|
||||
nodes.each do |uid, node|
|
||||
boot_time = control_time[uid].to_i
|
||||
next if boot_time.zero?
|
||||
if node.boot_time
|
||||
removed_nodes << uid if boot_time != node.boot_time
|
||||
else
|
||||
node.boot_time = boot_time
|
||||
end
|
||||
end
|
||||
removed_nodes
|
||||
end
|
||||
|
||||
def remove_nodes(nodes)
|
||||
if nodes.empty?
|
||||
Astute.logger.info "#{@ctx.task_id}: Nodes to remove are not provided. Do nothing."
|
||||
@ -84,10 +104,19 @@ module Astute
|
||||
end
|
||||
|
||||
erased_nodes, mclient_nodes = skipped_unskipped_mclient_nodes(nodes)
|
||||
|
||||
removed_nodes = get_already_removed_nodes(mclient_nodes)
|
||||
removed_nodes.each do |uid|
|
||||
erased_node = Node.new('uid' => uid)
|
||||
erased_nodes << erased_node
|
||||
mclient_nodes.delete(uid)
|
||||
Astute.logger.info "#{@ctx.task_id}: Node #{uid} is removed already, skipping"
|
||||
end
|
||||
|
||||
responses = mclient_remove_nodes(mclient_nodes)
|
||||
inaccessible_uids = mclient_nodes.uids - responses.map { |response| response[:sender] }
|
||||
inaccessible_nodes = NodesHash.build(inaccessible_uids.map do |uid|
|
||||
{'uid' => uid, 'error' => 'Node not answered by RPC.'}
|
||||
{'uid' => uid, 'error' => 'Node not answered by RPC.', 'boot_time' => mclient_nodes[uid][:boot_time]}
|
||||
end)
|
||||
error_nodes = NodesHash.new
|
||||
|
||||
@ -136,5 +165,33 @@ module Astute
|
||||
responses.map(&:results)
|
||||
end
|
||||
|
||||
def run_shell_without_check(context, node_uids, cmd, timeout=10)
|
||||
shell = MClient.new(
|
||||
context,
|
||||
'execute_shell_command',
|
||||
node_uids,
|
||||
check_result=false,
|
||||
timeout=timeout
|
||||
)
|
||||
results = shell.execute(:cmd => cmd)
|
||||
results.inject({}) do |h, res|
|
||||
Astute.logger.debug(
|
||||
"#{context.task_id}: cmd: #{cmd}\n" \
|
||||
"stdout: #{res.results[:data][:stdout]}\n" \
|
||||
"stderr: #{res.results[:data][:stderr]}\n" \
|
||||
"exit code: #{res.results[:data][:exit_code]}")
|
||||
h.merge({res.results[:sender] => res.results[:data][:stdout].chomp})
|
||||
end
|
||||
end
|
||||
|
||||
def get_boot_time(node_uids)
|
||||
run_shell_without_check(
|
||||
@ctx,
|
||||
node_uids,
|
||||
"stat --printf='%Y' /proc/1",
|
||||
timeout=10
|
||||
)
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
@ -20,6 +20,7 @@ describe Astute::NodesRemover do
|
||||
|
||||
let(:nodes) { [{'uid' => '1'}, {'uid' => '2'}] }
|
||||
let(:ctx) { mock_ctx }
|
||||
let(:ctl_time) { {'1' => '100', '2' => '200'} }
|
||||
|
||||
let(:mcollective_answer) do
|
||||
[
|
||||
@ -30,6 +31,7 @@ describe Astute::NodesRemover do
|
||||
|
||||
before(:each) do
|
||||
Astute::NodesRemover.any_instance.stubs(:mclient_remove_piece_nodes).returns(mcollective_answer)
|
||||
Astute::NodesRemover.any_instance.stubs(:run_shell_without_check).returns(ctl_time)
|
||||
end
|
||||
|
||||
it 'should erase nodes (mbr) and reboot nodes(default)' do
|
||||
@ -54,7 +56,7 @@ describe Astute::NodesRemover do
|
||||
{'uid' => '3', 'mclient_remove' => false},
|
||||
{'uid' => '2'},
|
||||
],
|
||||
"inaccessible_nodes" => [{"uid"=>"1", "error"=>"Node not answered by RPC."}]
|
||||
"inaccessible_nodes" => [{"uid"=>"1", "error"=>"Node not answered by RPC.", "boot_time"=>100}]
|
||||
}
|
||||
)
|
||||
end
|
||||
@ -78,8 +80,8 @@ describe Astute::NodesRemover do
|
||||
nr = Astute::NodesRemover.new(ctx, nodes)
|
||||
nr.stubs(:mclient_remove_nodes).with(
|
||||
Astute::NodesHash.build([
|
||||
{'uid' => '1'},
|
||||
{'uid' => '2', 'mclient_remove' => true}
|
||||
{'uid' => '1', 'boot_time' => 100},
|
||||
{'uid' => '2', 'mclient_remove' => true, 'boot_time' => 200}
|
||||
])
|
||||
).returns(mcollective_answer).once
|
||||
nr.remove
|
||||
@ -106,8 +108,8 @@ describe Astute::NodesRemover do
|
||||
{ "nodes"=>[],
|
||||
"status" => "error",
|
||||
"error_nodes" => [
|
||||
{"uid"=>"1", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"1\", :statuscode=>1, :data=>{:rebooted=>false}}\n"},
|
||||
{"uid"=>"2", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"2\", :statuscode=>1, :data=>{:rebooted=>false}}\n"}
|
||||
{"uid"=>"1", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"1\", :statuscode=>1, :data=>{:rebooted=>false}}\n", "boot_time"=>100},
|
||||
{"uid"=>"2", "error"=>"RPC agent 'erase_node' failed. Result:\n{:sender=>\"2\", :statuscode=>1, :data=>{:rebooted=>false}}\n", "boot_time"=>200}
|
||||
]
|
||||
}
|
||||
)
|
||||
@ -159,14 +161,32 @@ describe Astute::NodesRemover do
|
||||
{ "nodes"=>[],
|
||||
"status" => "error",
|
||||
"error_nodes" => [
|
||||
{"uid"=>"1", "error"=>"RPC method 'erase_node' failed with message: Could not reboot"},
|
||||
{"uid"=>"2", "error"=>"RPC method 'erase_node' failed with message: Could not reboot"}
|
||||
{"uid"=>"1", "error"=>"RPC method 'erase_node' failed with message: Could not reboot", "boot_time"=>100},
|
||||
{"uid"=>"2", "error"=>"RPC method 'erase_node' failed with message: Could not reboot", "boot_time"=>200}
|
||||
]
|
||||
}
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
context 'nodes fail to send status, but erased and rebooted' do
|
||||
let(:mcollective_answer) do
|
||||
[]
|
||||
end
|
||||
|
||||
let(:ctl_time2) { {} }
|
||||
let(:ctl_time3) { {'1' => '150', '2' => '250'} }
|
||||
|
||||
it 'should process rebooted nodes as erased' do
|
||||
Astute::NodesRemover.any_instance.stubs(:mclient_remove_piece_nodes).returns(mcollective_answer)
|
||||
Astute::NodesRemover.any_instance.stubs(:run_shell_without_check).returns(ctl_time)
|
||||
.then.returns(ctl_time2).then.returns(ctl_time3)
|
||||
expect(Astute::NodesRemover.new(ctx, nodes, reboot=true).remove).to eq(
|
||||
{ "nodes"=>[{"uid"=>"1"}, {"uid"=>"2"}] }
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
context 'erase node when change node status from bootstrap to provisioning' do
|
||||
let(:mcollective_answer) do
|
||||
[
|
||||
|
Loading…
Reference in New Issue
Block a user