c14a4ddf69
There was second node reboot during reset environment. Root cause of this is kernel panic on node, caused by removing partitions and data on the node (dd command). In this way, node can't send status about removing process and astute retries to send removing and reboot requests to node. This problem fixed by detecting already removed nodes using comparing time from boot. Change-Id: I5e54b9f741cdc762ffdcf46781e2a62dd7057a6c Closes-Bug: #1478020 Signed-off-by: Ruslan Aliev <raliev@mirantis.com>
198 lines
6.9 KiB
Ruby
198 lines
6.9 KiB
Ruby
# Copyright 2013 Mirantis, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
|
|
module Astute
|
|
class NodesRemover
|
|
|
|
def initialize(ctx, nodes, reboot=true)
|
|
@ctx = ctx
|
|
@nodes = NodesHash.build(nodes)
|
|
@reboot = reboot
|
|
end
|
|
|
|
def remove
|
|
# TODO(mihgen): 1. Nailgun should process node error message
|
|
# 2. Should we rename nodes -> removed_nodes array?
|
|
# 3. If exception is raised here, we should not fully fall into error, but only failed node
|
|
erased_nodes, error_nodes, inaccessible_nodes = remove_nodes(@nodes)
|
|
retry_remove_nodes(error_nodes, erased_nodes,
|
|
Astute.config[:mc_retries], Astute.config[:mc_retry_interval])
|
|
|
|
retry_remove_nodes(inaccessible_nodes, erased_nodes,
|
|
Astute.config[:mc_retries], Astute.config[:mc_retry_interval])
|
|
|
|
answer = {'nodes' => serialize_nodes(erased_nodes)}
|
|
|
|
if inaccessible_nodes.present?
|
|
serialized_inaccessible_nodes = serialize_nodes(inaccessible_nodes)
|
|
answer.merge!({'inaccessible_nodes' => serialized_inaccessible_nodes})
|
|
|
|
Astute.logger.warn "#{@ctx.task_id}: Removing of nodes\n#{@nodes.uids.pretty_inspect} finished " \
|
|
"with errors. Nodes\n#{serialized_inaccessible_nodes.pretty_inspect} are inaccessible"
|
|
end
|
|
|
|
if error_nodes.present?
|
|
serialized_error_nodes = serialize_nodes(error_nodes)
|
|
answer.merge!({'status' => 'error', 'error_nodes' => serialized_error_nodes})
|
|
|
|
Astute.logger.error "#{@ctx.task_id}: Removing of nodes\n#{@nodes.uids.pretty_inspect} finished " \
|
|
"with errors:\n#{serialized_error_nodes.pretty_inspect}"
|
|
end
|
|
Astute.logger.info "#{@ctx.task_id}: Finished removing of nodes:\n#{@nodes.uids.pretty_inspect}"
|
|
|
|
answer
|
|
end
|
|
|
|
private
|
|
|
|
def serialize_nodes(nodes)
|
|
nodes.nodes.map(&:to_hash)
|
|
end
|
|
|
|
# When :mclient_remove property is true (the default behavior), we send
|
|
# the node to mclient for removal (MBR, restarting etc), if it's false
|
|
# the node is skipped from mclient
|
|
def skipped_unskipped_mclient_nodes(nodes)
|
|
mclient_skipped_nodes = NodesHash.build(
|
|
nodes.values.select { |node| not node.fetch(:mclient_remove, true) }
|
|
)
|
|
mclient_nodes = NodesHash.build(
|
|
nodes.values.select { |node| node.fetch(:mclient_remove, true) }
|
|
)
|
|
|
|
Astute.logger.debug "#{@ctx.task_id}: Split nodes: #{mclient_skipped_nodes}, #{mclient_nodes}"
|
|
|
|
[mclient_skipped_nodes, mclient_nodes]
|
|
end
|
|
|
|
def get_already_removed_nodes(nodes)
|
|
removed_nodes = []
|
|
control_time = {}
|
|
|
|
nodes.uids.sort.each_slice(Astute.config[:max_nodes_per_call]) do |part|
|
|
control_time.merge!(get_boot_time(part))
|
|
end
|
|
|
|
nodes.each do |uid, node|
|
|
boot_time = control_time[uid].to_i
|
|
next if boot_time.zero?
|
|
if node.boot_time
|
|
removed_nodes << uid if boot_time != node.boot_time
|
|
else
|
|
node.boot_time = boot_time
|
|
end
|
|
end
|
|
removed_nodes
|
|
end
|
|
|
|
def remove_nodes(nodes)
|
|
if nodes.empty?
|
|
Astute.logger.info "#{@ctx.task_id}: Nodes to remove are not provided. Do nothing."
|
|
return Array.new(3){ NodesHash.new }
|
|
end
|
|
|
|
erased_nodes, mclient_nodes = skipped_unskipped_mclient_nodes(nodes)
|
|
|
|
removed_nodes = get_already_removed_nodes(mclient_nodes)
|
|
removed_nodes.each do |uid|
|
|
erased_node = Node.new('uid' => uid)
|
|
erased_nodes << erased_node
|
|
mclient_nodes.delete(uid)
|
|
Astute.logger.info "#{@ctx.task_id}: Node #{uid} is removed already, skipping"
|
|
end
|
|
|
|
responses = mclient_remove_nodes(mclient_nodes)
|
|
inaccessible_uids = mclient_nodes.uids - responses.map { |response| response[:sender] }
|
|
inaccessible_nodes = NodesHash.build(inaccessible_uids.map do |uid|
|
|
{'uid' => uid, 'error' => 'Node not answered by RPC.', 'boot_time' => mclient_nodes[uid][:boot_time]}
|
|
end)
|
|
error_nodes = NodesHash.new
|
|
|
|
responses.each do |response|
|
|
node = Node.new('uid' => response[:sender])
|
|
if response[:statuscode] != 0
|
|
node['error'] = "RPC agent 'erase_node' failed. Result:\n#{response.pretty_inspect}"
|
|
error_nodes << node
|
|
elsif @reboot && !response[:data][:rebooted]
|
|
node['error'] = "RPC method 'erase_node' failed with message: #{response[:data][:error_msg]}"
|
|
error_nodes << node
|
|
else
|
|
erased_nodes << node
|
|
end
|
|
end
|
|
[erased_nodes, error_nodes, inaccessible_nodes]
|
|
end
|
|
|
|
def retry_remove_nodes(error_nodes, erased_nodes, retries=3, interval=1)
|
|
retries.times do
|
|
retried_erased_nodes = remove_nodes(error_nodes)[0]
|
|
retried_erased_nodes.each do |uid, node|
|
|
error_nodes.delete uid
|
|
erased_nodes << node
|
|
end
|
|
return if error_nodes.empty?
|
|
sleep(interval) if interval > 0
|
|
end
|
|
end
|
|
|
|
def mclient_remove_nodes(nodes)
|
|
Astute.logger.info "#{@ctx.task_id}: Starting removing of nodes:\n#{nodes.uids.pretty_inspect}"
|
|
results = []
|
|
|
|
nodes.uids.sort.each_slice(Astute.config[:max_nodes_per_remove_call]).with_index do |part, i|
|
|
sleep Astute.config[:nodes_remove_interval] if i != 0
|
|
results += mclient_remove_piece_nodes(part)
|
|
end
|
|
results
|
|
end
|
|
|
|
def mclient_remove_piece_nodes(nodes)
|
|
remover = MClient.new(@ctx, "erase_node", nodes, check_result=false)
|
|
responses = remover.erase_node(:reboot => @reboot)
|
|
Astute.logger.debug "#{@ctx.task_id}: Data received from nodes:\n#{responses.pretty_inspect}"
|
|
responses.map(&:results)
|
|
end
|
|
|
|
def run_shell_without_check(context, node_uids, cmd, timeout=10)
|
|
shell = MClient.new(
|
|
context,
|
|
'execute_shell_command',
|
|
node_uids,
|
|
check_result=false,
|
|
timeout=timeout
|
|
)
|
|
results = shell.execute(:cmd => cmd)
|
|
results.inject({}) do |h, res|
|
|
Astute.logger.debug(
|
|
"#{context.task_id}: cmd: #{cmd}\n" \
|
|
"stdout: #{res.results[:data][:stdout]}\n" \
|
|
"stderr: #{res.results[:data][:stderr]}\n" \
|
|
"exit code: #{res.results[:data][:exit_code]}")
|
|
h.merge({res.results[:sender] => res.results[:data][:stdout].chomp})
|
|
end
|
|
end
|
|
|
|
def get_boot_time(node_uids)
|
|
run_shell_without_check(
|
|
@ctx,
|
|
node_uids,
|
|
"stat --printf='%Y' /proc/1",
|
|
timeout=10
|
|
)
|
|
end
|
|
|
|
end
|
|
end
|