Support stop provisioning mechanism

* ability to run command at remote nodes using SSH;
* add stop_provision action.

Implements: blueprint fuel-stop-provision

Change-Id: Ibcd588fc3dae5961ea51239cad6f2bdee5f16bbf
This commit is contained in:
Vladimir Sharshov 2014-02-19 07:53:14 +04:00
parent 2aba323f3a
commit 5c9d8cb355
7 changed files with 234 additions and 0 deletions

View File

@ -15,6 +15,7 @@ Gem::Specification.new do |s|
s.add_dependency 'symboltable', '1.0.2'
s.add_dependency 'rest-client', '~> 1.6.7'
s.add_dependency 'popen4', '~> 0.1.2'
s.add_dependency 'net-ssh-multi', '~> 1.1'
# Astute as service
s.add_dependency 'amqp', '0.9.10'

View File

@ -36,6 +36,9 @@ require 'astute/post_deploy_actions'
require 'astute/post_deploy_actions/restart_radosgw'
require 'astute/post_deploy_actions/update_cluster_hosts_info'
require 'astute/post_deploy_actions/upload_cirros_image'
require 'astute/ssh'
require 'astute/ssh_actions/ssh_erase_nodes'
require 'astute/ssh_actions/ssh_hard_reboot'
# Server
require 'astute/server/worker'

View File

@ -72,6 +72,8 @@ module Astute
conf[:PUPPET_SSH_KEYS] = ['neutron', 'nova', 'ceph', 'mysql'] # name of ssh keys what will be generated
#and uploaded to all nodes before deploy
conf[:MAX_NODES_PER_CALL] = 50 # how many nodes to deploy in one puppet call
conf[:SSH_RETRIES] = 5 # SSH tries to call ssh client before failure
conf[:SSH_RETRY_TIMEOUT] = 30 # SSH sleeps for ## sec between retries
# Server settings
conf[:broker_host] = 'localhost'

View File

@ -155,6 +155,16 @@ module Astute
puppetd.stop_and_disable
end
def stop_provision(reporter, task_id, engine_attrs, nodes)
Ssh.execute(Context.new(task_id, reporter), nodes, SshEraseNodes.command)
CobblerManager.new(engine_attrs, reporter).remove_nodes(nodes)
Ssh.execute(Context.new(task_id, reporter),
nodes,
SshHardReboot.command,
timeout=5,
retries=1)
end
def dump_environment(reporter, task_id, lastdump)
Dump.dump_environment(Context.new(task_id, reporter), lastdump)
end

124
lib/astute/ssh.rb Normal file
View File

@ -0,0 +1,124 @@
# Copyright 2013 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
require 'net/ssh/multi'
require 'timeout'
module Astute
class Ssh
def self.execute(ctx, nodes, cmd, timeout=60, retries=Astute.config.SSH_RETRIES)
nodes_to_process = nodes.map { |n| n['slave_name'] }
Astute.logger.debug "Run shell command '#{cmd}' using ssh"
ready_nodes = []
error_nodes = []
retries.times do |i|
Astute.logger.debug "Run shell command using ssh. Retry #{i}"
Astute.logger.debug "Affected nodes: #{nodes_to_process}"
new_ready_nodes, new_error_nodes, nodes_to_process = run_remote_command(nodes_to_process, cmd, timeout)
Astute.logger.debug "Retry result: "\
"success nodes: #{new_ready_nodes}, "\
"error nodes: #{new_error_nodes}, "\
"inaccessible nodes: #{nodes_to_process}"
ready_nodes += new_ready_nodes
error_nodes += new_error_nodes
break if nodes_to_process.empty?
sleep Astute.config.SSH_RETRY_TIMEOUT
end
inaccessible_nodes = nodes_to_process
nodes_uids = nodes.map { |n| n['uid'] }
answer = {'nodes' => to_report_format(ready_nodes, nodes)}
if inaccessible_nodes.present?
answer.merge!({'inaccessible_nodes' => to_report_format(inaccessible_nodes, nodes)})
Astute.logger.warn "#{ctx.task_id}: Running shell command on nodes #{nodes_uids.inspect} finished " \
"with errors. Nodes #{answer['inaccessible_nodes'].inspect} are inaccessible"
end
if error_nodes.present?
answer.merge!({'status' => 'error', 'error_nodes' => to_report_format(error_nodes, nodes)})
Astute.logger.error "#{ctx.task_id}: Running shell command on nodes #{nodes_uids.inspect} finished " \
"with errors: #{answer['error_nodes'].inspect}"
end
Astute.logger.info "#{ctx.task_id}: Finished running shell command: #{nodes_uids.inspect}"
answer
end
private
def self.to_report_format(slave_names, nodes)
result_nodes = nodes.select { |n| slave_names.include?(n['slave_name']) }
result_nodes.inject([]) do |result, node|
result << {'uid' => node['uid']} if node['uid']
result
end
end
def self.run_remote_command(nodes, cmd, timeout)
servers = []
channel = nil
Net::SSH::Multi.start(:concurrent_connections => Astute.config.MAX_NODES_PER_CALL,
:on_error => :warn) do |session|
nodes.each do |name|
session.use name,
:user => 'root',
:host_key => 'ssh-rsa',
:keys => ['/root/.ssh/id_rsa']
end
servers = session.servers_for
# execute commands on all servers
# FIXME: debug not show a messages if command contain a several
# strings
channel = session.exec cmd do |ch, stream, data|
Astute.logger.debug "[#{ch[:host]} : #{stream}] #{data}"
end
Timeout::timeout(timeout) { session.loop }
end
detect_status(servers)
rescue Timeout::Error
Astute.logger.debug "SSH session is closed due to the achievement of a timeout"
return [[], [], nodes] unless servers
servers.each do |s|
s.session.shutdown! && s.fail! if s.busy?
end
detect_status(servers)
end
# TODO: support exit code from shell command
def self.detect_status(servers)
executed_nodes = []
inaccessible_nodes = []
servers.each do |s|
s.failed? ? inaccessible_nodes << s.host : executed_nodes << s.host
end
[executed_nodes, [], inaccessible_nodes]
end
end
end

View File

@ -0,0 +1,67 @@
# Copyright 2013 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
module Astute
class SshEraseNodes
def self.command
<<-ERASE_COMMAND
killall -STOP anaconda
killall -STOP debootstrap dpkg
echo "5" > /proc/sys/kernel/panic
echo "1" > /proc/sys/kernel/sysrq
storages_codes="3, 8, 65, 66, 67, 68, 69, 70, 71, 104, 105, 106, 107, 108, 109, 110, 111, 202, 252, 253"
reboot_with_sleep() {
sleep 5
echo "1" > /proc/sys/kernel/panic_on_oops
echo "10" > /proc/sys/kernel/panic
echo "b" > /proc/sysrq-trigger
}
erase_data() {
echo "Run erase_node with dev= $1 length = $2 offset = $3 bs = $4"
dd if=/dev/zero of=/dev/$1 bs=$2 count=$3 seek=$4 oflag=direct
}
erase_boot_devices() {
for d in /sys/block/*
do
basename_dir=$(basename $d)
major_raw=$(udevadm info --query=property --name=$basename_dir | grep MAJOR | sed 's/ *$//g')
major=$(echo ${major_raw##*=})
echo $storages_codes | grep -o "\b$major\b"
if [ $? -ne 0 ]; then continue; fi
removable=$(grep -o '[[:digit:]]' /sys/block/$basename_dir/removable)
if [ $removable -ne 0 ]; then continue; fi
size=$(cat /sys/block/$basename_dir/size)
erase_data $basename_dir 1 0 '1M'
erase_data $basename_dir 1 $size '512'
done
}
echo "Run erase node command"
erase_boot_devices
# Avoid shell hang using nohup and stdout/stderr redirections
# nohup reboot_with_sleep > /dev/null 2>&1 &
ERASE_COMMAND
end
end
end

View File

@ -0,0 +1,27 @@
# Copyright 2013 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
module Astute
class SshHardReboot
def self.command
<<-REBOOT_COMMAND
echo "Run node rebooting command using 'SB' to sysrq-trigger"
echo "1" > /proc/sys/kernel/panic_on_oops
echo "10" > /proc/sys/kernel/panic
echo "b" > /proc/sysrq-trigger
REBOOT_COMMAND
end
end
end