diff --git a/astute.gemspec b/astute.gemspec index 5be1eea5..6eed817e 100644 --- a/astute.gemspec +++ b/astute.gemspec @@ -15,6 +15,7 @@ Gem::Specification.new do |s| s.add_dependency 'symboltable', '1.0.2' s.add_dependency 'rest-client', '~> 1.6.7' s.add_dependency 'popen4', '~> 0.1.2' + s.add_dependency 'net-ssh-multi', '~> 1.1' # Astute as service s.add_dependency 'amqp', '0.9.10' diff --git a/lib/astute.rb b/lib/astute.rb index 5c1af591..a2651469 100644 --- a/lib/astute.rb +++ b/lib/astute.rb @@ -36,6 +36,9 @@ require 'astute/post_deploy_actions' require 'astute/post_deploy_actions/restart_radosgw' require 'astute/post_deploy_actions/update_cluster_hosts_info' require 'astute/post_deploy_actions/upload_cirros_image' +require 'astute/ssh' +require 'astute/ssh_actions/ssh_erase_nodes' +require 'astute/ssh_actions/ssh_hard_reboot' # Server require 'astute/server/worker' diff --git a/lib/astute/config.rb b/lib/astute/config.rb index aa427c3e..bf9d5943 100644 --- a/lib/astute/config.rb +++ b/lib/astute/config.rb @@ -72,6 +72,8 @@ module Astute conf[:PUPPET_SSH_KEYS] = ['neutron', 'nova', 'ceph', 'mysql'] # name of ssh keys what will be generated #and uploaded to all nodes before deploy conf[:MAX_NODES_PER_CALL] = 50 # how many nodes to deploy in one puppet call + conf[:SSH_RETRIES] = 5 # SSH tries to call ssh client before failure + conf[:SSH_RETRY_TIMEOUT] = 30 # SSH sleeps for ## sec between retries # Server settings conf[:broker_host] = 'localhost' diff --git a/lib/astute/orchestrator.rb b/lib/astute/orchestrator.rb index eec0336d..161b7b82 100644 --- a/lib/astute/orchestrator.rb +++ b/lib/astute/orchestrator.rb @@ -155,6 +155,16 @@ module Astute puppetd.stop_and_disable end + def stop_provision(reporter, task_id, engine_attrs, nodes) + Ssh.execute(Context.new(task_id, reporter), nodes, SshEraseNodes.command) + CobblerManager.new(engine_attrs, reporter).remove_nodes(nodes) + Ssh.execute(Context.new(task_id, reporter), + nodes, + SshHardReboot.command, + timeout=5, + retries=1) + end + def dump_environment(reporter, task_id, lastdump) Dump.dump_environment(Context.new(task_id, reporter), lastdump) end diff --git a/lib/astute/ssh.rb b/lib/astute/ssh.rb new file mode 100644 index 00000000..243673f5 --- /dev/null +++ b/lib/astute/ssh.rb @@ -0,0 +1,124 @@ +# Copyright 2013 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +require 'net/ssh/multi' +require 'timeout' + +module Astute + class Ssh + + def self.execute(ctx, nodes, cmd, timeout=60, retries=Astute.config.SSH_RETRIES) + nodes_to_process = nodes.map { |n| n['slave_name'] } + + Astute.logger.debug "Run shell command '#{cmd}' using ssh" + ready_nodes = [] + error_nodes = [] + + retries.times do |i| + Astute.logger.debug "Run shell command using ssh. Retry #{i}" + Astute.logger.debug "Affected nodes: #{nodes_to_process}" + + new_ready_nodes, new_error_nodes, nodes_to_process = run_remote_command(nodes_to_process, cmd, timeout) + Astute.logger.debug "Retry result: "\ + "success nodes: #{new_ready_nodes}, "\ + "error nodes: #{new_error_nodes}, "\ + "inaccessible nodes: #{nodes_to_process}" + + ready_nodes += new_ready_nodes + error_nodes += new_error_nodes + + break if nodes_to_process.empty? + + sleep Astute.config.SSH_RETRY_TIMEOUT + end + + inaccessible_nodes = nodes_to_process + nodes_uids = nodes.map { |n| n['uid'] } + + answer = {'nodes' => to_report_format(ready_nodes, nodes)} + if inaccessible_nodes.present? + answer.merge!({'inaccessible_nodes' => to_report_format(inaccessible_nodes, nodes)}) + Astute.logger.warn "#{ctx.task_id}: Running shell command on nodes #{nodes_uids.inspect} finished " \ + "with errors. Nodes #{answer['inaccessible_nodes'].inspect} are inaccessible" + end + + if error_nodes.present? + answer.merge!({'status' => 'error', 'error_nodes' => to_report_format(error_nodes, nodes)}) + + Astute.logger.error "#{ctx.task_id}: Running shell command on nodes #{nodes_uids.inspect} finished " \ + "with errors: #{answer['error_nodes'].inspect}" + end + Astute.logger.info "#{ctx.task_id}: Finished running shell command: #{nodes_uids.inspect}" + + answer + end + + + private + + def self.to_report_format(slave_names, nodes) + result_nodes = nodes.select { |n| slave_names.include?(n['slave_name']) } + result_nodes.inject([]) do |result, node| + result << {'uid' => node['uid']} if node['uid'] + result + end + end + + def self.run_remote_command(nodes, cmd, timeout) + servers = [] + channel = nil + + Net::SSH::Multi.start(:concurrent_connections => Astute.config.MAX_NODES_PER_CALL, + :on_error => :warn) do |session| + nodes.each do |name| + session.use name, + :user => 'root', + :host_key => 'ssh-rsa', + :keys => ['/root/.ssh/id_rsa'] + end + servers = session.servers_for + + # execute commands on all servers + # FIXME: debug not show a messages if command contain a several + # strings + channel = session.exec cmd do |ch, stream, data| + Astute.logger.debug "[#{ch[:host]} : #{stream}] #{data}" + end + + Timeout::timeout(timeout) { session.loop } + end + + detect_status(servers) + rescue Timeout::Error + Astute.logger.debug "SSH session is closed due to the achievement of a timeout" + return [[], [], nodes] unless servers + + servers.each do |s| + s.session.shutdown! && s.fail! if s.busy? + end + detect_status(servers) + end + + # TODO: support exit code from shell command + def self.detect_status(servers) + executed_nodes = [] + inaccessible_nodes = [] + servers.each do |s| + s.failed? ? inaccessible_nodes << s.host : executed_nodes << s.host + end + [executed_nodes, [], inaccessible_nodes] + end + + end +end diff --git a/lib/astute/ssh_actions/ssh_erase_nodes.rb b/lib/astute/ssh_actions/ssh_erase_nodes.rb new file mode 100644 index 00000000..4c418e99 --- /dev/null +++ b/lib/astute/ssh_actions/ssh_erase_nodes.rb @@ -0,0 +1,67 @@ +# Copyright 2013 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +module Astute + class SshEraseNodes + + def self.command + <<-ERASE_COMMAND + killall -STOP anaconda + killall -STOP debootstrap dpkg + echo "5" > /proc/sys/kernel/panic + echo "1" > /proc/sys/kernel/sysrq + + storages_codes="3, 8, 65, 66, 67, 68, 69, 70, 71, 104, 105, 106, 107, 108, 109, 110, 111, 202, 252, 253" + + reboot_with_sleep() { + sleep 5 + echo "1" > /proc/sys/kernel/panic_on_oops + echo "10" > /proc/sys/kernel/panic + echo "b" > /proc/sysrq-trigger + } + + erase_data() { + echo "Run erase_node with dev= $1 length = $2 offset = $3 bs = $4" + dd if=/dev/zero of=/dev/$1 bs=$2 count=$3 seek=$4 oflag=direct + } + + erase_boot_devices() { + for d in /sys/block/* + do + basename_dir=$(basename $d) + major_raw=$(udevadm info --query=property --name=$basename_dir | grep MAJOR | sed 's/ *$//g') + major=$(echo ${major_raw##*=}) + + echo $storages_codes | grep -o "\b$major\b" + if [ $? -ne 0 ]; then continue; fi + + removable=$(grep -o '[[:digit:]]' /sys/block/$basename_dir/removable) + if [ $removable -ne 0 ]; then continue; fi + + size=$(cat /sys/block/$basename_dir/size) + + erase_data $basename_dir 1 0 '1M' + erase_data $basename_dir 1 $size '512' + done + } + + echo "Run erase node command" + erase_boot_devices + + # Avoid shell hang using nohup and stdout/stderr redirections + # nohup reboot_with_sleep > /dev/null 2>&1 & + ERASE_COMMAND + end + end +end \ No newline at end of file diff --git a/lib/astute/ssh_actions/ssh_hard_reboot.rb b/lib/astute/ssh_actions/ssh_hard_reboot.rb new file mode 100644 index 00000000..7f1e40e8 --- /dev/null +++ b/lib/astute/ssh_actions/ssh_hard_reboot.rb @@ -0,0 +1,27 @@ +# Copyright 2013 Mirantis, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +module Astute + class SshHardReboot + + def self.command + <<-REBOOT_COMMAND + echo "Run node rebooting command using 'SB' to sysrq-trigger" + echo "1" > /proc/sys/kernel/panic_on_oops + echo "10" > /proc/sys/kernel/panic + echo "b" > /proc/sysrq-trigger + REBOOT_COMMAND + end + end +end \ No newline at end of file