wait until node boot completely before reboot

Reboot hook is used to reboot node with bootstrap image
or provisioned node. It's possible that this hook is called
when boot process of provisioned node haven't completed.
Upstart system doesn't handle such situation and node may
hang.

So let's reboot immediately if we're in a bootstrap. Wait until
system boots completely in case of provisioned node. We check it by
existense of /run/cloud-init/status.json (it's located on tmpfs, so
no stale file from previous boot can be found). If this file hasn't
appeared after 60 seconds - reboot as is.

Partial-Bug: #1573105

Change-Id: Iae113425a0034bc137c88db305d405f79807a5dc
This commit is contained in:
Dmitry Guryanov 2016-05-05 19:07:52 +03:00
parent 94c96e3582
commit 4698918d4e
6 changed files with 52 additions and 14 deletions

View File

@ -55,6 +55,7 @@ require 'astute/task_deployment'
require 'astute/task_node'
require 'astute/task_proxy_reporter'
require 'astute/task_cluster'
require 'astute/common/reboot.rb'
require 'fuel_deployment'
['/astute/pre_deployment_actions/*.rb',

View File

@ -0,0 +1,38 @@
# Copyright 2014 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
module Astute
module RebootCommand
# Reboot immediately if we're in a bootstrap. Wait until system boots
# completely in case of provisioned node. We check it by existense
# of /run/cloud-init/status.json (it's located on tmpfs, so no stale
# file from previous boot can be found). If this file hasn't appeared
# after 60 seconds - reboot as is.
CMD = <<-REBOOT_COMMAND
if [ $(hostname) = bootstrap ]; then
reboot;
fi;
while true; do
t=0;
if [ -f /run/cloud-init/status.json -o $t -gt 60 ]; then
reboot;
else
sleep 1;
t=$((t + 1));
fi;
done
REBOOT_COMMAND
end
end

View File

@ -280,9 +280,8 @@ module Astute
control_time.merge!(boot_time(node_uids))
end
#TODO(vsharshov): will be enough for safe reboot without exceptions?
perform_with_limit(hook['uids']) do |node_uids|
run_shell_without_check(@ctx, node_uids, 'reboot', timeout=10)
run_shell_without_check(@ctx, node_uids, RebootCommand::CMD, timeout=60)
end
already_rebooted = Hash[hook['uids'].collect { |uid| [uid, false] }]

View File

@ -59,7 +59,7 @@ module Astute
def reboot
run_shell_without_check(
@task['node_id'],
'reboot',
RebootCommand::CMD,
timeout=2
)
rescue Astute::MClientTimeout, Astute::MClientError => e
@ -81,4 +81,4 @@ module Astute
end
end
end
end

View File

@ -1107,8 +1107,8 @@ describe Astute::NailgunHooks do
hooks.expects(:run_shell_without_check).once.with(
ctx,
['2','3'],
"reboot",
10,
regexp_matches(/reboot/),
60,
)
.returns('2' => '', '3' => '')
@ -1134,7 +1134,7 @@ describe Astute::NailgunHooks do
ctx,
['2','3'],
regexp_matches(/reboot/),
10,
60,
)
.returns('2' => '', '3' => '')
@ -1160,7 +1160,7 @@ describe Astute::NailgunHooks do
ctx,
['2','3'],
regexp_matches(/reboot/),
10,
60,
)
.returns('2' => '', '3' => '')
@ -1187,7 +1187,7 @@ describe Astute::NailgunHooks do
ctx,
['2','3'],
regexp_matches(/reboot/),
10,
60,
)
.returns('2' => '', '3' => '')
@ -1222,7 +1222,7 @@ describe Astute::NailgunHooks do
ctx,
['2'],
regexp_matches(/reboot/),
10,
60,
)
.returns('2' => '')
@ -1230,7 +1230,7 @@ describe Astute::NailgunHooks do
ctx,
['3'],
regexp_matches(/reboot/),
10,
60,
)
.returns('3' => '')
@ -1260,7 +1260,7 @@ describe Astute::NailgunHooks do
ctx,
['2','3'],
regexp_matches(/reboot/),
10,
60,
)
.returns('2' => '', '3' => '')

View File

@ -131,7 +131,7 @@ describe Astute::Reboot do
subject.stubs(:boot_time).returns(12)
subject.expects(:run_shell_without_check).with(
task['node_id'],
'reboot',
regexp_matches(/reboot/),
_timeout=2
)
subject.run
@ -151,7 +151,7 @@ describe Astute::Reboot do
subject.stubs(:boot_time).returns(12)
subject.expects(:run_shell_without_check).with(
task['node_id'],
'reboot',
regexp_matches(/reboot/),
_timeout=2
).raises(Astute::MClientTimeout)
expect{subject.run}.not_to raise_error(Astute::MClientTimeout)