Gracefully stop if tolerance limit exceeded

Several changes:

- support fault tolerance group;
- support internal stop deployment instead of raise in
  case of error;
- do not show last run summary debug report from mcollective;
- fix support of detecting offline nodes before run deployment;
- support fail on error behavior.

Support fault tolerance group

  Nailgun send fault tolerance group which inform Astute about
  available number of error nodes in this deployment and importance
  of every node in this task.

If number of error exceeds number of available errors, deployment
will stop.

  Support internal stop deployment instead of raise in case of error

  Before this change Astute is end processing, marks all nodes
  as error and do not waiting of puppet process on nodes.

  Now we use same way that used in case of stop deployment.
  Mark failed nodes as error, another nodes as skipped(stopped),
  ready nodes as ready. Also Astute will wait before current
  tasks end.

Do not show last run summary debug report from mcollective

  For now moment it not so useful, but quickly filled log file
  and difficult debug process

Fix support of detecting offline nodes before run deployment

  Astute gets response from mcollective to detect node availability.
  If node do not respond, it will mark as failed. It also support
  fault tollerance mechanism

Support fail on error behavior

  From this moment task which setup fail_on_error if false,
  task marks as skipped instead of failed in case of error.

Change-Id: Ica2a4ae64b4dfa4f7fccfbc95108d1412c40dc3f
Closes-Bug: #1435610
This commit is contained in:
Vladimir Sharshov (warpc) 2016-05-24 20:46:30 +03:00
parent 4df5a45042
commit 5a9f87c080
21 changed files with 459 additions and 269 deletions

1
.gitignore vendored
View File

@ -16,3 +16,4 @@ docs/_build
# Local raemon copy
raemon/
*.svg

View File

@ -75,9 +75,9 @@ module Astute
deployment_engine = TaskDeployment.new(context)
deployment_engine.deploy(
deployment_info: deployment_options[:deployment_info],
tasks_graph: deployment_options[:tasks_graph],
tasks_directory: deployment_options[:tasks_directory],
tasks_metadata: deployment_options[:tasks_metadata],
dry_run: deployment_options.fetch(:dry_run, false)
)
ensure

View File

@ -85,7 +85,15 @@ module Astute
private
def puppetd
puppetd = MClient.new(@ctx, "puppetd", [@node['uid']])
puppetd = MClient.new(
@ctx,
"puppetd",
[@node['uid']],
_check_result=true,
_timeout=nil,
_retries=Astute.config.mc_retries,
_enable_result_logging=false
)
puppetd.on_respond_timeout do |uids|
nodes = uids.map do |uid|
{

View File

@ -116,9 +116,9 @@ module Astute
reporter,
data['args']['task_uuid'],
{
:deployment_info => data['args'].fetch('deployment_info', []),
:tasks_graph => data['args'].fetch('tasks_graph', {}),
:tasks_directory => data['args'].fetch('tasks_directory', {}),
:tasks_metadata => data['args'].fetch('tasks_metadata', {}),
:dry_run => data['args'].fetch('dry_run', false)
}
)

View File

@ -74,6 +74,30 @@ module Astute
{}
end
def finished?
[:successful, :failed, :skipped].include? @status
end
def successful?
@status == :successful
end
def pending?
@status == :pending
end
def skipped?
@status == :skipped
end
def running?
@status == :running
end
def failed?
@status == :failed
end
private
# Run current task on node, specified in task
@ -183,44 +207,20 @@ module Astute
false
end
def finished?
[:successful, :failed, :skipped].include? @status
end
def failed!
self.status = :failed
time_summary
end
def failed?
@status == :failed
end
def running!
self.status = :running
end
def running?
@status == :running
end
def succeed!
self.status = :successful
time_summary
end
def successful?
@status == :successful
end
def pending?
@status == :pending
end
def skipped?
@status == :skipped
end
def skipped!
self.status = :skipped
time_summary

View File

@ -15,28 +15,14 @@ require 'fuel_deployment'
module Astute
class TaskCluster < Deployment::Cluster
attr_accessor :gracefully_stop_mark
def stop_condition(&block)
self.gracefully_stop_mark = block
def hook_post_gracefully_stop(*args)
report_new_node_status(args[0])
end
def hook_post_node_poll(*args)
gracefully_stop(args[0])
end
# Check if the deployment process should stop
# @return [true, false]
def gracefully_stop?
gracefully_stop_mark ? gracefully_stop_mark.call : false
end
def gracefully_stop(node)
if gracefully_stop? && node.ready?
node.set_status_skipped
node.report_node_status
end
def report_new_node_status(node)
node.report_node_status
end
end
end
end

View File

@ -20,12 +20,10 @@ module Astute
@ctx = context
end
def deploy(tasks_graph: {}, tasks_directory: {} , deployment_info: [], dry_run: false)
def deploy(tasks_graph: {}, tasks_directory: {} , tasks_metadata: {}, dry_run: false)
raise DeploymentEngineError, "Deployment graph was not provided!" if
tasks_graph.blank?
deployment_info, offline_uids = pre_deployment_process(deployment_info)
support_virtual_node(tasks_graph)
unzip_graph(tasks_graph, tasks_directory)
@ -33,12 +31,20 @@ module Astute
cluster = TaskCluster.new
cluster.node_concurrency.maximum = Astute.config.max_nodes_per_call
cluster.stop_condition { Thread.current[:gracefully_stop] }
cluster.fault_tolerance_groups = tasks_metadata.fetch(
'fault_tolerance_groups',
[]
)
offline_uids = fail_offline_nodes(tasks_graph)
critical_uids = critical_node_uids(cluster.fault_tolerance_groups)
tasks_graph.keys.each do |node_id|
node = TaskNode.new(node_id, cluster)
node.context = @ctx
node.set_critical if critical_node_uids(deployment_info).include?(node_id)
node.set_status_failed if offline_uids.include? node_id
node.set_critical if critical_uids.include?(node_id)
node.set_as_sync_point if sync_point?(node_id)
node.set_status_failed if offline_uids.include?(node_id)
end
setup_tasks(tasks_graph, cluster)
@ -57,6 +63,10 @@ module Astute
private
def sync_point?(node_id)
'virtual_sync_node' == node_id
end
def unzip_graph(tasks_graph, tasks_directory)
tasks_graph.each do |node_id, tasks|
tasks.each do |task|
@ -111,34 +121,14 @@ module Astute
"non-negative integer, but got #{value}. Please check task #{task}"
end
def pre_deployment_process(deployment_info)
return [[],[]] if deployment_info.blank?
deployment_info, offline_uids = remove_failed_nodes(deployment_info)
Astute::TaskPreDeploymentActions.new(deployment_info, @ctx).process
[deployment_info, offline_uids]
end
def report_deploy_result(result)
if result[:success]
if result[:success] && result.fetch(:failed_nodes, []).empty?
@ctx.report('status' => 'ready', 'progress' => 100)
elsif result[:success] && result.fetch(:failed_nodes, []).present?
report_failed_nodes(result)
@ctx.report('status' => 'ready', 'progress' => 100)
else
result[:failed_nodes].each do |node|
node_status = {
'uid' => node.id,
'status' => 'error',
'error_type' => 'deploy',
'error_msg' => result[:status]
}
task = result[:failed_tasks].find{ |t| t.node == node }
if task
node_status.merge!({
'deployment_graph_task_name' => task.name,
'task_status' => task.status.to_s
})
end
@ctx.report('nodes' => [node_status])
end
report_failed_nodes(result)
@ctx.report(
'status' => 'error',
'progress' => 100,
@ -147,6 +137,25 @@ module Astute
end
end
def report_failed_nodes(result)
result.fetch(:failed_nodes, []).each do |node|
node_status = {
'uid' => node.id,
'status' => 'error',
'error_type' => 'deploy',
'error_msg' => result[:status]
}
task = result[:failed_tasks].find{ |t| t.node == node }
if task
node_status.merge!({
'deployment_graph_task_name' => task.name,
'task_status' => task.status.to_s
})
end
@ctx.report('nodes' => [node_status])
end
end
def write_graph_to_file(deployment)
return unless Astute.config.enable_graph_file
@ -180,21 +189,20 @@ module Astute
tasks_graph
end
def critical_node_uids(deployment_info)
@critical_nodes ||= deployment_info.select{ |n| n['fail_if_error'] }
.map{ |n| n['uid'] }.uniq
def critical_node_uids(fault_tolerance_groups)
return [] unless fault_tolerance_groups
critical_nodes = fault_tolerance_groups.inject([]) do |critical_uids, group|
critical_uids += group['node_ids'] if group['fault_tolerance'].zero?
critical_uids
end
Astute.logger.info "Critical node #{critical_nodes}" if critical_nodes.present?
critical_nodes
end
# Removes nodes which failed to provision
def remove_failed_nodes(deployment_info)
uids = get_uids_from_deployment_info deployment_info
required_uids = critical_node_uids(deployment_info)
available_uids = detect_available_nodes(uids)
offline_uids = uids - available_uids
def fail_offline_nodes(tasks_graph)
offline_uids = detect_offline_nodes(tasks_graph.keys)
if offline_uids.present?
# set status for all failed nodes to error
nodes = (uids - available_uids).map do |uid|
nodes = offline_uids.map do |uid|
{'uid' => uid,
'status' => 'error',
'error_type' => 'provision',
@ -208,9 +216,7 @@ module Astute
'error' => 'Node is not ready for deployment'
)
# check if all required nodes are online
# if not, raise error
missing_required = required_uids - available_uids
missing_required = critical_node_uids(tasks_graph) & offline_uids
if missing_required.present?
error_message = "Critical nodes are not available for deployment: " \
"#{missing_required}"
@ -218,57 +224,21 @@ module Astute
end
end
return remove_offline_nodes(
uids,
available_uids,
deployment_info,
offline_uids)
offline_uids
end
def remove_offline_nodes(uids, available_uids, deployment_info, offline_uids)
if offline_uids.blank?
return [deployment_info, offline_uids]
end
Astute.logger.info "Removing nodes which failed to provision: " \
"#{offline_uids}"
deployment_info = cleanup_nodes_block(deployment_info, offline_uids)
deployment_info = deployment_info.select do |node|
available_uids.include? node['uid']
end
[deployment_info, offline_uids]
end
def cleanup_nodes_block(deployment_info, offline_uids)
return deployment_info if offline_uids.blank?
nodes = deployment_info.first['nodes']
# In case of deploy in already existing cluster in nodes block
# we will have all cluster nodes. We should remove only missing
# nodes instead of stay only available.
# Example: deploy 3 nodes, after it deploy 2 nodes.
# In 1 of 2 seconds nodes missing, in nodes block we should
# contain only 4 nodes.
nodes_wthout_missing = nodes.select do |node|
!offline_uids.include?(node['uid'])
end
deployment_info.each { |node| node['nodes'] = nodes_wthout_missing }
deployment_info
end
def detect_available_nodes(uids)
all_uids = uids.clone
def detect_offline_nodes(uids)
available_uids = []
uids.delete('master')
uids.delete('virtual_sync_node')
# In case of big amount of nodes we should do several calls to be sure
# about node status
Astute.config[:mc_retries].times.each do
Astute.config.mc_retries.times.each do
systemtype = Astute::MClient.new(
@ctx,
"systemtype",
all_uids,
uids,
_check_result=false,
10
)
@ -277,22 +247,15 @@ module Astute
end
available_uids += available_nodes.map { |node| node.results[:sender] }
all_uids -= available_uids
break if all_uids.empty?
uids -= available_uids
break if uids.empty?
sleep Astute.config[:mc_retry_interval]
sleep Astute.config.mc_retry_interval
end
available_uids
Astute.logger.warn "Offline node #{uids}" if uids.present?
uids
end
def get_uids_from_deployment_info(deployment_info)
top_level_uids = deployment_info.map{ |node| node["uid"] }
inside_uids = deployment_info.inject([]) do |uids, node|
uids += node.fetch('nodes', []).map{ |n| n['uid'] }
end
top_level_uids | inside_uids
end
end
end

View File

@ -37,7 +37,7 @@ module Astute
# Please be informed that this code define special method
# of Deployment::Node class. We use special method `task`
# to manage task status, graph of tasks and nodes.
task.status = @task_engine.status
task.status = setup_task_status
if @task.running?
@ctx.report({
'nodes' => [{
@ -49,8 +49,7 @@ module Astute
}]
})
else
set_status_online
setup_node_status
report_node_status
end
end
@ -70,10 +69,13 @@ module Astute
'uid' => id,
'status' => deploy_status,
'progress' => current_progress_bar,
}
node_status.merge!(
'deployment_graph_task_name' => task.name,
'task_status' => task.status.to_s,
'custom' => @task_engine.summary,
}
'custom' => @task_engine.summary
) if task
node_status.merge!('error_type' => 'deploy') if
deploy_status == 'error'
@ -83,6 +85,27 @@ module Astute
private
# This method support special task behavior. If task failed
# and we do not think that deployment should be stopped, Astute
# will mark such task as skipped and do not report error
def setup_task_status
if !task.data.fetch('fail_on_error', true) && @task_engine.failed?
Astute.logger.warn "Task #{task.name} failed, but marked as skipped "\
"because of 'fail on error' behavior"
return :skipped
end
@task_engine.status
end
def setup_node_status
if task
set_status_failed && return if task.failed?
set_status_skipped && return if task.dep_failed?
end
set_status_online
end
def current_progress_bar
100 * tasks_finished_count / tasks_total_count
end

View File

@ -32,15 +32,19 @@ module Deployment
@id = id
@node_concurrency = Deployment::Concurrency::Counter.new
@task_concurrency = Deployment::Concurrency::Group.new
@emergency_brake = false
end
include Enumerable
include Deployment::Log
attr_accessor :id
attr_accessor :gracefully_stop_mark
attr_reader :emergency_brake
attr_reader :nodes
attr_reader :node_concurrency
attr_reader :task_concurrency
attr_reader :fault_tolerance_groups
# Add an existing node object to the cluster
# @param [Deployment::Node] node a new node object
@ -211,6 +215,7 @@ module Deployment
hook 'pre_node', node
return if node.skipped?
node.poll
hook 'internal_post_node_poll', node
hook 'post_node_poll', node
return unless node.ready?
ready_task = node.ready_task
@ -248,35 +253,42 @@ module Deployment
def run
ready_nodes = each_ready_task.to_a.join ', '
info "Starting the deployment process. Starting tasks: #{ready_nodes}"
hook 'internal_pre_run'
hook 'pre_run'
topology_sort
result = loop do
if all_nodes_are_successful?
status = 'All nodes are deployed successfully. Stopping the deployment process!'
status = 'All nodes are deployed successfully.'\
'Stopping the deployment process!'
result = {
:success => true,
:status => status,
}
break result
end
if has_failed_critical_nodes?
status = "Critical nodes failed: #{failed_critical_nodes.join ', '}. Stopping the deployment process!"
result = {
:success => false,
:status => status,
:failed_nodes => failed_critical_nodes,
:failed_tasks => failed_tasks,
}
break result
end
gracefully_stop! if has_failed_critical_nodes?
if all_nodes_are_finished?
status = "All nodes are finished. Failed tasks: #{failed_tasks.join ', '} Stopping the deployment process!"
result = {
status = "All nodes are finished. Failed tasks: "\
"#{failed_tasks.join ', '} Stopping the "\
"deployment process!"
result = if has_failed_critical_nodes?
{
:success => false,
:status => status,
:failed_nodes => failed_nodes,
:failed_tasks => failed_tasks,
}
:skipped_nodes => skipped_nodes,
:failed_tasks => failed_tasks
}
else
{
:success => true,
:status => status,
:failed_nodes => failed_nodes,
:skipped_nodes => skipped_nodes,
:failed_tasks => failed_tasks
}
end
break result
end
# run loop over all nodes
@ -300,7 +312,7 @@ module Deployment
# @return [Array<Deployment::Node>]
def failed_critical_nodes
critical_nodes.select do |node|
node.failed?
node.failed? && !node.skipped?
end
end
@ -315,10 +327,17 @@ module Deployment
# @return [Array<Deployment::Node>]
def failed_nodes
select do |node|
node.failed?
node.failed? && !node.skipped?
end
end
def skipped_nodes
select do |node|
node.skipped?
end
end
# Get the list of the failed nodes
# @return [Array<Deployment::Task>]
def failed_tasks
@ -476,6 +495,79 @@ digraph "<%= id || 'graph' %>" {
end.sort
end
def stop_condition(&block)
self.gracefully_stop_mark = block
end
def hook_internal_post_node_poll(*args)
gracefully_stop(args[0])
validate_fault_tolerance(args[0])
end
def hook_internal_pre_run(*args)
return unless has_failed_nodes?
failed_nodes.each { |node| validate_fault_tolerance(node) }
end
# Check if the deployment process should stop
# @return [true, false]
def gracefully_stop?
return true if @emergency_brake
if gracefully_stop_mark && gracefully_stop_mark.call
info "Stop deployment by stop condition (external reason)"
@emergency_brake = true
end
@emergency_brake
end
def gracefully_stop(node)
if gracefully_stop? && node.ready?
node.set_status_skipped
hook 'post_gracefully_stop', node
end
end
def gracefully_stop!
return if @emergency_brake
info "Stop deployment by internal reason"
@emergency_brake = true
end
def fault_tolerance_groups=(groups=[])
@fault_tolerance_groups = groups.select { |group| group['node_ids'].present? }
@fault_tolerance_groups.each { |group| group['failed_node_ids'] = [] }
debug "Setup fault tolerance groups: #{@fault_tolerance_groups}"
end
def validate_fault_tolerance(node)
return if gracefully_stop?
if node.failed?
count_tolerance_fail(node)
gracefully_stop! if fault_tolerance_excess?
end
end
def count_tolerance_fail(node)
@fault_tolerance_groups.select do |g|
g['node_ids'].include?(node.name)
end.each do |group|
debug "Count faild node #{node.name} for group #{group['name']}"
group['fault_tolerance'] -= 1
group['node_ids'].delete(node.name)
group['failed_node_ids'] << node.name
end
end
def fault_tolerance_excess?
is_failed = @fault_tolerance_groups.select { |group| group['fault_tolerance'] < 0 }
return false if is_failed.empty?
warn "Fault tolerance exceeded the stop conditions #{is_failed}"
true
end
# @return [String]
def to_s
"Cluster[#{id}]"

View File

@ -203,7 +203,7 @@ module Deployment
task.finished?
end
if finished
debug 'All tasks are finished'
debug "All tasks on node #{name} are finished"
@tasks_are_finished = true
end
finished
@ -220,7 +220,7 @@ module Deployment
task.successful? || task.skipped?
end
if successful
debug 'All tasks are successful'
debug "All tasks on node #{name} are successful"
@tasks_are_successful = true
end
successful
@ -236,7 +236,7 @@ module Deployment
task.failed?
end
if failed.any?
debug "Found failed tasks: #{failed.map { |t| t.name }.join ', '}"
debug "Found failed tasks on node #{name}: #{failed.map { |t| t.name }.join ', '}"
@tasks_have_failed = true
end
failed.any?
@ -311,7 +311,9 @@ module Deployment
def inspect
message = "#{self}{"
message += "Tasks: #{tasks_finished_count}/#{tasks_total_count}"
message += " Finished: #{tasks_are_finished?} Failed: #{tasks_have_failed?} Successful: #{tasks_are_successful?}"
message += " Finished: #{tasks_are_finished?}"
message += " Failed: #{tasks_have_failed?}"
message += " Successful: #{tasks_are_successful?}"
message + '}'
end
end

View File

@ -34,6 +34,8 @@ module Deployment
ALLOWED_STATUSES = [:online, :busy, :offline, :failed, :successful, :skipped]
# A node is considered finished with one of these statuses
FINISHED_STATUSES = [:failed, :successful, :skipped]
# A node is considered failed with these statuses
FAILED_STATUSES = [:failed]
# @param [String, Symbol] name
# @param [Deployment::Cluster] cluster
@ -61,6 +63,8 @@ module Deployment
attr_accessor :id
attr_reader :critical
alias :critical? :critical
attr_reader :sync_point
alias :sync_point? :sync_point
# Set a new status of this node
# @param [Symbol, String] value
@ -83,15 +87,36 @@ module Deployment
# Set this node to be a critical node
# @return [true]
def set_critical
debug "Setup #{self} as critical node"
self.critical = true
end
# Set this node to be a normal node
# @return [false]
def set_normal
debug "Setup #{self} as normal node"
self.critical = false
end
# Set this node as sync point node
# @return [true]
def set_as_sync_point
self.sync_point = true
end
# Set this node as normal point node
# @return [false]
def unset_as_sync_point
self.sync_point = false
end
# Set the sync point property of this node
# @param [true, false] value
# @return [true, false]
def sync_point=(value)
@sync_point = !!value
end
# Set this node's Cluster Object
# @param [Deployment::Cluster] cluster The new cluster object
# @raise [Deployment::InvalidArgument] if the object is not a Node
@ -169,7 +194,7 @@ module Deployment
# or has the failed status
# @return [true, false]
def failed?
status == :failed or tasks_have_failed?
FAILED_STATUSES.include? status or tasks_have_failed?
end
# The node has all tasks successful
@ -181,7 +206,7 @@ module Deployment
# The node is skipped and will not get any tasks
def skipped?
status == :skipped
status == :skipped #or tasks_have_only_dep_failed?
end
ALLOWED_STATUSES.each do |status|
@ -235,7 +260,7 @@ module Deployment
def inspect
message = "#{self}{Status: #{status}"
message += " Tasks: #{tasks_finished_count}/#{tasks_total_count}"
message += " CurrentTask: #{task.name}" if task
message += " CurrentTask: #{task.name}, task status: #{task.status}" if task
message + '}'
end

View File

@ -315,6 +315,7 @@ module Deployment
# task are failed and set dep_failed status if so.
# @return [true, false]
def check_for_failed_dependencies
return if self.sync_point?
return false if FAILED_STATUSES.include? status
failed = each_backward_dependency.any? do |task|
FAILED_STATUSES.include? task.status
@ -329,17 +330,28 @@ module Deployment
def check_for_ready_dependencies
return false unless status == :pending
ready = each_backward_dependency.all? do |task|
SUCCESS_STATUSES.include? task.status
ready_statuses = SUCCESS_STATUSES
ready_statuses += FAILED_STATUSES if sync_point?
ready_statuses.include? task.status
end
self.status = :ready if ready
ready
end
# set the pending tasks to dep_failed if the node have failed
def check_for_node_status
return unless node
if Deployment::Node::FAILED_STATUSES.include? node.status and NOT_RUN_STATUSES.include? status
self.status = :dep_failed
end
end
# Poll direct task dependencies if
# the failed or ready status of this task should change
def poll_dependencies
check_for_ready_dependencies
check_for_failed_dependencies
check_for_node_status
end
alias :poll :poll_dependencies
@ -408,6 +420,24 @@ module Deployment
FAILED_STATUSES.include? status
end
# This task have not been run because of failed dependencies
# @return [true, false]
def dep_failed?
status == :dep_failed
end
# # This task failed
# # @return [true, false]
# def abortive?
# status == :failed
# end
#This task is sync point
# @return [true, false]
def sync_point?
self.node.sync_point?
end
# @return [String]
def to_s
"Task[#{name}/#{node.name}]"
@ -445,7 +475,7 @@ module Deployment
poll_dependencies
case status
when :pending;
:white
sync_point? ? :cyan : :white
when :ready
:yellow
when :successful;

View File

@ -33,7 +33,7 @@ $LOAD_PATH << lib_dir
require 'astute'
require 'fuel_deployment'
Deployment::Log.logger.level = Logger::WARN
Deployment::Log.logger.level = Logger::DEBUG
Dir[File.join(File.dirname(__FILE__), 'unit/fixtures/*.rb')].each { |file| require file }

View File

@ -200,6 +200,54 @@ describe Deployment::Cluster do
expect(subject.has_failed_nodes?).to eq true
end
context 'fault_tolerance_groups' do
let(:fault_tolerance_groups) do
[{
"fault_tolerance"=>1,
"name"=>"test_group",
"node_ids"=>['node2']
},
{
"fault_tolerance"=> 0,
"name"=>"test_group2",
"node_ids"=>[]
}]
end
it 'can find tolerance group' do
cluster.fault_tolerance_groups = fault_tolerance_groups
task1_1.status = :successful
task1_2.status = :successful
task2_1.status = :successful
task2_2.status = :failed
expect(cluster.fault_tolerance_groups).to eq [fault_tolerance_groups.first]
end
it 'can validate tolerance group' do
cluster.fault_tolerance_groups = fault_tolerance_groups
task1_1.status = :successful
task1_2.status = :successful
task2_1.status = :failed
cluster.validate_fault_tolerance(node1)
cluster.validate_fault_tolerance(node2)
expect(cluster.fault_tolerance_excess?).to eq false
expect(cluster.gracefully_stop?).to eq false
end
it 'can control deploy using tolerance group' do
fault_tolerance_groups.first['fault_tolerance'] = 0
cluster.fault_tolerance_groups = fault_tolerance_groups
task1_1.status = :successful
task1_2.status = :successful
task2_1.status = :failed
cluster.validate_fault_tolerance(node1)
cluster.validate_fault_tolerance(node2)
expect(cluster.fault_tolerance_excess?).to eq true
expect(cluster.gracefully_stop?).to eq true
end
end
it 'can find critical nodes' do
expect(subject.critical_nodes).to eq([])
node1.critical = true

View File

@ -245,7 +245,7 @@ describe Deployment::Node do
subject.status = :offline
expect(subject.inspect).to eq 'Node[node1]{Status: offline Tasks: 0/1}'
subject.task = task1
expect(subject.inspect).to eq 'Node[node1]{Status: offline Tasks: 0/1 CurrentTask: task1}'
expect(subject.inspect).to eq 'Node[node1]{Status: offline Tasks: 0/1 CurrentTask: task1, task status: ready}'
end
end

View File

@ -25,7 +25,11 @@ describe Astute::Orchestrator do
end
describe '#task_deployment' do
let(:deployment_info) { [] }
let(:tasks_metadata) do
{
'fault_tolerance_groups' => []
}
end
let(:tasks_graph) do
{"1"=>
@ -63,7 +67,7 @@ describe Astute::Orchestrator do
it 'should run task deployment' do
Astute::TaskDeployment.any_instance.expects(:deploy).with(
:deployment_info => deployment_info,
:tasks_metadata => tasks_metadata,
:tasks_graph => tasks_graph,
:tasks_directory => tasks_directory,
:dry_run => false
@ -73,7 +77,7 @@ describe Astute::Orchestrator do
@reporter,
'task_id',
{
:deployment_info => deployment_info,
:tasks_metadata => tasks_metadata,
:tasks_graph => tasks_graph,
:tasks_directory => tasks_directory
}
@ -92,7 +96,7 @@ describe Astute::Orchestrator do
@reporter,
'task_id',
{
:deployment_info => deployment_info,
:tasks_metadata => tasks_metadata,
:tasks_graph => tasks_graph,
:tasks_directory => tasks_directory
}

View File

@ -22,23 +22,27 @@ describe Astute::TaskCluster do
let(:node) { Astute::TaskNode.new('node_name', subject) }
describe "#hook_post_node_poll" do
before(:each) do
subject.stubs(:validate_fault_tolerance)
end
describe "#hook_internal_post_node_poll" do
it 'should call gracefully_stop with node' do
subject.expects(:gracefully_stop).with(node)
subject.hook_post_node_poll(node)
subject.hook_internal_post_node_poll(node)
end
end
describe "#gracefully_stop" do
it 'should check if node should be stopped' do
subject.expects(:gracefully_stop?).returns(false)
subject.hook_post_node_poll(node)
subject.hook_internal_post_node_poll(node)
end
it 'should check if node ready' do
subject.stop_condition { true }
node.expects(:ready?).returns(false)
subject.hook_post_node_poll(node)
subject.hook_internal_post_node_poll(node)
end
it 'should set node status as skipped if stopped' do
@ -47,7 +51,7 @@ describe Astute::TaskCluster do
node.stubs(:report_node_status)
node.expects(:set_status_skipped).once
subject.hook_post_node_poll(node)
subject.hook_internal_post_node_poll(node)
end
it 'should report new node status if stopped' do
@ -56,7 +60,7 @@ describe Astute::TaskCluster do
node.stubs(:set_status_skipped).once
node.expects(:report_node_status)
subject.hook_post_node_poll(node)
subject.hook_internal_post_node_poll(node)
end
end

View File

@ -24,13 +24,26 @@ describe Astute::TaskDeployment do
ctx
end
let(:deployment_info) do
[
let(:tasks_metadata) do
{
'uid' => '1',
'fail_if_error' => false
'fault_tolerance_groups' =>[
{"fault_tolerance"=>0, "name"=>"primary-controller", "node_ids"=>["1"]},
{"fault_tolerance"=>1, "name"=>"controller", "node_ids"=>[]},
{"fault_tolerance"=>0, "name"=>"cinder", "node_ids"=>[]},
{"fault_tolerance"=>0, "name"=>"cinder-block-device", "node_ids"=>[]},
{"fault_tolerance"=>1, "name"=>"cinder-vmware", "node_ids"=>[]},
{"fault_tolerance"=>0, "name"=>"compute", "node_ids"=>["3", "2"]},
{"fault_tolerance"=>1, "name"=>"compute-vmware", "node_ids"=>[]},
{"fault_tolerance"=>1, "name"=>"mongo", "node_ids"=>[]},
{"fault_tolerance"=>1, "name"=>"primary-mongo", "node_ids"=>[]},
{"fault_tolerance"=>1,
"name"=>"ceph-osd",
"node_ids"=>["3", "2", "5", "4"]},
{"fault_tolerance"=>1, "name"=>"base-os", "node_ids"=>[]},
{"fault_tolerance"=>1, "name"=>"virt", "node_ids"=>[]},
{"fault_tolerance"=>1, "name"=>"ironic", "node_ids"=>[]}
]
}
]
end
let(:tasks_graph) do
@ -41,7 +54,7 @@ describe Astute::TaskDeployment do
"required_for"=>[],
"requires"=> [],
"id"=>"ironic_post_swift_key",
"parameters"=>{}
"parameters"=>{},
}],
"null"=> [{
"skipped"=>true,
@ -74,21 +87,19 @@ describe Astute::TaskDeployment do
describe '#deploy' do
it 'should run deploy' do
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:fail_offline_nodes).returns([])
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
Astute::TaskCluster.any_instance.expects(:run).returns({:success => true})
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
it 'should not raise error if deployment info not provided' do
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:fail_offline_nodes).returns([])
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
@ -106,40 +117,33 @@ describe Astute::TaskDeployment do
)
end
it 'should run pre deployment task' do
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
pre_deployment = Astute::TaskPreDeploymentActions.new(deployment_info, ctx)
Astute::TaskPreDeploymentActions.expects(:new)
.with(deployment_info, ctx)
.returns(pre_deployment)
Astute::TaskPreDeploymentActions.any_instance.expects(:process)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
it 'should support virtual node' do
d_t = task_deployment.send(:support_virtual_node, tasks_graph)
expect(d_t.keys).to include 'virtual_sync_node'
expect(d_t.keys).not_to include 'null'
end
it 'should remove failed nodes' do
#TODO(vsharshov): improve remove failed nodes check. Check mcollective
it 'should support critical nodes' do
critical_nodes = task_deployment.send(
:critical_node_uids,
tasks_metadata['fault_tolerance_groups']
)
expect(critical_nodes).to include '1'
expect(critical_nodes).to include '2'
expect(critical_nodes).to include '3'
expect(critical_nodes.size).to eql(3)
end
it 'should fail offline nodes' do
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
task_deployment.expects(:remove_failed_nodes).returns([deployment_info, []])
task_deployment.expects(:fail_offline_nodes).returns([])
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -148,12 +152,12 @@ describe Astute::TaskDeployment do
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
task_deployment.stubs(:fail_offline_nodes).returns([])
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
Astute::TaskCluster.any_instance.expects(:stop_condition)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -162,12 +166,12 @@ describe Astute::TaskDeployment do
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
task_deployment.stubs(:fail_offline_nodes).returns([])
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
Deployment::Log.expects(:logger=).with(Astute.logger)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -176,10 +180,9 @@ describe Astute::TaskDeployment do
let(:task_concurrency) { mock('task_concurrency') }
before(:each) do
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
task_deployment.stubs(:fail_offline_nodes).returns([])
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
Deployment::Concurrency::Counter.any_instance
.stubs(:maximum=).with(
@ -190,7 +193,7 @@ describe Astute::TaskDeployment do
Deployment::Concurrency::Counter.any_instance.expects(:maximum=).with(0).times(5)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -204,7 +207,7 @@ describe Astute::TaskDeployment do
.with(1)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -218,7 +221,7 @@ describe Astute::TaskDeployment do
.with(7)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -229,7 +232,7 @@ describe Astute::TaskDeployment do
.with(0).times(5)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -241,7 +244,7 @@ describe Astute::TaskDeployment do
.with(0).times(2)
expect {task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)}.to raise_error(
Astute::DeploymentEngineError, /expect only non-negative integer, but got -4./
@ -252,15 +255,14 @@ describe Astute::TaskDeployment do
context 'dry_run' do
it 'should not run actual deployment if dry_run is set to True' do
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:fail_offline_nodes).returns([])
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
Astute::TaskCluster.any_instance.expects(:run).never
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory,
dry_run: true)
@ -277,8 +279,7 @@ describe Astute::TaskDeployment do
it 'should setup max nodes per call using config' do
Astute.config.max_nodes_per_call = 33
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:fail_offline_nodes).returns([])
task_deployment.stubs(:write_graph_to_file)
ctx.stubs(:report)
@ -293,7 +294,7 @@ describe Astute::TaskDeployment do
node_concurrency.expects(:maximum=).with(Astute.config.max_nodes_per_call)
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -302,21 +303,18 @@ describe Astute::TaskDeployment do
context 'should report final status' do
it 'succeed status' do
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
task_deployment.stubs(:fail_offline_nodes).returns([])
task_deployment.stubs(:write_graph_to_file)
ctx.expects(:report).with({'status' => 'ready', 'progress' => 100})
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
it 'failed status' do
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
failed_node = mock('node')
failed_node.expects(:id).returns('1')
@ -330,7 +328,7 @@ describe Astute::TaskDeployment do
:failed_nodes => [failed_node],
:failed_tasks => [failed_task],
:status => 'Failed because of'})
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
task_deployment.stubs(:fail_offline_nodes).returns([])
task_deployment.stubs(:write_graph_to_file)
ctx.expects(:report).with('nodes' => [{
'uid' => '1',
@ -346,7 +344,7 @@ describe Astute::TaskDeployment do
'error' => 'Failed because of'})
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -363,8 +361,7 @@ describe Astute::TaskDeployment do
it 'should write if disable' do
Astute.config.enable_graph_file = false
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:fail_offline_nodes).returns([])
ctx.stubs(:report)
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
@ -374,7 +371,7 @@ describe Astute::TaskDeployment do
.yields(file_handle).never
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end
@ -382,8 +379,7 @@ describe Astute::TaskDeployment do
it 'should write graph if enable' do
Astute.config.enable_graph_file = true
task_deployment.stubs(:remove_failed_nodes).returns([deployment_info, []])
Astute::TaskPreDeploymentActions.any_instance.stubs(:process)
task_deployment.stubs(:fail_offline_nodes).returns([])
ctx.stubs(:report)
Astute::TaskCluster.any_instance.stubs(:run).returns({:success => true})
@ -393,7 +389,7 @@ describe Astute::TaskDeployment do
.yields(file_handle).once
task_deployment.deploy(
deployment_info: deployment_info,
tasks_metadata: tasks_metadata,
tasks_graph: tasks_graph,
tasks_directory: tasks_directory)
end

View File

@ -247,17 +247,19 @@ describe Astute::TaskNode do
Astute::Puppet.any_instance.stubs(:run)
end
context 'mark online' do
it 'if task successful' do
Astute::Puppet.any_instance.stubs(:status).returns(:successful)
context 'mark failed' do
it 'if task failed' do
Astute::Puppet.any_instance.stubs(:status).returns(:failed)
ctx.stubs(:report)
task_node.run(task)
task_node.poll
expect(task_node.status).to eql(:online)
expect(task_node.status).to eql(:failed)
end
end
it 'if task failed' do
Astute::Puppet.any_instance.stubs(:status).returns(:failed)
context 'mark online' do
it 'if task successful' do
Astute::Puppet.any_instance.stubs(:status).returns(:successful)
ctx.stubs(:report)
task_node.run(task)
task_node.poll
@ -396,7 +398,7 @@ describe Astute::TaskNode do
task_node.poll
end
it 'should report deploy progress if task failed and another tasks exists' do
it 'should not report deploy progress if task failed and another tasks exists' do
Astute::Puppet.any_instance.expects(:status).returns(:failed)
task_node.graph.create_task(
'second_task',
@ -404,15 +406,7 @@ describe Astute::TaskNode do
)
task_node.run(task)
ctx.expects(:report).with({
'nodes' => [{
'uid' => 'node_id',
'status' => 'deploying',
'deployment_graph_task_name' => task.name,
'custom' => {},
'task_status' => 'failed',
'progress' => 50}]
})
ctx.expects(:report).never
task_node.poll
end
end

View File

@ -32,7 +32,7 @@ node1_data = [
[11, 13],
[12, 13],
[13, 9],
[9, 14],
# [9, 14],
[14, 15],
]
@ -42,7 +42,7 @@ node2_data = [
[0, 3],
[3, 4],
[4, 5],
[5, 6],
# [5, 6],
[5, 7],
[6, 8],
]
@ -54,8 +54,14 @@ cluster.plot = true if options[:plot]
node1 = cluster.node_create 'node1', Deployment::TestNode
node2 = cluster.node_create 'node2', Deployment::TestNode
sync_node = cluster.node_create 'sync_node', Deployment::TestNode
node2.set_critical if options[:critical]
sync_node.set_as_sync_point
sync_node.create_task 'sync_task'
node1_data.each do |task_from, task_to|
task_from = node1.graph.create_task "task#{task_from}"
task_to = node1.graph.create_task "task#{task_to}"
@ -74,6 +80,11 @@ node2['task4'].depends node1['task3']
node2['task5'].depends node1['task13']
node1['task15'].depends node2['task6']
sync_node['sync_task'].depends node2['task5']
sync_node['sync_task'].depends node1['task9']
node2['task6'].depends sync_node['sync_task']
node1['task14'].depends sync_node['sync_task']
if options[:plot]
cluster.make_image 'start'
end
@ -81,5 +92,5 @@ end
if options[:interactive]
binding.pry
else
cluster.run
p cluster.run
end

View File

@ -72,6 +72,9 @@ module Deployment
debug "#{task} finished with: #{status}"
self.task.status = status
self.status = :online
self.status = :skipped if task.status == :dep_failed
self.status = :failed if task.status == :failed
end
end
end