Prevent deletion of nodes which have running OSD processes

Node deletion will fail if a node with the ceph-osd role still
has PGs placed on any of its OSDs. Removing too many OSDs can result in
data loss. The end-user will be required to manually remove
those OSDs from the cluster and allow it to rebalance to ensure
no data is lost when deleting these nodes as described here:
http://ceph.com/docs/master/rados/operations/add-or-rm-osds/#removing-osds-manual

DocImpact
Change-Id: I41173a83a3268455148652680a534e47296af319
Closes-bug: #1424060
This commit is contained in:
Ryan Moe 2015-03-03 16:36:41 -08:00
parent 5cdd4ae403
commit 1b42ec374a
5 changed files with 180 additions and 5 deletions

View File

@ -42,6 +42,7 @@ require 'astute/deploy_actions'
require 'astute/nailgun_hooks'
require 'astute/puppet_task'
require 'astute/task_manager'
require 'astute/pre_delete'
['/astute/pre_deployment_actions/*.rb',
'/astute/pre_deploy_actions/*.rb',

View File

@ -106,6 +106,10 @@ module Astute
Network.multicast_verification(ctx, nodes)
end
def check_ceph_osds(reporter, task_id, nodes)
PreDelete.check_ceph_osds(Context.new(task_id, reporter), nodes)
end
private
def deploy_cluster(up_reporter, task_id, deployment_info, deploy_engine, pre_deployment, post_deployment)

70
lib/astute/pre_delete.rb Normal file
View File

@ -0,0 +1,70 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
module Astute
module PreDelete
def self.check_ceph_osds(ctx, nodes)
answer = {"status" => "ready"}
ceph_nodes = nodes.select { |n| n["roles"].include? "ceph-osd" }
ceph_osds = ceph_nodes.collect{ |n| n["slave_name"] }
return answer if ceph_osds.empty?
cmd = "ceph -f json osd tree"
shell = MClient.new(ctx, "execute_shell_command", [ceph_nodes[0]["id"]], timeout=60, retries=1)
result = shell.execute(:cmd => cmd).first.results
osds = {}
tree = JSON.parse(result[:data][:stdout])
tree["nodes"].each do |osd|
osds[osd["name"]] = osd["children"] if ceph_osds.include? osd["name"]
end
# pg dump lists all pgs in the cluster and where they are located.
# $14 is the 'up set' (the list of OSDs responsible for a particular
# pg for an epoch) and $16 is the 'acting set' (list of OSDs who
# are [or were at some point] responsible for a pg). These sets
# will generally be the same.
osd_list = osds.values.flatten.join("|")
cmd = "ceph pg dump 2>/dev/null | " \
"awk '//{print $14, $16}' | " \
"egrep -o '\\<(#{osd_list})\\>' | " \
"sort -un"
result = shell.execute(:cmd => cmd).first.results
rs = result[:data][:stdout].split("\n")
# JSON.parse returns the children as integers, so the result from the
# shell command needs to be converted for the set operations to work.
rs.map! { |x| x.to_i }
error_nodes = []
osds.each do |name, children|
error_nodes << name if rs & children != []
end
if not error_nodes.empty?
msg = "Ceph data still exists on: #{error_nodes.join(', ')}. " \
"You must manually remove the OSDs from the cluster " \
"and allow Ceph to rebalance before deleting these nodes."
answer = {"status" => "error", "error" => msg}
end
answer
end
end
end

View File

@ -148,11 +148,15 @@ module Astute
nodes = data['args']['nodes']
engine = data['args']['engine']
result = if nodes.empty?
Astute.logger.debug("#{task_uuid} Node list is empty")
nil
else
@orchestrator.remove_nodes(reporter, task_uuid, engine, nodes)
result = @orchestrator.check_ceph_osds(reporter, task_uuid, nodes)
if result["status"] == "ready"
if nodes.empty?
Astute.logger.debug("#{task_uuid} Node list is empty")
result = nil
else
result = @orchestrator.remove_nodes(reporter, task_uuid, engine, nodes)
end
end
report_result(result, reporter)

View File

@ -0,0 +1,96 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
require File.join(File.dirname(__FILE__), '../spec_helper')
describe '#check_ceph_osds' do
include SpecHelpers
let(:ctx) { mock_ctx }
let(:success_result) { {"status"=>"ready"} }
let(:mclient) do
mclient = mock_rpcclient
Astute::MClient.any_instance.stubs(:rpcclient).returns(mclient)
Astute::MClient.any_instance.stubs(:log_result).returns(mclient)
Astute::MClient.any_instance.stubs(:check_results_with_retries).returns(mclient)
mclient
end
def build_mcresult(stdout="", sender="1")
rs = {:sender => sender, :data => {:stdout => stdout}}
mcresult_mock = mock_mc_result(rs)
mock_result = mock
mock_result.stubs(:results).returns(rs)
mock_result.stubs(:each).returns(mcresult_mock)
[mock_result]
end
context "no ceph-osd nodes" do
let(:nodes) { [
{"id" => "1", "roles" => ["controller"]},
{"id" => "2", "roles" => ["compute"]}
]
}
it "should do nothing if no nodes have ceph-osd role" do
expect(Astute::PreDelete.check_ceph_osds(ctx, nodes)).to eq(success_result)
end
end
context "nodes with ceph-osd role" do
let(:nodes) { [
{"id" => "1", "roles" => ["primary-controller"]},
{"id" => "2", "roles" => ["compute", "ceph-osd"],
"slave_name" => "node-2"}
]
}
let(:pg_cmd) {
cmd = "ceph pg dump 2>/dev/null | " \
"awk '//{print $14, $16}' | " \
"egrep -o '\\<(1|2)\\>' | " \
"sort -un"
}
let(:osd_cmd) { "ceph -f json osd tree" }
let(:json_resp) { '{"nodes": [{"name": "node-2", "children": [1,2]}]}'}
let(:error_result) do
msg = "Ceph data still exists on: node-2. You must manually " \
"remove the OSDs from the cluster and allow Ceph to " \
"rebalance before deleting these nodes."
{"status" => "error", "error" => msg}
end
it "should raise error if OSDs contain data" do
mclient.expects(:execute).with({:cmd => osd_cmd})
.returns(build_mcresult(stdout=json_resp))
mclient.expects(:execute).with({:cmd => pg_cmd})
.returns(build_mcresult(stdout="1\n2"))
expect(Astute::PreDelete.check_ceph_osds(ctx, nodes)).to eq(error_result)
end
it "should succeed with no pgs placed on node" do
mclient.expects(:execute).with({:cmd => osd_cmd})
.returns(build_mcresult(stdout=json_resp))
mclient.expects(:execute).with({:cmd => pg_cmd})
.returns(build_mcresult(stdout="3\n4"))
expect(Astute::PreDelete.check_ceph_osds(ctx, nodes)).to eq(success_result)
end
end
end # describe