Add fencing agent script as a separated task
Based on nailgun agent, should be installed the same way To run unit tests use: rspec ./bin/fencing-agent_spec.rb --color --format documentation Requirements for tests: ruby 1.9.3p429 / 1.8.7p371 rspec 2.14.7 --------------------- httpclient (2.3.4.1) json (1.8.1) logger (1.2.8) ohai (6.20.0) rethtool (0.0.3) Depends on: https://review.openstack.org/66607 Implements blueprint: fencing-in-nailgun-agent Change-Id: Iceee47c47d35969f4ff08a0eb5b1e74b12b8b92f Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
This commit is contained in:
parent
0466543ce4
commit
894a1c68e8
|
@ -0,0 +1 @@
|
|||
* * * * * root flock -w 0 -o /var/lock/fencing-agent.lock -c "/opt/nailgun/bin/fencing-agent.rb 2>&1 | tee -a /var/log/fencing-agent.log | /usr/bin/logger -t fencing-agent || true"
|
|
@ -0,0 +1,188 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
# Copyright 2014 Mirantis, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
begin
|
||||
require 'rubygems'
|
||||
rescue LoadError
|
||||
end
|
||||
require 'ohai/system'
|
||||
require 'logger'
|
||||
require 'open3'
|
||||
require 'rexml/document'
|
||||
|
||||
unless Process.euid == 0
|
||||
puts "You must be root"
|
||||
exit 1
|
||||
end
|
||||
|
||||
ENV['PATH'] = "/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin"
|
||||
|
||||
class FenceAgent
|
||||
def initialize(logger)
|
||||
@logger = logger
|
||||
@os = Ohai::System.new()
|
||||
@os.all_plugins
|
||||
end
|
||||
|
||||
def system_info
|
||||
{
|
||||
:fqdn => (@os[:fqdn].strip rescue @os[:hostname].strip rescue nil),
|
||||
:hostname => (@os[:hostname].strip rescue nil),
|
||||
}.delete_if { |key, value| value.nil? or value.empty? or value == "Not Specified" }
|
||||
end
|
||||
|
||||
# Check free root space for all nodes in the corosync cluster, if any up and running
|
||||
# Do not wait or check for the fence actions results, if any were taken (it is in cluster's responsibility)
|
||||
# TODO report to nailgun if fencing actions were taken
|
||||
# * return 0, if nodes in the cluster don't need fencing by root free space criteria
|
||||
# * return 1, if fence action is not applicable atm, e.g. corosync is absent or not accessible yet, or node wasn't yet provisioned
|
||||
# * return 2, if some nodes has been ordered to fence and all corresponding crm commands were issued to corosync
|
||||
# * return 3, if some nodes has been ordered to fence, but some of crm commands were not issued for some reasons
|
||||
def check_and_fence
|
||||
# Privates
|
||||
|
||||
# for unit tests' stubs
|
||||
def random(s,n)
|
||||
s+rand(n)
|
||||
end
|
||||
|
||||
# sleep and exec cmd
|
||||
def exec(cmd,sleep_time)
|
||||
unless sleep_time.nil? or sleep_time == 0
|
||||
@logger.info("Sleep #{Process.pid} for #{sleep_time}s, before issuing cmd:#{cmd}")
|
||||
sleep(sleep_time)
|
||||
end
|
||||
Process.fork do
|
||||
Process.exec(cmd)
|
||||
end
|
||||
Process.wait
|
||||
$?.exitstatus
|
||||
end
|
||||
|
||||
# * return target, if provisioned
|
||||
# * return bootstrap, if not provisioned yet
|
||||
def get_system_type(filename)
|
||||
fl = File.open(filename, "r")
|
||||
state = fl.readline.rstrip
|
||||
fl.close
|
||||
state
|
||||
end
|
||||
|
||||
# * return true, if corosync running and CIB is up
|
||||
def is_corosync_up
|
||||
cmd = "/usr/sbin/crm_attribute --type crm_config --query --name dc-version &>/dev/null"
|
||||
exec(cmd,random(5,10)) == 0
|
||||
end
|
||||
|
||||
# assume is_corosync_up true
|
||||
# * return xml with free root space data from CIB, or nil
|
||||
def get_free_root_space_from_CIB
|
||||
cmd = "/usr/sbin/cibadmin --query --xpath \"//nvpair[@name='root_free']\""
|
||||
sleep(random(3,5))
|
||||
REXML::Document.new(Open3.popen3(cmd)[1].read).root.elements['/xpath-query'] rescue nil
|
||||
end
|
||||
|
||||
# assume is_corosync_up true
|
||||
# * return true, if node is OFFLINE (or not applicable for any actions by corosync cluster services)
|
||||
def is_offline(fqdn)
|
||||
cmd = "/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{fqdn}']\" | grep -q 'crmd=\"online\"'"
|
||||
exec(cmd,random(5,10)) > 0
|
||||
end
|
||||
|
||||
# assume is_corosync_up true
|
||||
# issue fencing action to cluster services for given nodes
|
||||
# * return 2, if some nodes has been ordered to fence and all crm command has been issued.
|
||||
# * return 3, if some nodes has been ordered to fence, but some of crm commands was not issued for some reasons.
|
||||
def fence_nodes(nodes_to_fence)
|
||||
failed = false
|
||||
nodes_to_fence.each do |node|
|
||||
cmd = "/usr/sbin/crm --force node fence #{node}"
|
||||
if exec(cmd,random(15,15)) > 0
|
||||
@logger.error("Cannot issue the command: #{cmd}")
|
||||
failed = true
|
||||
else
|
||||
@logger.error("Issued the fence action: #{cmd}")
|
||||
end
|
||||
end
|
||||
return 2 unless failed
|
||||
3
|
||||
end
|
||||
|
||||
# Start check for cluster's free root space
|
||||
@logger.debug("Starting cluster free root space check")
|
||||
if File.exist?("/etc/nailgun_systemtype")
|
||||
# exit, if node is not provisioned yet
|
||||
if get_system_type("/etc/nailgun_systemtype") != "target"
|
||||
@logger.debug("The system state is not 'target' yet, exiting with 1")
|
||||
return 1
|
||||
end
|
||||
else
|
||||
@logger.debug("The /etc/nailgun_systemtype file is missing, exiting with 1")
|
||||
return 1
|
||||
end
|
||||
# exit, if cibadmin tool doesn't exist yet
|
||||
unless is_corosync_up
|
||||
@logger.debug("Corosync is absent or not ready yet, exiting with 1")
|
||||
return 1
|
||||
end
|
||||
# query CIB for nodes' root free space
|
||||
stanzas = get_free_root_space_from_CIB
|
||||
if stanzas.nil?
|
||||
@logger.debug("Free space monitoring resource is not configured yet, exiting with 1")
|
||||
return 1
|
||||
end
|
||||
nodes_to_fence = []
|
||||
# for every node in the cluster
|
||||
stanzas.each_element do |e|
|
||||
items = e.attributes
|
||||
# get the node's fqdn and free space at root partition from CIB
|
||||
line = { :fqdn => /^status-(.*)-root_free$/.match(items['id'])[1], :root_free => items['value'] }
|
||||
# get the node's status from CIB
|
||||
@logger.debug("Got fqdn:#{line[:fqdn]}, root free space:#{line[:root_free]}G")
|
||||
# if node is not the agent's one, and node's root free space is zero, and its status is online, add it to the list of nodes must be fenced
|
||||
cmd = "/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{line[:fqdn]}']\" | grep -q 'crmd=\"online\"'"
|
||||
if line[:root_free].to_i == 0
|
||||
offline = is_offline(line[:fqdn])
|
||||
@logger.debug("Ignoring offline node #{line[:fqdn]}") if offline
|
||||
end
|
||||
itself = (system_info[:fqdn] == line[:fqdn] or system_info[:name] == line[:fqdn])
|
||||
@logger.debug("Ignoring my own node #{line[:fqdn]} (cannot shoot myself)") if itself and line[:root_free].to_i == 0
|
||||
nodes_to_fence.push(line[:fqdn]) unless line[:root_free].to_i > 0 or offline or itself or nodes_to_fence.include?(line[:fqdn])
|
||||
end
|
||||
# fence the failed nodes, if any, by random delay (15..30) and report an alert
|
||||
unless nodes_to_fence.empty?
|
||||
result = fence_nodes(nodes_to_fence)
|
||||
@logger.error("Cluster has FAILED free root space check!")
|
||||
return result
|
||||
else
|
||||
@logger.debug("Cluster has PASSED free root space check successfully")
|
||||
return 0
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# skip it, if under unit testing
|
||||
if $0 == __FILE__
|
||||
logger = Logger.new(STDOUT)
|
||||
logger.level = Logger::DEBUG
|
||||
|
||||
agent = FenceAgent.new(logger)
|
||||
begin
|
||||
agent.check_and_fence
|
||||
rescue => ex
|
||||
logger.error "Cluster free root space check cannot be performed: #{ex.message}\n#{ex.backtrace}"
|
||||
end
|
||||
end
|
|
@ -0,0 +1,148 @@
|
|||
require 'rubygems'
|
||||
require 'rspec'
|
||||
require 'mocha/api'
|
||||
# stub the root rights for agent script under test
|
||||
Process.stubs(:euid).returns(0)
|
||||
# use load for agent script w/o '.rb' extension
|
||||
require './bin/fencing-agent'
|
||||
|
||||
# fixtures
|
||||
$xml_all_ok = <<END
|
||||
<xpath-query>
|
||||
<nvpair id="status-node-7.test.domain.local-root_free" name="root_free" value="5"/>
|
||||
<nvpair id="status-node-8.test.domain.local-root_free" name="root_free" value="5"/>
|
||||
<nvpair id="status-node-9.test.domain.local-root_free" name="root_free" value="5"/>
|
||||
</xpath-query>
|
||||
END
|
||||
$xml_need_fence1 = <<END
|
||||
<xpath-query>
|
||||
<nvpair id="status-node-7.test.domain.local-root_free" name="root_free" value="5"/>
|
||||
<nvpair id="status-node-8.test.domain.local-root_free" name="root_free" value="5"/>
|
||||
<nvpair id="status-node-9.test.domain.local-root_free" name="root_free" value="0"/>
|
||||
</xpath-query>
|
||||
END
|
||||
$xml_need_fence2 = <<END
|
||||
<xpath-query>
|
||||
<nvpair id="status-node-7.test.domain.local-root_free" name="root_free" value="0"/>
|
||||
<nvpair id="status-node-8.test.domain.local-root_free" name="root_free" value="5"/>
|
||||
<nvpair id="status-node-9.test.domain.local-root_free" name="root_free" value="0"/>
|
||||
</xpath-query>
|
||||
END
|
||||
$fl = StringIO.new("target")
|
||||
|
||||
describe FenceAgent do
|
||||
before :each do
|
||||
logger = Logger.new(STDOUT)
|
||||
logger.level = Logger::DEBUG
|
||||
@agent = FenceAgent.new(logger)
|
||||
@agent.stubs(:random).returns(0)
|
||||
File.stub(:exist?).with("/etc/nailgun_systemtype").and_return(true)
|
||||
File.stub(:open).with("/etc/nailgun_systemtype", "r").and_return($fl)
|
||||
end
|
||||
|
||||
describe "#new" do
|
||||
it "takes logger and url parameters and returns a nailgun agent instance" do
|
||||
@agent.should be_an_instance_of FenceAgent
|
||||
end
|
||||
end
|
||||
|
||||
# Fence daemon tests
|
||||
describe "#check_and_fence" do
|
||||
before :each do
|
||||
@agent.stubs(:is_corosync_up).returns(true)
|
||||
@agent.stubs(:get_system_type).returns("target")
|
||||
end
|
||||
|
||||
it "Check N/A: should return 1, if system type file is missing" do
|
||||
File.stub(:exist?).with("/etc/nailgun_systemtype").and_return(false)
|
||||
@agent.check_and_fence.should eq(1)
|
||||
end
|
||||
|
||||
it "Check N/A: should return 1, if fence action is not applicable because of wrong system type" do
|
||||
@agent.stubs(:get_system_type).returns("bootstrap")
|
||||
@agent.check_and_fence.should eq(1)
|
||||
end
|
||||
|
||||
it "Check N/A: should return 1, if corosync is not ready" do
|
||||
@agent.stub(:is_corosync_up).and_return(false)
|
||||
@agent.check_and_fence.should eq(1)
|
||||
end
|
||||
|
||||
it "Check N/A: should return 1, if none of free space monitoring ocf resources ready" do
|
||||
@agent.stubs(:get_free_root_space_from_CIB).returns(nil)
|
||||
@agent.check_and_fence.should eq(1)
|
||||
end
|
||||
|
||||
it "Check PASSED: should return 0, if nodes in the cluster don't need fencing by root free space criteria" do
|
||||
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_all_ok).root.elements['/xpath-query'])
|
||||
@agent.check_and_fence.should eq(0)
|
||||
end
|
||||
|
||||
it "Check FAILED: if one node must be fenced and is online, should issue fence command to corosync and return 2" do
|
||||
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence1).root.elements['/xpath-query'])
|
||||
expected_node = "node-9.test.domain.local"
|
||||
expected_nodes = [ expected_node ]
|
||||
@agent.stub(:exec).with(
|
||||
"/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{expected_node}']\" | grep -q 'crmd=\"online\"'"
|
||||
).and_return(0)
|
||||
@agent.stub(:exec).with(
|
||||
"/usr/sbin/crm --force node fence #{expected_node}"
|
||||
).and_return(0)
|
||||
@agent.should_receive(:is_offline).with(expected_node).exactly(1).times.and_return(false)
|
||||
@agent.should_receive(:fence_nodes).with(expected_nodes).exactly(1).times.and_return(2)
|
||||
@agent.check_and_fence.should eq(2)
|
||||
end
|
||||
|
||||
it "Check FAILED: if some nodes must be fenced and are online, should issue fence commands to corosync and return 2" do
|
||||
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence2).root.elements['/xpath-query'])
|
||||
expected_node1 = "node-7.test.domain.local"
|
||||
expected_node2 = "node-9.test.domain.local"
|
||||
expected_nodes = [ expected_node1, expected_node2 ]
|
||||
expected_nodes.each do |node|
|
||||
@agent.stub(:exec).with(
|
||||
"/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{node}']\" | grep -q 'crmd=\"online\"'"
|
||||
).and_return(0)
|
||||
@agent.stub(:exec).with(
|
||||
"/usr/sbin/crm --force node fence #{node}"
|
||||
).and_return(0)
|
||||
@agent.should_receive(:is_offline).with(node).exactly(1).times.and_return(false)
|
||||
end
|
||||
@agent.should_receive(:fence_nodes).with(expected_nodes).exactly(1).times.and_return(2)
|
||||
@agent.check_and_fence.should eq(2)
|
||||
end
|
||||
|
||||
it "Check FAILED: should return 3, if some nodes are online and has been ordered to fence, but some of crm commands were not issued for some reasons" do
|
||||
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence2).root.elements['/xpath-query'])
|
||||
expected_node1 = "node-7.test.domain.local"
|
||||
expected_node2 = "node-9.test.domain.local"
|
||||
expected_nodes = [ expected_node1, expected_node2 ]
|
||||
expected_nodes.each do |node|
|
||||
@agent.stub(:exec).with(
|
||||
"/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{node}']\" | grep -q 'crmd=\"online\"'"
|
||||
).and_return(0)
|
||||
@agent.should_receive(:is_offline).with(node).exactly(1).times.and_return(false)
|
||||
end
|
||||
@agent.stub(:exec).with(
|
||||
"/usr/sbin/crm --force node fence #{expected_node1}"
|
||||
).and_return(0)
|
||||
@agent.stub(:exec).with(
|
||||
"/usr/sbin/crm --force node fence #{expected_node2}"
|
||||
).and_return(6)
|
||||
@agent.should_receive(:fence_nodes).with(expected_nodes).exactly(1).times.and_return(3)
|
||||
@agent.check_and_fence.should eq(3)
|
||||
end
|
||||
|
||||
it "Check consider PASSED behavior: should exclude itself from the fencing and return 0" do
|
||||
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence1).root.elements['/xpath-query'])
|
||||
@agent.stubs(:is_offline).returns(false)
|
||||
@agent.stubs(:system_info).returns({ :fqdn => 'node-9.test.domain.local', :name => 'node-9' })
|
||||
@agent.check_and_fence.should eq(0)
|
||||
end
|
||||
|
||||
it "Check consider PASSED behavior: should exclude offline nodes from the fencing and return 0" do
|
||||
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence1).root.elements['/xpath-query'])
|
||||
@agent.stubs(:is_offline).returns(true)
|
||||
@agent.check_and_fence.should eq(0)
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue