Add fencing agent script as a separated task

Based on nailgun agent, should be installed the same way
To run unit tests use:
rspec ./bin/fencing-agent_spec.rb --color --format documentation

Requirements for tests:
ruby 1.9.3p429 / 1.8.7p371
rspec 2.14.7
---------------------
httpclient (2.3.4.1)
json (1.8.1)
logger (1.2.8)
ohai (6.20.0)
rethtool (0.0.3)

Depends on: https://review.openstack.org/66607
Implements blueprint: fencing-in-nailgun-agent

Change-Id: Iceee47c47d35969f4ff08a0eb5b1e74b12b8b92f
Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
This commit is contained in:
Bogdan Dobrelya 2013-11-27 18:57:32 +02:00 committed by Bogdan Dobrelya
parent 0466543ce4
commit 894a1c68e8
3 changed files with 337 additions and 0 deletions

1
bin/fencing-agent.cron Normal file
View File

@ -0,0 +1 @@
* * * * * root flock -w 0 -o /var/lock/fencing-agent.lock -c "/opt/nailgun/bin/fencing-agent.rb 2>&1 | tee -a /var/log/fencing-agent.log | /usr/bin/logger -t fencing-agent || true"

188
bin/fencing-agent.rb Executable file
View File

@ -0,0 +1,188 @@
#!/usr/bin/env ruby
# Copyright 2014 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
begin
require 'rubygems'
rescue LoadError
end
require 'ohai/system'
require 'logger'
require 'open3'
require 'rexml/document'
unless Process.euid == 0
puts "You must be root"
exit 1
end
ENV['PATH'] = "/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin"
class FenceAgent
def initialize(logger)
@logger = logger
@os = Ohai::System.new()
@os.all_plugins
end
def system_info
{
:fqdn => (@os[:fqdn].strip rescue @os[:hostname].strip rescue nil),
:hostname => (@os[:hostname].strip rescue nil),
}.delete_if { |key, value| value.nil? or value.empty? or value == "Not Specified" }
end
# Check free root space for all nodes in the corosync cluster, if any up and running
# Do not wait or check for the fence actions results, if any were taken (it is in cluster's responsibility)
# TODO report to nailgun if fencing actions were taken
# * return 0, if nodes in the cluster don't need fencing by root free space criteria
# * return 1, if fence action is not applicable atm, e.g. corosync is absent or not accessible yet, or node wasn't yet provisioned
# * return 2, if some nodes has been ordered to fence and all corresponding crm commands were issued to corosync
# * return 3, if some nodes has been ordered to fence, but some of crm commands were not issued for some reasons
def check_and_fence
# Privates
# for unit tests' stubs
def random(s,n)
s+rand(n)
end
# sleep and exec cmd
def exec(cmd,sleep_time)
unless sleep_time.nil? or sleep_time == 0
@logger.info("Sleep #{Process.pid} for #{sleep_time}s, before issuing cmd:#{cmd}")
sleep(sleep_time)
end
Process.fork do
Process.exec(cmd)
end
Process.wait
$?.exitstatus
end
# * return target, if provisioned
# * return bootstrap, if not provisioned yet
def get_system_type(filename)
fl = File.open(filename, "r")
state = fl.readline.rstrip
fl.close
state
end
# * return true, if corosync running and CIB is up
def is_corosync_up
cmd = "/usr/sbin/crm_attribute --type crm_config --query --name dc-version &>/dev/null"
exec(cmd,random(5,10)) == 0
end
# assume is_corosync_up true
# * return xml with free root space data from CIB, or nil
def get_free_root_space_from_CIB
cmd = "/usr/sbin/cibadmin --query --xpath \"//nvpair[@name='root_free']\""
sleep(random(3,5))
REXML::Document.new(Open3.popen3(cmd)[1].read).root.elements['/xpath-query'] rescue nil
end
# assume is_corosync_up true
# * return true, if node is OFFLINE (or not applicable for any actions by corosync cluster services)
def is_offline(fqdn)
cmd = "/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{fqdn}']\" | grep -q 'crmd=\"online\"'"
exec(cmd,random(5,10)) > 0
end
# assume is_corosync_up true
# issue fencing action to cluster services for given nodes
# * return 2, if some nodes has been ordered to fence and all crm command has been issued.
# * return 3, if some nodes has been ordered to fence, but some of crm commands was not issued for some reasons.
def fence_nodes(nodes_to_fence)
failed = false
nodes_to_fence.each do |node|
cmd = "/usr/sbin/crm --force node fence #{node}"
if exec(cmd,random(15,15)) > 0
@logger.error("Cannot issue the command: #{cmd}")
failed = true
else
@logger.error("Issued the fence action: #{cmd}")
end
end
return 2 unless failed
3
end
# Start check for cluster's free root space
@logger.debug("Starting cluster free root space check")
if File.exist?("/etc/nailgun_systemtype")
# exit, if node is not provisioned yet
if get_system_type("/etc/nailgun_systemtype") != "target"
@logger.debug("The system state is not 'target' yet, exiting with 1")
return 1
end
else
@logger.debug("The /etc/nailgun_systemtype file is missing, exiting with 1")
return 1
end
# exit, if cibadmin tool doesn't exist yet
unless is_corosync_up
@logger.debug("Corosync is absent or not ready yet, exiting with 1")
return 1
end
# query CIB for nodes' root free space
stanzas = get_free_root_space_from_CIB
if stanzas.nil?
@logger.debug("Free space monitoring resource is not configured yet, exiting with 1")
return 1
end
nodes_to_fence = []
# for every node in the cluster
stanzas.each_element do |e|
items = e.attributes
# get the node's fqdn and free space at root partition from CIB
line = { :fqdn => /^status-(.*)-root_free$/.match(items['id'])[1], :root_free => items['value'] }
# get the node's status from CIB
@logger.debug("Got fqdn:#{line[:fqdn]}, root free space:#{line[:root_free]}G")
# if node is not the agent's one, and node's root free space is zero, and its status is online, add it to the list of nodes must be fenced
cmd = "/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{line[:fqdn]}']\" | grep -q 'crmd=\"online\"'"
if line[:root_free].to_i == 0
offline = is_offline(line[:fqdn])
@logger.debug("Ignoring offline node #{line[:fqdn]}") if offline
end
itself = (system_info[:fqdn] == line[:fqdn] or system_info[:name] == line[:fqdn])
@logger.debug("Ignoring my own node #{line[:fqdn]} (cannot shoot myself)") if itself and line[:root_free].to_i == 0
nodes_to_fence.push(line[:fqdn]) unless line[:root_free].to_i > 0 or offline or itself or nodes_to_fence.include?(line[:fqdn])
end
# fence the failed nodes, if any, by random delay (15..30) and report an alert
unless nodes_to_fence.empty?
result = fence_nodes(nodes_to_fence)
@logger.error("Cluster has FAILED free root space check!")
return result
else
@logger.debug("Cluster has PASSED free root space check successfully")
return 0
end
end
end
# skip it, if under unit testing
if $0 == __FILE__
logger = Logger.new(STDOUT)
logger.level = Logger::DEBUG
agent = FenceAgent.new(logger)
begin
agent.check_and_fence
rescue => ex
logger.error "Cluster free root space check cannot be performed: #{ex.message}\n#{ex.backtrace}"
end
end

148
bin/fencing-agent_spec.rb Normal file
View File

@ -0,0 +1,148 @@
require 'rubygems'
require 'rspec'
require 'mocha/api'
# stub the root rights for agent script under test
Process.stubs(:euid).returns(0)
# use load for agent script w/o '.rb' extension
require './bin/fencing-agent'
# fixtures
$xml_all_ok = <<END
<xpath-query>
<nvpair id="status-node-7.test.domain.local-root_free" name="root_free" value="5"/>
<nvpair id="status-node-8.test.domain.local-root_free" name="root_free" value="5"/>
<nvpair id="status-node-9.test.domain.local-root_free" name="root_free" value="5"/>
</xpath-query>
END
$xml_need_fence1 = <<END
<xpath-query>
<nvpair id="status-node-7.test.domain.local-root_free" name="root_free" value="5"/>
<nvpair id="status-node-8.test.domain.local-root_free" name="root_free" value="5"/>
<nvpair id="status-node-9.test.domain.local-root_free" name="root_free" value="0"/>
</xpath-query>
END
$xml_need_fence2 = <<END
<xpath-query>
<nvpair id="status-node-7.test.domain.local-root_free" name="root_free" value="0"/>
<nvpair id="status-node-8.test.domain.local-root_free" name="root_free" value="5"/>
<nvpair id="status-node-9.test.domain.local-root_free" name="root_free" value="0"/>
</xpath-query>
END
$fl = StringIO.new("target")
describe FenceAgent do
before :each do
logger = Logger.new(STDOUT)
logger.level = Logger::DEBUG
@agent = FenceAgent.new(logger)
@agent.stubs(:random).returns(0)
File.stub(:exist?).with("/etc/nailgun_systemtype").and_return(true)
File.stub(:open).with("/etc/nailgun_systemtype", "r").and_return($fl)
end
describe "#new" do
it "takes logger and url parameters and returns a nailgun agent instance" do
@agent.should be_an_instance_of FenceAgent
end
end
# Fence daemon tests
describe "#check_and_fence" do
before :each do
@agent.stubs(:is_corosync_up).returns(true)
@agent.stubs(:get_system_type).returns("target")
end
it "Check N/A: should return 1, if system type file is missing" do
File.stub(:exist?).with("/etc/nailgun_systemtype").and_return(false)
@agent.check_and_fence.should eq(1)
end
it "Check N/A: should return 1, if fence action is not applicable because of wrong system type" do
@agent.stubs(:get_system_type).returns("bootstrap")
@agent.check_and_fence.should eq(1)
end
it "Check N/A: should return 1, if corosync is not ready" do
@agent.stub(:is_corosync_up).and_return(false)
@agent.check_and_fence.should eq(1)
end
it "Check N/A: should return 1, if none of free space monitoring ocf resources ready" do
@agent.stubs(:get_free_root_space_from_CIB).returns(nil)
@agent.check_and_fence.should eq(1)
end
it "Check PASSED: should return 0, if nodes in the cluster don't need fencing by root free space criteria" do
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_all_ok).root.elements['/xpath-query'])
@agent.check_and_fence.should eq(0)
end
it "Check FAILED: if one node must be fenced and is online, should issue fence command to corosync and return 2" do
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence1).root.elements['/xpath-query'])
expected_node = "node-9.test.domain.local"
expected_nodes = [ expected_node ]
@agent.stub(:exec).with(
"/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{expected_node}']\" | grep -q 'crmd=\"online\"'"
).and_return(0)
@agent.stub(:exec).with(
"/usr/sbin/crm --force node fence #{expected_node}"
).and_return(0)
@agent.should_receive(:is_offline).with(expected_node).exactly(1).times.and_return(false)
@agent.should_receive(:fence_nodes).with(expected_nodes).exactly(1).times.and_return(2)
@agent.check_and_fence.should eq(2)
end
it "Check FAILED: if some nodes must be fenced and are online, should issue fence commands to corosync and return 2" do
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence2).root.elements['/xpath-query'])
expected_node1 = "node-7.test.domain.local"
expected_node2 = "node-9.test.domain.local"
expected_nodes = [ expected_node1, expected_node2 ]
expected_nodes.each do |node|
@agent.stub(:exec).with(
"/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{node}']\" | grep -q 'crmd=\"online\"'"
).and_return(0)
@agent.stub(:exec).with(
"/usr/sbin/crm --force node fence #{node}"
).and_return(0)
@agent.should_receive(:is_offline).with(node).exactly(1).times.and_return(false)
end
@agent.should_receive(:fence_nodes).with(expected_nodes).exactly(1).times.and_return(2)
@agent.check_and_fence.should eq(2)
end
it "Check FAILED: should return 3, if some nodes are online and has been ordered to fence, but some of crm commands were not issued for some reasons" do
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence2).root.elements['/xpath-query'])
expected_node1 = "node-7.test.domain.local"
expected_node2 = "node-9.test.domain.local"
expected_nodes = [ expected_node1, expected_node2 ]
expected_nodes.each do |node|
@agent.stub(:exec).with(
"/usr/sbin/cibadmin --query --xpath \"//node_state[@uname='#{node}']\" | grep -q 'crmd=\"online\"'"
).and_return(0)
@agent.should_receive(:is_offline).with(node).exactly(1).times.and_return(false)
end
@agent.stub(:exec).with(
"/usr/sbin/crm --force node fence #{expected_node1}"
).and_return(0)
@agent.stub(:exec).with(
"/usr/sbin/crm --force node fence #{expected_node2}"
).and_return(6)
@agent.should_receive(:fence_nodes).with(expected_nodes).exactly(1).times.and_return(3)
@agent.check_and_fence.should eq(3)
end
it "Check consider PASSED behavior: should exclude itself from the fencing and return 0" do
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence1).root.elements['/xpath-query'])
@agent.stubs(:is_offline).returns(false)
@agent.stubs(:system_info).returns({ :fqdn => 'node-9.test.domain.local', :name => 'node-9' })
@agent.check_and_fence.should eq(0)
end
it "Check consider PASSED behavior: should exclude offline nodes from the fencing and return 0" do
@agent.stubs(:get_free_root_space_from_CIB).returns(REXML::Document.new($xml_need_fence1).root.elements['/xpath-query'])
@agent.stubs(:is_offline).returns(true)
@agent.check_and_fence.should eq(0)
end
end
end