KATO Tomoyuki 2016-05-05 18:47:49 +09:00
@ -33,6 +33,7 @@ declare -A SPECIAL_BOOKS=(

@ -0,0 +1,30 @@
name = openstackhaguide
summary = OpenStack High Availability Guide
author = OpenStack
author-email = openstack-docs@lists.openstack.org
home-page = http://docs.openstack.org/
classifier =
Environment :: OpenStack
Intended Audience :: Information Technology
Intended Audience :: System Administrators
License :: OSI Approved :: Apache Software License
Operating System :: POSIX :: Linux
Topic :: Documentation
setup-hooks =
all_files = 1
build-dir = build
source-dir = source
universal = 1
warnerrors = True

@ -0,0 +1,30 @@
#!/usr/bin/env python
# Copyright (c) 2013 Hewlett-Packard Development Company, L.P.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import setuptools
# In python < 2.7.4, a lazy loading of package `pbr` will break
# setuptools if some other modules registered functions in `atexit`.
# solution from: http://bugs.python.org/issue15881#msg170215
import multiprocessing # noqa
except ImportError:

@ -0,0 +1 @@

@ -0,0 +1,12 @@
Configure high availability on compute nodes
The `Installation Guide
gives instructions for installing multiple compute nodes.
To make them highly available,
you must configure the environment
to include multiple instances of the API
and other services.

@ -0,0 +1,10 @@
Configuring the compute node for high availability
.. toctree::
:maxdepth: 2

View File

@ -0,0 +1,290 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is execfile()d with the current directory set to its
# containing dir.
# Note that not all possible configuration values are present in this
# autogenerated file.
# All configuration values have a default; values that are commented out
# serve to show the default.
import os
# import sys
import openstackdocstheme
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = []
# Add any paths that contain templates here, relative to this directory.
# templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'High Availability Guide'
bug_tag = u'ha-guide'
copyright = u'2015, OpenStack contributors'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
# The short X.Y version.
version = '0.0.1'
# The full version, including alpha/beta/rc tags.
release = '0.0.1'
# A few variables have to be set for the log-a-bug feature.
# giturl: The location of conf.py on Git. Must be set manually.
# gitsha: The SHA checksum of the bug description. Automatically extracted from git log.
# bug_tag: Tag for categorizing the bug. Must be set manually.
# These variables are passed to the logabug code via html_context.
giturl = u'http://git.openstack.org/cgit/openstack/openstack-manuals/tree/doc/ha-guide/source'
git_cmd = "/usr/bin/git log | head -n1 | cut -f2 -d' '"
gitsha = os.popen(git_cmd).read().strip('\n')
html_context = {"gitsha": gitsha, "bug_tag": bug_tag,
"giturl": giturl}
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
# today = ''
# Else, today_fmt is used as the format for a strftime call.
# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['common/cli*', 'common/nova*',
'common/get_started*', 'common/dashboard*']
# The reST default role (used for this markup: `text`) to use for all
# documents.
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
# keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'openstackdocs'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
html_theme_path = [openstackdocstheme.get_html_theme_path()]
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = []
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
# So that we can enable "log-a-bug" links from each output HTML page, this
# variable must be set to a format that includes year, month, day, hours and
# minutes.
html_last_updated_fmt = '%Y-%m-%d %H:%M'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
# html_additional_pages = {}
# If false, no module index is generated.
# html_domain_indices = True
# If false, no index is generated.
html_use_index = False
# If true, the index is split into individual pages for each letter.
# html_split_index = False
# If true, links to the reST sources are added to the pages.
html_show_sourcelink = False
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'ha-guide'
# If true, publish source files
html_copy_source = False
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
# 'preamble': '',
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'HAGuide.tex', u'High Availability Guide',
u'OpenStack contributors', 'manual'),
# The name of an image file (relative to this directory) to place at the top of
# the title page.
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
# latex_use_parts = False
# If true, show page references after internal links.
# latex_show_pagerefs = False
# If true, show URL addresses after external links.
# latex_show_urls = False
# Documents to append as an appendix to all manuals.
# latex_appendices = []
# If false, no module index is generated.
# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'haguide', u'High Availability Guide',
[u'OpenStack contributors'], 1)
# If true, show URL addresses after external links.
# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'HAGuide', u'High Availability Guide',
u'OpenStack contributors', 'HAGuide',
'This guide shows OpenStack operators and deployers how to configure'
'OpenStack Networking to be robust and fault-tolerant.', 'Miscellaneous'),
# Documents to append as an appendix to all manuals.
# texinfo_appendices = []
# If false, no module index is generated.
# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
# texinfo_no_detailmenu = False
# -- Options for Internationalization output ------------------------------
locale_dirs = ['locale/']

@ -0,0 +1,396 @@
Before you launch Galera Cluster, you need to configure the server
and the database to operate as part of the cluster.
Configuring the server
Certain services running on the underlying operating system of your
OpenStack database may block Galera Cluster from normal operation
or prevent ``mysqld`` from achieving network connectivity with the cluster.
Galera Cluster requires that you open four ports to network traffic:
- On ``3306``, Galera Cluster uses TCP for database client connections
and State Snapshot Transfers methods that require the client,
(that is, ``mysqldump``).
- On ``4567`` Galera Cluster uses TCP for replication traffic. Multicast
replication uses both TCP and UDP on this port.
- On ``4568`` Galera Cluster uses TCP for Incremental State Transfers.
- On ``4444`` Galera Cluster uses TCP for all other State Snapshot Transfer
.. seealso:: For more information on firewalls, see `Firewalls and default ports
<http://docs.openstack.org/liberty/config-reference/content/firewalls-default-ports.html>`_, in the Configuration Reference.
For many Linux distributions, you can configure the firewall using
the ``iptables`` utility. To do so, complete the following steps:
#. For each cluster node, run the following commands, replacing
``NODE-IP-ADDRESS`` with the IP address of the cluster node
you want to open the firewall to:
.. code-block:: console
# iptables --append INPUT --in-interface eth0 \
--protocol --match tcp --dport 3306 \
--source NODE-IP-ADDRESS --jump ACCEPT
# iptables --append INPUT --in-interface eth0 \
--protocol --match tcp --dport 4567 \
--source NODE-IP-ADDRESS --jump ACCEPT
# iptables --append INPUT --in-interface eth0 \
--protocol --match tcp --dport 4568 \
--source NODE-IP-ADDRESS --jump ACCEPT
# iptables --append INPUT --in-interface eth0 \
--protocol --match tcp --dport 4444 \
--source NODE-IP-ADDRESS --jump ACCEPT
In the event that you also want to configure multicast replication,
run this command as well:
.. code-block:: console
# iptables --append INPUT --in-interface eth0 \
--protocol udp --match udp --dport 4567 \
--source NODE-IP-ADDRESS --jump ACCEPT
#. Make the changes persistent. For servers that use ``init``, use
the :command:`save` command:
.. code-block:: console
# service save iptables
For servers that use ``systemd``, you need to save the current packet
filtering to the path of the file that ``iptables`` reads when it starts.
This path can vary by distribution, but common locations are in the
``/etc`` directory, such as:
- ``/etc/sysconfig/iptables``
- ``/etc/iptables/iptables.rules``
When you find the correct path, run the :command:`iptables-save` command:
.. code-block:: console
# iptables-save > /etc/sysconfig/iptables
With the firewall configuration saved, whenever your OpenStack
database starts.
For many Linux distributions, you can configure the firewall using the
``firewall-cmd`` utility for FirewallD. To do so, complete the following
steps on each cluster node:
#. Add the Galera Cluster service:
.. code-block:: console
# firewall-cmd --add-service=mysql
#. For each instance of OpenStack database in your cluster, run the
following commands, replacing ``NODE-IP-ADDRESS`` with the IP address
of the cluster node you want to open the firewall to:
.. code-block:: console
# firewall-cmd --add-port=3306/tcp
# firewall-cmd --add-port=4567/tcp
# firewall-cmd --add-port=4568/tcp
# firewall-cmd --add-port=4444/tcp
In the event that you also want to configure mutlicast replication,
run this command as well:
.. code-block:: console
# firewall-cmd --add-port=4567/udp
#. To make this configuration persistent, repeat the above commands
with the :option:`--permanent` option.
.. code-block:: console
# firewall-cmd --add-service=mysql --permanent
# firewall-cmd --add-port=3306/tcp --permanent
# firewall-cmd --add-port=4567/tcp --permanent
# firewall-cmd --add-port=4568/tcp --permanent
# firewall-cmd --add-port=4444/tcp --permanent
# firewall-cmd --add-port=4567/udp --permanent
With the firewall configuration saved, whenever your OpenStack
database starts.
Security-Enhanced Linux is a kernel module for improving security on Linux
operating systems. It is commonly enabled and configured by default on
Red Hat-based distributions. In the context of Galera Cluster, systems with
SELinux may block the database service, keep it from starting or prevent it
from establishing network connections with the cluster.
To configure SELinux to permit Galera Cluster to operate, complete
the following steps on each cluster node:
#. Using the ``semanage`` utility, open the relevant ports:
.. code-block:: console
# semanage port -a -t mysqld_port_t -p tcp 3306
# semanage port -a -t mysqld_port_t -p tcp 4567
# semanage port -a -t mysqld_port_t -p tcp 4568
# semanage port -a -t mysqld_port_t -p tcp 4444
In the event that you use multicast replication, you also need to
open ``4567`` to UDP traffic:
.. code-block:: console
# semanage port -a -t mysqld_port_t -p udp 4567
#. Set SELinux to allow the database server to run:
.. code-block:: console
# semanage permissive -a mysqld_t
With these options set, SELinux now permits Galera Cluster to operate.
.. note:: Bear in mind, leaving SELinux in permissive mode is not a good
security practice. Over the longer term, you need to develop a
security policy for Galera Cluster and then switch SELinux back
into enforcing mode.
For more information on configuring SELinux to work with
Galera Cluster, see the `Documentation
Application Armor is a kernel module for improving security on Linux
operating systems. It is developed by Canonical and commonly used on
Ubuntu-based distributions. In the context of Galera Cluster, systems
with AppArmor may block the database service from operating normally.
To configure AppArmor to work with Galera Cluster, complete the
following steps on each cluster node:
#. Create a symbolic link for the database server in the ``disable`` directory:
.. code-block:: console
# ln -s /etc/apparmor.d/usr /etc/apparmor.d/disable/.sbin.mysqld
#. Restart AppArmor. For servers that use ``init``, run the following command:
.. code-block:: console
# service apparmor restart
For servers that use ``systemd``, instead run this command:
.. code-block:: console
# systemctl restart apparmor
AppArmor now permits Galera Cluster to operate.
Database configuration
MySQL databases, including MariaDB and Percona XtraDB, manage their
configurations using a ``my.cnf`` file, which is typically located in the
``/etc`` directory. Configuration options available in these databases are
also available in Galera Cluster, with some restrictions and several
.. code-block:: ini
# InnoDB Configuration
# Galera Cluster Configuration
Configuring ``mysqld``
While all of the configuration parameters available to the standard MySQL,
MariaDB or Percona XtraDB database server are available in Galera Cluster,
there are some that you must define an outset to avoid conflict or
unexpected behavior.
- Ensure that the database server is not bound only to to the localhost,
````. Instead, bind it to ```` to ensure it listens on
all available interfaces.
.. code-block:: ini
- Ensure that the binary log format is set to use row-level replication,
as opposed to statement-level replication:
.. code-block:: ini
Configuring InnoDB
Galera Cluster does not support non-transactional storage engines and
requires that you use InnoDB by default. There are some additional
parameters that you must define to avoid conflicts.
- Ensure that the default storage engine is set to InnoDB:
.. code-block:: ini
- Ensure that the InnoDB locking mode for generating auto-increment values
is set to ``2``, which is the interleaved locking mode.
.. code-block:: ini
Do not change this value. Other modes may cause ``INSERT`` statements
on tables with auto-increment columns to fail as well as unresolved
deadlocks that leave the system unresponsive.
- Ensure that the InnoDB log buffer is written to file once per second,
rather than on each commit, to improve performance:
.. code-block:: ini
Bear in mind, while setting this parameter to ``1`` or ``2`` can improve
performance, it introduces certain dangers. Operating system failures can
erase the last second of transactions. While you can recover this data
from another node, if the cluster goes down at the same time
(in the event of a data center power outage), you lose this data permanently.
- Define the InnoDB memory buffer pool size. The default value is 128 MB,
but to compensate for Galera Cluster's additional memory usage, scale
your usual value back by 5%:
.. code-block:: ini
Configuring wsrep replication
Galera Cluster configuration parameters all have the ``wsrep_`` prefix.
There are five that you must define for each cluster node in your
OpenStack database.
- **wsrep Provider** The Galera Replication Plugin serves as the wsrep
Provider for Galera Cluster. It is installed on your system as the
``libgalera_smm.so`` file. You must define the path to this file in
your ``my.cnf``.
.. code-block:: ini
- **Cluster Name** Define an arbitrary name for your cluster.
.. code-block:: ini
You must use the same name on every cluster node. The connection fails
when this value does not match.
- **Cluster Address** List the IP addresses for each cluster node.
.. code-block:: ini
Replace the IP addresses given here with comma-separated list of each
OpenStack database in your cluster.
- **Node Name** Define the logical name of the cluster node.
.. code-block:: ini
- **Node Address** Define the IP address of the cluster node.
.. code-block:: ini
Additional parameters
For a complete list of the available parameters, run the
``SHOW VARIABLES`` command from within the database client:
.. code-block:: mysql
| Variable_name | Value |
| wsrep_auto_increment_control | ON |
| wsrep_causal_reads | OFF |
| wsrep_certify_nonPK | ON |
| ... | ... |
| wsrep_sync_wait | 0 |
For the documentation of these parameters, wsrep Provider option and status
variables available in Galera Cluster, see `Reference

@ -0,0 +1,275 @@
Using Galera Cluster requires that you install two packages. The first is
the database server, which must include the wsrep API patch. The second
package is the Galera Replication Plugin, which enables the write-set
replication service functionality with the database server.
There are three implementations of Galera Cluster: MySQL, MariaDB and
Percona XtraDB. For each implementation, there is a software repository that
provides binary packages for Debian, Red Hat, and SUSE-based Linux
Enabling the repository
Galera Cluster is not available in the base repositories of Linux
distributions. In order to install it with your package manage, you must
first enable the repository on your system. The particular methods for
doing so vary depending on which distribution you use for OpenStack and
which database server you want to use.
For Debian and Debian-based distributions, such as Ubuntu, complete the
following steps:
#. Add the GnuPG key for the database repository that you want to use.
.. code-block:: console
# apt-key adv --recv-keys --keyserver \
keyserver.ubuntu.com BC19DDBA
Note that the particular key value in this command varies depending on
which database software repository you want to use.
| Database | Key |
| Galera Cluster for MySQL | ``BC19DDBA`` |
| MariaDB Galera Cluster | ``0xcbcb082a1bb943db`` |
| Percona XtraDB Cluster | ``1C4CBDCDCD2EFD2A`` |
#. Add the repository to your sources list. Using your preferred text
editor, create a ``galera.list`` file in the ``/etc/apt/sources.list.d/``
directory. For the contents of this file, use the lines that pertain to
the software repository you want to install:
.. code-block:: linux-config
# Galera Cluster for MySQL
deb http://releases.galeracluster.com/DISTRO RELEASE main
# MariaDB Galera Cluster
deb http://mirror.jmu.edu/pub/mariadb/repo/VERSION/DISTRO RELEASE main
# Percona XtraDB Cluster
deb http://repo.percona.com/apt RELEASE main
For each entry: Replace all instances of ``DISTRO`` with the distribution
that you use, such as ``debian`` or ``ubuntu``. Replace all instances of
``RELEASE`` with the release of that distribution, such as ``wheezy`` or
``trusty``. Replace all instances of ``VERSION`` with the version of the
database server that you want to install, such as ``5.6`` or ``10.0``.
.. note:: In the event that you do not know the release code-name for
your distribution, you can use the following command to
find it out:
.. code-block:: console
$ lsb_release -a
#. Update the local cache.
.. code-block:: console
# apt-get update
Packages in the Galera Cluster Debian repository are now available for
installation on your system.
Red Hat
For Red Hat Enterprise Linux and Red Hat-based Linux distributions, the
process is more straightforward. In this file, only enter the text for
the repository you want to use.
- For Galera Cluster for MySQL, using your preferred text editor, create a
``Galera.repo`` file in the ``/etc/yum.repos.d/`` directory.
.. code-block:: linux-config
name = Galera Cluster for MySQL
baseurl = http://releases.galeracluster.com/DISTRO/RELEASE/ARCH
gpgkey = http://releases.galeracluster.com/GPG-KEY-galeracluster.com
gpgcheck = 1
Replace ``DISTRO`` with the name of the distribution you use, such as
``centos`` or ``fedora``. Replace ``RELEASE`` with the release number,
such as ``7`` for CentOS 7. Replace ``ARCH`` with your system
architecture, such as ``x86_64``
- For MariaDB Galera Cluster, using your preferred text editor, create a
``Galera.repo`` file in the ``/etc/yum.repos.d/`` directory.
.. code-block:: linux-config
name = MariaDB Galera Cluster
baseurl = http://yum.mariadb.org/VERSION/PACKAGE
gpgkey = https://yum.mariadb.org/RPM-GPG-KEY-MariaDB
gpgcheck = 1
Replace ``VERSION`` with the version of MariaDB you want to install, such
as ``5.6`` or ``10.0``. Replace ``PACKAGE`` with the package type and
architecture, such as ``rhel6-amd64`` for Red Hat 6 on 64-bit
- For Percona XtraDB Cluster, run the following command:
.. code-block:: console
# yum install http://www.percona.com/downloads/percona-release/redhat/0.1-3/percona-release-0.1-3.noarch.rpm
Bear in mind that the Percona repository only supports Red Hat Enterprise
Linux and CentOS distributions.
Packages in the Galera Cluster Red Hat repository are not available for
installation on your system.
For SUSE Enterprise Linux and SUSE-based distributions, such as openSUSE
binary installations are only available for Galera Cluster for MySQL and
MariaDB Galera Cluster.
#. Create a ``Galera.repo`` file in the local directory. For Galera Cluster
for MySQL, use the following content:
.. code-block:: linux-config
name = Galera Cluster for MySQL
baseurl = http://releases.galeracluster.com/DISTRO/RELEASE
gpgkey = http://releases.galeracluster.com/GPG-KEY-galeracluster.com
gpgcheck = 1
In the text: Replace ``DISTRO`` with the name of the distribution you
use, such as ``sles`` or ``opensuse``. Replace ``RELEASE`` with the
version number of that distribution.
For MariaDB Galera Cluster, instead use this content:
.. code-block:: linux-config
name = MariaDB Galera Cluster
baseurl = http://yum.mariadb.org/VERSION/PACKAGE
gpgkey = https://yum.mariadb.org/RPM-GPG-KEY-MariaDB
gpgcheck = 1
In the text: Replace ``VERSION`` with the version of MariaDB you want to
install, such as ``5.6`` or ``10.0``. Replace package with the package
architecture you want to use, such as ``opensuse13-amd64``.
#. Add the repository to your system:
.. code-block:: console
$ sudo zypper addrepo Galera.repo
#. Refresh ``zypper``:
.. code-block:: console
$ sudo zypper refresh
Packages in the Galera Cluster SUSE repository are now available for
Installing Galera Cluster
When you finish enabling the software repository for Galera Cluster, you can
install it using your package manager. The particular command and packages
you need to install varies depending on which database server you want to
install and which Linux distribution you use:
Galera Cluster for MySQL:
- For Debian and Debian-based distributions, such as Ubuntu, run the
following command:
.. code-block:: console
# apt-get install galera-3 mysql-wsrep-5.6
- For Red Hat Enterprise Linux and Red Hat-based distributions, such as
Fedora or CentOS, instead run this command:
.. code-block:: console
# yum install galera-3 mysql-wsrep-5.6
- For SUSE Enterprise Linux Server and SUSE-based distributions, such as
openSUSE, instead run this command:
.. code-block:: console
# zypper install galera-3 mysql-wsrep-5.6
MariaDB Galera Cluster:
- For Debian and Debian-based distributions, such as Ubuntu, run the
following command:
.. code-block:: console
# apt-get install galera mariadb-galera-server
- For Red Hat Enterprise Linux and Red Hat-based distributions, such as
Fedora or CentOS, instead run this command:
.. code-block:: console
# yum install galera MariaDB-Galera-server
- For SUSE Enterprise Linux Server and SUSE-based distributions, such as
openSUSE, instead run this command:
.. code-block:: console
# zypper install galera MariaDB-Galera-server
Percona XtraDB Cluster:
- For Debian and Debian-based distributions, such as Ubuntu, run the
following command:
.. code-block:: console
# apt-get install percona-xtradb-cluster
- For Red Hat Enterprise Linux and Red Hat-based distributions, such as
Fedora or CentOS, instead run this command:
.. code-block:: console
# yum install Percona-XtraDB-Cluster
Galera Cluster is now installed on your system. You must repeat this
process for each controller node in your cluster.
.. warning:: In the event that you already installed the standalone version
of MySQL, MariaDB or Percona XtraDB, this installation purges all
privileges on your OpenStack database server. You must reapply the
privileges listed in the installation guide.

@ -0,0 +1,255 @@
When you finish the installation and configuration process on each
cluster node in your OpenStack database, you can initialize Galera Cluster.
Before you attempt this, verify that you have the following ready:
- Database hosts with Galera Cluster installed. You need a
minimum of three hosts;
- No firewalls between the hosts;
- SELinux and AppArmor set to permit access to ``mysqld``;
- The correct path to ``libgalera_smm.so`` given to the
``wsrep_provider`` parameter.
Initializing the cluster
In Galera Cluster, the Primary Component is the cluster of database
servers that replicate into each other. In the event that a
cluster node loses connectivity with the Primary Component, it
defaults into a non-operational state, to avoid creating or serving
inconsistent data.
By default, cluster nodes do not start as part of a Primary
Component. Instead they assume that one exists somewhere and
attempts to establish a connection with it. To create a Primary
Component, you must start one cluster node using the
``--wsrep-new-cluster`` option. You can do this using any cluster
node, it is not important which you choose. In the Primary
Component, replication and state transfers bring all databases to
the same state.
To start the cluster, complete the following steps:
#. Initialize the Primary Component on one cluster node. For
servers that use ``init``, run the following command:
.. code-block:: console
# service mysql start --wsrep-new-cluster
For servers that use ``systemd``, instead run this command:
.. code-block:: console
# systemctl start mysql --wsrep-new-cluster
#. Once the database server starts, check the cluster status using
the ``wsrep_cluster_size`` status variable. From the database
client, run the following command:
.. code-block:: mysql
SHOW STATUS LIKE 'wsrep_cluster_size';
| Variable_name | Value |
| wsrep_cluster_size | 1 |
#. Start the database server on all other cluster nodes. For
servers that use ``init``, run the following command:
.. code-block:: console
# service mysql start
For servers that use ``systemd``, instead run this command:
.. code-block:: console
# systemctl start mysql
#. When you have all cluster nodes started, log into the database
client on one of them and check the ``wsrep_cluster_size``
status variable again.
.. code-block:: mysql
SHOW STATUS LIKE 'wsrep_cluster_size';
| Variable_name | Value |
| wsrep_cluster_size | 3 |
When each cluster node starts, it checks the IP addresses given to
the ``wsrep_cluster_address`` parameter and attempts to establish
network connectivity with a database server running there. Once it
establishes a connection, it attempts to join the Primary
Component, requesting a state transfer as needed to bring itself
into sync with the cluster.
In the event that you need to restart any cluster node, you can do
so. When the database server comes back it, it establishes
connectivity with the Primary Component and updates itself to any
changes it may have missed while down.
Restarting the cluster
Individual cluster nodes can stop and be restarted without issue.
When a database loses its connection or restarts, Galera Cluster
brings it back into sync once it reestablishes connection with the
Primary Component. In the event that you need to restart the
entire cluster, identify the most advanced cluster node and
initialize the Primary Component on that node.
To find the most advanced cluster node, you need to check the
sequence numbers, or seqnos, on the last committed transaction for
each. You can find this by viewing ``grastate.dat`` file in
database directory,
.. code-block:: console
$ cat /path/to/datadir/grastate.dat
# Galera saved state
version: 3.8
uuid: 5ee99582-bb8d-11e2-b8e3-23de375c1d30
seqno: 8204503945773
Alternatively, if the database server is running, use the
``wsrep_last_committed`` status variable:
.. code-block:: mysql
SHOW STATUS LIKE 'wsrep_last_committed';
| Variable_name | Value |
| wsrep_last_committed | 409745 |
This value increments with each transaction, so the most advanced
node has the highest sequence number, and therefore is the most up to date.
Configuration tips
Deployment strategies
Galera can be configured using one of the following
- Each instance has its own IP address;
OpenStack services are configured with the list of these IP
addresses so they can select one of the addresses from those
- Galera runs behind HAProxy.
HAProxy load balances incoming requests and exposes just one IP
address for all the clients.
Galera synchronous replication guarantees a zero slave lag. The
failover procedure completes once HAProxy detects that the active
back end has gone down and switches to the backup one, which is
then marked as 'UP'. If no back ends are up (in other words, the
Galera cluster is not ready to accept connections), the failover
procedure finishes only when the Galera cluster has been
successfully reassembled. The SLA is normally no more than 5
- Use MySQL/Galera in active/passive mode to avoid deadlocks on
``SELECT ... FOR UPDATE`` type queries (used, for example, by nova
and neutron). This issue is discussed more in the following:
- http://lists.openstack.org/pipermail/openstack-dev/2014-May/035264.html
- http://www.joinfu.com/
Of these options, the second one is highly recommended. Although Galera
supports active/active configurations, we recommend active/passive
(enforced by the load balancer) in order to avoid lock contention.
Configuring HAProxy
If you use HAProxy for load-balancing client access to Galera
Cluster as described in the :doc:`controller-ha-haproxy`, you can
use the ``clustercheck`` utility to improve health checks.
#. Create a configuration file for ``clustercheck`` at
.. code-block:: ini
#. Log in to the database client and grant the ``clustercheck`` user
``PROCESS`` privileges.
.. code-block:: mysql
GRANT PROCESS ON *.* TO 'clustercheck_user'@'localhost'
IDENTIFIED BY 'my_clustercheck_password';
You only need to do this on one cluster node. Galera Cluster
replicates the user to all the others.
#. Create a configuration file for the HAProxy monitor service, at
.. code-block:: ini
service galera-monitor {
port = 9200
disable = no
socket_type = stream
protocol = tcp
wait = no
user = root
group = root
groups = yes
server = /usr/bin/clustercheck
per_source = UNLIMITED
log_on_success =
log_on_failure = HOST
flags = REUSE
#. Start the ``xinetd`` daemon for ``clustercheck``. For servers
that use ``init``, run the following commands:
.. code-block:: console
# service xinetd enable
# service xinetd start
For servers that use ``systemd``, instead run these commands:
.. code-block:: console
# systemctl daemon-reload
# systemctl enable xinetd
# systemctl start xinetd

@ -0,0 +1,33 @@
Database (Galera Cluster)
The first step is to install the database that sits at the heart of the
cluster. To implement high availability, run an instance of the database on
each controller node and use Galera Cluster to provide replication between
them. Galera Cluster is a synchronous multi-master database cluster, based
on MySQL and the InnoDB storage engine. It is a high-availability service
that provides high system uptime, no data loss, and scalability for growth.
You can achieve high availability for the OpenStack database in many
different ways, depending on the type of database that you want to use.
There are three implementations of Galera Cluster available to you:
- `Galera Cluster for MySQL <http://galeracluster.com/>`_ The MySQL
reference implementation from Codership, Oy;
- `MariaDB Galera Cluster <https://mariadb.org/>`_ The MariaDB
implementation of Galera Cluster, which is commonly supported in
environments based on Red Hat distributions;
- `Percona XtraDB Cluster <http://www.percona.com/>`_ The XtraDB
implementation of Galera Cluster from Percona.
In addition to Galera Cluster, you can also achieve high availability
through other database options, such as PostgreSQL, which has its own
replication system.
.. toctree::
:maxdepth: 2

@ -0,0 +1,229 @@
HAProxy provides a fast and reliable HTTP reverse proxy and load balancer
for TCP or HTTP applications. It is particularly suited for web crawling
under very high loads while needing persistence or Layer 7 processing.
It realistically supports tens of thousands of connections with recent
Each instance of HAProxy configures its front end to accept connections
only from the virtual IP (VIP) address and to terminate them as a list
of all instances of the corresponding service under load balancing,
such as any OpenStack API service.
This makes the instances of HAProxy act independently and fail over
transparently together with the network endpoints (VIP addresses)
failover and, therefore, shares the same SLA.
You can alternatively use a commercial load balancer, which is a hardware
or software. A hardware load balancer generally has good performance.
For detailed instructions about installing HAProxy on your nodes,
see its `official documentation <http://www.haproxy.org/#docs>`_.
.. note::
HAProxy should not be a single point of failure.
It is advisable to have multiple HAProxy instances running,
where the number of these instances is a small odd number like 3 or 5.
You need to ensure its availability by other means,
such as Keepalived or Pacemaker.
The common practice is to locate an HAProxy instance on each OpenStack
controller in the environment.
Once configured (see example file below), add HAProxy to the cluster
and ensure the VIPs can only run on machines where HAProxy is active:
.. code-block:: console
$ pcs resource create lb-haproxy systemd:haproxy --clone
$ pcs constraint order start p_api-ip then lb-haproxy-clone kind=Optional
$ pcs constraint colocation add p_api-ip with lb-haproxy-clone
Example Config File
Here is an example ``/etc/haproxy/haproxy.cfg`` configuration file.
You need a copy of it on each controller node.
.. note::
To implement any changes made to this you must restart the HAProxy service
.. code-block:: none
chroot /var/lib/haproxy
group haproxy
maxconn 4000
pidfile /var/run/haproxy.pid
user haproxy
log global
maxconn 4000
option redispatch
retries 3
timeout http-request 10s
timeout queue 1m
timeout connect 10s
timeout client 1m
timeout server 1m
timeout check 10s
listen dashboard_cluster
bind <Virtual IP>:443
balance source
option tcpka
option httpchk
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen galera_cluster
bind <Virtual IP>:3306
balance source
option httpchk
server controller1 check port 9200 inter 2000 rise 2 fall 5
server controller2 backup check port 9200 inter 2000 rise 2 fall 5
server controller3 backup check port 9200 inter 2000 rise 2 fall 5
listen glance_api_cluster
bind <Virtual IP>:9292
balance source
option tcpka
option httpchk
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen glance_registry_cluster
bind <Virtual IP>:9191
balance source
option tcpka
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen keystone_admin_cluster
bind <Virtual IP>:35357
balance source
option tcpka
option httpchk
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen keystone_public_internal_cluster
bind <Virtual IP>:5000
balance source
option tcpka
option httpchk
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen nova_ec2_api_cluster
bind <Virtual IP>:8773
balance source
option tcpka
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen nova_compute_api_cluster
bind <Virtual IP>:8774
balance source
option tcpka
option httpchk
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen nova_metadata_api_cluster
bind <Virtual IP>:8775
balance source
option tcpka
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen cinder_api_cluster
bind <Virtual IP>:8776
balance source
option tcpka
option httpchk
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen ceilometer_api_cluster
bind <Virtual IP>:8777
balance source
option tcpka
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen nova_vncproxy_cluster
bind <Virtual IP>:6080
balance source
option tcpka
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen neutron_api_cluster
bind <Virtual IP>:9696
balance source
option tcpka
option httpchk
option tcplog
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
listen swift_proxy_cluster
bind <Virtual IP>:8080
balance source
option tcplog
option tcpka
server controller1 check inter 2000 rise 2 fall 5
server controller2 check inter 2000 rise 2 fall 5
server controller3 check inter 2000 rise 2 fall 5
.. note::
The Galera cluster configuration directive ``backup`` indicates
that two of the three controllers are standby nodes.
This ensures that only one node services write requests
because OpenStack support for multi-node writes is not yet production-ready.
.. note::
The Telemetry API service configuration does not have the ``option httpchk``
directive as it cannot process this check properly.
TODO: explain why the Telemetry API is so special
[TODO: we need more commentary about the contents and format of this file]

@ -0,0 +1,147 @@
Identity services (keystone)
OpenStack Identity (keystone)
is the Identity service in OpenStack that is used by many services.
You should be familiar with
`OpenStack identity concepts
before proceeding.
Making the OpenStack Identity service highly available
in active / passive mode involves:
- :ref:`keystone-pacemaker`
- :ref:`keystone-config-identity`
- :ref:`keystone-services-config`
.. _keystone-pacemaker:
Add OpenStack Identity resource to Pacemaker
#. You must first download the OpenStack Identity resource to Pacemaker
by running the following commands:
.. code-block:: console
# cd /usr/lib/ocf/resource.d
# mkdir openstack
# cd openstack
# wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/keystone
# chmod a+rx *
#. You can now add the Pacemaker configuration
for the OpenStack Identity resource
by running the :command:`crm configure` command
to connect to the Pacemaker cluster.
Add the following cluster resources:
primitive p_keystone ocf:openstack:keystone \
params config="/etc/keystone/keystone.conf"
os_password="secretsecret" \
os_auth_url="" \
op monitor interval="30s" timeout="30s"
This configuration creates ``p_keystone``,
a resource for managing the OpenStack Identity service.
:command:`crm configure` supports batch input
so you may copy and paste the above lines
into your live Pacemaker configuration,
and then make changes as required.
For example, you may enter edit ``p_ip_keystone``
from the :command:`crm configure` menu
and edit the resource to match your preferred virtual IP address.
#. After you add these resources,
commit your configuration changes by entering :command:`commit`
from the :command:`crm configure` menu.
Pacemaker then starts the OpenStack Identity service
and its dependent resources on one of your nodes.
.. _keystone-config-identity:
Configure OpenStack Identity service
#. Edit the :file:`keystone.conf` file
to change the values of the :manpage:`bind(2)` parameters:
.. code-block:: ini
bind_host =
public_bind_host =
admin_bind_host =
The ``admin_bind_host`` parameter
lets you use a private network for admin access.
#. To be sure that all data is highly available,
ensure that everything is stored in the MySQL database
(which is also highly available):
.. code-block:: ini
driver = keystone.catalog.backends.sql.Catalog
driver = keystone.identity.backends.sql.Identity
.. _keystone-services-config:
Configure OpenStack services to use the highly available OpenStack Identity
Your OpenStack services must now point
their OpenStack Identity configuration
to the highly available virtual cluster IP address
rather than point to the physical IP address
of an OpenStack Identity server as you would do
in a non-HA environment.
#. For OpenStack Compute, for example,
if your OpenStack Identiy service IP address is,
use the following configuration in your :file:`api-paste.ini` file:
.. code-block:: ini
auth_host =
#. You also need to create the OpenStack Identity Endpoint
with this IP address.
.. note::
If you are using both private and public IP addresses,
you should create two virtual IP addresses
and define your endpoint like this:
.. code-block:: console
$ openstack endpoint create --region $KEYSTONE_REGION \
$service-type public http://PUBLIC_VIP:5000/v2.0
$ openstack endpoint create --region $KEYSTONE_REGION \
$service-type admin
$ openstack endpoint create --region $KEYSTONE_REGION \
$service-type internal
#. If you are using the horizon dashboard,
edit the :file:`local_settings.py` file
to include the following:
.. code-block:: ini

@ -0,0 +1,21 @@
Memcached is a general-purpose distributed memory caching system. It
is used to speed up dynamic database-driven websites by caching data
and objects in RAM to reduce the number of times an external data
source must be read.
Memcached is a memory cache demon that can be used by most OpenStack
services to store ephemeral data, such as tokens.
Access to memcached is not handled by HAproxy because replicated
access is currently only in an experimental state. Instead OpenStack
services must be supplied with the full list of hosts running
The Memcached client implements hashing to balance objects among the
instances. Failure of an instance only impacts a percentage of the
objects and the client automatically removes it from the list of
instances. The SLA is several minutes.

@ -0,0 +1,597 @@
Pacemaker cluster stack
`Pacemaker <http://clusterlabs.org/>`_ cluster stack is the state-of-the-art
high availability and load balancing stack for the Linux platform.
Pacemaker is useful to make OpenStack infrastructure highly available.
Also, it is storage and application-agnostic, and in no way
specific to OpenStack.
Pacemaker relies on the
`Corosync <http://corosync.github.io/corosync/>`_ messaging layer
for reliable cluster communications.
Corosync implements the Totem single-ring ordering and membership protocol.
It also provides UDP and InfiniBand based messaging,
quorum, and cluster membership to Pacemaker.
Pacemaker does not inherently (need or want to) understand the
applications it manages. Instead, it relies on resource agents (RAs),
scripts that encapsulate the knowledge of how to start, stop, and
check the health of each application managed by the cluster.
These agents must conform to one of the `OCF <https://github.com/ClusterLabs/
`SysV Init <http://refspecs.linux-foundation.org/LSB_3.0.0/LSB-Core-generic/
LSB-Core-generic/iniscrptact.html>`_, Upstart, or Systemd standards.
Pacemaker ships with a large set of OCF agents (such as those managing
MySQL databases, virtual IP addresses, and RabbitMQ), but can also use
any agents already installed on your system and can be extended with
your own (see the
`developer guide <http://www.linux-ha.org/doc/dev-guides/ra-dev-guide.html>`_).
The steps to implement the Pacemaker cluster stack are:
- :ref:`pacemaker-install`
- :ref:`pacemaker-corosync-setup`
- :ref:`pacemaker-corosync-start`
- :ref:`pacemaker-start`
- :ref:`pacemaker-cluster-properties`
.. _pacemaker-install:
Install packages
On any host that is meant to be part of a Pacemaker cluster,
you must first establish cluster communications
through the Corosync messaging layer.
This involves installing the following packages
(and their dependencies, which your package manager
usually installs automatically):
- pacemaker
- pcs (CentOS or RHEL) or crmsh
- corosync
- fence-agents (CentOS or RHEL) or cluster-glue
- resource-agents
- libqb0
.. _pacemaker-corosync-setup:
Set up the cluster with `pcs`
#. Make sure pcs is running and configured to start at boot time:
.. code-block:: console
$ systemctl enable pcsd
$ systemctl start pcsd
#. Set a password for hacluster user **on each host**.
Since the cluster is a single administrative domain, it is generally
accepted to use the same password on all nodes.
.. code-block:: console
$ echo my-secret-password-no-dont-use-this-one \
| passwd --stdin hacluster
#. Use that password to authenticate to the nodes which will
make up the cluster. The :option:`-p` option is used to give
the password on command line and makes it easier to script.
.. code-block:: console
$ pcs cluster auth controller1 controller2 controller3 \
-u hacluster -p my-secret-password-no-dont-use-this-one --force
#. Create the cluster, giving it a name, and start it:
.. code-block:: console
$ pcs cluster setup --force --name my-first-openstack-cluster \
controller1 controller2 controller3
$ pcs cluster start --all
.. note ::
In Red Hat Enterprise Linux or CentOS environments, this is a recommended
path to perform configuration. For more information, see the `RHEL docs
Set up the cluster with `crmsh`
After installing the Corosync package, you must create
the :file:`/etc/corosync/corosync.conf` configuration file.
.. note::
For Ubuntu, you should also enable the Corosync service
in the ``/etc/default/corosync`` configuration file.
Corosync can be configured to work
with either multicast or unicast IP addresses
or to use the votequorum library.
- :ref:`corosync-multicast`
- :ref:`corosync-unicast`
- :ref:`corosync-votequorum`
.. _corosync-multicast:
Set up Corosync with multicast
Most distributions ship an example configuration file
as part of the documentation bundled with the Corosync package.
An example Corosync configuration file is shown below:
**Example Corosync configuration file for multicast (corosync.conf)**
.. code-block:: ini
totem {
version: 2
# Time (in ms) to wait for a token (1)
token: 10000
# How many token retransmits before forming a new
# configuration
token_retransmits_before_loss_const: 10
# Turn off the virtual synchrony filter
vsftype: none
# Enable encryption (2)
secauth: on
# How many threads to use for encryption/decryption
threads: 0
# This specifies the redundant ring protocol, which may be
# none, active, or passive. (3)
rrp_mode: active
# The following is a two-ring multicast configuration. (4)
interface {
ringnumber: 0
mcastport: 5405
interface {
ringnumber: 1
mcastport: 5405
amf {
mode: disabled
service {
# Load the Pacemaker Cluster Resource Manager (5)
ver: 1
name: pacemaker
aisexec {
user: root
group: root
logging {
fileline: off
to_stderr: yes
to_logfile: no
to_syslog: yes
syslog_facility: daemon
debug: off
timestamp: on
logger_subsys {
subsys: AMF
debug: off
tags: enter|leave|trace1|trace2|trace3|trace4|trace6
Note the following:
- The ``token`` value specifies the time, in milliseconds,
during which the Corosync token is expected
to be transmitted around the ring.
When this timeout expires, the token is declared lost,
and after ``token_retransmits_before_loss_const lost`` tokens,
the non-responding processor (cluster node) is declared dead.
In other words, ``token × token_retransmits_before_loss_const``
is the maximum time a node is allowed to not respond to cluster messages
before being considered dead.
The default for token is 1000 milliseconds (1 second),
with 4 allowed retransmits.
These defaults are intended to minimize failover times,
but can cause frequent "false alarms" and unintended failovers
in case of short network interruptions. The values used here are safer,
albeit with slightly extended failover times.
- With ``secauth`` enabled,
Corosync nodes mutually authenticate using a 128-byte shared secret
stored in the :file:`/etc/corosync/authkey` file,
which may be generated with the :command:`corosync-keygen` utility.
When using ``secauth``, cluster communications are also encrypted.
- In Corosync configurations using redundant networking
(with more than one interface),
you must select a Redundant Ring Protocol (RRP) mode other than none.
``active`` is the recommended RRP mode.
Note the following about the recommended interface configuration:
- Each configured interface must have a unique ``ringnumber``,
starting with 0.
- The ``bindnetaddr`` is the network address of the interfaces to bind to.
The example uses two network addresses of /24 IPv4 subnets.
- Multicast groups (``mcastaddr``) must not be reused
across cluster boundaries.
In other words, no two distinct clusters
should ever use the same multicast group.
Be sure to select multicast addresses compliant with
`RFC 2365, "Administratively Scoped IP Multicast"
- For firewall configurations,
note that Corosync communicates over UDP only,
and uses ``mcastport`` (for receives)
and ``mcastport - 1`` (for sends).
- The service declaration for the pacemaker service
may be placed in the :file:`corosync.conf` file directly
or in its own separate file, :file:`/etc/corosync/service.d/pacemaker`.
.. note::
If you are using Corosync version 2 on Ubuntu 14.04,
remove or comment out lines under the service stanza,
which enables Pacemaker to start up. Another potential
problem is the boot and shutdown order of Corosync and
Pacemaker. To force Pacemaker to start after Corosync and
stop before Corosync, fix the start and kill symlinks manually:
.. code-block:: console
# update-rc.d pacemaker start 20 2 3 4 5 . stop 00 0 1 6 .
The Pacemaker service also requires an additional
configuration file ``/etc/corosync/uidgid.d/pacemaker``
to be created with the following content:
.. code-block:: ini
uidgid {
uid: hacluster
gid: haclient
- Once created, the :file:`corosync.conf` file
(and the :file:`authkey` file if the secauth option is enabled)
must be synchronized across all cluster nodes.
.. _corosync-unicast:
Set up Corosync with unicast
For environments that do not support multicast,
Corosync should be configured for unicast.
An example fragment of the :file:`corosync.conf` file
for unicastis shown below:
**Corosync configuration file fragment for unicast (corosync.conf)**
.. code-block:: ini
totem {
interface {
ringnumber: 0
broadcast: yes (1)
mcastport: 5405
interface {
ringnumber: 1
broadcast: yes
mcastport: 5405
transport: udpu (2)
nodelist { (3)
node {
nodeid: 1
node {
nodeid: 2
node {
nodeid: 3
Note the following:
- If the ``broadcast`` parameter is set to yes,
the broadcast address is used for communication.
If this option is set, the ``mcastaddr`` parameter should not be set.
- The ``transport`` directive controls the transport mechanism used.
To avoid the use of multicast entirely,
specify the ``udpu`` unicast transport parameter.
This requires specifying the list of members
in the ``nodelist`` directive;
this could potentially make up the membership before deployment.
The default is ``udp``.
The transport type can also be set to ``udpu`` or ``iba``.
- Within the ``nodelist`` directive,
it is possible to specify specific information
about the nodes in the cluster.
The directive can contain only the node sub-directive,
which specifies every node that should be a member of the membership,
and where non-default options are needed.
Every node must have at least the ``ring0_addr`` field filled.
.. note::
For UDPU, every node that should be a member
of the membership must be specified.
Possible options are:
- ``ring{X}_addr`` specifies the IP address of one of the nodes.
{X} is the ring number.
- ``nodeid`` is optional
when using IPv4 and required when using IPv6.
This is a 32-bit value specifying the node identifier
delivered to the cluster membership service.
If this is not specified with IPv4,
the node id is determined from the 32-bit IP address
of the system to which the system is bound with ring identifier of 0.
The node identifier value of zero is reserved and should not be used.
.. _corosync-votequorum:
Set up Corosync with votequorum library
The votequorum library is part of the corosync project.
It provides an interface to the vote-based quorum service
and it must be explicitly enabled in the Corosync configuration file.
The main role of votequorum library is to avoid split-brain situations,
but it also provides a mechanism to:
- Query the quorum status
- Get a list of nodes known to the quorum service
- Receive notifications of quorum state changes
- Change the number of votes assigned to a node
- Change the number of expected votes for a cluster to be quorate
- Connect an additional quorum device
to allow small clusters remain quorate during node outages
The votequorum library has been created to replace and eliminate
qdisk, the disk-based quorum daemon for CMAN,
from advanced cluster configurations.
A sample votequorum service configuration
in the :file:`corosync.conf` file is:
.. code-block:: ini
quorum {
provider: corosync_votequorum (1)
expected_votes: 7 (2)
wait_for_all: 1 (3)
last_man_standing: 1 (4)
last_man_standing_window: 10000 (5)
Note the following:
- Specifying ``corosync_votequorum`` enables the votequorum library;
this is the only required option.
- The cluster is fully operational with ``expected_votes`` set to 7 nodes
(each node has 1 vote), quorum: 4.
If a list of nodes is specified as ``nodelist``,
the ``expected_votes`` value is ignored.
- Setting ``wait_for_all`` to 1 means that,
When starting up a cluster (all nodes down),
the cluster quorum is held until all nodes are online
and have joined the cluster for the first time.
This parameter is new in Corosync 2.0.
- Setting ``last_man_standing`` to 1 enables
the Last Man Standing (LMS) feature;
by default, it is disabled (set to 0).
If a cluster is on the quorum edge
(``expected_votes:`` set to 7; ``online nodes:`` set to 4)
for longer than the time specified
for the ``last_man_standing_window`` parameter,
the cluster can recalculate quorum and continue operating
even if the next node will be lost.
This logic is repeated until the number of online nodes
in the cluster reaches 2.
In order to allow the cluster to step down from 2 members to only 1,
the ``auto_tie_breaker`` parameter needs to be set;
this is not recommended for production environments.
- ``last_man_standing_window`` specifies the time, in milliseconds,
required to recalculate quorum after one or more hosts
have been lost from the cluster.
To do the new quorum recalculation,
the cluster must have quorum for at least the interval
specified for ``last_man_standing_window``;
the default is 10000ms.
.. _pacemaker-corosync-start:
Start Corosync
Corosync is started as a regular system service.
Depending on your distribution, it may ship with an LSB init script,
an upstart job, or a systemd unit file.
Either way, the service is usually named corosync:
- :command:`# /etc/init.d/corosync start` (LSB)
- :command:`# service corosync start` (LSB, alternate)
- :command:`# start corosync` (upstart)
- :command:`# systemctl start corosync` (systemd)
You can now check the Corosync connectivity with two tools.
Use the :command:`corosync-cfgtool` utility with the :option:`-s` option
to get a summary of the health of the communication rings:
.. code-block:: console
# corosync-cfgtool -s
Printing ring status.
Local node ID 435324542
id =
status = ring 0 active with no faults
id =
status = ring 1 active with no faults
Use the :command:`corosync-objctl` utility
to dump the Corosync cluster member list:
.. code-block:: console
# corosync-objctl runtime.totem.pg.mrp.srp.members
runtime.totem.pg.mrp.srp.435324542.ip=r(0) ip( r(1) ip(
runtime.totem.pg.mrp.srp.983895584.ip=r(0) ip( r(1) ip(
You should see a ``status=joined`` entry
for each of your constituent cluster nodes.
[TODO: Should the main example now use corosync-cmapctl and have the note
give the command for Corosync version 1?]
.. note::
If you are using Corosync version 2, use the :command:`corosync-cmapctl`
utility instead of :command:`corosync-objctl`; it is a direct replacement.
.. _pacemaker-start:
Start Pacemaker
After the Corosync services have been started
and you have verified that the cluster is communicating properly,
you can start :command:`pacemakerd`, the Pacemaker master control process:
- :command:`# /etc/init.d/pacemaker start` (LSB)
- :command:`# service pacemaker start` (LSB, alternate)
- :command:`# start pacemaker` (upstart)
- :command:`# systemctl start pacemaker` (systemd)
After the Pacemaker services have started,
Pacemaker creates a default empty cluster configuration with no resources.
Use the :command:`crm_mon` utility to observe the status of Pacemaker:
.. code-block:: console
Last updated: Sun Oct 7 21:07:52 2012
Last change: Sun Oct 7 20:46:00 2012 via cibadmin on controller2
Stack: openais
Current DC: controller2 - partition with quorum
Version: 1.1.6-9971ebba4494012a93c03b40a2c58ec0eb60f50c
3 Nodes configured, 3 expected votes
0 Resources configured.
Online: [ controller3 controller2 controller1 ]
.. _pacemaker-cluster-properties:
Set basic cluster properties
After you set up your Pacemaker cluster,
you should set a few basic cluster properties:
.. code-block:: console
$ crm configure property pe-warn-series-max="1000" \
pe-input-series-max="1000" \
pe-error-series-max="1000" \
.. code-block:: console
$ pcs property set pe-warn-series-max=1000 \
pe-input-series-max=1000 \
pe-error-series-max=1000 \
Note the following:
- Setting the ``pe-warn-series-max``, ``pe-input-series-max``
and ``pe-error-series-max`` parameters to 1000
instructs Pacemaker to keep a longer history of the inputs processed
and errors and warnings generated by its Policy Engine.
This history is useful if you need to troubleshoot the cluster.
- Pacemaker uses an event-driven approach to cluster state processing.
The ``cluster-recheck-interval`` parameter (which defaults to 15 minutes)
defines the interval at which certain Pacemaker actions occur.
It is usually prudent to reduce this to a shorter interval,
such as 5 or 3 minutes.
After you make these changes, you may commit the updated configuration.

An AMQP (Advanced Message Queuing Protocol) compliant message bus is
required for most OpenStack components in order to coordinate the
execution of jobs entered into the system.
The most popular AMQP implementation used in OpenStack installations
is RabbitMQ.
RabbitMQ nodes fail over both on the application and the
infrastructure layers.
The application layer is controlled by the ``oslo.messaging``
configuration options for multiple AMQP hosts. If the AMQP node fails,
the application reconnects to the next one configured within the
specified reconnect interval. The specified reconnect interval
constitutes its SLA.
On the infrastructure layer, the SLA is the time for which RabbitMQ
cluster reassembles. Several cases are possible. The Mnesia keeper
node is the master of the corresponding Pacemaker resource for
RabbitMQ; when it fails, the result is a full AMQP cluster downtime
interval. Normally, its SLA is no more than several minutes. Failure
of another node that is a slave of the corresponding Pacemaker
resource for RabbitMQ results in no AMQP cluster downtime at all.
Making the RabbitMQ service highly available involves the following steps:
- :ref:`Install RabbitMQ<rabbitmq-install>`
- :ref:`Configure RabbitMQ for HA queues<rabbitmq-configure>`
- :ref:`Configure OpenStack services to use Rabbit HA queues
.. note::
Access to RabbitMQ is not normally handled by HAproxy. Instead,
consumers must be supplied with the full list of hosts running
RabbitMQ with ``rabbit_hosts`` and turn on the ``rabbit_ha_queues``
Jon Eck found the `core issue
and went into some detail regarding the `history and solution
on his blog.
In summary though:
The source address for the connection from HAProxy back to the
client is the VIP address. However the VIP address is no longer
present on the host. This means that the network (IP) layer
deems the packet unroutable, and informs the transport (TCP)
layer. TCP, however, is a reliable transport. It knows how to
handle transient errors and will retry. And so it does.
In this case that is a problem though, because:
TCP generally holds on to hope for a long time. A ballpark
estimate is somewhere on the order of tens of minutes (30
minutes is commonly referenced). During this time it will keep
probing and trying to deliver the data.
It is important to note that HAProxy has no idea that any of this is
happening. As far as its process is concerned, it called
``write()`` with the data and the kernel returned success. The
resolution is already understood and just needs to make its way
through a review.
.. _rabbitmq-install:
Install RabbitMQ
The commands for installing RabbitMQ are specific to the Linux distribution
you are using:
.. list-table:: Install RabbitMQ
:widths: 15 30
:header-rows: 1
* - Distribution
- Command
* - Ubuntu, Debian
- :command:`# apt-get install rabbitmq-server`
* - RHEL, Fedora, CentOS
- :command:`# yum install rabbitmq-server`
* - openSUSE
- :command:`# zypper install rabbitmq-server`
* - SLES 12
- :command:`# zypper addrepo -f obs://Cloud:OpenStack:Kilo/SLE_12 Kilo`
[Verify fingerprint of imported GPG key; see below]
:command:`# zypper install rabbitmq-server`
.. note::
For SLES 12, the packages are signed by GPG key 893A90DAD85F9316.
You should verify the fingerprint of the imported GPG key before using it.
Key ID: 893A90DAD85F9316
Key Name: Cloud:OpenStack OBS Project <Cloud:OpenStack@build.opensuse.org>
Key Fingerprint: 35B34E18ABC1076D66D5A86B893A90DAD85F9316
Key Created: Tue Oct 8 13:34:21 2013
Key Expires: Thu Dec 17 13:34:21 2015
For more information,
see the official installation manual for the distribution:
- `Debian and Ubuntu <http://www.rabbitmq.com/install-debian.html>`_
- `RPM based <http://www.rabbitmq.com/install-rpm.html>`_
(RHEL, Fedora, CentOS, openSUSE)
.. _rabbitmq-configure:
Configure RabbitMQ for HA queues
[TODO: This section should begin with a brief mention
about what HA queues are and why they are valuable, etc]
We are building a cluster of RabbitMQ nodes to construct a RabbitMQ broker,
which is a logical grouping of several Erlang nodes.
The following components/services can work with HA queues:
[TODO: replace "currently" with specific release names]
[TODO: Does this list need to be updated? Perhaps we need a table
that shows each component and the earliest release that allows it
to work with HA queues.]
- OpenStack Compute
- OpenStack Block Storage
- OpenStack Networking
- Telemetry
We have to consider that, while exchanges and bindings
survive the loss of individual nodes,
queues and their messages do not
because a queue and its contents are located on one node.
If we lose this node, we also lose the queue.
Mirrored queues in RabbitMQ improve
the availability of service since it is resilient to failures.
Production servers should run (at least) three RabbitMQ servers;
for testing and demonstration purposes,
it is possible to run only two servers.
In this section, we configure two nodes,
called ``rabbit1`` and ``rabbit2``.
To build a broker, we need to ensure
that all nodes have the same Erlang cookie file.
[TODO: Should the example instead use a minimum of three nodes?]
#. To do so, stop RabbitMQ everywhere and copy the cookie
from the first node to each of the other node(s):
.. code-block:: console
# scp /var/lib/rabbitmq/.erlang.cookie root@NODE:/var/lib/rabbitmq/.erlang.cookie
#. On each target node, verify the correct owner,
group, and permissions of the file :file:`erlang.cookie`.
.. code-block:: console
# chown rabbitmq:rabbitmq /var/lib/rabbitmq/.erlang.cookie
# chmod 400 /var/lib/rabbitmq/.erlang.cookie
#. Start the message queue service on all nodes and configure it to start
when the system boots.
On Ubuntu, it is configured by default.
On CentOS, RHEL, openSUSE, and SLES:
.. code-block:: console
# systemctl enable rabbitmq-server.service
# systemctl start rabbitmq-server.service
#. Verify that the nodes are running:
.. code-block:: console
# rabbitmqctl cluster_status
Cluster status of node rabbit@NODE...
#. Run the following commands on each node except the first one:
.. code-block:: console
# rabbitmqctl stop_app
Stopping node rabbit@NODE...
# rabbitmqctl join_cluster --ram rabbit@rabbit1
# rabbitmqctl start_app
Starting node rabbit@NODE ...
.. note::
The default node type is a disc node. In this guide, nodes
join the cluster as RAM nodes.
#. To verify the cluster status:
.. code-block:: console
# rabbitmqctl cluster_status
Cluster status of node rabbit@NODE...
[{nodes,[{disc,[rabbit@rabbit1]},{ram,[rabbit@NODE]}]}, \
If the cluster is working,
you can create usernames and passwords for the queues.
#. To ensure that all queues except those with auto-generated names
are mirrored across all running nodes,
set the ``ha-mode`` policy key to all
by running the following command on one of the nodes:
.. code-block:: console
# rabbitmqctl set_policy ha-all '^(?!amq\.).*' '{"ha-mode": "all"}'
More information is available in the RabbitMQ documentation:
- `Highly Available Queues <http://www.rabbitmq.com/ha.html>`_
- `Clustering Guide <https://www.rabbitmq.com/clustering.html>`_
.. note::
As another option to make RabbitMQ highly available, RabbitMQ contains the
OCF scripts for the Pacemaker cluster resource agents since version 3.5.7.
It provides the active/active RabbitMQ cluster with mirrored queues.
For more information, see `Auto-configuration of a cluster with
a Pacemaker <http://www.rabbitmq.com/pacemaker.html>`_.
.. _rabbitmq-services:
Configure OpenStack services to use Rabbit HA queues
We have to configure the OpenStack components
to use at least two RabbitMQ nodes.
Do this configuration on all services using RabbitMQ:
#. RabbitMQ HA cluster host:port pairs:
#. How frequently to retry connecting with RabbitMQ:
[TODO: document the unit of measure here? Seconds?]
#. How long to back-off for between retries when connecting to RabbitMQ:
[TODO: document the unit of measure here? Seconds?]
#. Maximum retries with trying to connect to RabbitMQ (infinite by default):
#. Use durable queues in RabbitMQ:
#. Use HA queues in RabbitMQ (x-ha-policy: all):
.. note::
If you change the configuration from an old set-up
that did not use HA queues, you should restart the service:
.. code-block:: console
# rabbitmqctl stop_app
# rabbitmqctl reset
# rabbitmqctl start_app

[TODO (Add Telemetry overview)]
Telemetry central agent
The Telemetry central agent can be configured to partition its polling
workload between multiple agents, enabling high availability.
Both the central and the compute agent can run in an HA deployment,
which means that multiple instances of these services can run in
parallel with workload partitioning among these running instances.
The `Tooz <https://pypi.python.org/pypi/tooz>`__ library provides
the coordination within the groups of service instances.
It provides an API above several back ends that can be used for building
distributed applications.
Tooz supports
`various drivers <http://docs.openstack.org/developer/tooz/drivers.html>`__
including the following back end solutions:
* `Zookeeper <http://zookeeper.apache.org/>`__.
Recommended solution by the Tooz project.
* `Redis <http://redis.io/>`__.
Recommended solution by the Tooz project.
* `Memcached <http://memcached.org/>`__.
Recommended for testing.
You must configure a supported Tooz driver for the HA deployment of
the Telemetry services.
For information about the required configuration options that have
to be set in the :file:`ceilometer.conf` configuration file for both
the central and compute agents, see the `coordination section
in the OpenStack Configuration Reference.
.. note:: Without the ``backend_url`` option being set only one
instance of both the central and compute agent service is able to run
and function correctly.
The availability check of the instances is provided by heartbeat messages.
When the connection with an instance is lost, the workload will be
reassigned within the remained instances in the next polling cycle.
.. note:: Memcached uses a timeout value, which should always be set to
a value that is higher than the heartbeat value set for Telemetry.
For backward compatibility and supporting existing deployments, the central
agent configuration also supports using different configuration files for
groups of service instances of this type that are running in parallel.
For enabling this configuration, set a value for the partitioning_group_prefix
option in the `central section <http://docs.openstack.org/liberty/
in the OpenStack Configuration Reference.
.. warning:: For each sub-group of the central agent pool with the same
``partitioning_group_prefix`` a disjoint subset of meters must be polled --
otherwise samples may be missing or duplicated. The list of meters to poll
can be set in the :file:`/etc/ceilometer/pipeline.yaml` configuration file.
For more information about pipelines see the `Data collection and
To enable the compute agent to run multiple instances simultaneously with
workload partitioning, the workload_partitioning option has to be set to
``True`` under the `compute section <http://docs.openstack.org/liberty/
in the :file:`ceilometer.conf` configuration file.

View File

Configure the VIP
You must select and assign a virtual IP address (VIP)
that can freely float between cluster nodes.
This configuration creates ``vip``,
a virtual IP address for use by the API node (````):
For ``crmsh``:
.. code-block:: console
primitive vip ocf:heartbeat:IPaddr2 \
params ip="" cidr_netmask="24" op monitor interval="30s"
For ``pcs``:
.. code-block:: console
# pcs resource create vip ocf:heartbeat:IPaddr2 \
params ip="" cidr_netmask="24" op monitor interval="30s"

Configuring the controller for high availability
The cloud controller runs on the management network
and must talk to all other services.
.. toctree::
:maxdepth: 2

Width:  |  Height:  |  Size: 223 KiB

Binary file not shown.


Width:  |  Height:  |  Size: 215 KiB

Binary file not shown.


Width:  |  Height:  |  Size: 52 KiB

Hardware setup
The standard hardware requirements:
- `Provider networks <http://docs.openstack.org/liberty/install-guide-ubuntu/overview.html#networking-option-1-provider-networks>`_
- `Self-service networks <http://docs.openstack.org/liberty/install-guide-ubuntu/overview.html#networking-option-2-self-service-networks>`_
However, OpenStack does not require a significant amount of resources
and the following minimum requirements should support
a proof-of-concept high availability environment
with core services and several instances:
[TODO: Verify that these numbers are good]
| Node type | Processor | Memory | Storage | NIC |
| controller node | 1-2 | 8 GB | 100 GB | 2 |
| compute node | 2-4+ | 8+ GB | 100+ GB | 2 |
For demonstrations and studying,
you can set up a test environment on virtual machines (VMs).
This has the following benefits:
- One physical server can support multiple nodes,
each of which supports almost any number of network interfaces.
- Ability to take periodic "snap shots" throughout the installation process
and "roll back" to a working configuration in the event of a problem.
However, running an OpenStack environment on VMs
degrades the performance of your instances,
particularly if your hypervisor and/or processor lacks support
for hardware acceleration of nested VMs.
.. note::
When installing highly-available OpenStack on VMs,
be sure that your hypervisor permits promiscuous mode
and disables MAC address filtering on the external network.

Hardware considerations for high availability
[TODO: Provide a minimal architecture example for HA,
expanded on that given in
for easy comparison]
.. toctree::
:maxdepth: 2

OpenStack High Availability Guide
This guide describes how to install and configure
OpenStack for high availability.
It supplements the OpenStack Installation Guides
and assumes that you are familiar with the material in those guides.
This guide documents OpenStack Mitaka, OpenStack Liberty, and OpenStack
Kilo releases.
.. warning:: This guide is a work-in-progress and changing rapidly
while we continue to test and enhance the guidance. Please note
where there are open "to do" items and help where you are able.
.. toctree::
:maxdepth: 2
Search in this guide
* :ref:`search`

Install memcached
[TODO: Verify that Oslo supports hash synchronization;
if so, this should not take more than load balancing.]
[TODO: This hands off to two different docs for install information.
We should choose one or explain the specific purpose of each.]
Most OpenStack services can use memcached
to store ephemeral data such as tokens.
Although memcached does not support
typical forms of redundancy such as clustering,
OpenStack services can use almost any number of instances
by configuring multiple hostnames or IP addresses.
The memcached client implements hashing
to balance objects among the instances.
Failure of an instance only impacts a percentage of the objects
and the client automatically removes it from the list of instances.
To install and configure memcached, read the
`official documentation <https://code.google.com/p/memcached/wiki/NewStart>`_.
Memory caching is managed by `oslo.cache
so the way to use multiple memcached servers is the same for all projects.
[TODO: Should this show three hosts?]
Example configuration with two hosts:
memcached_servers = controller1:11211,controller2:11211
By default, `controller1` handles the caching service but,
if the host goes down, `controller2` does the job.
For more information about memcached installation,
see the `OpenStack Administrator Guide

Configure NTP
You must configure NTP to properly synchronize services among nodes.
We recommend that you configure the controller node to reference
more accurate (lower stratum) servers and other nodes to reference
the controller node. For more information, see the
`Install Guides <http://docs.openstack.org/#install-guides>`_.

Install operating system on each node
The first step in setting up your highly-available OpenStack cluster
is to install the operating system on each node.
Follow the instructions in the OpenStack Installation Guides:
- `CentOS and RHEL <http://docs.openstack.org/liberty/install-guide-rdo/environment.html>`_
- `openSUSE and SUSE Linux Enterprise Server <http://docs.openstack.org/liberty/install-guide-obs/environment.html>`_
- `Ubuntu <http://docs.openstack.org/liberty/install-guide-ubuntu/environment.html>`_
The OpenStack Installation Guides also include a list of the services
that use passwords with important notes about using them.
This guide uses the following example IP addresses:
.. code-block:: none
# controller controller # virtual IP controller1 controller2 controller3

Installing high availability packages
[TODO -- write intro to this section]
.. toctree::
:maxdepth: 2

The keepalived architecture
High availability strategies
The following diagram shows a very simplified view of the different
strategies used to achieve high availability for the OpenStack
.. image:: /figures/keepalived-arch.jpg
:width: 100%
Depending on the method used to communicate with the service, the
following availability strategies will be followed:
- Keepalived, for the HAProxy instances.
- Access via an HAProxy virtual IP, for services such as HTTPd that
are accessed via a TCP socket that can be load balanced
- Built-in application clustering, when available from the application.
Galera is one example of this.
- Starting up one instance of the service on several controller nodes,
when they can coexist and coordinate by other means. RPC in
``nova-conductor`` is one example of this.
- No high availability, when the service can only work in
active/passive mode.
There are known issues with cinder-volume that recommend setting it as
active-passive for now, see:
While there will be multiple neutron LBaaS agents running, each agent
will manage a set of load balancers, that cannot be failed over to
another node.
Architecture limitations
This architecture has some inherent limitations that should be kept in
mind during deployment and daily operations.
The following sections describe these limitations.
#. Keepalived and network partitions
In case of a network partitioning, there is a chance that two or
more nodes running keepalived claim to hold the same VIP, which may
lead to an undesired behaviour. Since keepalived uses VRRP over
multicast to elect a master (VIP owner), a network partition in
which keepalived nodes cannot communicate will result in the VIPs
existing on two nodes. When the network partition is resolved, the
duplicate VIPs should also be resolved. Note that this network
partition problem with VRRP is a known limitation for this
#. Cinder-volume as a single point of failure
There are currently concerns over the cinder-volume service ability
to run as a fully active-active service. During the Mitaka
timeframe, this is being worked on, see:
Thus, cinder-volume will only be running on one of the controller
nodes, even if it will be configured on all nodes. In case of a
failure in the node running cinder-volume, it should be started in
a surviving controller node.
#. Neutron-lbaas-agent as a single point of failure
The current design of the neutron LBaaS agent using the HAProxy
driver does not allow high availability for the tenant load
balancers. The neutron-lbaas-agent service will be enabled and
running on all controllers, allowing for load balancers to be
distributed across all nodes. However, a controller node failure
will stop all load balancers running on that node until the service
is recovered or the load balancer is manually removed and created
#. Service monitoring and recovery required
An external service monitoring infrastructure is required to check
the OpenStack service health, and notify operators in case of any
failure. This architecture does not provide any facility for that,
so it would be necessary to integrate the OpenStack deployment with
any existing monitoring environment.
#. Manual recovery after a full cluster restart
Some support services used by RDO or RHEL OSP use their own form of
application clustering. Usually, these services maintain a cluster
quorum, that may be lost in case of a simultaneous restart of all
cluster nodes, for example during a power outage. Each service will
require its own procedure to regain quorum.
If you find any or all of these limitations concerning, you are
encouraged to refer to the
:doc:`Pacemaker HA architecture<intro-ha-arch-pacemaker>` instead.

The Pacemaker architecture
What is a cluster manager
At its core, a cluster is a distributed finite state machine capable
of co-ordinating the startup and recovery of inter-related services
across a set of machines.
Even a distributed and/or replicated application that is able to
survive failures on one or more machines can benefit from a
cluster manager:
#. Awareness of other applications in the stack
While SYS-V init replacements like systemd can provide
deterministic recovery of a complex stack of services, the
recovery is limited to one machine and lacks the context of what
is happening on other machines - context that is crucial to
determine the difference between a local failure, clean startup
and recovery after a total site failure.
#. Awareness of instances on other machines
Services like RabbitMQ and Galera have complicated boot-up
sequences that require co-ordination, and often serialization, of
startup operations across all machines in the cluster. This is
especially true after site-wide failure or shutdown where we must
first determine the last machine to be active.
#. A shared implementation and calculation of `quorum
It is very important that all members of the system share the same
view of who their peers are and whether or not they are in the
majority. Failure to do this leads very quickly to an internal
`split-brain <http://en.wikipedia.org/wiki/Split-brain_(computing)>`_
state - where different parts of the system are pulling in
different and incompatible directions.
#. Data integrity through fencing (a non-responsive process does not
imply it is not doing anything)
A single application does not have sufficient context to know the
difference between failure of a machine and failure of the
applcation on a machine. The usual practice is to assume the
machine is dead and carry on, however this is highly risky - a
rogue process or machine could still be responding to requests and
generally causing havoc. The safer approach is to make use of
remotely accessible power switches and/or network switches and SAN
controllers to fence (isolate) the machine before continuing.
#. Automated recovery of failed instances
While the application can still run after the failure of several
instances, it may not have sufficient capacity to serve the
required volume of requests. A cluster can automatically recover
failed instances to prevent additional load induced failures.
For this reason, the use of a cluster manager like `Pacemaker
<http://clusterlabs.org>`_ is highly recommended.
Deployment flavors
It is possible to deploy three different flavors of the Pacemaker
architecture. The two extremes are **Collapsed** (where every
component runs on every node) and **Segregated** (where every
component runs in its own 3+ node cluster).
Regardless of which flavor you choose, it is recommended that the
clusters contain at least three nodes so that we can take advantage of
`quorum <quorum_>`_.
Quorum becomes important when a failure causes the cluster to split in
two or more partitions. In this situation, you want the majority to
ensure the minority are truly dead (through fencing) and continue to
host resources. For a two-node cluster, no side has the majority and
you can end up in a situation where both sides fence each other, or
both sides are running the same services - leading to data corruption.
Clusters with an even number of hosts suffer from similar issues - a
single network failure could easily cause a N:N split where neither
side retains a majority. For this reason, we recommend an odd number
of cluster members when scaling up.
You can have up to 16 cluster members (this is currently limited by
the ability of corosync to scale higher). In extreme cases, 32 and
even up to 64 nodes could be possible, however, this is not well tested.
In this configuration, there is a single cluster of 3 or more
nodes on which every component is running.
This scenario has the advantage of requiring far fewer, if more
powerful, machines. Additionally, being part of a single cluster
allows us to accurately model the ordering dependencies between
This scenario can be visualized as below.
.. image:: /figures/Cluster-deployment-collapsed.png
:width: 100%
You would choose this option if you prefer to have fewer but more
powerful boxes.
This is the most common option and the one we document here.
In this configuration, each service runs in a dedicated cluster of
3 or more nodes.
The benefits of this approach are the physical isolation between
components and the ability to add capacity to specific components.
You would choose this option if you prefer to have more but
less powerful boxes.
This scenario can be visualized as below, where each box below
represents a cluster of three or more guests.
.. image:: /figures/Cluster-deployment-segregated.png
:width: 100%
It is also possible to follow a segregated approach for one or more
components that are expected to be a bottleneck and use a collapsed
approach for the remainder.
Proxy server
Almost all services in this stack benefit from being proxied.
Using a proxy server provides:
#. Load distribution
Many services can act in an active/active capacity, however, they
usually require an external mechanism for distributing requests to
one of the available instances. The proxy server can serve this
#. API isolation
By sending all API access through the proxy, we can clearly
identify service interdependencies. We can also move them to
locations other than ``localhost`` to increase capacity if the
need arises.
#. Simplified process for adding/removing of nodes
Since all API access is directed to the proxy, adding or removing
nodes has no impact on the configuration of other services. This
can be very useful in upgrade scenarios where an entirely new set
of machines can be configured and tested in isolation before
telling the proxy to direct traffic there instead.
#. Enhanced failure detection
The proxy can be configured as a secondary mechanism for detecting
service failures. It can even be configured to look for nodes in
a degraded state (such as being 'too far' behind in the
replication) and take them out of circulation.
The following components are currently unable to benefit from the use
of a proxy server:
* RabbitMQ
* Memcached
* MongoDB
However, the reasons vary and are discussed under each component's
We recommend HAProxy as the load balancer, however, there are many
alternatives in the marketplace.
We use a check interval of 1 second, however, the timeouts vary by service.
Generally, we use round-robin to distribute load amongst instances of
active/active services, however, Galera uses the ``stick-table`` options
to ensure that incoming connections to the virtual IP (VIP) should be
directed to only one of the available back ends.
In Galera's case, although it can run active/active, this helps avoid
lock contention and prevent deadlocks. It is used in combination with
the ``httpchk`` option that ensures only nodes that are in sync with its
peers are allowed to handle requests.

View File

@ -0,0 +1,4 @@
Overview of highly-available compute nodes

High availability concepts
High availability systems seek to minimize two things:
**System downtime**
Occurs when a user-facing service is unavailable
beyond a specified maximum amount of time.
**Data loss**
Accidental deletion or destruction of data.
Most high availability systems guarantee protection against system downtime
and data loss only in the event of a single failure.
However, they are also expected to protect against cascading failures,
where a single failure deteriorates into a series of consequential failures.
Many service providers guarantee :term:`Service Level Agreement (SLA)`
including uptime percentage of computing service, which is calculated based
on the available time and system downtime excluding planned outage time.
Redundancy and failover
High availability is implemented with redundant hardware
running redundant instances of each service.
If one piece of hardware running one instance of a service fails,
the system can then failover to use another instance of a service
that is running on hardware that did not fail.
A crucial aspect of high availability
is the elimination of single points of failure (SPOFs).
A SPOF is an individual piece of equipment or software
that causes system downtime or data loss if it fails.
In order to eliminate SPOFs, check that mechanisms exist for redundancy of:
- Network components, such as switches and routers
- Applications and automatic service migration
- Storage components
- Facility services such as power, air conditioning, and fire protection
In the event that a component fails and a back-up system must take on
its load, most high availability systems will replace the failed
component as quickly as possible to maintain necessary redundancy. This
way time spent in a degraded protection state is minimized.
Most high availability systems fail in the event of multiple
independent (non-consequential) failures. In this case, most
implementations favor protecting data over maintaining availability.
High availability systems typically achieve an uptime percentage of
99.99% or more, which roughly equates to less than an hour of
cumulative downtime per year. In order to achieve this, high
availability systems should keep recovery times after a failure to
about one to two minutes, sometimes significantly less.
OpenStack currently meets such availability requirements for its own
infrastructure services, meaning that an uptime of 99.99% is feasible
for the OpenStack infrastructure proper. However, OpenStack does not
guarantee 99.99% availability for individual guest instances.
This document discusses some common methods of implementing highly
available systems, with an emphasis on the core OpenStack services and
other open source services that are closely aligned with OpenStack.
These methods are by no means the only ways to do it;
you may supplement these services with commercial hardware and software
that provides additional features and functionality.
You also need to address high availability concerns
for any applications software that you run on your OpenStack environment.
The important thing is to make sure that your services are redundant
and available; how you achieve that is up to you.
Stateless vs. stateful services
Preventing single points of failure can depend on whether or not a
service is stateless.
Stateless service
A service that provides a response after your request
and then requires no further attention.
To make a stateless service highly available,
you need to provide redundant instances and load balance them.
OpenStack services that are stateless include ``nova-api``,
``nova-conductor``, ``glance-api``, ``keystone-api``,
``neutron-api`` and ``nova-scheduler``.
Stateful service
A service where subsequent requests to the service
depend on the results of the first request.
Stateful services are more difficult to manage because a single
action typically involves more than one request, so simply providing
additional instances and load balancing does not solve the problem.
For example, if the horizon user interface reset itself every time
you went to a new page, it would not be very useful.
OpenStack services that are stateful include the OpenStack database
and message queue.
Making stateful services highly available can depend on whether you choose
an active/passive or active/active configuration.
Active/Passive vs. Active/Active
Stateful services may be configured as active/passive or active/active:
:term:`active/passive configuration`
Maintains a redundant instance
that can be brought online when the active service fails.
For example, OpenStack writes to the main database
while maintaining a disaster recovery database that can be brought online
if the main database fails.
A typical active/passive installation for a stateful service maintains
a replacement resource that can be brought online when required.
Requests are handled using a :term:`virtual IP` address (VIP) that
facilitates returning to service with minimal reconfiguration.
A separate application (such as Pacemaker or Corosync) monitors
these services, bringing the backup online as necessary.
:term:`active/active configuration`
Each service also has a backup but manages both the main and
redundant systems concurrently.
This way, if there is a failure, the user is unlikely to notice.
The backup system is already online and takes on increased load
while the main system is fixed and brought back online.
Typically, an active/active installation for a stateless service
maintains a redundant instance, and requests are load balanced using
a virtual IP address and a load balancer such as HAProxy.
A typical active/active installation for a stateful service includes
redundant services, with all instances having an identical state. In
other words, updates to one instance of a database update all other
instances. This way a request to one instance is the same as a
request to any other. A load balancer manages the traffic to these
systems, ensuring that operational systems always handle the
Clusters and quorums
The quorum specifies the minimal number of nodes
that must be functional in a cluster of redundant nodes
in order for the cluster to remain functional.
When one node fails and failover transfers control to other nodes,
the system must ensure that data and processes remain sane.
To determine this, the contents of the remaining nodes are compared
and, if there are discrepancies, a "majority rules" algorithm is implemented.
For this reason, each cluster in a high availability environment should
have an odd number of nodes and the quorum is defined as more than a half
of the nodes.
If multiple nodes fail so that the cluster size falls below the quorum
value, the cluster itself fails.
For example, in a seven-node cluster, the quorum should be set to
floor(7/2) + 1 == 4. If quorum is four and four nodes fail simultaneously,
the cluster itself would fail, whereas it would continue to function, if
no more than three nodes fail. If split to partitions of three and four nodes
respectively, the quorum of four nodes would continue to operate the majority
partition and stop or fence the minority one (depending on the
no-quorum-policy cluster configuration).
And the quorum could also have been set to three, just as a configuration
.. note::
Note that setting the quorum to a value less than floor(n/2) + 1 is not
recommended and would likely cause a split-brain in a face of network
Then, for the given example when four nodes fail simultaneously,
the cluster would continue to function as well. But if split to partitions of
three and four nodes respectively, the quorum of three would have made both
sides to attempt to fence the other and host resources. And without fencing
enabled, it would go straight to running two copies of each resource.
This is why setting the quorum to a value less than floor(n/2) + 1 is
dangerous. However it may be required for some specific cases, like a
temporary measure at a point it is known with 100% certainty that the other
nodes are down.
When configuring an OpenStack environment for study or demonstration purposes,
it is possible to turn off the quorum checking;
this is discussed later in this guide.
Production systems should always run with quorum enabled.
Single-controller high availability mode
OpenStack supports a single-controller high availability mode
that is managed by the services that manage highly available environments
but is not actually highly available because
no redundant controllers are configured to use for failover.
This environment can be used for study and demonstration
but is not appropriate for a production environment.
It is possible to add controllers to such an environment
to convert it into a truly highly available environment.
High availability is not for every user. It presents some challenges.
High availability may be too complex for databases or
systems with large amounts of data. Replication can slow large systems
down. Different setups have different prerequisites. Read the guidelines
for each setup.
High availability is turned off as the default in OpenStack setups.

Overview of highly-available controllers
OpenStack is a set of multiple services exposed to the end users
as HTTP(s) APIs. Additionally, for own internal usage OpenStack
requires SQL database server and AMQP broker. The physical servers,
where all the components are running are often called controllers.
This modular OpenStack architecture allows to duplicate all the
components and run them on different controllers.
By making all the components redundant it is possible to make
OpenStack highly-available.
In general we can divide all the OpenStack components into three categories:
- OpenStack APIs, these are HTTP(s) stateless services written in python,
easy to duplicate and mostly easy to load balance.
- SQL relational database server provides stateful type consumed by other
components. Supported databases are MySQL, MariaDB, and PostgreSQL.
Making SQL database redundant is complex.
- :term:`Advanced Message Queuing Protocol (AMQP)` provides OpenStack
internal stateful communication service.
Network components
[TODO Need discussion of network hardware, bonding interfaces,
intelligent Layer 2 switches, routers and Layer 3 switches.]
The configuration uses static routing without
Virtual Router Redundancy Protocol (VRRP)
or similar techniques implemented.
[TODO Need description of VIP failover inside Linux namespaces
and expected SLA.]
See [TODO link] for more information about configuring networking
for high availability.
Common deployement architectures
There are primarily two HA architectures in use today.
One uses a cluster manager such as Pacemaker or Veritas to co-ordinate
the actions of the various services across a set of machines. Since
we are focused on FOSS, we will refer to this as the Pacemaker
The other is optimized for Active/Active services that do not require
any inter-machine coordination. In this setup, services are started by
your init system (systemd in most modern distributions) and a tool is
used to move IP addresses between the hosts. The most common package
for doing this is keepalived.
.. toctree::
:maxdepth: 1

View File

@ -0,0 +1,4 @@
High availability for other components

View File

@ -0,0 +1,12 @@
Overview of high availability storage
Making the Block Storage (cinder) API service highly available in
active/passive mode involves:
* Configuring Block Storage to listen on the VIP address
* Managing the Block Storage API daemon with the Pacemaker cluster manager
* Configuring OpenStack services to use this IP address

View File

@ -0,0 +1,15 @@
Introduction to OpenStack high availability
.. toctree::
:maxdepth: 2

.. _dhcp-agent:
Run neutron DHCP agent
The OpenStack Networking service has a scheduler
that lets you run multiple agents across nodes;
the DHCP agent can be natively highly available.
To configure the number of DHCP agents per network,
modify the ``dhcp_agents_per_network`` parameter
in the :file:`/etc/neutron/neutron.conf` file.
By default this is set to 1.
To achieve high availability,
assign more than one DHCP agent per network.

.. _neutron-l3:
Run neutron L3 agent
The neutron L3 agent is scalable, due to the scheduler that supports
Virtual Router Redundancy Protocol (VRRP)
to distribute virtual routers across multiple nodes.
To enable high availability for configured routers,
edit the :file:`/etc/neutron/neutron.conf` file
to set the following values:
.. list-table:: /etc/neutron/neutron.conf parameters for high availability
:widths: 15 10 30
:header-rows: 1
* - Parameter
- Value
- Description
* - l3_ha
- True
- All routers are highly available by default.
* - allow_automatic_l3agent_failover
- True
- Set automatic L3 agent failover for routers
* - max_l3_agents_per_router
- 2 or more
- Maximum number of network nodes to use for the HA router.
* - min_l3_agents_per_router
- 2 or more
- Minimum number of network nodes to use for the HA router.
A new router can be created only if this number
of network nodes are available.

.. _neutron-lbaas:
Run neutron LBaaS agent
Currently, no native feature is provided
to make the LBaaS agent highly available
using the default plug-in HAProxy.
A common way to make HAProxy highly available
is to use the VRRP (Virtual Router Redundancy Protocol).
Unfortunately, this is not yet implemented
in the LBaaS HAProxy plug-in.
[TODO: update this section.]

.. _neutron-metadata:
Run neutron metadata agent
No native feature is available
to make this service highly available.
At this time, the Active/Passive solution exists
to run the neutron metadata agent
in failover mode with Pacemaker.
[TODO: Update this information.
Can this service now be made HA in active/active mode
or do we need to pull in the instructions
OpenStack network nodes
Configure networking on each node.
`Networking <http://docs.openstack.org/liberty/install-guide-ubuntu/environment-networking.html>`_
section of the *Install Guide* includes basic information
about configuring networking.
Notes from planning outline:
- Rather than configuring neutron here,
we should simply mention physical network HA methods
such as bonding and additional node/network requirements
for L3HA and DVR for planning purposes.
- Neutron agents shuld be described for active/active;
deprecate single agent's instances case.
- For Kilo and beyond, focus on L3HA and DVR.
- Link to `Networking Guide <http://docs.openstack.org/networking-guide/>`_
for configuration details.
[TODO: Verify that the active/passive
network configuration information from
should not be included here.
`LP1328922 <https://bugs.launchpad.net/openstack-manuals/+bug/1328922>`_
`LP1349398 <https://bugs.launchpad.net/openstack-manuals/+bug/1349398>`_
are related.]
OpenStack network nodes contain:
- :ref:`Neutron DHCP agent<dhcp-agent>`
- Neutron L2 agent.
Note that the L2 agent cannot be distributed and highly available.
Instead, it must be installed on each data forwarding node
to control the virtual network drivers
such as Open vSwitch or Linux Bridge.
One L2 agent runs per node and controls its virtual interfaces.
- :ref:`Neutron L3 agent<neutron-l3>`
- :ref:`Neutron metadata agent<neutron-metadata>`
- :ref:`Neutron LBaaS<neutron-lbaas>` (Load Balancing as a Service) agent
.. note::
For Liberty, we do not have the standalone network nodes in general.
We usually run the Networking services on the controller nodes.
In this guide, we use the term "network nodes" for convenience.
.. toctree::
:maxdepth: 2

@ -0,0 +1,85 @@
.. _storage-ha-backend:
Storage back end
Most of this guide concerns the control plane of high availability:
ensuring that services continue to run even if a component fails.
Ensuring that data is not lost
is the data plane component of high availability;
this is discussed here.
An OpenStack environment includes multiple data pools for the VMs:
- Ephemeral storage is allocated for an instance
and is deleted when the instance is deleted.
The Compute service manages ephemeral storage.
By default, Compute stores ephemeral drives as files
on local disks on the Compute node
but Ceph RBD can instead be used
as the storage back end for ephemeral storage.
- Persistent storage exists outside all instances.
Two types of persistent storage are provided:
- Block Storage service (cinder)
can use LVM or Ceph RBD as the storage back end.
- Image service (glance)
can use the Object Storage service (swift)
or Ceph RBD as the storage back end.
For more information about configuring storage back ends for
the different storage options, see the `Administrator Guide
This section discusses ways to protect against
data loss in your OpenStack environment.
RAID drives
Configuring RAID on the hard drives that implement storage
protects your data against a hard drive failure.
If, however, the node itself fails, data may be lost.
In particular, all volumes stored on an LVM node can be lost.
`Ceph RBD <http://ceph.com/>`_
is an innately high availability storage back end.
It creates a storage cluster with multiple nodes
that communicate with each other
to replicate and redistribute data dynamically.
A Ceph RBD storage cluster provides
a single shared set of storage nodes
that can handle all classes of persistent and ephemeral data
-- glance, cinder, and nova --
that are required for OpenStack instances.
Ceph RBD provides object replication capabilities
by storing Block Storage volumes as Ceph RBD objects;
Ceph RBD ensures that each replica of an object
is stored on a different node.
This means that your volumes are protected against
hard drive and node failures
or even the failure of the data center itself.
When Ceph RBD is used for ephemeral volumes
as well as block and image storage, it supports
`live migration
of VMs with ephemeral drives;
LVM only supports live migration of volume-backed VMs.
Remote backup facilities
[TODO: Add discussion of remote backup facilities
as an alternate way to secure ones data.
Include brief mention of key third-party technologies
with links to their documentation]

.. highlight: ini
:linenothreshold: 5
Highly available Block Storage API
Cinder provides 'block storage as a service' suitable for performance
sensitive scenarios such as databases, expandable file systems, or
providing a server with access to raw block level storage.
Persistent block storage can survive instance termination and can also
be moved across instances like any external storage device. Cinder
also has volume snapshots capability for backing up the volumes.
Making this Block Storage API service highly available in
active/passive mode involves:
- :ref:`ha-cinder-pacemaker`
- :ref:`ha-cinder-configure`
- :ref:`ha-cinder-services`
In theory, you can run the Block Storage service as active/active.
However, because of sufficient concerns, it is recommended running
the volume component as active/passive only.
Jon Bernard writes:
Requests are first seen by Cinder in the API service, and we have a
fundamental problem there - a standard test-and-set race condition
exists for many operations where the volume status is first checked
for an expected status and then (in a different operation) updated to
a pending status. The pending status indicates to other incoming
requests that the volume is undergoing a current operation, however it
is possible for two simultaneous requests to race here, which
undefined results.
Later, the manager/driver will receive the message and carry out the
operation. At this stage there is a question of the synchronization
techniques employed by the drivers and what guarantees they make.
If cinder-volume processes exist as different process, then the
'synchronized' decorator from the lockutils package will not be
sufficient. In this case the programmer can pass an argument to
synchronized() 'external=True'. If external is enabled, then the
locking will take place on a file located on the filesystem. By
default, this file is placed in Cinder's 'state directory' in
/var/lib/cinder so won't be visible to cinder-volume instances running
on different machines.
However, the location for file locking is configurable. So an
operator could configure the state directory to reside on shared
storage. If the shared storage in use implements unix file locking
semantics, then this could provide the requisite synchronization
needed for an active/active HA configuration.
The remaining issue is that not all drivers use the synchronization
methods, and even fewer of those use the external file locks. A
sub-concern would be whether they use them correctly.
You can read more about these concerns on the
`Red Hat Bugzilla <https://bugzilla.redhat.com/show_bug.cgi?id=1193229>`_
and there is a
`psuedo roadmap <https://etherpad.openstack.org/p/cinder-kilo-stabilisation-work>`_
for addressing them upstream.
.. _ha-cinder-pacemaker:
Add Block Storage API resource to Pacemaker
On RHEL-based systems, you should create resources for cinder's
systemd agents and create constraints to enforce startup/shutdown
.. code-block:: console
pcs resource create openstack-cinder-api systemd:openstack-cinder-api --clone interleave=true
pcs resource create openstack-cinder-scheduler systemd:openstack-cinder-scheduler --clone interleave=true
pcs resource create openstack-cinder-volume systemd:openstack-cinder-volume
pcs constraint order start openstack-cinder-api-clone then openstack-cinder-scheduler-clone
pcs constraint colocation add openstack-cinder-scheduler-clone with openstack-cinder-api-clone
pcs constraint order start openstack-cinder-scheduler-clone then openstack-cinder-volume
pcs constraint colocation add openstack-cinder-volume with openstack-cinder-scheduler-clone
If the Block Storage service runs on the same nodes as the other services,
then it is advisable to also include:
.. code-block:: console
pcs constraint order start openstack-keystone-clone then openstack-cinder-api-clone
Alternatively, instead of using systemd agents, download and
install the OCF resource agent:
.. code-block:: console
# cd /usr/lib/ocf/resource.d/openstack
# wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/cinder-api
# chmod a+rx *
You can now add the Pacemaker configuration for Block Storage API resource.
Connect to the Pacemaker cluster with the :command:`crm configure` command
and add the following cluster resources:
primitive p_cinder-api ocf:openstack:cinder-api \
params config="/etc/cinder/cinder.conf"
os_username="admin" \
keystone_get_token_url="" \
op monitor interval="30s" timeout="30s"
This configuration creates ``p_cinder-api``,
a resource for managing the Block Storage API service.
The command :command:`crm configure` supports batch input,
so you may copy and paste the lines above
into your live pacemaker configuration and then make changes as required.
For example, you may enter ``edit p_ip_cinder-api``
from the :command:`crm configure` menu
and edit the resource to match your preferred virtual IP address.
Once completed, commit your configuration changes
by entering :command:`commit` from the :command:`crm configure` menu.
Pacemaker then starts the Block Storage API service
and its dependent resources on one of your nodes.
.. _ha-cinder-configure:
Configure Block Storage API service
Edit the ``/etc/cinder/cinder.conf`` file:
On a RHEL-based system, it should look something like:
.. code-block:: ini
# This is the name which we should advertise ourselves as and for
# A/P installations it should be the same everywhere
host = cinder-cluster-1
# Listen on the Block Storage VIP
osapi_volume_listen =
auth_strategy = keystone
control_exchange = cinder
volume_driver = cinder.volume.drivers.nfs.NfsDriver
nfs_shares_config = /etc/cinder/nfs_exports
nfs_sparsed_volumes = true
nfs_mount_options = v3
sql_connection = mysql://cinder:CINDER_DBPASS@
max_retries = -1
# is the Keystone VIP
identity_uri =
auth_uri =
admin_tenant_name = service
admin_user = cinder
admin_password = CINDER_PASS
# Explicitly list the rabbit hosts as it doesn't play well with HAProxy
rabbit_hosts =,,
# As a consequence, we also need HA queues
rabbit_ha_queues = True
heartbeat_timeout_threshold = 60
heartbeat_rate = 2
Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
database. Replace ``CINDER_PASS`` with the password you chose for the
``cinder`` user in the Identity service.
This example assumes that you are using NFS for the physical storage, which
will almost never be true in a production installation.
If you are using the Block Storage service OCF agent, some settings will
be filled in for you, resulting in a shorter configuration file:
.. code-block:: ini
# We have to use MySQL connection to store data:
sql_connection = mysql://cinder:CINDER_DBPASS@
# Alternatively, you can switch to pymysql,
# a new Python 3 compatible library and use
# sql_connection = mysql+pymysql://cinder:CINDER_DBPASS@
# and be ready when everything moves to Python 3.
# Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
# We bind Block Storage API to the VIP:
osapi_volume_listen =
# We send notifications to High Available RabbitMQ:
notifier_strategy = rabbit
rabbit_host =
Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
.. _ha-cinder-services:
Configure OpenStack services to use highly available Block Storage API
Your OpenStack services must now point their
Block Storage API configuration to the highly available,
virtual cluster IP address
rather than a Block Storage API servers physical IP address
as you would for a non-HA environment.
You must create the Block Storage API endpoint with this IP.
If you are using both private and public IP addresses,
you should create two virtual IPs and define your endpoint like this:
.. code-block:: console
$ keystone endpoint-create --region $KEYSTONE_REGION \
--service-id $service-id \
--publicurl 'http://PUBLIC_VIP:8776/v1/%(tenant_id)s' \
--adminurl '' \
--internalurl ''

Highly available OpenStack Image API
The OpenStack Image service offers a service for discovering,
registering, and retrieving virtual machine images.
To make the OpenStack Image API service highly available
in active / passive mode, you must:
- :ref:`glance-api-pacemaker`
- :ref:`glance-api-configure`
- :ref:`glance-services`
This section assumes that you are familiar with the
for installing the OpenStack Image API service.
.. _glance-api-pacemaker:
Add OpenStack Image API resource to Pacemaker
You must first download the resource agent to your system:
.. code-block:: console
# cd /usr/lib/ocf/resource.d/openstack
# wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/glance-api
# chmod a+rx *
You can now add the Pacemaker configuration
for the OpenStack Image API resource.
Use the :command:`crm configure` command
to connect to the Pacemaker cluster
and add the following cluster resources:
primitive p_glance-api ocf:openstack:glance-api \
params config="/etc/glance/glance-api.conf" \
os_password="secretsecret" \
os_username="admin" os_tenant_name="admin" \
os_auth_url="" \
op monitor interval="30s" timeout="30s"
This configuration creates ``p_glance-api``,
a resource for managing the OpenStack Image API service.
The :command:`crm configure` command supports batch input,
so you may copy and paste the above into your live Pacemaker configuration
and then make changes as required.
For example, you may enter edit ``p_ip_glance-api``
from the :command:`crm configure` menu
and edit the resource to match your preferred virtual IP address.
After completing these steps,
commit your configuration changes by entering :command:`commit`
from the :command:`crm configure` menu.
Pacemaker then starts the OpenStack Image API service
and its dependent resources on one of your nodes.
.. _glance-api-configure:
Configure OpenStack Image service API
Edit the :file:`/etc/glance/glance-api.conf` file
to configure the OpenStack image service:
.. code-block:: ini
# We have to use MySQL connection to store data:
# Alternatively, you can switch to pymysql,
# a new Python 3 compatible library and use
# sql_connection=mysql+pymysql://glance:password@
# and be ready when everything moves to Python 3.
# Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
# We bind OpenStack Image API to the VIP:
bind_host =
# Connect to OpenStack Image registry service:
registry_host =
# We send notifications to High Available RabbitMQ:
notifier_strategy = rabbit
rabbit_host =
[TODO: need more discussion of these parameters]
.. _glance-services:
Configure OpenStack services to use highly available OpenStack Image API
Your OpenStack services must now point
their OpenStack Image API configuration to the highly available,
virtual cluster IP address
instead of pointint to the physical IP address
of an OpenStack Image API server
as you would in a non-HA cluster.
For OpenStack Compute, for example,
if your OpenStack Image API service IP address is
(as in the configuration explained here),
you would use the following configuration in your :file:`nova.conf` file:
.. code-block:: ini
api_servers =
You must also create the OpenStack Image API endpoint with this IP address.
If you are using both private and public IP addresses,
you should create two virtual IP addresses
and define your endpoint like this:
.. code-block:: console
$ keystone endpoint-create --region $KEYSTONE_REGION \
--service-id $service-id --publicurl 'http://PUBLIC_VIP:9292' \
--adminurl '' \
--internalurl ''

.. highlight: ini
:linenothreshold: 5
Highly available Shared File Systems API
Making the Shared File Systems (manila) API service highly available
in active/passive mode involves:
- :ref:`ha-manila-pacemaker`
- :ref:`ha-manila-configure`
- :ref:`ha-manila-services`
.. _ha-manila-pacemaker:
Add Shared File Systems API resource to Pacemaker
You must first download the resource agent to your system:
.. code-block:: console
# cd /usr/lib/ocf/resource.d/openstack
# wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/manila-api
# chmod a+rx *
You can now add the Pacemaker configuration for the Shared File Systems
API resource. Connect to the Pacemaker cluster with the
:command:`crm configure` command and add the following cluster resources:
primitive p_manila-api ocf:openstack:manila-api \
params config="/etc/manila/manila.conf"
os_username="admin" \
keystone_get_token_url="" \
op monitor interval="30s" timeout="30s"
This configuration creates ``p_manila-api``, a resource for managing the
Shared File Systems API service.
The :command:`crm configure` supports batch input, so you may copy and paste
the lines above into your live Pacemaker configuration and then make changes
as required. For example, you may enter ``edit p_ip_manila-api`` from the
:command:`crm configure` menu and edit the resource to match your preferred
virtual IP address.
Once completed, commit your configuration changes by entering :command:`commit`
from the :command:`crm configure` menu. Pacemaker then starts the
Shared File Systems API service and its dependent resources on one of your
.. _ha-manila-configure:
Configure Shared File Systems API service
Edit the :file:`/etc/manila/manila.conf` file:
.. code-block:: ini
# We have to use MySQL connection to store data:
sql_connection = mysql+pymysql://manila:password@
# We bind Shared File Systems API to the VIP:
osapi_volume_listen =
# We send notifications to High Available RabbitMQ:
notifier_strategy = rabbit
rabbit_host =
.. _ha-manila-services:
Configure OpenStack services to use HA Shared File Systems API
Your OpenStack services must now point their Shared File Systems API
configuration to the highly available, virtual cluster IP address rather than
a Shared File Systems API servers physical IP address as you would
for a non-HA environment.
You must create the Shared File Systems API endpoint with this IP.
If you are using both private and public IP addresses, you should create two
virtual IPs and define your endpoints like this:
.. code-block:: console
$ openstack endpoint create --region RegionOne \
sharev2 public 'http://PUBLIC_VIP:8786/v2/%(tenant_id)s'
$ openstack endpoint create --region RegionOne \
sharev2 internal ''
$ openstack endpoint create --region RegionOne \
sharev2 admin ''

Configuring Storage for high availability
.. toctree::
:maxdepth: 2

for guide in user-guide admin-guide \
contributor-guide image-guide arch-design cli-reference; do
for guide in admin-guide arch-design cli-reference contributor-guide \
ha-guide image-guide user-guide; do
tools/build-rst.sh doc/$guide --build build \
--target $guide $LINKCHECK