[arch-design] Move RST guide to arch-design folder
- Moved RST guide to the arch-design folder - Deleted XML files - Updated scripts Change-Id: Id0e38a9cada9dd75cb9c8f3bd2d88ce2f4fd3eac Implements: blueprint archguide-mitaka-rst
@ -35,6 +35,11 @@ Virtual Machine Image Guide
|
||||
|
||||
* RST conversion finished.
|
||||
|
||||
Architecture Design Guide
|
||||
-------------------------
|
||||
|
||||
* Completed RST conversion.
|
||||
|
||||
Translations
|
||||
------------
|
||||
|
||||
|
@ -30,9 +30,9 @@ declare -A SPECIAL_BOOKS=(
|
||||
["networking-guide"]="RST"
|
||||
["user-guide"]="RST"
|
||||
["user-guide-admin"]="RST"
|
||||
["arch-design"]="RST"
|
||||
# Skip in-progress guides
|
||||
["contributor-guide"]="skip"
|
||||
["arch-design-rst"]="skip"
|
||||
["config-ref-rst"]="skip"
|
||||
# This needs special handling, handle it with the RST tools.
|
||||
["common-rst"]="RST"
|
||||
|
@ -1,64 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<book xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="openstack-arch-design">
|
||||
<title>OpenStack Architecture Design Guide</title>
|
||||
<?rax title.font.size="28px" subtitle.font.size="28px"?>
|
||||
<titleabbrev>Architecture Guide</titleabbrev>
|
||||
<info>
|
||||
<author>
|
||||
<personname>
|
||||
<firstname/>
|
||||
<surname/>
|
||||
</personname>
|
||||
<affiliation>
|
||||
<orgname>OpenStack Foundation</orgname>
|
||||
</affiliation>
|
||||
</author>
|
||||
<copyright>
|
||||
<year>2014</year>
|
||||
<year>2015</year>
|
||||
<holder>OpenStack Foundation</holder>
|
||||
</copyright>
|
||||
<releaseinfo>current</releaseinfo>
|
||||
<productname>OpenStack</productname>
|
||||
<pubdate/>
|
||||
<legalnotice role="apache2">
|
||||
<annotation>
|
||||
<remark>Copyright details are filled in by the
|
||||
template.</remark>
|
||||
</annotation>
|
||||
</legalnotice>
|
||||
<legalnotice role="cc-by">
|
||||
<annotation>
|
||||
<remark>Remaining licensing details are filled in by
|
||||
the template.</remark>
|
||||
</annotation>
|
||||
</legalnotice>
|
||||
<abstract>
|
||||
<para>To reap the benefits of OpenStack, you should
|
||||
plan, design, and architect your cloud properly,
|
||||
taking user's needs into account and understanding the
|
||||
use cases.</para>
|
||||
</abstract>
|
||||
</info>
|
||||
<!-- Chapters are referred from the book file through these
|
||||
include statements. You can add additional chapters using
|
||||
these types of statements. -->
|
||||
<xi:include href="../common/ch_preface.xml"/>
|
||||
<xi:include href="ch_introduction.xml"/>
|
||||
<xi:include href="ch_legal-security-requirements.xml"/>
|
||||
<xi:include href="ch_generalpurpose.xml"/>
|
||||
<xi:include href="ch_compute_focus.xml"/>
|
||||
<xi:include href="ch_storage_focus.xml"/>
|
||||
<xi:include href="ch_network_focus.xml"/>
|
||||
<xi:include href="ch_multi_site.xml"/>
|
||||
<xi:include href="ch_hybrid.xml"/>
|
||||
<xi:include href="ch_massively_scalable.xml"/>
|
||||
<xi:include href="ch_specialized.xml"/>
|
||||
<xi:include href="ch_references.xml"/>
|
||||
<xi:include href="../common/app_support.xml"/>
|
||||
<glossary role="auto"/>
|
||||
</book>
|
@ -1,45 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="compute_focus">
|
||||
<title>Compute focused</title>
|
||||
<para>Compute-focused clouds are a specialized subset of the general purpose
|
||||
OpenStack cloud architecture. A compute-focused cloud specifically supports
|
||||
compute intensive workloads.</para>
|
||||
<note>
|
||||
<para>Compute intensive workloads may be CPU intensive, RAM intensive,
|
||||
or both; they are not typically storage or network intensive.</para>
|
||||
</note>
|
||||
<para>Compute-focused workloads may include the following use cases:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>High performance computing (HPC)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Big data analytics using Hadoop or other distributed data
|
||||
stores</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Continuous integration/continuous deployment (CI/CD)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Platform-as-a-Service (PaaS)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Signal processing for network function virtualization (NFV)</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<note>
|
||||
<para>A compute-focused OpenStack cloud does not typically use raw block storage
|
||||
services as it does not host applications that require
|
||||
persistent block storage.</para>
|
||||
</note>
|
||||
|
||||
<xi:include href="compute_focus/section_tech_considerations_compute_focus.xml"/>
|
||||
<xi:include href="compute_focus/section_operational_considerations_compute_focus.xml"/>
|
||||
<xi:include href="compute_focus/section_architecture_compute_focus.xml"/>
|
||||
<xi:include href="compute_focus/section_prescriptive_examples_compute_focus.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,95 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="generalpurpose">
|
||||
<title>General purpose</title>
|
||||
<para>An OpenStack general purpose cloud is often considered a
|
||||
starting point for building a cloud deployment. They are designed
|
||||
to balance the components and do not emphasize any particular aspect
|
||||
of the overall computing environment.
|
||||
Cloud design must give equal weight to the compute, network, and
|
||||
storage components. General purpose clouds are
|
||||
found in private, public, and hybrid environments, lending
|
||||
themselves to many different use cases.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
General purpose clouds are homogeneous deployments. They are
|
||||
not suited to specialized environments or edge case situations.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
Common uses of a general purpose cloud include:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Providing a simple database
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
A web application runtime environment
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
A shared application development platform
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Lab test bed
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Use cases that benefit from scale-out rather than scale-up approaches
|
||||
are good candidates for general purpose cloud architecture.
|
||||
</para>
|
||||
<para>A general purpose cloud is designed to have a range of potential
|
||||
uses or functions; not specialized for specific use cases. General
|
||||
purpose architecture is designed to address 80% of potential use
|
||||
cases available. The infrastructure, in itself, is a specific use case,
|
||||
enabling it to be used as a base model for the design process.
|
||||
General purpose clouds are designed to be platforms that are suited
|
||||
for general purpose applications.</para>
|
||||
<para>General purpose clouds are limited to the most basic
|
||||
components, but they can include additional resources such
|
||||
as:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Virtual-machine disk image library</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Raw block storage</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>File or object storage</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Firewalls</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Load balancers</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>IP addresses</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Network overlays or virtual local area networks
|
||||
(VLANs)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Software bundles</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<xi:include href="generalpurpose/section_user_requirements_general_purpose.xml"/>
|
||||
<xi:include href="generalpurpose/section_tech_considerations_general_purpose.xml"/>
|
||||
<xi:include href="generalpurpose/section_operational_considerations_general_purpose.xml"/>
|
||||
<xi:include href="generalpurpose/section_architecture_general_purpose.xml"/>
|
||||
<xi:include href="generalpurpose/section_prescriptive_example_general_purpose.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,59 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="hybrid">
|
||||
<title>Hybrid</title>
|
||||
<para>A <glossterm baseform="hybrid cloud">hybrid cloud</glossterm> design
|
||||
is one that uses more than one cloud. For example, designs that use
|
||||
both an OpenStack-based private cloud and an OpenStack-based public
|
||||
cloud, or that use an OpenStack cloud and a non-OpenStack cloud,
|
||||
are hybrid clouds.</para>
|
||||
<para><glossterm baseform="bursting">Bursting</glossterm> describes the
|
||||
practice of creating new instances in an external cloud to alleviate
|
||||
capacity issues in a private cloud.</para>
|
||||
<itemizedlist>
|
||||
<title>Example scenarios suited to hybrid clouds</title>
|
||||
<listitem>
|
||||
<para>Bursting from a private cloud to a public
|
||||
cloud</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Disaster recovery</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Development and testing</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Federated cloud, enabling users to choose resources
|
||||
from multiple providers</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Supporting legacy systems as they transition to the
|
||||
cloud</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Hybrid clouds interact with systems that are outside
|
||||
the control of the private cloud administrator, and require careful
|
||||
architecture to prevent conflicts with hardware, software,
|
||||
and APIs under external control.</para>
|
||||
<para>The degree to which the architecture is OpenStack-based
|
||||
affects your ability to accomplish tasks with native
|
||||
OpenStack tools. By definition, this is a situation in which
|
||||
no single cloud can provide all of the necessary
|
||||
functionality. In order to manage the entire system, we recommend
|
||||
using a cloud management platform (CMP).</para>
|
||||
<para>There are several commercial and open source CMPs available,
|
||||
but there is no single CMP that can address all needs in all scenarios,
|
||||
and sometimes a manually-built solution is the best option.
|
||||
This chapter includes discussion of using CMPs for managing a hybrid
|
||||
cloud.</para>
|
||||
|
||||
<xi:include href="hybrid/section_user_requirements_hybrid.xml"/>
|
||||
<xi:include href="hybrid/section_tech_considerations_hybrid.xml"/>
|
||||
<xi:include href="hybrid/section_operational_considerations_hybrid.xml"/>
|
||||
<xi:include href="hybrid/section_architecture_hybrid.xml"/>
|
||||
<xi:include href="hybrid/section_prescriptive_examples_hybrid.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,18 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="introduction">
|
||||
<title>Introduction</title>
|
||||
|
||||
<para><glossterm>OpenStack</glossterm> is a fully-featured, self-service
|
||||
cloud. This book takes you through some of the considerations you have to make
|
||||
when designing your cloud.</para>
|
||||
|
||||
<xi:include href="introduction/section_intended_audience.xml"/>
|
||||
<xi:include href="introduction/section_how_this_book_is_organized.xml"/>
|
||||
<xi:include href="introduction/section_how_this_book_was_written.xml"/>
|
||||
<xi:include href="introduction/section_methodology.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,260 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="security-legal-requirements">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Security and legal requirements</title>
|
||||
<para>This chapter discusses the legal and security requirements you
|
||||
need to consider for the different OpenStack scenarios.</para>
|
||||
<section xml:id="legal-requirements">
|
||||
<title>Legal requirements</title>
|
||||
<para>Many jurisdictions have legislative and regulatory
|
||||
requirements governing the storage and management of data in
|
||||
cloud environments. Common areas of regulation include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Data retention policies ensuring storage of
|
||||
persistent data and records management to meet data
|
||||
archival requirements.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Data ownership policies governing the possession and
|
||||
responsibility for data.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Data sovereignty policies governing the storage of
|
||||
data in foreign countries or otherwise separate
|
||||
jurisdictions.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Data compliance policies governing certain types of
|
||||
information needing to reside in certain locations due to
|
||||
regulatory issues - and more importantly, cannot reside in
|
||||
other locations for the same reason.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Examples of such legal frameworks include the <link
|
||||
xlink:href="http://ec.europa.eu/justice/data-protection/">data
|
||||
protection framework</link> of the European Union and the
|
||||
requirements of the <link
|
||||
xlink:href="http://www.finra.org/Industry/Regulation/FINRARules/">
|
||||
Financial Industry Regulatory Authority</link> in the United
|
||||
States. Consult a local regulatory body for more information.
|
||||
</para>
|
||||
</section>
|
||||
<section xml:id="security-overview">
|
||||
<title>Security</title>
|
||||
<para>When deploying OpenStack in an enterprise as a private
|
||||
cloud, despite activating a firewall and binding
|
||||
employees with security agreements, cloud architecture
|
||||
should not make assumptions about safety and protection.
|
||||
In addition to considering the users, operators, or administrators
|
||||
who will use the environment, consider also negative or hostile users who
|
||||
would attack or compromise the security of your deployment regardless
|
||||
of firewalls or security agreements.</para>
|
||||
<para>Attack vectors increase further in a public facing OpenStack
|
||||
deployment. For example, the API endpoints and the
|
||||
software behind it become vulnerable to hostile
|
||||
entities attempting to gain unauthorized access or prevent access
|
||||
to services. This can result in loss of reputation and you must
|
||||
protect against it through auditing and appropriate
|
||||
filtering.</para>
|
||||
<para>It is important to understand that user authentication
|
||||
requests encase sensitive information such as user names,
|
||||
passwords, and authentication tokens. For this reason, place
|
||||
the API services behind hardware that performs SSL termination.</para>
|
||||
<warning>
|
||||
<para>Be mindful of consistency when utilizing third party
|
||||
clouds to explore authentication options.</para>
|
||||
</warning>
|
||||
</section>
|
||||
<section xml:id="security-domains">
|
||||
<title>Security domains</title>
|
||||
<para>A security domain comprises users, applications, servers or
|
||||
networks that share common trust requirements and expectations
|
||||
within a system. Typically, security domains have the same
|
||||
authentication and authorization requirements and users.</para>
|
||||
<para>You can map security domains individually to the
|
||||
installation, or combine them. For example, some
|
||||
deployment topologies combine both guest and data domains onto
|
||||
one physical network. In other cases these networks
|
||||
are physically separate. Map out the security domains against
|
||||
specific OpenStack topologies needs. The domains and their trust requirements
|
||||
depend on whether the cloud instance is public, private, or
|
||||
hybrid.</para>
|
||||
<simplesect>
|
||||
<title>Public security domains</title>
|
||||
<para>The public security domain is an untrusted area of
|
||||
the cloud infrastructure. It can refer to the internet as a
|
||||
whole or simply to networks over which the user has no
|
||||
authority. Always consider this domain untrusted. For example,
|
||||
in a hybrid cloud deployment, any information traversing
|
||||
between and beyond the clouds is in the public domain and
|
||||
untrustworthy.</para>
|
||||
</simplesect>
|
||||
<simplesect>
|
||||
<title>Guest security domains</title>
|
||||
<para>Typically used for compute instance-to-instance traffic, the
|
||||
guest security domain handles compute data generated by
|
||||
instances on the cloud but not services that support the
|
||||
operation of the cloud, such as API calls. Public cloud
|
||||
providers and private cloud providers who do not have
|
||||
stringent controls on instance use or who allow unrestricted
|
||||
internet access to instances should consider this domain to be
|
||||
untrusted. Private cloud providers may want to consider this
|
||||
network as internal and therefore trusted only if they have
|
||||
controls in place to assert that they trust instances and all
|
||||
their tenants.</para>
|
||||
</simplesect>
|
||||
<simplesect>
|
||||
<title>Management security domains</title>
|
||||
<para>The management security domain is where services interact.
|
||||
The networks in this domain transport confidential data such as configuration
|
||||
parameters, user names, and passwords. Trust this domain when it is
|
||||
behind an organization's firewall in deployments.</para>
|
||||
</simplesect>
|
||||
<simplesect>
|
||||
<title>Data security domains</title>
|
||||
<para>The data security domain is concerned primarily with
|
||||
information pertaining to the storage services within
|
||||
OpenStack. The data that crosses this network has integrity and
|
||||
confidentiality requirements. Depending on the type of deployment there
|
||||
may also be availability requirements. The trust level of this network
|
||||
is heavily dependent on deployment decisions and does not have a default
|
||||
level of trust.</para>
|
||||
</simplesect>
|
||||
</section>
|
||||
<section xml:id="hypervisor-security">
|
||||
<title>Hypervisor-security</title>
|
||||
<para>The hypervisor also requires a security assessment. In a
|
||||
public cloud, organizations typically do not have control
|
||||
over the choice of hypervisor. Properly securing your
|
||||
hypervisor is important. Attacks made upon the
|
||||
unsecured hypervisor are called a
|
||||
<firstterm>hypervisor breakout</firstterm>.
|
||||
Hypervisor breakout describes the event of a
|
||||
compromised or malicious instance breaking out of the resource
|
||||
controls of the hypervisor and gaining access to the bare
|
||||
metal operating system and hardware resources.</para>
|
||||
<para>There is not an issue if the security of instances is not important.
|
||||
However, enterprises need to avoid vulnerability. The only way to
|
||||
do this is to avoid the situation where the instances are running
|
||||
on a public cloud. That does not mean that there is a
|
||||
need to own all of the infrastructure on which an OpenStack
|
||||
installation operates; it suggests avoiding situations in which
|
||||
sharing hardware with others occurs.</para>
|
||||
</section>
|
||||
<section xml:id="security-baremetal">
|
||||
<title>Baremetal security</title>
|
||||
<para>There are other services worth considering that provide a
|
||||
bare metal instance instead of a cloud. In other cases, it is
|
||||
possible to replicate a second private cloud by integrating
|
||||
with a private Cloud-as-a-Service deployment. The
|
||||
organization does not buy the hardware, but also does not share
|
||||
with other tenants. It is also possible to use a provider that
|
||||
hosts a bare-metal public cloud instance for which the
|
||||
hardware is dedicated only to one customer, or a provider that
|
||||
offers private Cloud-as-a-Service.</para>
|
||||
<important>
|
||||
<para>Each cloud implements services differently.
|
||||
What keeps data secure in one
|
||||
cloud may not do the same in another. Be sure to know the
|
||||
security requirements of every cloud that handles the
|
||||
organization's data or workloads.</para>
|
||||
</important>
|
||||
<para>More information on OpenStack Security can be found in the
|
||||
<link xlink:href="http://docs.openstack.org/security-guide"><citetitle>OpenStack
|
||||
Security Guide</citetitle></link>.</para>
|
||||
</section>
|
||||
<section xml:id="networking-security">
|
||||
<title>Networking Security</title>
|
||||
<para>Consider security implications and requirements before designing the
|
||||
physical and logical network topologies. Make sure that the networks are
|
||||
properly segregated and traffic flows are going to the correct
|
||||
destinations without crossing through locations that are undesirable.
|
||||
Consider the following example factors:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Firewalls</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Overlay interconnects for joining separated tenant networks</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Routing through or avoiding specific networks</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>How networks attach to hypervisors can expose security
|
||||
vulnerabilities. To mitigate against exploiting hypervisor breakouts,
|
||||
separate networks from other systems and schedule instances for the
|
||||
network onto dedicated compute nodes. This prevents attackers
|
||||
from having access to the networks from a compromised instance.</para>
|
||||
</section>
|
||||
<section xml:id="security-multi-site">
|
||||
<title>Multi-site security</title>
|
||||
<para>Securing a multi-site OpenStack installation brings
|
||||
extra challenges. Tenants may expect a tenant-created network
|
||||
to be secure. In a multi-site installation the use of a
|
||||
non-private connection between sites may be required. This may
|
||||
mean that traffic would be visible to third parties and, in
|
||||
cases where an application requires security, this issue
|
||||
requires mitigation. In these instances, install a VPN or
|
||||
encrypted connection between sites to conceal sensitive traffic.</para>
|
||||
<para>Another security consideration with regard to multi-site
|
||||
deployments is Identity. Centralize authentication within a
|
||||
multi-site deployment. Centralization provides a
|
||||
single authentication point for users across the deployment,
|
||||
as well as a single point of administration for traditional
|
||||
create, read, update, and delete operations. Centralized
|
||||
authentication is also useful for auditing purposes because
|
||||
all authentication tokens originate from the same
|
||||
source.</para>
|
||||
<para>Just as tenants in a single-site deployment need isolation
|
||||
from each other, so do tenants in multi-site installations.
|
||||
The extra challenges in multi-site designs revolve around
|
||||
ensuring that tenant networks function across regions.
|
||||
OpenStack Networking (neutron) does not presently support
|
||||
a mechanism to provide this functionality, therefore an
|
||||
external system may be necessary to manage these mappings.
|
||||
Tenant networks may contain sensitive information requiring
|
||||
that this mapping be accurate and consistent to ensure that a
|
||||
tenant in one site does not connect to a different tenant in
|
||||
another site.</para>
|
||||
</section>
|
||||
<section xml:id="openstack-components-multi-site">
|
||||
<title>OpenStack components</title>
|
||||
<para>Most OpenStack installations require a bare minimum set of
|
||||
pieces to function. These include OpenStack Identity
|
||||
(keystone) for authentication, OpenStack Compute
|
||||
(nova) for compute, OpenStack Image service (glance) for image
|
||||
storage, OpenStack Networking (neutron) for networking, and
|
||||
potentially an object store in the form of OpenStack Object
|
||||
Storage (swift). Bringing multi-site into play also demands extra
|
||||
components in order to coordinate between regions. Centralized
|
||||
Identity service is necessary to provide the single authentication
|
||||
point. Centralized dashboard is also recommended to provide a
|
||||
single login point and a mapped experience to the API and CLI
|
||||
options available. If needed, use a centralized Object Storage service,
|
||||
installing the required swift proxy service alongside the Object
|
||||
Storage service.</para>
|
||||
<para>It may also be helpful to install a few extra options in
|
||||
order to facilitate certain use cases. For instance,
|
||||
installing DNS service may assist in automatically generating
|
||||
DNS domains for each region with an automatically-populated
|
||||
zone full of resource records for each instance. This
|
||||
facilitates using DNS as a mechanism for determining which
|
||||
region would be selected for certain applications.</para>
|
||||
<para>Another useful tool for managing a multi-site installation
|
||||
is Orchestration (heat). The Orchestration service
|
||||
allows the use of templates to define a set of instances to
|
||||
be launched together or for scaling existing sets. It can
|
||||
set up matching or differentiated groupings based on
|
||||
regions. For instance, if an application requires an equally
|
||||
balanced number of nodes across sites, the same heat template
|
||||
can be used to cover each site with small alterations to only
|
||||
the region name.</para>
|
||||
</section>
|
||||
</chapter>
|
||||
|
@ -1,79 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="massively_scalable">
|
||||
<title>Massively scalable</title>
|
||||
|
||||
<para>A massively scalable architecture is a cloud
|
||||
implementation that is either a very large deployment, such as
|
||||
a commercial service provider might build, or
|
||||
one that has the capability to support user requests for large
|
||||
amounts of cloud resources.</para>
|
||||
<para>An example is an infrastructure in which requests to service
|
||||
500 or more instances at a time is common. A massively scalable
|
||||
infrastructure fulfills such a request without exhausting the
|
||||
available cloud infrastructure resources. While the high capital
|
||||
cost of implementing such a cloud architecture means that it
|
||||
is currently in limited use, many organizations are planning
|
||||
for massive scalability in the future.</para>
|
||||
<para>A massively scalable OpenStack cloud design presents a
|
||||
unique set of challenges and considerations. For the most part
|
||||
it is similar to a general purpose cloud architecture, as it
|
||||
is built to address a non-specific range of potential use
|
||||
cases or functions. Typically, it is rare that particular
|
||||
workloads determine the design or configuration of massively
|
||||
scalable clouds. The massively scalable cloud is most often
|
||||
built as a platform for a variety of workloads. Because private
|
||||
organizations rarely require or have the resources for them,
|
||||
massively scalable OpenStack clouds are generally built as
|
||||
commercial, public cloud offerings.</para>
|
||||
<para>Services provided by a massively scalable OpenStack cloud
|
||||
include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Virtual-machine disk image library</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Raw block storage</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>File or object storage</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Firewall functionality</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Load balancing functionality</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Private (non-routable) and public (floating) IP
|
||||
addresses</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Virtualized network topologies</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Software bundles</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Virtual compute resources</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Like a general purpose cloud, the instances deployed in a
|
||||
massively scalable OpenStack cloud do not necessarily use
|
||||
any specific aspect of the cloud offering (compute, network,
|
||||
or storage). As the cloud grows in scale, the number of
|
||||
workloads can cause stress on all the cloud
|
||||
components. This adds further stresses to supporting
|
||||
infrastructure such as databases and message brokers. The
|
||||
architecture design for such a cloud must account for these
|
||||
performance pressures without negatively impacting user
|
||||
experience.</para>
|
||||
|
||||
<xi:include href="massively_scalable/section_user_requirements_massively_scalable.xml"/>
|
||||
<xi:include href="massively_scalable/section_tech_considerations_massively_scalable.xml"/>
|
||||
<xi:include href="massively_scalable/section_operational_considerations_massively_scalable.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,34 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="multi_site">
|
||||
<title>Multi-site</title>
|
||||
|
||||
<para>OpenStack is capable of running in a multi-region
|
||||
configuration. This enables some parts of OpenStack to
|
||||
effectively manage a group of sites as a single cloud.</para>
|
||||
<para>Some use cases that might indicate a need for a multi-site
|
||||
deployment of OpenStack include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>An organization with a diverse geographic
|
||||
footprint.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Geo-location sensitive data.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Data locality, in which specific data or
|
||||
functionality should be close to users.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<xi:include href="multi_site/section_user_requirements_multi_site.xml"/>
|
||||
<xi:include href="multi_site/section_tech_considerations_multi_site.xml"/>
|
||||
<xi:include href="multi_site/section_operational_considerations_multi_site.xml"/>
|
||||
<xi:include href="multi_site/section_architecture_multi_site.xml"/>
|
||||
<xi:include href="multi_site/section_prescriptive_examples_multi_site.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,152 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="network_focus">
|
||||
<title>Network focused</title>
|
||||
<para>All OpenStack deployments depend on network communication in order
|
||||
to function properly due to its service-based nature. In some cases,
|
||||
however, the network elevates beyond simple
|
||||
infrastructure. This chapter discusses architectures that are more
|
||||
reliant or focused on network services. These architectures depend
|
||||
on the network infrastructure and require
|
||||
network services that perform reliably in order to satisfy user and
|
||||
application requirements.</para>
|
||||
<para>Some possible use cases include:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Content delivery network</term>
|
||||
<listitem>
|
||||
<para>This includes streaming video, viewing photographs, or
|
||||
accessing any other cloud-based data repository distributed to
|
||||
a large number of end users. Network configuration affects
|
||||
latency, bandwidth, and the distribution of instances. Therefore,
|
||||
it impacts video streaming. Not all video streaming is
|
||||
consumer-focused. For example, multicast videos (used for media,
|
||||
press conferences, corporate presentations, and web conferencing
|
||||
services) can also use a content delivery network.
|
||||
The location of the video repository and its relationship to end
|
||||
users affects content delivery. Network throughput of the back-end
|
||||
systems, as well as the WAN architecture and the cache methodology,
|
||||
also affect performance.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Network management functions</term>
|
||||
<listitem>
|
||||
<para>Use this cloud to provide network service functions built to
|
||||
support the delivery of back-end network services such as DNS,
|
||||
NTP, or SNMP.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Network service offerings</term>
|
||||
<listitem>
|
||||
<para>Use this cloud to run customer-facing network tools to
|
||||
support services. Examples include VPNs, MPLS private networks,
|
||||
and GRE tunnels.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Web portals or web services</term>
|
||||
<listitem>
|
||||
<para>Web servers are a common application for cloud services,
|
||||
and we recommend an understanding of their network requirements.
|
||||
The network requires scaling out to meet user demand and deliver
|
||||
web pages with a minimum latency. Depending on the details of
|
||||
the portal architecture, consider the internal east-west and
|
||||
north-south network bandwidth.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>High speed and high volume transactional systems</term>
|
||||
<listitem>
|
||||
<para>
|
||||
These types of applications are sensitive to network
|
||||
configurations. Examples include financial systems,
|
||||
credit card transaction applications, and trading and other
|
||||
extremely high volume systems. These systems are sensitive
|
||||
to network jitter and latency. They must balance a high volume
|
||||
of East-West and North-South network traffic to
|
||||
maximize efficiency of the data delivery.
|
||||
Many of these systems must access large, high performance
|
||||
database back ends.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>High availability</term>
|
||||
<listitem>
|
||||
<para>These types of use cases are dependent on the proper sizing
|
||||
of the network to maintain replication of data between sites for
|
||||
high availability. If one site becomes unavailable, the extra
|
||||
sites can serve the displaced load until the original site
|
||||
returns to service. It is important to size network capacity
|
||||
to handle the desired loads.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Big data</term>
|
||||
<listitem>
|
||||
<para>Clouds used for the management and collection of big data
|
||||
(data ingest) have a significant demand on network resources.
|
||||
Big data often uses partial replicas of the data to maintain
|
||||
integrity over large distributed clouds. Other big data
|
||||
applications that require a large amount of network resources
|
||||
are Hadoop, Cassandra, NuoDB, Riak, and other NoSQL and
|
||||
distributed databases.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Virtual desktop infrastructure (VDI)</term>
|
||||
<listitem>
|
||||
<para>This use case is sensitive to network congestion, latency,
|
||||
jitter, and other network characteristics. Like video streaming,
|
||||
the user experience is important. However, unlike video
|
||||
streaming, caching is not an option to offset the network issues.
|
||||
VDI requires both upstream and downstream traffic and cannot rely
|
||||
on caching for the delivery of the application to the end user.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Voice over IP (VoIP)</term>
|
||||
<listitem>
|
||||
<para>This is sensitive to network congestion, latency, jitter,
|
||||
and other network characteristics. VoIP has a symmetrical traffic
|
||||
pattern and it requires network quality of service (QoS) for best
|
||||
performance. In addition, you can implement active queue management
|
||||
to deliver voice and multimedia content. Users are sensitive to
|
||||
latency and jitter fluctuations and can detect them at very low
|
||||
levels.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Video Conference or web conference</term>
|
||||
<listitem>
|
||||
<para>This is sensitive to network congestion, latency, jitter,
|
||||
and other network characteristics. Video Conferencing has a
|
||||
symmetrical traffic pattern, but unless the network is on an
|
||||
MPLS private network, it cannot use network quality of service
|
||||
(QoS) to improve performance. Similar to VoIP, users are
|
||||
sensitive to network performance issues even at low levels.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>High performance computing (HPC)</term>
|
||||
<listitem>
|
||||
<para>This is a complex use case that requires careful
|
||||
consideration of the traffic flows and usage patterns to address
|
||||
the needs of cloud clusters. It has high east-west traffic
|
||||
patterns for distributed computing, but there can be substantial
|
||||
north-south traffic depending on the specific application.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
<xi:include href="network_focus/section_user_requirements_network_focus.xml"/>
|
||||
<xi:include href="network_focus/section_tech_considerations_network_focus.xml"/>
|
||||
<xi:include href="network_focus/section_operational_considerations_network_focus.xml"/>
|
||||
<xi:include href="network_focus/section_architecture_network_focus.xml"/>
|
||||
<xi:include href="network_focus/section_prescriptive_examples_network_focus.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,128 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-design-references">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>References</title>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://ec.europa.eu/justice/data-protection/">Data
|
||||
Protection framework of the European Union</link>: Guidance on
|
||||
Data Protection laws governed by the EU.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://www.internetsociety.org/deploy360/blog/2014/05/goodbye-ipv4-iana-starts-allocating-final-address-blocks/">Depletion
|
||||
of IPv4 Addresses</link>: describing how IPv4 addresses and the
|
||||
migration to IPv6 is inevitable.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://www.garrettcom.com/techsupport/papers/ethernet_switch_reliability.pdf">Ethernet
|
||||
Switch Reliability</link>: Research white paper on Ethernet Switch
|
||||
reliability.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://www.finra.org/Industry/Regulation/FINRARules/">Financial
|
||||
Industry Regulatory Authority</link>: Requirements of the
|
||||
Financial Industry Regulatory Authority in the USA.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/cli-reference/content/chapter_cli-glance-property.html">Image
|
||||
Service property keys</link>: Glance API property keys allows the
|
||||
administrator to attach custom characteristics to images.
|
||||
</para>
|
||||
<para>
|
||||
<link xlink:href="http://libguestfs.org">LibGuestFS
|
||||
Documentation</link>: Official LibGuestFS documentation.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops/content/logging_monitoring.html">Logging
|
||||
and Monitoring</link>: Official OpenStack Operations
|
||||
documentation.
|
||||
</para>
|
||||
<para>
|
||||
<link xlink:href="http://manageiq.org/">ManageIQ Cloud Management
|
||||
Platform</link>: An Open Source Cloud Management Platform for
|
||||
managing multiple clouds.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://www.n-tron.com/pdf/network_availability.pdf">N-Tron
|
||||
Network Availability</link>: Research white paper on network
|
||||
availability.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://davejingtian.org/2014/03/30/nested-kvm-just-for-fun">Nested
|
||||
KVM</link>: Post on how to nest KVM under KVM.
|
||||
</para>
|
||||
<para>
|
||||
<link xlink:href="http://www.opencompute.org/">Open Compute
|
||||
Project</link>: The Open Compute Project Foundation's mission is
|
||||
to design and enable the delivery of the most efficient server,
|
||||
storage and data center hardware designs for scalable
|
||||
computing.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops/content/flavors.html">OpenStack
|
||||
Flavors</link>: Official OpenStack documentation.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/ha-guide/">OpenStack
|
||||
High Availability Guide</link>: Information on how to provide
|
||||
redundancy for the OpenStack components.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="https://wiki.openstack.org/wiki/HypervisorSupportMatrix">OpenStack
|
||||
Hypervisor Support Matrix</link>: Matrix of supported hypervisors
|
||||
and capabilities when used with OpenStack.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/developer/swift/replication_network.html">OpenStack
|
||||
Object Store (Swift) Replication Reference</link>: Developer
|
||||
documentation of Swift replication.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops/">OpenStack
|
||||
Operations Guide</link>: The OpenStack Operations Guide provides
|
||||
information on setting up and installing OpenStack.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/security-guide/">OpenStack
|
||||
Security Guide</link>: The OpenStack Security Guide provides
|
||||
information on securing OpenStack deployments.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="http://www.openstack.org/marketplace/training">OpenStack
|
||||
Training Marketplace</link>: The OpenStack Market for training and
|
||||
Vendors providing training on OpenStack.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="https://wiki.openstack.org/wiki/Pci_passthrough#How_to_check_PCI_status_with_PCI_api_paches">PCI
|
||||
passthrough</link>: The PCI API patches extend the
|
||||
servers/os-hypervisor to show PCI information for instance and
|
||||
compute node, and also provides a resource endpoint to show PCI
|
||||
information.
|
||||
</para>
|
||||
<para>
|
||||
<link
|
||||
xlink:href="https://wiki.openstack.org/wiki/TripleO">TripleO</link>:
|
||||
TripleO is a program aimed at installing, upgrading and operating
|
||||
OpenStack clouds using OpenStack's own cloud facilities as the
|
||||
foundation.
|
||||
</para>
|
||||
</chapter>
|
@ -1,67 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="specialized">
|
||||
<title>Specialized cases</title>
|
||||
<para>Although most OpenStack architecture designs fall into one
|
||||
of the seven major scenarios outlined in other sections
|
||||
(compute focused, network focused, storage focused, general
|
||||
purpose, multi-site, hybrid cloud, and massively scalable),
|
||||
there are a few use cases that do not fit into these categories.
|
||||
This section discusses these specialized cases and provide
|
||||
some additional details and design considerations
|
||||
for each use case:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<link
|
||||
linkend="specialized-networking-example">Specialized
|
||||
networking</link>: describes running
|
||||
networking-oriented software that may involve reading
|
||||
packets directly from the wire or participating in
|
||||
routing protocols.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link
|
||||
linkend="software-defined-networking-sdn">Software-defined
|
||||
networking (SDN)</link>: describes both
|
||||
running an SDN controller from within OpenStack as well
|
||||
as participating in a software-defined network.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link
|
||||
linkend="desktop-as-a-service">Desktop-as-a-Service</link>:
|
||||
describes running a virtualized desktop environment
|
||||
in a cloud (<glossterm>Desktop-as-a-Service</glossterm>).
|
||||
This applies to private and public clouds.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link
|
||||
linkend="arch-guide-openstack-on-openstack">OpenStack on
|
||||
OpenStack</link>: describes building a multi-tiered cloud by
|
||||
running OpenStack on top of an OpenStack installation.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="specialized-hardware">Specialized
|
||||
hardware</link>: describes the use of specialized
|
||||
hardware devices from within the OpenStack environment.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<xi:include href="specialized/section_multi_hypervisor_specialized.xml"/>
|
||||
<xi:include href="specialized/section_networking_specialized.xml"/>
|
||||
<xi:include href="specialized/section_software_defined_networking_specialized.xml"/>
|
||||
<xi:include href="specialized/section_desktop_as_a_service_specialized.xml"/>
|
||||
<xi:include href="specialized/section_openstack_on_openstack_specialized.xml"/>
|
||||
<xi:include href="specialized/section_hardware_specialized.xml"/>
|
||||
</chapter>
|
@ -1,78 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<chapter xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="storage_focus">
|
||||
<title>Storage focused</title>
|
||||
|
||||
<para>Cloud storage is a model of data storage that stores digital
|
||||
data in logical pools and physical storage that spans
|
||||
across multiple servers and locations. Cloud storage commonly
|
||||
refers to a hosted object storage service, however the term
|
||||
also includes other types of data storage that are
|
||||
available as a service, for example block storage.</para>
|
||||
<para>Cloud storage runs on virtualized infrastructure and
|
||||
resembles broader cloud computing in terms of accessible
|
||||
interfaces, elasticity, scalability, multi-tenancy, and
|
||||
metered resources. You can use cloud storage services from
|
||||
an off-premises service or deploy on-premises.</para>
|
||||
<para>Cloud storage consists of many distributed, synonymous
|
||||
resources, which are often referred to as integrated
|
||||
storage clouds. Cloud storage is highly fault tolerant through
|
||||
redundancy and the distribution of data. It is highly durable
|
||||
through the creation of versioned copies, and can be
|
||||
consistent with regard to data replicas.</para>
|
||||
<para>At large scale, management of data operations is
|
||||
a resource intensive process for an organization. Hierarchical
|
||||
storage management (HSM) systems and data grids help
|
||||
annotate and report a baseline data valuation to make
|
||||
intelligent decisions and automate data decisions. HSM enables
|
||||
automated tiering and movement, as well as orchestration
|
||||
of data operations. A data grid is an architecture, or set of
|
||||
services evolving technology, that brings together sets of
|
||||
services enabling users to manage large data sets.</para>
|
||||
<para>Example applications deployed with cloud
|
||||
storage characteristics:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Active archive, backups and hierarchical storage
|
||||
management.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>General content storage and synchronization. An
|
||||
example of this is private dropbox.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Data analytics with parallel file systems.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Unstructured data store for services. For example,
|
||||
social media back-end storage.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Persistent block storage.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Operating system and application image store.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Media streaming.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Databases.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Content distribution.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Cloud storage peering.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<xi:include href="storage_focus/section_tech_considerations_storage_focus.xml"/>
|
||||
<xi:include href="storage_focus/section_operational_considerations_storage_focus.xml"/>
|
||||
<xi:include href="storage_focus/section_architecture_storage_focus.xml"/>
|
||||
<xi:include href="storage_focus/section_prescriptive_examples_storage_focus.xml"/>
|
||||
|
||||
</chapter>
|
@ -1,268 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-design-architecture-hardware">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Architecture</title>
|
||||
<para>The hardware selection covers three areas:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Compute</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Network</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Storage</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Compute-focused OpenStack clouds have high demands on processor and
|
||||
memory resources, and requires hardware that can handle these demands.
|
||||
Consider the following factors when selecting compute (server) hardware:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Server density</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Resource capacity</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Expandability</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Cost</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Weigh these considerations against each other to determine the
|
||||
best design for the desired purpose. For example, increasing server density
|
||||
means sacrificing resource capacity or expandability.</para>
|
||||
<para>A compute-focused cloud should have an emphasis on server hardware
|
||||
that can offer more CPU sockets, more CPU cores, and more RAM. Network
|
||||
connectivity and storage capacity are less critical.</para>
|
||||
<para>When designing a compute-focused OpenStack architecture, you must
|
||||
consider whether you intend to scale up or scale out.
|
||||
Selecting a smaller number of larger hosts, or a
|
||||
larger number of smaller hosts, depends on a combination of factors:
|
||||
cost, power, cooling, physical rack and floor space, support-warranty,
|
||||
and manageability.</para>
|
||||
<para>Considerations for selecting hardware:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Most blade servers can support dual-socket multi-core CPUs. To
|
||||
avoid this CPU limit, select <literal>full width</literal>
|
||||
or <literal>full height</literal> blades.
|
||||
Be aware, however, that this also decreases server density. For example,
|
||||
high density blade servers such as HP BladeSystem or Dell PowerEdge
|
||||
M1000e support up to 16 servers in only ten rack units. Using
|
||||
half-height blades is twice as dense as using full-height blades,
|
||||
which results in only eight servers per ten rack units.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>1U rack-mounted servers that occupy only a single rack
|
||||
unit may offer greater server density than a blade server
|
||||
solution. It is possible to place forty 1U servers in a rack, providing
|
||||
space for the top of rack (ToR) switches, compared to 32 full width
|
||||
blade servers.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>2U rack-mounted servers provide quad-socket, multi-core CPU
|
||||
support, but with a corresponding decrease in server density (half
|
||||
the density that 1U rack-mounted servers offer).</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Larger rack-mounted servers, such as 4U servers, often provide
|
||||
even greater CPU capacity, commonly supporting four or even eight CPU
|
||||
sockets. These servers have greater expandability, but such servers
|
||||
have much lower server density and are often more expensive.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para><literal>Sled servers</literal> are rack-mounted servers that
|
||||
support multiple
|
||||
independent servers in a single 2U or 3U enclosure. These deliver higher
|
||||
density as compared to typical 1U or 2U rack-mounted servers. For
|
||||
example, many sled servers offer four independent dual-socket
|
||||
nodes in 2U for a total of eight CPU sockets in 2U.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Consider these when choosing server hardware for a compute-
|
||||
focused OpenStack design architecture:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Instance density</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Host density</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Power and cooling density</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<section xml:id="selecting-networking-hardware-arch">
|
||||
<title>Selecting networking hardware</title>
|
||||
<para>Some of the key considerations for networking hardware selection
|
||||
include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Port count</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Port density</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Port speed</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Redundancy</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Power requirements</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>We recommend designing the network architecture using
|
||||
a scalable network model that makes it easy to add capacity and
|
||||
bandwidth. A good example of such a model is the leaf-spline model. In
|
||||
this type of network design, it is possible to easily add additional
|
||||
bandwidth as well as scale out to additional racks of gear. It is
|
||||
important to select network hardware that supports the required
|
||||
port count, port speed, and port density while also allowing for future
|
||||
growth as workload demands increase. It is also important to evaluate
|
||||
where in the network architecture it is valuable to provide redundancy.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="os-and-hypervisor-arch">
|
||||
<title>Operating system and hypervisor</title>
|
||||
<para>The selection of operating system (OS) and hypervisor has a
|
||||
significant impact on the end point design.</para>
|
||||
<para>OS and hypervisor selection impact the following areas:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Cost</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Supportability</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Management tools</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Scale and performance</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Security</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Supported features</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Interoperability</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="openstack-components-arch">
|
||||
<title>OpenStack components</title>
|
||||
<para>The selection of OpenStack components is important.
|
||||
There are certain components that are required, for example the compute
|
||||
and image services, but others, such as the Orchestration service, may not
|
||||
be present.</para>
|
||||
<para>For a compute-focused OpenStack design architecture, the
|
||||
following components may be present:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Identity (keystone)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Dashboard (horizon)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Compute (nova)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Object Storage (swift)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Image (glance)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Networking (neutron)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Orchestration (heat)</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<note>
|
||||
<para>A compute-focused design is less likely to include OpenStack Block
|
||||
Storage. However, there may be some situations where the need for
|
||||
performance requires a block storage component to improve data I-O.</para>
|
||||
</note>
|
||||
<para>The exclusion of certain OpenStack components might also limit the
|
||||
functionality of other components. If a design includes
|
||||
the Orchestration service but excludes the Telemetry service, then
|
||||
the design cannot take advantage of Orchestration's auto
|
||||
scaling functionality as this relies on information from Telemetry.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="networking-software-arch">
|
||||
<title>Networking software</title>
|
||||
<para>OpenStack Networking provides a wide variety of networking services
|
||||
for instances. There are many additional networking software packages
|
||||
that might be useful to manage the OpenStack components themselves.
|
||||
The <citetitle>OpenStack High Availability Guide</citetitle>
|
||||
(<link xlink:href="http://docs.openstack.org/ha-guide/">http://docs.openstack.org/ha-guide/</link>)
|
||||
describes some of these software packages in more detail.
|
||||
</para>
|
||||
<para>For a compute-focused OpenStack cloud, the OpenStack infrastructure
|
||||
components must be highly available. If the design does not
|
||||
include hardware load balancing, you must add networking software packages,
|
||||
for example, HAProxy.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="management-software-arch">
|
||||
<title>Management software</title>
|
||||
<para>The selected supplemental software solution impacts and affects
|
||||
the overall OpenStack cloud design. This includes software for
|
||||
providing clustering, logging, monitoring and alerting.</para>
|
||||
<para>The availability of design requirements is the main determiner
|
||||
for the inclusion of clustering software, such as Corosync or Pacemaker.</para>
|
||||
<para>Operational considerations determine the requirements for logging,
|
||||
monitoring, and alerting. Each of these sub-categories include
|
||||
various options.</para>
|
||||
<para>Some other potential design impacts include:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>OS-hypervisor combination</term>
|
||||
<listitem>
|
||||
<para>Ensure that the selected logging,
|
||||
monitoring, or alerting tools support the proposed OS-hypervisor
|
||||
combination.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Network hardware</term>
|
||||
<listitem>
|
||||
<para>The logging, monitoring, and alerting software
|
||||
must support the network hardware selection.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="database-software-arch">
|
||||
<title>Database software</title>
|
||||
<para>A large majority of OpenStack components require access to
|
||||
back-end database services to store state and configuration
|
||||
information. Select an appropriate back-end database that
|
||||
satisfies the availability and fault tolerance requirements of the
|
||||
OpenStack services. OpenStack services support connecting
|
||||
to any database that the SQLAlchemy Python drivers support,
|
||||
however most common database deployments make use of MySQL or some
|
||||
variation of it. We recommend that you make the database that provides
|
||||
back-end services within a general-purpose cloud highly
|
||||
available. Some of the more common software solutions include Galera,
|
||||
MariaDB, and MySQL with multi-master replication.</para>
|
||||
</section>
|
||||
|
||||
</section>
|
@ -1,84 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="operational-considerations-compute-focus">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Operational considerations</title>
|
||||
<para>There are a number of operational considerations that affect the
|
||||
design of compute-focused OpenStack clouds, including:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Enforcing strict API availability requirements
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Understanding and dealing with failure scenarios
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Managing host maintenance schedules
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Service-level agreements (SLAs) are contractual obligations that
|
||||
ensure the availability of a service. When designing an OpenStack cloud,
|
||||
factoring in promises of availability implies a certain level of
|
||||
redundancy and resiliency.</para>
|
||||
|
||||
<section xml:id="montioring-compute-focus">
|
||||
<title>Monitoring</title>
|
||||
<para>OpenStack clouds require appropriate monitoring platforms
|
||||
to catch and manage errors.</para>
|
||||
<note>
|
||||
<para>We recommend leveraging existing monitoring systems
|
||||
to see if they are able to effectively monitor an
|
||||
OpenStack environment.</para>
|
||||
</note>
|
||||
<para>Specific meters that are critically important to capture
|
||||
include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Image disk utilization</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Response time to the Compute API</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="capacity-planning-operational">
|
||||
<title>Capacity planning</title>
|
||||
<para>Adding extra capacity to an OpenStack cloud is a
|
||||
horizontally scaling process.</para>
|
||||
<para>We recommend similar (or the same) CPUs
|
||||
when adding extra nodes to the environment. This reduces
|
||||
the chance of breaking live-migration features if they are
|
||||
present. Scaling out hypervisor hosts also has a direct effect
|
||||
on network and other data center resources. We recommend you
|
||||
factor in this increase when reaching rack capacity or when requiring
|
||||
extra network switches.</para>
|
||||
<para>Changing the internal components of a Compute host to account for
|
||||
increases in demand is a process known as vertical scaling.
|
||||
Swapping a CPU for one with more cores, or
|
||||
increasing the memory in a server, can help add extra
|
||||
capacity for running applications.</para>
|
||||
<para>Another option is to assess the average workloads and
|
||||
increase the number of instances that can run within the
|
||||
compute environment by adjusting the overcommit ratio.</para>
|
||||
<note>
|
||||
<para>It is important to remember that changing the CPU
|
||||
overcommit ratio can have a detrimental effect and cause
|
||||
a potential increase in a noisy neighbor.</para>
|
||||
</note>
|
||||
<para>The added risk of increasing the overcommit ratio is that
|
||||
more instances fail when a compute host fails. We do not recommend
|
||||
that you increase the CPU overcommit ratio in compute-focused
|
||||
OpenStack design architecture, as it can increase the potential
|
||||
for noisy neighbor issues.</para>
|
||||
</section>
|
||||
</section>
|
@ -1,162 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="prescriptive-example-compute-focus">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Prescriptive examples</title>
|
||||
<para>The Conseil Européen pour la Recherche Nucléaire (CERN),
|
||||
also known as the European Organization for Nuclear Research,
|
||||
provides particle accelerators and other infrastructure for
|
||||
high-energy physics research.</para>
|
||||
<para>As of 2011 CERN operated these two compute centers in Europe
|
||||
with plans to add a third.</para>
|
||||
<informaltable rules="all">
|
||||
<col width="40%" />
|
||||
<col width="60%" />
|
||||
<thead>
|
||||
<tr><th>Data center</th><th>Approximate capacity</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Geneva, Switzerland</td>
|
||||
<td>
|
||||
<itemizedlist>
|
||||
<listitem><para>3.5 Mega Watts</para></listitem>
|
||||
<listitem><para>91000 cores</para></listitem>
|
||||
<listitem><para>120 PB HDD</para></listitem>
|
||||
<listitem><para>100 PB Tape</para></listitem>
|
||||
<listitem><para>310 TB Memory</para></listitem>
|
||||
</itemizedlist>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Budapest, Hungary</td>
|
||||
<td>
|
||||
<itemizedlist>
|
||||
<listitem><para>2.5 Mega Watts</para></listitem>
|
||||
<listitem><para>20000 cores</para></listitem>
|
||||
<listitem><para>6 PB HDD</para></listitem>
|
||||
</itemizedlist>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</informaltable>
|
||||
<para>To support a growing number of compute-heavy users of
|
||||
experiments related to the Large Hadron Collider (LHC), CERN
|
||||
ultimately elected to deploy an OpenStack cloud using
|
||||
Scientific Linux and RDO. This effort aimed to simplify the
|
||||
management of the center's compute resources with a view to
|
||||
doubling compute capacity through the addition of a
|
||||
data center in 2013 while maintaining the same
|
||||
levels of compute staff.</para>
|
||||
<para>The CERN solution uses <glossterm baseform="cell">cells</glossterm>
|
||||
for segregation of compute
|
||||
resources and for transparently scaling between different data
|
||||
centers. This decision meant trading off support for security
|
||||
groups and live migration. In addition, they must manually replicate
|
||||
some details, like flavors, across cells. In
|
||||
spite of these drawbacks cells provide the
|
||||
required scale while exposing a single public API endpoint to
|
||||
users.</para>
|
||||
<para>CERN created a compute cell for each of the two original data
|
||||
centers and created a third when it added a new data center
|
||||
in 2013. Each cell contains three availability zones to
|
||||
further segregate compute resources and at least three
|
||||
RabbitMQ message brokers configured for clustering with
|
||||
mirrored queues for high availability.</para>
|
||||
<para>The API cell, which resides behind a HAProxy load balancer,
|
||||
is in the data center in Switzerland and directs API
|
||||
calls to compute cells using a customized variation of the
|
||||
cell scheduler. The customizations allow certain workloads to
|
||||
route to a specific data center or all data centers,
|
||||
with cell RAM availability determining cell selection in the
|
||||
latter case.</para>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in" fileref="../figures/Generic_CERN_Example.png"/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
<para>There is also some customization of the filter scheduler
|
||||
that handles placement within the cells:</para>
|
||||
<variablelist>
|
||||
<varlistentry><term>ImagePropertiesFilter</term>
|
||||
<listitem>
|
||||
<para>Provides special handling
|
||||
depending on the guest operating system in use
|
||||
(Linux-based or Windows-based).</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry><term>ProjectsToAggregateFilter</term>
|
||||
<listitem><para>Provides special
|
||||
handling depending on which project the instance is
|
||||
associated with.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry><term>default_schedule_zones</term>
|
||||
<listitem><para>Allows the selection of
|
||||
multiple default availability zones, rather than a
|
||||
single default.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>A central database team manages the MySQL database server in each cell
|
||||
in an active/passive configuration with a NetApp storage back end.
|
||||
Backups run every 6 hours.</para>
|
||||
|
||||
<section xml:id="network-architecture">
|
||||
<title>Network architecture</title>
|
||||
<para>To integrate with existing networking infrastructure, CERN
|
||||
made customizations to legacy networking (nova-network). This was in the
|
||||
form of a driver to integrate with CERN's existing database
|
||||
for tracking MAC and IP address assignments.</para>
|
||||
<para>The driver facilitates selection of a MAC address and IP for
|
||||
new instances based on the compute node where the scheduler places
|
||||
the instance.</para>
|
||||
<para>The driver considers the compute node where the scheduler
|
||||
placed an instance and selects a MAC address and IP
|
||||
from the pre-registered list associated with that node in the
|
||||
database. The database updates to reflect the address assignment to
|
||||
that instance.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="storage-architecture">
|
||||
<title>Storage architecture</title>
|
||||
<para>CERN deploys the OpenStack Image service in the API cell and
|
||||
configures it to expose version 1 (V1) of the API. This also requires
|
||||
the image registry. The storage back end in
|
||||
use is a 3 PB Ceph cluster.</para>
|
||||
<para>CERN maintains a small set of Scientific Linux 5 and 6 images onto
|
||||
which orchestration tools can place applications. Puppet manages
|
||||
instance configuration and customization.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="monitoring">
|
||||
<title>Monitoring</title>
|
||||
<para>CERN does not require direct billing, but uses the Telemetry service
|
||||
to perform metering for the purposes of adjusting
|
||||
project quotas. CERN uses a sharded, replicated, MongoDB back-end.
|
||||
To spread API load, CERN deploys instances of the nova-api service
|
||||
within the child cells for Telemetry to query
|
||||
against. This also requires the configuration of supporting services
|
||||
such as keystone, glance-api, and glance-registry in the child cells.
|
||||
</para>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Generic_CERN_Architecture.png"/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
<para>
|
||||
Additional monitoring tools in use include <link
|
||||
xlink:href="http://flume.apache.org/">Flume</link>, <link
|
||||
xlink:href="http://www.elasticsearch.org/">Elastic
|
||||
Search</link>, <link
|
||||
xlink:href="http://www.elasticsearch.org/overview/kibana/">Kibana</link>,
|
||||
and the CERN developed <link
|
||||
xlink:href="http://lemon.web.cern.ch/lemon/index.shtml">Lemon</link>
|
||||
project.
|
||||
</para>
|
||||
</section>
|
||||
</section>
|
@ -1,275 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE section [
|
||||
<!ENTITY % openstack SYSTEM "../../common/entities/openstack.ent">
|
||||
%openstack;
|
||||
]>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="technical-considerations-compute-focus">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Technical considerations</title>
|
||||
<para>In a compute-focused OpenStack cloud, the type of instance
|
||||
workloads you provision heavily influences technical
|
||||
decision making.</para>
|
||||
<para>Public and private clouds require deterministic capacity
|
||||
planning to support elastic growth in order to meet user SLA
|
||||
expectations. Deterministic capacity planning is the path to
|
||||
predicting the effort and expense of making a given process
|
||||
perform consistently. This process is important because,
|
||||
when a service becomes a critical part of a user's
|
||||
infrastructure, the user's experience links directly to the SLAs of
|
||||
the cloud itself.</para>
|
||||
<para>There are two aspects of capacity planning to consider:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Planning the initial deployment footprint</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Planning expansion of the environment to stay ahead of the
|
||||
demands of cloud users</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Begin planning an initial OpenStack deployment footprint with
|
||||
estimations of expected uptake, and existing infrastructure workloads.</para>
|
||||
<para>The starting point is the core count of the cloud. By
|
||||
applying relevant ratios, the user can gather information
|
||||
about:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>The number of expected concurrent instances:
|
||||
(overcommit fraction × cores) / virtual cores per instance</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Required storage: flavor disk size × number of instances</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>These ratios determine the amount of
|
||||
additional infrastructure needed to support the cloud. For
|
||||
example, consider a situation in which you require 1600
|
||||
instances, each with 2 vCPU and 50 GB of storage. Assuming the
|
||||
default overcommit rate of 16:1, working out the math provides
|
||||
an equation of:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>1600 = (16 × (number of physical cores)) / 2</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Storage required = 50 GB × 1600</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>On the surface, the equations reveal the need for 200
|
||||
physical cores and 80 TB of storage for
|
||||
<filename>/var/lib/nova/instances/</filename>. However,
|
||||
it is also important to
|
||||
look at patterns of usage to estimate the load that the API
|
||||
services, database servers, and queue servers are likely to
|
||||
encounter.</para>
|
||||
<para>Aside from the creation and termination of instances, consider the
|
||||
impact of users accessing the service,
|
||||
particularly on nova-api and its associated database. Listing
|
||||
instances gathers a great deal of information and given the
|
||||
frequency with which users run this operation, a cloud with a
|
||||
large number of users can increase the load significantly.
|
||||
This can even occur unintentionally. For example, the
|
||||
OpenStack Dashboard instances tab refreshes the list of
|
||||
instances every 30 seconds, so leaving it open in a browser
|
||||
window can cause unexpected load.</para>
|
||||
<para>Consideration of these factors can help determine how many
|
||||
cloud controller cores you require. A server with 8 CPU cores
|
||||
and 8 GB of RAM server would be sufficient for a rack of
|
||||
compute nodes, given the above caveats.</para>
|
||||
<para>Key hardware specifications are also crucial to the
|
||||
performance of user instances. Be sure to consider budget and
|
||||
performance needs, including storage performance
|
||||
(spindles/core), memory availability (RAM/core), network
|
||||
bandwidth (Gbps/core), and overall CPU performance
|
||||
(CPU/core).</para>
|
||||
<para>The cloud resource calculator is a useful tool in examining
|
||||
the impacts of different hardware and instance load outs. See:
|
||||
<link xlink:href="https://github.com/noslzzp/cloud-resource-calculator/blob/master/cloud-resource-calculator.ods">https://github.com/noslzzp/cloud-resource-calculator/blob/master/cloud-resource-calculator.ods</link>
|
||||
</para>
|
||||
|
||||
<section xml:id="expansion-planning-compute-focus">
|
||||
<title>Expansion planning</title>
|
||||
<para>A key challenge for planning the expansion of cloud
|
||||
compute services is the elastic nature of cloud infrastructure
|
||||
demands.</para>
|
||||
<para>Planning for expansion is a balancing act.
|
||||
Planning too conservatively can lead to unexpected
|
||||
oversubscription of the cloud and dissatisfied users. Planning
|
||||
for cloud expansion too aggressively can lead to unexpected
|
||||
underutilization of the cloud and funds spent unnecessarily on operating
|
||||
infrastructure.</para>
|
||||
<para>The key is to carefully monitor the trends in
|
||||
cloud usage over time. The intent is to measure the
|
||||
consistency with which you deliver services, not the
|
||||
average speed or capacity of the cloud. Using this information
|
||||
to model capacity performance enables users to more
|
||||
accurately determine the current and future capacity of the
|
||||
cloud.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="cpu-and-ram-compute-focus">
|
||||
<title>CPU and RAM</title>
|
||||
<para>OpenStack enables users to overcommit CPU and RAM on
|
||||
compute nodes. This allows an increase in the number of
|
||||
instances running on the cloud at the cost of reducing the
|
||||
performance of the instances. OpenStack Compute uses the
|
||||
following ratios by default:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>CPU allocation ratio: 16:1</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>RAM allocation ratio: 1.5:1</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>The default CPU allocation ratio of 16:1 means that the
|
||||
scheduler allocates up to 16 virtual cores per physical core.
|
||||
For example, if a physical node has 12 cores, the scheduler
|
||||
sees 192 available virtual cores. With typical flavor
|
||||
definitions of 4 virtual cores per instance, this ratio would
|
||||
provide 48 instances on a physical node.</para>
|
||||
<para>Similarly, the default RAM allocation ratio of 1.5:1 means
|
||||
that the scheduler allocates instances to a physical node as
|
||||
long as the total amount of RAM associated with the instances
|
||||
is less than 1.5 times the amount of RAM available on the
|
||||
physical node.</para>
|
||||
<para>You must select the appropriate CPU and RAM allocation ratio
|
||||
based on particular use cases.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="additional-hardware-compute-focus">
|
||||
<title>Additional hardware</title>
|
||||
<para>Certain use cases may benefit from exposure to additional
|
||||
devices on the compute node. Examples might include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>High performance computing jobs that benefit from
|
||||
the availability of graphics processing units (GPUs)
|
||||
for general-purpose computing.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Cryptographic routines that benefit from the
|
||||
availability of hardware random number generators to
|
||||
avoid entropy starvation.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Database management systems that benefit from the
|
||||
availability of SSDs for ephemeral storage to maximize
|
||||
read/write time.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Host aggregates group hosts that share similar
|
||||
characteristics, which can include hardware similarities. The
|
||||
addition of specialized hardware to a cloud deployment is
|
||||
likely to add to the cost of each node, so consider carefully
|
||||
whether all compute nodes, or
|
||||
just a subset targeted by flavors, need the
|
||||
additional customization to support the desired
|
||||
workloads.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="utilization">
|
||||
<title>Utilization</title>
|
||||
<para>Infrastructure-as-a-Service offerings, including OpenStack,
|
||||
use flavors to provide standardized views of virtual machine
|
||||
resource requirements that simplify the problem of scheduling
|
||||
instances while making the best use of the available physical
|
||||
resources.</para>
|
||||
<para>In order to facilitate packing of virtual machines onto
|
||||
physical hosts, the default selection of flavors provides a
|
||||
second largest flavor that is half the size
|
||||
of the largest flavor in every dimension. It has half the
|
||||
vCPUs, half the vRAM, and half the ephemeral disk space. The
|
||||
next largest flavor is half that size again. The following figure
|
||||
provides a visual representation of this concept for a general
|
||||
purpose computing design:
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Compute_Tech_Bin_Packing_General1.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject></para>
|
||||
<para>The following figure displays a CPU-optimized, packed server:
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Compute_Tech_Bin_Packing_CPU_optimized1.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject></para>
|
||||
<para>These default flavors are well suited to typical configurations
|
||||
of commodity server hardware. To maximize utilization,
|
||||
however, it may be necessary to customize the flavors or
|
||||
create new ones in order to better align instance sizes to the
|
||||
available hardware.</para>
|
||||
<para>Workload characteristics may also influence hardware choices
|
||||
and flavor configuration, particularly where they present
|
||||
different ratios of CPU versus RAM versus HDD
|
||||
requirements.</para>
|
||||
<para>For more information on Flavors see:
|
||||
<link xlink:href="http://docs.openstack.org/openstack-ops/content/flavors.html">OpenStack Operations Guide: Flavors</link></para>
|
||||
</section>
|
||||
|
||||
<section xml:id="openstack-components-compute-focus">
|
||||
<title>OpenStack components</title>
|
||||
<para>Due to the nature of the workloads in this
|
||||
scenario, a number of components are highly beneficial for
|
||||
a Compute-focused cloud. This includes the typical OpenStack
|
||||
components:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>OpenStack Compute (nova)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Image service (glance)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Identity (keystone)</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Also consider several specialized components:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para><glossterm>Orchestration</glossterm> (heat)</para>
|
||||
<para>Given the nature of the
|
||||
applications involved in this scenario, these are heavily
|
||||
automated deployments. Making use of Orchestration is highly
|
||||
beneficial in this case. You can script the deployment of a
|
||||
batch of instances and the running of tests, but it
|
||||
makes sense to use the Orchestration service
|
||||
to handle all these actions.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Telemetry (ceilometer)</para>
|
||||
<para>Telemetry and the alarms it generates support autoscaling
|
||||
of instances using Orchestration. Users that are not using the
|
||||
Orchestration service do not need to deploy the Telemetry
|
||||
service and may choose to use external solutions to fulfill
|
||||
their metering and monitoring requirements.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Block Storage (cinder)</para>
|
||||
<para>Due to the burst-able nature of the workloads and the
|
||||
applications and instances that perform batch
|
||||
processing, this cloud mainly uses memory or CPU, so
|
||||
the need for add-on storage to each instance is not a likely
|
||||
requirement. This does not mean that you do not use
|
||||
OpenStack Block Storage (cinder) in the infrastructure, but
|
||||
typically it is not a central component.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Networking</para>
|
||||
<para>When choosing a networking platform, ensure that it either
|
||||
works with all desired hypervisor and container technologies
|
||||
and their OpenStack drivers, or that it includes an implementation of
|
||||
an ML2 mechanism driver. You can mix networking platforms
|
||||
that provide ML2 mechanisms drivers.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
</section>
|
Before Width: | Height: | Size: 52 KiB |
Before Width: | Height: | Size: 39 KiB |
Before Width: | Height: | Size: 35 KiB |
Before Width: | Height: | Size: 118 KiB |
Before Width: | Height: | Size: 83 KiB |
Before Width: | Height: | Size: 79 KiB |
Before Width: | Height: | Size: 77 KiB |
Before Width: | Height: | Size: 79 KiB |
Before Width: | Height: | Size: 70 KiB |
Before Width: | Height: | Size: 24 KiB |
Before Width: | Height: | Size: 42 KiB |
Before Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 60 KiB |
Before Width: | Height: | Size: 52 KiB |
Before Width: | Height: | Size: 59 KiB |
Before Width: | Height: | Size: 49 KiB |
Before Width: | Height: | Size: 54 KiB |
Before Width: | Height: | Size: 72 KiB |
Before Width: | Height: | Size: 54 KiB |
Before Width: | Height: | Size: 68 KiB |
Before Width: | Height: | Size: 54 KiB |
Before Width: | Height: | Size: 53 KiB |
Before Width: | Height: | Size: 50 KiB |
Before Width: | Height: | Size: 54 KiB |
Before Width: | Height: | Size: 55 KiB |
Before Width: | Height: | Size: 52 KiB |
Before Width: | Height: | Size: 75 KiB |
Before Width: | Height: | Size: 40 KiB |
Before Width: | Height: | Size: 37 KiB |
Before Width: | Height: | Size: 56 KiB |
Before Width: | Height: | Size: 20 KiB |
Before Width: | Height: | Size: 30 KiB |
Before Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 56 KiB |
Before Width: | Height: | Size: 25 KiB |
Before Width: | Height: | Size: 56 KiB |
Before Width: | Height: | Size: 52 KiB |
Before Width: | Height: | Size: 50 KiB |
Before Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 50 KiB |
Before Width: | Height: | Size: 35 KiB |
Before Width: | Height: | Size: 21 KiB |
Before Width: | Height: | Size: 1.3 MiB |
Before Width: | Height: | Size: 5.0 KiB |
Before Width: | Height: | Size: 39 KiB |
@ -1,720 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE section [
|
||||
<!ENTITY % openstack SYSTEM "../../common/entities/openstack.ent">
|
||||
%openstack;
|
||||
]>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-guide-architecture-overview">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Architecture</title>
|
||||
<para>Hardware selection involves three key areas:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Compute</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Network</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Storage</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Hardware for a general purpose OpenStack cloud
|
||||
should reflect a cloud with no pre-defined usage model,
|
||||
designed to run a wide variety of applications with
|
||||
varying resource usage requirements.
|
||||
These applications include any of the following:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
RAM-intensive
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
CPU-intensive
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Storage-intensive
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Certain hardware form factors may better suit a general
|
||||
purpose OpenStack cloud due to the requirement for equal (or
|
||||
nearly equal) balance of resources. Server hardware must provide
|
||||
the following:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Equal (or nearly equal) balance of compute capacity (RAM and CPU)
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Network capacity (number and speed of links)
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Storage capacity (gigabytes or terabytes as well as Input/Output
|
||||
Operations Per Second (<glossterm>IOPS</glossterm>)
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Evaluate server hardware around four conflicting
|
||||
dimensions:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Server density</term>
|
||||
<listitem>
|
||||
<para>A measure of how many servers can
|
||||
fit into a given measure of physical space, such as a
|
||||
rack unit [U].</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Resource capacity</term>
|
||||
<listitem>
|
||||
<para>The number of CPU cores, amount of RAM,
|
||||
or amount of deliverable storage.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Expandability</term>
|
||||
<listitem>
|
||||
<para>Limit of additional resources you can add to
|
||||
a server.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Cost</term>
|
||||
<listitem>
|
||||
<para>The relative purchase price of the hardware
|
||||
weighted against the level of design effort needed to
|
||||
build the system.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>Increasing server density means sacrificing resource
|
||||
capacity or expandability, however, increasing resource
|
||||
capacity and expandability increases cost and decreases server
|
||||
density. As a result, determining the best server hardware for
|
||||
a general purpose OpenStack architecture means understanding
|
||||
how choice of form factor will impact the rest of the
|
||||
design. The following list outlines the form factors to
|
||||
choose from:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Blade servers typically support dual-socket
|
||||
multi-core CPUs. Blades also offer
|
||||
outstanding density.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>1U rack-mounted servers occupy only a single rack
|
||||
unit. Their benefits include high density, support for
|
||||
dual-socket multi-core CPUs, and support for
|
||||
reasonable RAM amounts. This form factor offers
|
||||
limited storage capacity, limited network capacity,
|
||||
and limited expandability.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>2U rack-mounted servers offer the expanded storage
|
||||
and networking capacity that 1U servers tend to lack,
|
||||
but with a corresponding decrease in server density
|
||||
(half the density offered by 1U rack-mounted
|
||||
servers).</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Larger rack-mounted servers, such as 4U servers,
|
||||
will tend to offer even greater CPU capacity, often
|
||||
supporting four or even eight CPU sockets. These
|
||||
servers often have much greater expandability so will
|
||||
provide the best option for upgradability. This means,
|
||||
however, that the servers have a much lower server
|
||||
density and a much greater hardware cost.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para><emphasis>Sled servers</emphasis> are rack-mounted servers that support
|
||||
multiple independent servers in a single 2U or 3U
|
||||
enclosure. This form factor offers increased density
|
||||
over typical 1U-2U rack-mounted servers but tends to
|
||||
suffer from limitations in the amount of storage or
|
||||
network capacity each individual server
|
||||
supports.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>The best form factor for server hardware
|
||||
supporting a general purpose OpenStack cloud is driven by
|
||||
outside business and cost factors. No single reference
|
||||
architecture applies to all implementations; the decision
|
||||
must flow from user requirements, technical
|
||||
considerations, and operational considerations. Here are some
|
||||
of the key factors that influence the selection of server
|
||||
hardware:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Instance density</term>
|
||||
<listitem>
|
||||
<para>Sizing is an important
|
||||
consideration for a general purpose OpenStack cloud.
|
||||
The expected or anticipated number of instances that
|
||||
each hypervisor can host is a common meter used in
|
||||
sizing the deployment. The selected server hardware
|
||||
needs to support the expected or anticipated instance
|
||||
density.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Host density</term>
|
||||
<listitem>
|
||||
<para>Physical data centers have limited
|
||||
physical space, power, and cooling. The number of
|
||||
hosts (or hypervisors) that can be fitted into a given
|
||||
metric (rack, rack unit, or floor tile) is another
|
||||
important method of sizing. Floor weight is an often
|
||||
overlooked consideration. The data center floor must
|
||||
be able to support the weight of the proposed number
|
||||
of hosts within a rack or set of racks. These factors
|
||||
need to be applied as part of the host density
|
||||
calculation and server hardware selection.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Power density</term>
|
||||
<listitem>
|
||||
<para>Data centers have a specified amount
|
||||
of power fed to a given rack or set of racks. Older
|
||||
data centers may have a power density as power as low
|
||||
as 20 AMPs per rack, while more recent data centers
|
||||
can be architected to support power densities as high
|
||||
as 120 AMP per rack. The selected server hardware must
|
||||
take power density into account.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Network connectivity</term>
|
||||
<listitem>
|
||||
<para>The selected server hardware
|
||||
must have the appropriate number of network
|
||||
connections, as well as the right type of network
|
||||
connections, in order to support the proposed
|
||||
architecture. Ensure that, at a minimum, there are at
|
||||
least two diverse network connections coming into each
|
||||
rack.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>The selection of form factors or architectures affects the selection
|
||||
of server hardware. Ensure that the selected server hardware
|
||||
is configured to support enough storage capacity (or storage
|
||||
expandability) to match the requirements of selected scale-out
|
||||
storage solution. Similarly, the network architecture impacts
|
||||
the server hardware selection and vice versa.</para>
|
||||
|
||||
<section xml:id="selecting-storage-hardware">
|
||||
<title>Selecting storage hardware</title>
|
||||
<para>Determine storage hardware architecture by
|
||||
selecting specific storage architecture. Determine the selection of
|
||||
storage architecture by evaluating possible solutions against the
|
||||
critical factors, the user requirements, technical
|
||||
considerations, and operational considerations.
|
||||
Incorporate the following facts into your storage architecture:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Cost</term>
|
||||
<listitem>
|
||||
<para>Storage can be a significant portion of the
|
||||
overall system cost. For an organization that is concerned
|
||||
with vendor support, a commercial storage solution is
|
||||
advisable, although it comes with a higher price
|
||||
tag. If initial capital expenditure requires
|
||||
minimization, designing a system based on commodity
|
||||
hardware would apply. The trade-off is potentially
|
||||
higher support costs and a greater risk of
|
||||
incompatibility and interoperability issues.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Scalability</term>
|
||||
<listitem>
|
||||
<para>Scalability, along with expandability, is a major
|
||||
consideration in a general purpose OpenStack cloud. It
|
||||
might be difficult to predict the final intended size
|
||||
of the implementation as there are no established
|
||||
usage patterns for a general purpose cloud. It might
|
||||
become necessary to expand the initial deployment in
|
||||
order to accommodate growth and user demand.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Expandability</term>
|
||||
<listitem>
|
||||
<para>Expandability is a major architecture factor for
|
||||
storage solutions with general purpose OpenStack
|
||||
cloud. A storage solution that expands
|
||||
to 50 PB is considered more expandable than a
|
||||
solution that only scales to 10 PB. This meter
|
||||
is related to scalability, which is the measure of a
|
||||
solution's performance as it expands.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>Using a scale-out storage solution with direct-attached
|
||||
storage (DAS) in the servers is well suited for a general
|
||||
purpose OpenStack cloud. Cloud services requirements determine
|
||||
your choice of scale-out solution. You need to determine if
|
||||
a single, highly expandable and highly vertical, scalable,
|
||||
centralized storage array is suitable for your design.
|
||||
After determining an approach, select the storage hardware
|
||||
based on this criteria.</para>
|
||||
<para>This list expands upon the potential impacts for including a
|
||||
particular storage architecture (and corresponding storage
|
||||
hardware) into the design for a general purpose OpenStack
|
||||
cloud:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Connectivity</term>
|
||||
<listitem>
|
||||
<para>Ensure that, if storage protocols
|
||||
other than Ethernet are part of the storage solution,
|
||||
the appropriate hardware has been selected.
|
||||
If a centralized storage array is selected, ensure
|
||||
that the hypervisor will be able to connect to that
|
||||
storage array for image storage.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Usage</term>
|
||||
<listitem>
|
||||
<para>How the particular storage architecture will
|
||||
be used is critical for determining the architecture.
|
||||
Some of the configurations that will influence the
|
||||
architecture include whether it will be used by the
|
||||
hypervisors for ephemeral instance storage or if
|
||||
OpenStack Object Storage will use it for object storage.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Instance and image locations</term>
|
||||
<listitem>
|
||||
<para>
|
||||
Where instances and images will be stored will influence
|
||||
the architecture.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Server hardware</term>
|
||||
<listitem>
|
||||
<para>If the solution is a scale-out
|
||||
storage architecture that includes DAS, it
|
||||
will affect the server hardware selection. This could
|
||||
ripple into the decisions that affect host density,
|
||||
instance density, power density, OS-hypervisor,
|
||||
management tools and others.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>General purpose OpenStack cloud has multiple options.
|
||||
The key factors that will have an influence
|
||||
on selection of storage hardware for a general purpose
|
||||
OpenStack cloud are as follows:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Capacity</term>
|
||||
<listitem>
|
||||
<para>Hardware resources selected for the resource nodes
|
||||
should be capable of supporting enough storage for the
|
||||
cloud services. Defining the initial requirements and
|
||||
ensuring the design can support adding capacity is
|
||||
important. Hardware nodes selected for object storage
|
||||
should be capable of support a large number of inexpensive
|
||||
disks with no reliance on RAID controller cards.
|
||||
Hardware nodes selected for block storage should be capable
|
||||
of supporting high speed storage solutions and RAID controller
|
||||
cards to provide performance and redundancy to storage at a
|
||||
hardware level.
|
||||
Selecting hardware RAID controllers that automatically repair
|
||||
damaged arrays will assist with the replacement and repair of
|
||||
degraded or deleted storage devices.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Performance</term>
|
||||
<listitem>
|
||||
<para>Disks selected for object storage services do not need
|
||||
to be fast performing disks. We recommend that object storage
|
||||
nodes take advantage of the best cost per terabyte available
|
||||
for storage. Contrastingly, disks chosen for block storage
|
||||
services should take advantage of performance boosting
|
||||
features that may entail the use of SSDs or flash storage
|
||||
to provide high performance block storage pools. Storage
|
||||
performance of ephemeral disks used for instances should
|
||||
also be taken into consideration.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Fault tolerance</term>
|
||||
<listitem>
|
||||
<para>Object storage resource nodes have
|
||||
no requirements for hardware fault tolerance or RAID
|
||||
controllers. It is not necessary to plan for fault
|
||||
tolerance within the object storage hardware because
|
||||
the object storage service provides replication
|
||||
between zones as a feature of the service. Block
|
||||
storage nodes, compute nodes, and cloud controllers
|
||||
should all have fault tolerance built in at the
|
||||
hardware level by making use of hardware RAID
|
||||
controllers and varying levels of RAID configuration.
|
||||
The level of RAID chosen should be consistent with the
|
||||
performance and availability requirements of the
|
||||
cloud.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="selecting-networking-hardware">
|
||||
<title>Selecting networking hardware</title>
|
||||
<para>Selecting network architecture determines which network
|
||||
hardware will be used. Networking software is determined by
|
||||
the selected networking hardware.</para>
|
||||
<para>There are more subtle design impacts that need to be considered.
|
||||
The selection of certain networking hardware (and the networking software)
|
||||
affects the management tools that can be used. There are
|
||||
exceptions to this; the rise of <emphasis>open</emphasis> networking software
|
||||
that supports a range of networking hardware means that there
|
||||
are instances where the relationship between networking
|
||||
hardware and networking software are not as tightly defined.</para>
|
||||
<para>Some of the key considerations that should be included in
|
||||
the selection of networking hardware include:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Port count</term>
|
||||
<listitem>
|
||||
<para>The design will require networking
|
||||
hardware that has the requisite port count.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Port density</term>
|
||||
<listitem>
|
||||
<para>The network design will be affected by
|
||||
the physical space that is required to provide the
|
||||
requisite port count. A higher port density is preferred,
|
||||
as it leaves more rack space for compute or storage components
|
||||
that may be required by the design. This can also lead into
|
||||
concerns about fault domains and power density that
|
||||
should be considered. Higher density switches are more
|
||||
expensive and should also be considered, as it is
|
||||
important not to over design the network if it is not
|
||||
required.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Port speed</term>
|
||||
<listitem>
|
||||
<para>
|
||||
The networking hardware must support the proposed
|
||||
network speed, for example: 1 GbE, 10 GbE, or
|
||||
40 GbE (or even 100 GbE).</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Redundancy</term>
|
||||
<listitem>
|
||||
<para>The level of network hardware redundancy
|
||||
required is influenced by the user requirements for
|
||||
high availability and cost considerations. Network
|
||||
redundancy can be achieved by adding redundant power
|
||||
supplies or paired switches. If this is a requirement,
|
||||
the hardware will need to support this configuration.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Power requirements</term>
|
||||
<listitem>
|
||||
<para>Ensure that the physical data
|
||||
center provides the necessary power for the selected
|
||||
network hardware.</para>
|
||||
<note>
|
||||
<para>
|
||||
This may be an issue for spine switches in a leaf and
|
||||
spine fabric, or end of row (EoR) switches.</para>
|
||||
</note>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>There is no single best practice architecture for the
|
||||
networking hardware supporting a general purpose OpenStack
|
||||
cloud that will apply to all implementations. Some of the key
|
||||
factors that will have a strong influence on selection of
|
||||
networking hardware include:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Connectivity</term>
|
||||
<listitem>
|
||||
<para>All nodes within an OpenStack cloud
|
||||
require network connectivity. In some
|
||||
cases, nodes require access to more than one network
|
||||
segment. The design must encompass sufficient network
|
||||
capacity and bandwidth to ensure that all
|
||||
communications within the cloud, both north-south and
|
||||
east-west traffic have sufficient resources
|
||||
available.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Scalability</term>
|
||||
<listitem>
|
||||
<para>The network design should
|
||||
encompass a physical and logical network design that
|
||||
can be easily expanded upon. Network hardware should
|
||||
offer the appropriate types of interfaces and speeds
|
||||
that are required by the hardware nodes.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Availability</term>
|
||||
<listitem>
|
||||
<para>To ensure that access to nodes within
|
||||
the cloud is not interrupted, we recommend that
|
||||
the network architecture identify any single points of
|
||||
failure and provide some level of redundancy or fault
|
||||
tolerance. With regard to the network infrastructure
|
||||
itself, this often involves use of networking
|
||||
protocols such as LACP, VRRP or others to achieve a
|
||||
highly available network connection. In addition, it
|
||||
is important to consider the networking implications
|
||||
on API availability. In order to ensure that the APIs,
|
||||
and potentially other services in the cloud are highly
|
||||
available, we recommend you design a load balancing
|
||||
solution within the network architecture to
|
||||
accommodate for these requirements.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="software-selection">
|
||||
<title>Software selection</title>
|
||||
<para>Software selection for a general purpose OpenStack
|
||||
architecture design needs to include these three areas:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Operating system (OS) and hypervisor</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack components</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Supplemental software</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="os-and-hypervisor">
|
||||
<title>Operating system and hypervisor</title>
|
||||
<para>The operating system (OS) and hypervisor have a
|
||||
significant impact on the overall design. Selecting a particular
|
||||
operating system and hypervisor can directly affect server
|
||||
hardware selection. Make sure the storage
|
||||
hardware and topology support the selected operating
|
||||
system and hypervisor combination. Also ensure the networking
|
||||
hardware selection and topology will work with the chosen operating
|
||||
system and hypervisor combination.</para>
|
||||
<para>Some areas that could be impacted by the selection of OS and
|
||||
hypervisor include:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Cost</term>
|
||||
<listitem>
|
||||
<para>Selecting a commercially supported hypervisor,
|
||||
such as Microsoft Hyper-V, will result in a different
|
||||
cost model rather than community-supported open source
|
||||
hypervisors including <glossterm
|
||||
baseform="kernel-based VM (KVM)">KVM</glossterm>,
|
||||
Kinstance or <glossterm>Xen</glossterm>. When
|
||||
comparing open source OS solutions, choosing Ubuntu
|
||||
over Red Hat (or vice versa) will have an impact on
|
||||
cost due to support contracts.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Supportability</term>
|
||||
<listitem>
|
||||
<para>Depending on the selected
|
||||
hypervisor, staff should have the appropriate
|
||||
training and knowledge to support the selected OS and
|
||||
hypervisor combination. If they do not, training will
|
||||
need to be provided which could have a cost impact on
|
||||
the design.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Management tools</term>
|
||||
<listitem>
|
||||
<para>The management tools used for
|
||||
Ubuntu and Kinstance differ from the management tools
|
||||
for VMware vSphere. Although both OS and hypervisor
|
||||
combinations are supported by OpenStack, there will be
|
||||
very different impacts to the rest of the design as a
|
||||
result of the selection of one combination versus the
|
||||
other.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Scale and performance</term>
|
||||
<listitem>
|
||||
<para>Ensure that selected OS and
|
||||
hypervisor combinations meet the appropriate scale and
|
||||
performance requirements. The chosen architecture will
|
||||
need to meet the targeted instance-host ratios with
|
||||
the selected OS-hypervisor combinations.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Security</term>
|
||||
<listitem>
|
||||
<para>Ensure that the design can accommodate
|
||||
regular periodic installations of application security
|
||||
patches while maintaining required workloads. The
|
||||
frequency of security patches for the proposed
|
||||
OS-hypervisor combination will have an impact on
|
||||
performance and the patch installation process could
|
||||
affect maintenance windows.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Supported features</term>
|
||||
<listitem>
|
||||
<para>Determine which features of OpenStack are required.
|
||||
This will often determine the selection of the OS-hypervisor combination.
|
||||
Some features are only available with specific operating systems or
|
||||
hypervisors.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Interoperability</term>
|
||||
<listitem>
|
||||
<para>You will need to consider how the OS and hypervisor combination
|
||||
interactions with other operating systems and hypervisors, including
|
||||
other software solutions.
|
||||
Operational troubleshooting tools for one OS-hypervisor
|
||||
combination may differ from the tools used for another OS-hypervisor
|
||||
combination and, as a result, the design will need to
|
||||
address if the two sets of tools need to interoperate.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="openstack-components">
|
||||
<title>OpenStack components</title>
|
||||
<para>Selecting which OpenStack components are included in the overall
|
||||
design is important. Some OpenStack components, like
|
||||
compute and Image service, are required in every architecture. Other
|
||||
components, like Orchestration, are not always required.</para>
|
||||
<para>Excluding certain OpenStack components can limit or constrain
|
||||
the functionality of other components. For example, if the architecture includes
|
||||
Orchestration but excludes Telemetry, then the design will not be able
|
||||
to take advantage of Orchestrations' auto scaling functionality.
|
||||
It is important to research the component interdependencies
|
||||
in conjunction with the technical requirements before deciding
|
||||
on the final architecture.</para>
|
||||
|
||||
<section xml:id="networking-software">
|
||||
<title>Networking software</title>
|
||||
<para>OpenStack Networking (neutron) provides a wide variety of networking
|
||||
services for instances. There are many additional networking
|
||||
software packages that can be useful when managing OpenStack
|
||||
components. Some examples include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Software to provide load balancing
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Network redundancy protocols
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Routing daemons
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Some of these software packages are described
|
||||
in more detail in the <citetitle>OpenStack High Availability
|
||||
Guide</citetitle> (refer to the <link
|
||||
xlink:href="http://docs.openstack.org/ha-guide/networking-ha.html">Network
|
||||
controller cluster stack chapter</link> of the OpenStack High
|
||||
Availability Guide).</para>
|
||||
<para>For a general purpose OpenStack cloud, the OpenStack
|
||||
infrastructure components need to be highly available. If
|
||||
the design does not include hardware load balancing,
|
||||
networking software packages like HAProxy will need to be
|
||||
included.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="management-software">
|
||||
<title>Management software</title>
|
||||
<para>Selected supplemental software solution impacts and
|
||||
affects the overall OpenStack cloud design. This includes
|
||||
software for providing clustering, logging, monitoring and
|
||||
alerting.</para>
|
||||
<para>Inclusion of clustering software, such as Corosync or
|
||||
Pacemaker, is determined primarily by the availability
|
||||
requirements. The impact of including (or not
|
||||
including) these software packages is primarily determined by
|
||||
the availability of the cloud infrastructure and the
|
||||
complexity of supporting the configuration after it is
|
||||
deployed. The <link xlink:href="http://docs.openstack.org/ha-guide/"><citetitle>OpenStack High Availability Guide</citetitle></link>
|
||||
provides more
|
||||
details on the installation and configuration of Corosync and
|
||||
Pacemaker, should these packages need to be included in the
|
||||
design.</para>
|
||||
<para>Requirements for logging, monitoring, and alerting are
|
||||
determined by operational considerations. Each of these
|
||||
sub-categories includes a number of various options.</para>
|
||||
<para>If these software packages are required, the
|
||||
design must account for the additional resource consumption
|
||||
(CPU, RAM, storage, and network bandwidth). Some other potential
|
||||
design impacts include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>OS-hypervisor combination: Ensure that the
|
||||
selected logging, monitoring, or alerting tools
|
||||
support the proposed OS-hypervisor combination.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Network hardware: The network hardware selection
|
||||
needs to be supported by the logging, monitoring, and
|
||||
alerting software.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="database-software">
|
||||
<title>Database software</title>
|
||||
<para>OpenStack components often require access
|
||||
to back-end database services to store state and configuration
|
||||
information. Selecting an appropriate back-end database
|
||||
that satisfies the availability and fault tolerance
|
||||
requirements of the OpenStack services is required. OpenStack
|
||||
services supports connecting to a database that is supported
|
||||
by the SQLAlchemy python drivers, however, most common
|
||||
database deployments make use of MySQL or variations of it. We
|
||||
recommend that the database, which provides back-end
|
||||
service within a general purpose cloud, be made highly
|
||||
available when using an available technology which can
|
||||
accomplish that goal.</para>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
@ -1,156 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="operational-considerations-general-purpose">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Operational considerations</title>
|
||||
<para>In the planning and design phases of the build out, it is
|
||||
important to include the operation's function. Operational
|
||||
factors affect the design choices for a general purpose cloud,
|
||||
and operations staff are often tasked with the maintenance of
|
||||
cloud environments for larger installations.</para>
|
||||
<para>Expectations set by the Service Level Agreements (SLAs) directly
|
||||
affect knowing when and where you should implement redundancy and
|
||||
high availability. SLAs are contractual
|
||||
obligations that provide assurances for service availability.
|
||||
They define the levels of availability that drive the technical
|
||||
design, often with penalties for not meeting contractual obligations.</para>
|
||||
<para>SLA terms that affect design include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>API availability guarantees implying multiple
|
||||
infrastructure services and highly available
|
||||
load balancers.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Network uptime guarantees affecting switch
|
||||
design, which might require redundant switching and
|
||||
power.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Factor in networking security policy requirements
|
||||
in to your deployments.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<section xml:id="support-and-maintainability-general-purpose">
|
||||
<title>Support and maintainability</title>
|
||||
<para>To be able to support and maintain an installation, OpenStack
|
||||
cloud management requires operations staff to understand and
|
||||
comprehend design architecture content. The operations and engineering
|
||||
staff skill level, and level of separation, are dependent on size and
|
||||
purpose of the installation. Large cloud service providers, or telecom
|
||||
providers, are more likely to be managed by specially trained, dedicated
|
||||
operations organizations. Smaller implementations are more likely to rely
|
||||
on support staff that need to take on combined engineering, design and
|
||||
operations functions.</para>
|
||||
<para>Maintaining OpenStack installations requires a
|
||||
variety of technical skills. You may want to consider using a third-party
|
||||
management company with special expertise in managing
|
||||
OpenStack deployment.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="monitoring-general-purpose">
|
||||
<title>Monitoring</title>
|
||||
<para>OpenStack clouds require appropriate monitoring platforms to
|
||||
ensure errors are caught and managed appropriately. Specific
|
||||
meters that are critically important to monitor include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Image disk utilization
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Response time to the Compute API
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Leveraging existing monitoring systems is an effective check to
|
||||
ensure OpenStack environments can be monitored.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="downtime-general-purpose">
|
||||
<title>Downtime</title>
|
||||
<para>To effectively run cloud installations, initial downtime planning
|
||||
includes creating processes and architectures that support
|
||||
the following:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Planned (maintenance)
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Unplanned (system faults)
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Resiliency of overall system and individual components are going
|
||||
to be dictated by the requirements of the SLA, meaning designing
|
||||
for high availability (HA) can have cost ramifications.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="capacity-planning">
|
||||
<title>Capacity planning</title>
|
||||
<para>Capacity constraints for a general purpose cloud environment
|
||||
include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Compute limits
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Storage limits
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>A relationship exists between the size of the compute environment
|
||||
and the supporting OpenStack infrastructure controller nodes requiring
|
||||
support.</para>
|
||||
<para>Increasing the size of the supporting compute environment increases
|
||||
the network traffic and messages, adding load to the controller or
|
||||
networking nodes. Effective monitoring of the environment will help
|
||||
with capacity decisions on scaling.</para>
|
||||
<para>Compute nodes automatically attach to OpenStack clouds, resulting in
|
||||
a horizontally scaling process when adding extra compute capacity to an
|
||||
OpenStack cloud. Additional processes are required to place nodes into
|
||||
appropriate availability zones and host aggregates. When adding additional
|
||||
compute nodes to environments, ensure identical or functional compatible
|
||||
CPUs are used, otherwise live migration features will break. It is necessary
|
||||
to add rack capacity or network switches as scaling out compute hosts directly
|
||||
affects network and datacenter resources.</para>
|
||||
<para>Assessing the average workloads and increasing the number of instances
|
||||
that can run within the compute environment by adjusting the overcommit
|
||||
ratio is another option. It is important to remember that changing the CPU overcommit
|
||||
ratio can have a detrimental effect and cause a potential increase in a
|
||||
noisy neighbor. The additional risk of increasing the overcommit ratio is
|
||||
more instances failing when a compute host fails.</para>
|
||||
<para>Compute host components can also be upgraded to account for
|
||||
increases in demand; this is known as vertical scaling.
|
||||
Upgrading CPUs with more cores, or increasing the overall
|
||||
server memory, can add extra needed capacity depending on
|
||||
whether the running applications are more CPU intensive or
|
||||
memory intensive.</para>
|
||||
<para>Insufficient disk capacity could also have a negative effect
|
||||
on overall performance including CPU and memory usage.
|
||||
Depending on the back-end architecture of the OpenStack Block
|
||||
Storage layer, capacity includes adding disk shelves to
|
||||
enterprise storage systems or installing additional block
|
||||
storage nodes. Upgrading directly attached storage installed in
|
||||
compute hosts, and adding capacity to the shared storage for
|
||||
additional ephemeral storage to instances, may be necessary.</para>
|
||||
<para>
|
||||
For a deeper discussion on many of these topics, refer to the
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/ops"><citetitle>OpenStack
|
||||
Operations Guide</citetitle></link>.
|
||||
</para>
|
||||
</section>
|
||||
</section>
|
@ -1,101 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="prescriptive-example-online-classifieds">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Prescriptive example</title>
|
||||
<para>An online classified advertising company wants to run web applications
|
||||
consisting of Tomcat, Nginx and MariaDB in a private cloud. To be able
|
||||
to meet policy requirements, the cloud infrastructure will run in their
|
||||
own data center. The company has predictable load requirements, but requires
|
||||
scaling to cope with nightly increases in demand. Their current environment
|
||||
does not have the flexibility to align with their goal of running an open
|
||||
source API environment. The current environment consists of the following:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Between 120 and 140 installations of Nginx and
|
||||
Tomcat, each with 2 vCPUs and 4 GB of RAM</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A three-node MariaDB and Galera cluster, each with 4
|
||||
vCPUs and 8 GB RAM</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>The company runs hardware load balancers and multiple web
|
||||
applications serving their websites, and orchestrates environments
|
||||
using combinations of scripts and Puppet. The website generates large amounts of
|
||||
log data daily that requires archiving.</para>
|
||||
<para>The solution would consist of the following OpenStack
|
||||
components:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>A firewall, switches and load balancers on the
|
||||
public facing network connections.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Controller service running Image,
|
||||
Identity, Networking, combined with support services such as
|
||||
MariaDB and RabbitMQ, configured for high availability on at
|
||||
least three controller nodes.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Compute nodes running the KVM
|
||||
hypervisor.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Block Storage for use by compute instances,
|
||||
requiring persistent storage (such as databases for
|
||||
dynamic sites).</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Object Storage for serving static objects
|
||||
(such as images).</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<mediaobject><imageobject><imagedata contentwidth="4in"
|
||||
fileref="../figures/General_Architecture3.png"
|
||||
/></imageobject></mediaobject>
|
||||
<para>Running up to 140
|
||||
web instances and the small number of MariaDB instances
|
||||
requires 292 vCPUs available, as well as 584 GB RAM. On a
|
||||
typical 1U server using dual-socket hex-core Intel CPUs with
|
||||
Hyperthreading, and assuming 2:1 CPU overcommit ratio, this
|
||||
would require 8 OpenStack Compute nodes.</para>
|
||||
<para>The web application instances run from local storage on each
|
||||
of the OpenStack Compute nodes. The web application instances
|
||||
are stateless, meaning that any of the instances can fail and
|
||||
the application will continue to function.</para>
|
||||
<para>MariaDB server instances store their data on shared
|
||||
enterprise storage, such as NetApp or Solidfire devices. If a
|
||||
MariaDB instance fails, storage would be expected to be
|
||||
re-attached to another instance and rejoined to the Galera
|
||||
cluster.</para>
|
||||
<para>Logs from the web application servers are shipped to
|
||||
OpenStack Object Storage for processing and
|
||||
archiving.</para>
|
||||
<para>Additional capabilities can be realized by
|
||||
moving static web content to be served from OpenStack Object
|
||||
Storage containers, and backing the OpenStack Image service
|
||||
with OpenStack Object Storage.</para>
|
||||
<note>
|
||||
<para>
|
||||
Increasing OpenStack Object Storage means network bandwidth
|
||||
needs to be taken into consideration. Running OpenStack Object
|
||||
Storage with network connections offering 10 GbE or better connectivity
|
||||
is advised.
|
||||
</para>
|
||||
</note>
|
||||
<para>Leveraging Orchestration and Telemetry services is also a potential issue when
|
||||
providing auto-scaling, orchestrated web application environments.
|
||||
Defining the web applications in <glossterm
|
||||
baseform="Heat Orchestration Template (HOT)">Heat Orchestration Templates (HOT)</glossterm>
|
||||
negates the reliance on the current scripted Puppet solution.</para>
|
||||
<para>OpenStack Networking can be used to control hardware load
|
||||
balancers through the use of plug-ins and the Networking API.
|
||||
This allows users to control hardware load balance pools
|
||||
and instances as members in these pools, but their use in
|
||||
production environments must be carefully weighed against
|
||||
current stability.</para>
|
||||
</section>
|
@ -1,738 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE section [
|
||||
<!ENTITY % openstack SYSTEM "../../common/entities/openstack.ent">
|
||||
%openstack;
|
||||
]>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="technical-considerations-general-purpose">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Technical considerations</title>
|
||||
<para>General purpose clouds are expected to
|
||||
include these base services:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Compute
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Network
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Storage
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Each of these services have different resource requirements.
|
||||
As a result, you must make design decisions relating directly
|
||||
to the service, as well as provide a balanced infrastructure for
|
||||
all services.</para>
|
||||
<para>Take into consideration the unique aspects of each service, as
|
||||
individual characteristics and service mass can impact the hardware
|
||||
selection process. Hardware designs should be generated for each of the
|
||||
services.</para>
|
||||
<para>Hardware decisions are also made in relation to network architecture
|
||||
and facilities planning. These factors play heavily into
|
||||
the overall architecture of an OpenStack cloud.</para>
|
||||
|
||||
<section xml:id="designing-compute-resources-tech-considerations">
|
||||
<title>Compute resource design</title>
|
||||
<para>When designing compute resource pools, a number of factors
|
||||
can impact your design decisions. Factors such as number of processors,
|
||||
amount of memory, and the quantity of storage required for each hypervisor
|
||||
must be taken into account.</para>
|
||||
<para>You will also need to decide whether to provide compute resources
|
||||
in a single pool or in multiple pools. In most cases, multiple pools
|
||||
of resources can be allocated and addressed on demand. A compute design
|
||||
that allocates multiple pools of resources makes best use of application
|
||||
resources, and is commonly referred to as
|
||||
<firstterm>bin packing</firstterm>.</para>
|
||||
<para>In a bin packing design, each independent resource pool provides service
|
||||
for specific flavors. This helps to ensure that, as instances are scheduled
|
||||
onto compute hypervisors, each independent node's resources will be allocated
|
||||
in a way that makes the most efficient use of the available hardware. Bin
|
||||
packing also requires a common hardware design, with all hardware nodes within
|
||||
a compute resource pool sharing a common processor, memory, and storage layout.
|
||||
This makes it easier to deploy, support, and maintain nodes throughout their
|
||||
life cycle.</para>
|
||||
<para>An <firstterm>overcommit ratio</firstterm> is the ratio of available
|
||||
virtual resources to available physical resources. This ratio is
|
||||
configurable for CPU and memory. The default CPU overcommit ratio is 16:1, and
|
||||
the default memory overcommit ratio is 1.5:1. Determining the tuning of the
|
||||
overcommit ratios during the design phase is important as it has a direct
|
||||
impact on the hardware layout of your compute nodes.</para>
|
||||
<para>When selecting a processor, compare features and performance
|
||||
characteristics. Some processors include features specific to virtualized
|
||||
compute hosts, such as hardware-assisted virtualization, and technology
|
||||
related to memory paging (also known as EPT shadowing). These types of features
|
||||
can have a significant impact on the performance of your virtual machine.</para>
|
||||
<para>You will also need to consider the compute requirements of non-hypervisor
|
||||
nodes (sometimes referred to as resource nodes). This includes controller, object
|
||||
storage, and block storage nodes, and networking services.</para>
|
||||
<para>The number of processor cores and threads impacts the number of worker
|
||||
threads which can be run on a resource node. Design decisions must relate
|
||||
directly to the service being run on it, as well as provide a balanced
|
||||
infrastructure for all services.</para>
|
||||
<para>Workload can be unpredictable in a general purpose cloud, so consider
|
||||
including the ability to add additional compute resource pools on demand.
|
||||
In some cases, however, the demand for certain instance types or flavors may not
|
||||
justify individual hardware design. In either case, start by allocating
|
||||
hardware designs that are capable of servicing the most common instance
|
||||
requests. If you want to add additional hardware to the overall architecture,
|
||||
this can be done later.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="designing-network-resources-tech-considerations">
|
||||
<title>Designing network resources</title>
|
||||
<para>OpenStack clouds generally have multiple network segments, with
|
||||
each segment providing access to particular resources. The network services
|
||||
themselves also require network communication paths which should
|
||||
be separated from the other networks. When designing network services
|
||||
for a general purpose cloud, plan for either a physical or logical
|
||||
separation of network segments used by operators and tenants. You can also
|
||||
create an additional network segment for access to internal services such as
|
||||
the message bus and database used by various services. Segregating these
|
||||
services onto separate networks helps to protect sensitive data and protects
|
||||
against unauthorized access to services.</para>
|
||||
<para>Choose a networking service based on the requirements of your instances.
|
||||
The architecture and design of your cloud will impact whether you choose
|
||||
OpenStack Networking(neutron), or legacy networking (nova-network).</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Legacy networking (nova-network)</term>
|
||||
<listitem>
|
||||
<para>The legacy networking (nova-network) service is primarily a
|
||||
layer-2 networking service that functions in two modes, which
|
||||
use VLANs in different ways. In a flat network mode, all
|
||||
network hardware nodes and devices throughout the cloud are connected
|
||||
to a single layer-2 network segment that provides access to
|
||||
application data.</para>
|
||||
<para>When the network devices in the cloud support segmentation
|
||||
using VLANs, legacy networking can operate in the second mode. In
|
||||
this design model, each tenant within the cloud is assigned a
|
||||
network subnet which is mapped to a VLAN on the physical
|
||||
network. It is especially important to remember the maximum
|
||||
number of 4096 VLANs which can be used within a spanning tree
|
||||
domain. This places a hard limit on the amount of
|
||||
growth possible within the data center. When designing a
|
||||
general purpose cloud intended to support multiple tenants, we
|
||||
recommend the use of legacy networking with VLANs, and
|
||||
not in flat network mode.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>Another consideration regarding network is the fact that
|
||||
legacy networking is entirely managed by the cloud operator;
|
||||
tenants do not have control over network resources. If tenants
|
||||
require the ability to manage and create network resources
|
||||
such as network segments and subnets, it will be necessary to
|
||||
install the OpenStack Networking service to provide network
|
||||
access to instances.</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>OpenStack Networking (neutron)</term>
|
||||
<listitem>
|
||||
<para>OpenStack Networking (neutron) is a first class networking
|
||||
service that gives full control over creation of virtual
|
||||
network resources to tenants. This is often accomplished in
|
||||
the form of tunneling protocols which will establish
|
||||
encapsulated communication paths over existing network
|
||||
infrastructure in order to segment tenant traffic. These
|
||||
methods vary depending on the specific implementation, but
|
||||
some of the more common methods include tunneling over GRE,
|
||||
encapsulating with VXLAN, and VLAN tags.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>We recommend you design at least three network segments:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>The first segment is a public network, used for access to REST APIs
|
||||
by tenants and operators. The controller nodes and swift
|
||||
proxies are the only devices connecting to this network segment. In some
|
||||
cases, this network might also be serviced by hardware load balancers
|
||||
and other network devices.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The second segment is used by administrators to manage hardware resources.
|
||||
Configuration management tools also use this for deploying software and
|
||||
services onto new hardware. In some cases, this network segment might also be
|
||||
used for internal services, including the message bus and database services.
|
||||
This network needs to communicate with every hardware node.
|
||||
Due to the highly sensitive nature of this network segment, you also need to
|
||||
secure this network from unauthorized access.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The third network segment is used by applications and consumers to access
|
||||
the physical network, and for users to access applications. This network is
|
||||
segregated from the one used to access the cloud APIs and is not
|
||||
capable of communicating directly with the hardware resources in the cloud.
|
||||
Compute resource nodes and network gateway services which allow application
|
||||
data to access the physical network from outside of the cloud need to
|
||||
communicate on this network segment.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="designing-openstack-object-storage-tech-considerations">
|
||||
<title>Designing OpenStack Object Storage</title>
|
||||
<para>When designing hardware resources for OpenStack Object
|
||||
Storage, the primary goal is to maximize the amount of storage
|
||||
in each resource node while also ensuring that the cost per
|
||||
terabyte is kept to a minimum. This often involves utilizing
|
||||
servers which can hold a large number of spinning disks.
|
||||
Whether choosing to use 2U server form factors with directly
|
||||
attached storage or an external chassis that holds a larger
|
||||
number of drives, the main goal is to maximize the storage
|
||||
available in each node.</para>
|
||||
<note>
|
||||
<para>We do not recommended investing in enterprise class drives
|
||||
for an OpenStack Object Storage cluster. The consistency and
|
||||
partition tolerance characteristics of OpenStack Object
|
||||
Storage ensures that data stays up to date and survives
|
||||
hardware faults without the use of any specialized data
|
||||
replication devices.</para>
|
||||
</note>
|
||||
<para>One of the benefits of OpenStack Object Storage is the ability
|
||||
to mix and match drives by making use of weighting within the
|
||||
swift ring. When designing your swift storage cluster, we
|
||||
recommend making use of the most cost effective storage
|
||||
solution available at the time.</para>
|
||||
<para>To achieve durability and availability of data stored as objects
|
||||
it is important to design object storage resource pools to ensure they can
|
||||
provide the suggested availability. Considering rack-level and zone-level
|
||||
designs to accommodate the number of replicas configured to be stored in the
|
||||
Object Storage service (the default number of replicas is three) is important
|
||||
when designing beyond the hardware node level. Each replica of
|
||||
data should exist in its own availability zone with its own
|
||||
power, cooling, and network resources available to service
|
||||
that specific zone.</para>
|
||||
<para>Object storage nodes should be designed so that the number
|
||||
of requests does not hinder the performance of the cluster.
|
||||
The object storage service is a chatty protocol, therefore
|
||||
making use of multiple processors that have higher core counts
|
||||
will ensure the IO requests do not inundate the server.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="designing-openstack-block-storage">
|
||||
<title>Designing OpenStack Block Storage</title>
|
||||
<para>When designing OpenStack Block Storage resource nodes, it is
|
||||
helpful to understand the workloads and requirements that will
|
||||
drive the use of block storage in the cloud. We recommend designing
|
||||
block storage pools so that tenants can choose appropriate storage
|
||||
solutions for their applications. By creating multiple storage pools of different
|
||||
types, in conjunction with configuring an advanced storage
|
||||
scheduler for the block storage service, it is possible to
|
||||
provide tenants with a large catalog of storage services with
|
||||
a variety of performance levels and redundancy options.</para>
|
||||
<para>Block storage also takes advantage of a number of enterprise storage
|
||||
solutions. These are addressed via a plug-in driver developed by the
|
||||
hardware vendor. A large number of
|
||||
enterprise storage plug-in drivers ship out-of-the-box with
|
||||
OpenStack Block Storage (and many more available via third
|
||||
party channels). General purpose clouds are more likely to use
|
||||
directly attached storage in the majority of block storage nodes,
|
||||
deeming it necessary to provide additional levels of service to tenants
|
||||
which can only be provided by enterprise class storage solutions.</para>
|
||||
<para>Redundancy and availability requirements impact the decision to use
|
||||
a RAID controller card in block storage nodes. The input-output per second (IOPS)
|
||||
demand of your application will influence whether or not you should use a RAID
|
||||
controller, and which level of RAID is required.
|
||||
Making use of higher performing RAID volumes is suggested when
|
||||
considering performance. However, where redundancy of
|
||||
block storage volumes is more important we recommend
|
||||
making use of a redundant RAID configuration such as RAID 5 or
|
||||
RAID 6. Some specialized features, such as automated
|
||||
replication of block storage volumes, may require the use of
|
||||
third-party plug-ins and enterprise block storage solutions in
|
||||
order to provide the high demand on storage. Furthermore,
|
||||
where extreme performance is a requirement it may also be
|
||||
necessary to make use of high speed SSD disk drives' high
|
||||
performing flash storage solutions.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="software-selection-tech-considerations">
|
||||
<title>Software selection</title>
|
||||
<para>The software selection process plays a large role in the
|
||||
architecture of a general purpose cloud. The following have
|
||||
a large impact on the design of the cloud:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Choice of operating system
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Selection of OpenStack software components
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Choice of hypervisor
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Selection of supplemental software
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Operating system (OS) selection plays a large role in the
|
||||
design and architecture of a cloud. There are a number of OSes
|
||||
which have native support for OpenStack including:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Ubuntu
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Red Hat Enterprise Linux (RHEL)
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
CentOS
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
SUSE Linux Enterprise Server (SLES)
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<note>
|
||||
<para>Native support is not a constraint on the choice of OS; users are
|
||||
free to choose just about any Linux distribution (or even
|
||||
Microsoft Windows) and install OpenStack directly from source
|
||||
(or compile their own packages). However, many organizations will
|
||||
prefer to install OpenStack from distribution-supplied packages or
|
||||
repositories (although using the distribution vendor's OpenStack
|
||||
packages might be a requirement for support).
|
||||
</para>
|
||||
</note>
|
||||
<para>OS selection also directly influences hypervisor selection.
|
||||
A cloud architect who selects Ubuntu, RHEL, or SLES has some
|
||||
flexibility in hypervisor; KVM, Xen, and LXC are supported
|
||||
virtualization methods available under OpenStack Compute
|
||||
(nova) on these Linux distributions. However, a cloud architect
|
||||
who selects Hyper-V is limited to Windows Servers. Similarly, a
|
||||
cloud architect who selects XenServer is limited to the CentOS-based
|
||||
dom0 operating system provided with XenServer.</para>
|
||||
<para>The primary factors that play into OS-hypervisor selection
|
||||
include:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>User requirements</term>
|
||||
<listitem>
|
||||
<para>The selection of OS-hypervisor
|
||||
combination first and foremost needs to support the
|
||||
user requirements.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Support</term>
|
||||
<listitem>
|
||||
<para>The selected OS-hypervisor combination
|
||||
needs to be supported by OpenStack.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Interoperability</term>
|
||||
<listitem>
|
||||
<para>The OS-hypervisor needs to be
|
||||
interoperable with other features and services in the
|
||||
OpenStack design in order to meet the user
|
||||
requirements.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="hypervisor-tech-considerations">
|
||||
<title>Hypervisor</title>
|
||||
<para>OpenStack supports a wide variety of hypervisors, one or
|
||||
more of which can be used in a single cloud. These hypervisors
|
||||
include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>KVM (and QEMU)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>XCP/XenServer</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>vSphere (vCenter and ESXi)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Hyper-V</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>LXC</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Docker</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Bare-metal</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>A complete list of supported hypervisors and their
|
||||
capabilities can be found at
|
||||
<link xlink:href="https://wiki.openstack.org/wiki/HypervisorSupportMatrix">OpenStack Hypervisor Support Matrix</link>.
|
||||
</para>
|
||||
<para>We recommend general purpose clouds use hypervisors that
|
||||
support the most general purpose use cases, such as KVM and
|
||||
Xen. More specific hypervisors should be chosen to account
|
||||
for specific functionality or a supported feature requirement.
|
||||
In some cases, there may also be a mandated
|
||||
requirement to run software on a certified hypervisor
|
||||
including solutions from VMware, Microsoft, and Citrix.</para>
|
||||
<para>The features offered through the OpenStack cloud platform
|
||||
determine the best choice of a hypervisor. Each hypervisor
|
||||
has their own hardware requirements which may affect the decisions
|
||||
around designing a general purpose cloud.</para>
|
||||
<para>In a mixed hypervisor environment, specific aggregates of
|
||||
compute resources, each with defined capabilities, enable
|
||||
workloads to utilize software and hardware specific to their
|
||||
particular requirements. This functionality can be exposed
|
||||
explicitly to the end user, or accessed through defined
|
||||
metadata within a particular flavor of an instance.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="openstack-components-tech-considerations">
|
||||
<title>OpenStack components</title>
|
||||
<para>A general purpose OpenStack cloud design should incorporate
|
||||
the core OpenStack services to provide a wide range of
|
||||
services to end-users. The OpenStack core services recommended
|
||||
in a general purpose cloud are:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>OpenStack <glossterm>Compute</glossterm>
|
||||
(<glossterm>nova</glossterm>)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack <glossterm>Networking</glossterm>
|
||||
(<glossterm>neutron</glossterm>)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack <glossterm>Image service</glossterm>
|
||||
(<glossterm>glance</glossterm>)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack <glossterm>Identity</glossterm>
|
||||
(<glossterm>keystone</glossterm>)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack <glossterm>dashboard</glossterm>
|
||||
(<glossterm>horizon</glossterm>)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para><glossterm>Telemetry</glossterm>
|
||||
(<glossterm>ceilometer</glossterm>)</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>A general purpose cloud may also include OpenStack
|
||||
<glossterm>Object Storage</glossterm> (<glossterm>swift</glossterm>).
|
||||
OpenStack <glossterm>Block Storage</glossterm>
|
||||
(<glossterm>cinder</glossterm>). These may be
|
||||
selected to provide storage to applications and
|
||||
instances.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="supplemental-software-tech-considerations">
|
||||
<title>Supplemental software</title>
|
||||
<para>A general purpose OpenStack deployment consists of more than
|
||||
just OpenStack-specific components. A typical deployment
|
||||
involves services that provide supporting functionality,
|
||||
including databases and message queues, and may also involve
|
||||
software to provide high availability of the OpenStack
|
||||
environment. Design decisions around the underlying message
|
||||
queue might affect the required number of controller services,
|
||||
as well as the technology to provide highly resilient database
|
||||
functionality, such as MariaDB with Galera. In such a
|
||||
scenario, replication of services relies on quorum.</para>
|
||||
<para>Where many general purpose deployments use hardware load
|
||||
balancers to provide highly available API access and SSL
|
||||
termination, software solutions, for example HAProxy, can also
|
||||
be considered. It is vital to ensure that such software
|
||||
implementations are also made highly available. High
|
||||
availability can be achieved by using software such as
|
||||
Keepalived or Pacemaker with Corosync. Pacemaker and Corosync
|
||||
can provide active-active or active-passive highly available
|
||||
configuration depending on the specific service in the
|
||||
OpenStack environment. Using this software can affect the
|
||||
design as it assumes at least a 2-node controller
|
||||
infrastructure where one of those nodes may be running certain
|
||||
services in standby mode.</para>
|
||||
<para>Memcached is a distributed memory object caching system, and
|
||||
Redis is a key-value store. Both are deployed on
|
||||
general purpose clouds to assist in alleviating load to the
|
||||
Identity service. The memcached service caches tokens, and due
|
||||
to its distributed nature it can help alleviate some
|
||||
bottlenecks to the underlying authentication system. Using
|
||||
memcached or Redis does not affect the overall design of your
|
||||
architecture as they tend to be deployed onto the
|
||||
infrastructure nodes providing the OpenStack services.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="controller-infrastructure-tech-considerations">
|
||||
<title>Controller infrastructure</title>
|
||||
<para>The Controller infrastructure nodes provide management
|
||||
services to the end-user as well as providing services
|
||||
internally for the operating of the cloud. The Controllers
|
||||
run message queuing services that carry system
|
||||
messages between each service. Performance issues related to
|
||||
the message bus would lead to delays in sending that message
|
||||
to where it needs to go. The result of this condition would be
|
||||
delays in operation functions such as spinning up and deleting
|
||||
instances, provisioning new storage volumes and managing
|
||||
network resources. Such delays could adversely affect an
|
||||
application’s ability to react to certain conditions,
|
||||
especially when using auto-scaling features. It is important
|
||||
to properly design the hardware used to run the controller
|
||||
infrastructure as outlined above in the Hardware Selection
|
||||
section.</para>
|
||||
<para>Performance of the controller services is not limited
|
||||
to processing power, but restrictions may emerge in serving
|
||||
concurrent users. Ensure that the APIs and Horizon services
|
||||
are load tested to ensure that you are able to serve your
|
||||
customers. Particular attention should be made to the
|
||||
OpenStack Identity Service (Keystone), which provides the
|
||||
authentication and authorization for all services, both
|
||||
internally to OpenStack itself and to end-users. This service
|
||||
can lead to a degradation of overall performance if this is
|
||||
not sized appropriately.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="network-performance-tech-considerations">
|
||||
<title>Network performance</title>
|
||||
<para>In a general purpose OpenStack cloud, the requirements of
|
||||
the network help determine performance capabilities.
|
||||
It is possible to design OpenStack
|
||||
environments that run a mix of networking capabilities. By
|
||||
utilizing the different interface speeds, the users of the
|
||||
OpenStack environment can choose networks that are fit for
|
||||
their purpose.</para>
|
||||
<para>Network performance can be boosted considerably by
|
||||
implementing hardware load balancers to provide front-end
|
||||
service to the cloud APIs. The hardware load balancers also
|
||||
perform SSL termination if that is a requirement of your
|
||||
environment. When implementing SSL offloading, it is important
|
||||
to understand the SSL offloading capabilities of the devices
|
||||
selected.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="compute-host-tech-considerations">
|
||||
<title>Compute host</title>
|
||||
<para>The choice of hardware specifications used in compute nodes
|
||||
including CPU, memory and disk type directly affects the
|
||||
performance of the instances. Other factors which can directly
|
||||
affect performance include tunable parameters within the
|
||||
OpenStack services, for example the overcommit ratio applied
|
||||
to resources. The defaults in OpenStack Compute set a 16:1
|
||||
over-commit of the CPU and 1.5 over-commit of the memory.
|
||||
Running at such high ratios leads to an increase in
|
||||
"noisy-neighbor" activity. Care must be taken when sizing your
|
||||
Compute environment to avoid this scenario. For running
|
||||
general purpose OpenStack environments it is possible to keep
|
||||
to the defaults, but make sure to monitor your environment as
|
||||
usage increases.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="storage-performance-tech-considerations">
|
||||
<title>Storage performance</title>
|
||||
<para>When considering performance of OpenStack Block Storage,
|
||||
hardware and architecture choice is important. Block Storage
|
||||
can use enterprise back-end systems such as NetApp or EMC,
|
||||
scale out storage such as GlusterFS and Ceph, or simply use
|
||||
the capabilities of directly attached storage in the nodes
|
||||
themselves. Block Storage may be deployed so that traffic
|
||||
traverses the host network, which could affect, and be
|
||||
adversely affected by, the front-side API traffic performance.
|
||||
As such, consider using a dedicated data storage network with
|
||||
dedicated interfaces on the Controller and Compute
|
||||
hosts.</para>
|
||||
<para>When considering performance of OpenStack Object Storage, a
|
||||
number of design choices will affect performance. A user’s
|
||||
access to the Object Storage is through the proxy services,
|
||||
which sit behind hardware load balancers. By the
|
||||
very nature of a highly resilient storage system, replication
|
||||
of the data would affect performance of the overall system. In
|
||||
this case, 10 GbE (or better) networking is recommended
|
||||
throughout the storage network architecture.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="availability-tech-considerations">
|
||||
<title>Availability</title>
|
||||
<para>In OpenStack, the infrastructure is integral to providing
|
||||
services and should always be available, especially when
|
||||
operating with SLAs. Ensuring network availability is
|
||||
accomplished by designing the network architecture so that no
|
||||
single point of failure exists. A consideration of the number
|
||||
of switches, routes and redundancies of power should be
|
||||
factored into core infrastructure, as well as the associated
|
||||
bonding of networks to provide diverse routes to your highly
|
||||
available switch infrastructure.</para>
|
||||
<para>The OpenStack services themselves should be deployed across
|
||||
multiple servers that do not represent a single point of
|
||||
failure. Ensuring API availability can be achieved by placing
|
||||
these services behind highly available load balancers that
|
||||
have multiple OpenStack servers as members.</para>
|
||||
<para>OpenStack lends itself to deployment in a highly available
|
||||
manner where it is expected that at least 2 servers be
|
||||
utilized. These can run all the services involved from the
|
||||
message queuing service, for example RabbitMQ or QPID, and an
|
||||
appropriately deployed database service such as MySQL or
|
||||
MariaDB. As services in the cloud are scaled out, back-end
|
||||
services will need to scale too. Monitoring and reporting on
|
||||
server utilization and response times, as well as load testing
|
||||
your systems, will help determine scale out decisions.</para>
|
||||
<para>Care must be taken when deciding network functionality.
|
||||
Currently, OpenStack supports both the legacy networking (nova-network)
|
||||
system and the newer, extensible OpenStack Networking (neutron). Both
|
||||
have their pros and cons when it comes to providing highly
|
||||
available access. Legacy networking, which provides networking
|
||||
access maintained in the OpenStack Compute code, provides a
|
||||
feature that removes a single point of failure when it comes
|
||||
to routing, and this feature is currently missing in OpenStack
|
||||
Networking. The effect of legacy networking’s multi-host
|
||||
functionality restricts failure domains to the host running
|
||||
that instance.</para>
|
||||
<para>When using OpenStack Networking, the
|
||||
OpenStack controller servers or separate Networking
|
||||
hosts handle routing. For a deployment that requires features
|
||||
available in only Networking, it is possible to
|
||||
remove this restriction by using third party software that
|
||||
helps maintain highly available L3 routes. Doing so allows for
|
||||
common APIs to control network hardware, or to provide complex
|
||||
multi-tier web applications in a secure manner. It is also
|
||||
possible to completely remove routing from
|
||||
Networking, and instead rely on hardware routing capabilities.
|
||||
In this case, the switching infrastructure must support L3
|
||||
routing.</para>
|
||||
<para>OpenStack Networking and legacy networking
|
||||
both have their advantages and
|
||||
disadvantages. They are both valid and supported options that
|
||||
fit different network deployment models described in the
|
||||
<citetitle><link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops/content/network_design.html#network_deployment_options"
|
||||
>OpenStack Operations Guide</link></citetitle>.</para>
|
||||
<para>Ensure your deployment has adequate back-up capabilities.</para>
|
||||
<para>Application design must also be factored into the
|
||||
capabilities of the underlying cloud infrastructure. If the
|
||||
compute hosts do not provide a seamless live migration
|
||||
capability, then it must be expected that when a compute host
|
||||
fails, that instance and any data local to that instance will
|
||||
be deleted. However, when providing an expectation to users
|
||||
that instances have a high-level of uptime guarantees, the
|
||||
infrastructure must be deployed in a way that eliminates any
|
||||
single point of failure when a compute host disappears. This
|
||||
may include utilizing shared file systems on enterprise
|
||||
storage or OpenStack Block storage to provide a level of
|
||||
guarantee to match service features.</para>
|
||||
<para>For more information on high availability in OpenStack, see the <link
|
||||
xlink:href="http://docs.openstack.org/ha-guide/"><citetitle>OpenStack
|
||||
High Availability Guide</citetitle></link>.
|
||||
</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="security-tech-considerations">
|
||||
<title>Security</title>
|
||||
<para>A security domain comprises users, applications, servers or
|
||||
networks that share common trust requirements and expectations
|
||||
within a system. Typically they have the same authentication
|
||||
and authorization requirements and users.</para>
|
||||
<para>These security domains are:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Public</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Guest</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Management</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Data</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>These security domains can be mapped to an OpenStack
|
||||
deployment individually, or combined. In each case, the cloud operator
|
||||
should be aware of the appropriate security concerns. Security
|
||||
domains should be mapped out against your specific OpenStack
|
||||
deployment topology. The domains and their trust requirements
|
||||
depend upon whether the cloud instance is public, private, or
|
||||
hybrid.</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>The public security domain is an entirely untrusted area of
|
||||
the cloud infrastructure. It can refer to the internet as a
|
||||
whole or simply to networks over which you have no authority.
|
||||
This domain should always be considered untrusted.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The guest security domain handles compute data generated by
|
||||
instances on the cloud but not services that support the
|
||||
operation of the cloud, such as API calls. Public cloud
|
||||
providers and private cloud providers who do not have
|
||||
stringent controls on instance use or who allow unrestricted
|
||||
internet access to instances should consider this domain to be
|
||||
untrusted. Private cloud providers may want to consider this
|
||||
network as internal and therefore trusted only if they have
|
||||
controls in place to assert that they trust instances and all
|
||||
their tenants.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The management security domain is where services interact.
|
||||
Sometimes referred to as the <emphasis>control plane</emphasis>, the networks
|
||||
in this domain transport confidential data such as configuration
|
||||
parameters, user names, and passwords. In most deployments this
|
||||
domain is considered trusted.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The data security domain is concerned primarily with
|
||||
information pertaining to the storage services within
|
||||
OpenStack. Much of the data that crosses this network has high
|
||||
integrity and confidentiality requirements and, depending on
|
||||
the type of deployment, may also have strong availability
|
||||
requirements. The trust level of this network is heavily
|
||||
dependent on other deployment decisions.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>When deploying OpenStack in an enterprise as a private cloud
|
||||
it is usually behind the firewall and within the trusted
|
||||
network alongside existing systems. Users of the cloud are
|
||||
employees that are bound by the security
|
||||
requirements set forth by the company. This tends to push most
|
||||
of the security domains towards a more trusted model. However,
|
||||
when deploying OpenStack in a public facing role, no
|
||||
assumptions can be made and the attack vectors significantly
|
||||
increase.</para>
|
||||
<para>Consideration must be taken when managing the users of the
|
||||
system for both public and private clouds. The identity
|
||||
service allows for LDAP to be part of the authentication
|
||||
process. Including such systems in an OpenStack deployment may
|
||||
ease user management if integrating into existing
|
||||
systems.</para>
|
||||
<para>It is important to understand that user authentication
|
||||
requests include sensitive information including user names,
|
||||
passwords, and authentication tokens. For this reason, placing
|
||||
the API services behind hardware that performs SSL termination
|
||||
is strongly recommended.</para>
|
||||
<para>
|
||||
For more information OpenStack Security, see the <link
|
||||
xlink:href="http://docs.openstack.org/security-guide/"><citetitle>OpenStack
|
||||
Security Guide</citetitle></link>
|
||||
</para>
|
||||
</section>
|
||||
</section>
|
@ -1,155 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="user-requirements-general-purpose">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>User requirements</title>
|
||||
<para>When building a general purpose cloud, you should follow the
|
||||
<glossterm baseform="IaaS">Infrastructure-as-a-Service (IaaS)</glossterm>
|
||||
model; a platform best suited for use cases with simple requirements.
|
||||
General purpose cloud user requirements are not complex.
|
||||
However, it is important to capture them even
|
||||
if the project has minimum business and technical requirements, such as a
|
||||
proof of concept (PoC), or a small lab platform.</para>
|
||||
<note>
|
||||
<para>
|
||||
The following user considerations are written from the perspective of
|
||||
the cloud builder, not from the perspective of the end user.
|
||||
</para>
|
||||
</note>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Cost</term>
|
||||
<listitem>
|
||||
<para>Financial factors are a primary concern for
|
||||
any organization. Cost is an important criterion
|
||||
as general purpose clouds are considered the baseline
|
||||
from which all other cloud architecture environments
|
||||
derive. General purpose clouds do not always provide
|
||||
the most cost-effective environment for specialized
|
||||
applications or situations. Unless razor-thin margins and costs have
|
||||
been mandated as a critical factor, cost should not be
|
||||
the sole consideration when choosing or designing a
|
||||
general purpose architecture.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Time to market</term>
|
||||
<listitem>
|
||||
<para>The ability to deliver services or products within
|
||||
a flexible time frame is a common business factor
|
||||
when building a general purpose cloud.
|
||||
Delivering a product in six months instead
|
||||
of two years is a driving force behind the
|
||||
decision to build general purpose clouds. General
|
||||
purpose clouds allow users to self-provision and gain
|
||||
access to compute, network, and storage resources
|
||||
on-demand thus decreasing time to market.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Revenue opportunity</term>
|
||||
<listitem>
|
||||
<para>Revenue opportunities for a
|
||||
cloud will vary greatly based on the intended
|
||||
use case of that particular cloud. Some general
|
||||
purpose clouds are built for commercial customer
|
||||
facing products, but there are alternatives
|
||||
that might make the general purpose cloud the right
|
||||
choice.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<section xml:id="technical-requirements">
|
||||
<title>Technical requirements</title>
|
||||
<para>Technical cloud architecture requirements should be weighted
|
||||
against the business requirements.
|
||||
</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Performance</term>
|
||||
<listitem>
|
||||
<para>As a baseline product, general purpose
|
||||
clouds do not provide optimized performance for any
|
||||
particular function. While a general purpose cloud
|
||||
should provide enough performance to satisfy average
|
||||
user considerations, performance is not a general
|
||||
purpose cloud customer driver.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>No predefined usage model</term>
|
||||
<listitem>
|
||||
<para>The lack of a pre-defined
|
||||
usage model enables the user to run a wide variety of
|
||||
applications without having to know the application
|
||||
requirements in advance. This provides a degree of
|
||||
independence and flexibility that no other cloud
|
||||
scenarios are able to provide.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>On-demand and self-service application</term>
|
||||
<listitem>
|
||||
<para>By
|
||||
definition, a cloud provides end users with the
|
||||
ability to self-provision computing power, storage,
|
||||
networks, and software in a simple and flexible way.
|
||||
The user must be able to scale their resources up to a
|
||||
substantial level without disrupting the underlying
|
||||
host operations. One of the benefits of using a
|
||||
general purpose cloud architecture is the ability to
|
||||
start with limited resources and increase them over
|
||||
time as the user demand grows.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Public cloud</term>
|
||||
<listitem>
|
||||
<para>For a company interested in building a
|
||||
commercial public cloud offering based on OpenStack,
|
||||
the general purpose architecture model might be the
|
||||
best choice. Designers are not always going to
|
||||
know the purposes or workloads for which the end users
|
||||
will use the cloud.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Internal consumption (private) cloud</term>
|
||||
<listitem>
|
||||
<para>Organizations need to determine if it is logical to
|
||||
create their own clouds internally. Using a private cloud,
|
||||
organizations are able to maintain complete control over
|
||||
architectural and cloud components.</para>
|
||||
<note>
|
||||
<para>Users will want to combine
|
||||
using the internal cloud with access to an external
|
||||
cloud. If that case is likely, it might be worth
|
||||
exploring the possibility of taking a multi-cloud
|
||||
approach with regard to at least some of the
|
||||
architectural elements.
|
||||
</para>
|
||||
</note>
|
||||
<para>Designs that incorporate the
|
||||
use of multiple clouds, such as a private cloud and a
|
||||
public cloud offering, are described in the
|
||||
"Multi-Cloud" scenario, see <xref linkend="multi_site"/>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Security</term>
|
||||
<listitem>
|
||||
<para>Security should be implemented according
|
||||
to asset, threat, and vulnerability risk assessment
|
||||
matrices. For cloud domains that require increased
|
||||
computer security, network security, or information
|
||||
security, a general purpose cloud is not considered an
|
||||
appropriate choice.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
</section>
|
@ -1,190 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-guide-architecture-hybrid">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Architecture</title>
|
||||
<para>Map out the dependencies of the expected workloads
|
||||
and the cloud infrastructures required to support them to architect a
|
||||
solution for the broadest compatibility between cloud platforms,
|
||||
minimizing the need to create workarounds and processes to fill
|
||||
identified gaps.</para>
|
||||
<para>For your chosen cloud management platform, note the relative
|
||||
levels of support for both monitoring and orchestration.</para>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Multi-Cloud_Priv-AWS4.png"/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
|
||||
<section xml:id="image-portability">
|
||||
<title>Image portability</title>
|
||||
<para>The majority of cloud workloads currently run on instances
|
||||
using hypervisor technologies. The challenge is that each of these
|
||||
hypervisors uses an image format that may not be compatible with the
|
||||
others. When possible, standardize on a single hypervisor and instance
|
||||
image format. This may not be possible when using externally-managed
|
||||
public clouds.</para>
|
||||
<para>Conversion tools exist to address image format compatibility.
|
||||
Examples include <link
|
||||
xlink:href="http://libguestfs.org/virt-v2v">virt-p2v/virt-v2v</link>
|
||||
and <link
|
||||
xlink:href="http://libguestfs.org/virt-edit.1.html">
|
||||
virt-edit</link>. These tools cannot serve beyond basic cloud instance
|
||||
specifications.</para>
|
||||
<para>Alternatively, build a thin operating system image as
|
||||
the base for new instances. This facilitates rapid creation of cloud
|
||||
instances using cloud orchestration or configuration management tools
|
||||
for more specific templating. Remember if you intend to use portable
|
||||
images for disaster recovery, application diversity, or high
|
||||
availability, your users could move the images and instances between
|
||||
cloud platforms regularly.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="upper-layer-services">
|
||||
<title>Upper-layer services</title>
|
||||
<para>Many clouds offer complementary services beyond the
|
||||
basic compute, network, and storage components. These
|
||||
additional services often simplify the deployment
|
||||
and management of applications on a cloud platform.</para>
|
||||
<para>When moving workloads from the source to the destination
|
||||
cloud platforms, consider that the destination cloud platform
|
||||
may not have comparable services. Implement workloads in a
|
||||
different way or by using a different technology.</para>
|
||||
<para>For example, moving an application that uses a NoSQL database
|
||||
service such as MongoDB could cause difficulties in maintaining
|
||||
the application between the platforms.</para>
|
||||
<para>There are a number of options that are appropriate for
|
||||
the hybrid cloud use case:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Implementing a baseline of upper-layer services
|
||||
across all of the cloud platforms. For
|
||||
platforms that do not support a given service, create
|
||||
a service on top of that platform and apply it to the
|
||||
workloads as they are launched on that cloud.</para>
|
||||
<para>For example, through the <glossterm>Database service</glossterm>
|
||||
for OpenStack (<glossterm>trove</glossterm>),
|
||||
OpenStack supports MySQL-as-a-Service but not NoSQL
|
||||
databases in production. To move from or run
|
||||
alongside AWS, a NoSQL workload must use an automation
|
||||
tool, such as the Orchestration service (heat), to
|
||||
recreate the NoSQL database on top of OpenStack.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Deploying a <glossterm>Platform-as-a-Service (PaaS)</glossterm>
|
||||
technology that abstracts the
|
||||
upper-layer services from the underlying cloud
|
||||
platform. The unit of application deployment and
|
||||
migration is the PaaS. It leverages the services of
|
||||
the PaaS and only consumes the base infrastructure
|
||||
services of the cloud platform.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Using automation tools to create the required upper-layer services
|
||||
that are portable across all cloud platforms.</para>
|
||||
<para>For example, instead of using database services that
|
||||
are inherent in the cloud platforms, launch cloud
|
||||
instances and deploy the databases on those
|
||||
instances using scripts or configuration and
|
||||
application deployment tools.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="network-services">
|
||||
<title>Network services</title>
|
||||
<para>Network services functionality is a critical component of
|
||||
multiple cloud architectures. It is an important factor
|
||||
to assess when choosing a CMP and cloud provider.
|
||||
Considerations include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Functionality
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Security
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Scalability
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
High availability (HA)
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Verify and test critical cloud endpoint features.</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>After selecting the network functionality framework,
|
||||
you must confirm the functionality is compatible. This
|
||||
ensures testing and functionality persists
|
||||
during and after upgrades.</para>
|
||||
<note>
|
||||
<para>Diverse cloud platforms may de-synchronize
|
||||
over time if you do not maintain their mutual compatibility.
|
||||
This is a particular issue with APIs.</para>
|
||||
</note>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Scalability across multiple cloud providers determines
|
||||
your choice of underlying network framework. It is important to
|
||||
have the network API functions presented and to verify
|
||||
that the desired functionality persists across all
|
||||
chosen cloud endpoint.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>High availability implementations vary in
|
||||
functionality and design. Examples of some common
|
||||
methods are active-hot-standby, active-passive, and
|
||||
active-active. Develop your high availability
|
||||
implementation and a test framework to understand
|
||||
the functionality and limitations of the environment.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>It is imperative to address security considerations.
|
||||
For example, addressing how data is secured between client and
|
||||
endpoint and any traffic that traverses the multiple clouds.
|
||||
Business and regulatory requirements dictate what security
|
||||
approach to take. For more information, see the
|
||||
<link linkend="security-overview">Security
|
||||
Requirements Chapter</link></para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="data">
|
||||
<title>Data</title>
|
||||
<para>Traditionally, replication has been the best method of protecting
|
||||
object store implementations. A variety of replication methods exist
|
||||
in storage architectures, for example synchronous and asynchronous
|
||||
mirroring. Most object stores and back-end storage systems implement
|
||||
methods for replication at the storage subsystem layer.
|
||||
Object stores also tailor replication techniques
|
||||
to fit a cloud's requirements.</para>
|
||||
<para>Organizations must find the right balance between
|
||||
data integrity and data availability. Replication strategy may
|
||||
also influence disaster recovery methods.</para>
|
||||
<para>Replication across different racks, data centers, and
|
||||
geographical regions increases focus on
|
||||
determining and ensuring data locality. The ability to
|
||||
guarantee data is accessed from the nearest or fastest storage
|
||||
can be necessary for applications to perform well.</para>
|
||||
<note>
|
||||
<para>When running embedded object store methods, ensure that you do
|
||||
not instigate extra data replication as this can cause performance
|
||||
issues.</para>
|
||||
</note>
|
||||
</section>
|
||||
</section>
|
@ -1,86 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-guide-hybrid-operational-considerations">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Operational considerations</title>
|
||||
<para>Hybrid cloud deployments present complex operational
|
||||
challenges. Differences between provider clouds can cause
|
||||
incompatibilities with workloads or Cloud Management
|
||||
Platforms (CMP). Cloud providers may also offer different levels of
|
||||
integration with competing cloud offerings.</para>
|
||||
<para>Monitoring is critical to maintaining a hybrid cloud, and it is
|
||||
important to determine if a CMP supports
|
||||
monitoring of all the clouds involved, or if compatible APIs
|
||||
are available to be queried for necessary information.</para>
|
||||
|
||||
<section xml:id="agility">
|
||||
<title>Agility</title>
|
||||
<para>Hybrid clouds provide application
|
||||
availability across different cloud environments and
|
||||
technologies. This availability enables the deployment to
|
||||
survive disaster in any single cloud environment.
|
||||
Each cloud should provide the means to create instances quickly
|
||||
in response to capacity issues or failure elsewhere in the hybrid
|
||||
cloud.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="application-readiness-hybrid">
|
||||
<title>Application readiness</title>
|
||||
<para>Enterprise workloads that depend on the
|
||||
underlying infrastructure for availability are not designed to
|
||||
run on OpenStack. If the application cannot
|
||||
tolerate infrastructure failures, it is likely to require
|
||||
significant operator intervention to recover. Applications for
|
||||
hybrid clouds must be fault tolerant, with an SLA that is not tied
|
||||
to the underlying infrastructure. Ideally, cloud applications should be
|
||||
able to recover when entire racks and data centers experience an
|
||||
outage.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="upgrades">
|
||||
<title>Upgrades</title>
|
||||
<para>If a deployment includes a public cloud, predicting
|
||||
upgrades may not be possible. Carefully examine provider SLAs.</para>
|
||||
<note>
|
||||
<para>At massive scale, even when
|
||||
dealing with a cloud that offers an SLA with a high percentage
|
||||
of uptime, workloads must be able to recover quickly.</para>
|
||||
</note>
|
||||
<para>When upgrading private cloud deployments, minimize disruption by
|
||||
making incremental changes and providing a facility to either rollback
|
||||
or continue to roll forward when using a continuous delivery
|
||||
model.</para>
|
||||
<para>You may need to coordinate CMP upgrades with hybrid cloud upgrades if
|
||||
there are API changes.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="network-operation-center-noc">
|
||||
<title>Network Operation Center</title>
|
||||
<para>Consider infrastructure control
|
||||
when planning the Network Operation Center (NOC)
|
||||
for a hybrid cloud environment. If a significant
|
||||
portion of the cloud is on externally managed systems,
|
||||
prepare for situations where it may not be possible to
|
||||
make changes.
|
||||
Additionally, providers may differ on how
|
||||
infrastructure must be managed and exposed. This can lead to
|
||||
delays in root cause analysis where each insists the blame
|
||||
lies with the other provider.</para>
|
||||
<para>Ensure that the network structure connects all clouds to form
|
||||
integrated system, keeping in mind the state of handoffs.
|
||||
These handoffs must both be as reliable as possible and
|
||||
include as little latency as possible to ensure the best
|
||||
performance of the overall system.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="maintainability">
|
||||
<title>Maintainability</title>
|
||||
<para>Hybrid clouds rely on third party systems and processes. As a
|
||||
result, it is not possible to guarantee
|
||||
proper maintenance of the overall system. Instead, be prepared to
|
||||
abandon workloads and recreate them in an improved state.</para>
|
||||
</section>
|
||||
</section>
|
@ -1,173 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="prescriptive-examples-multi-cloud">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Prescriptive examples</title>
|
||||
<para>Hybrid cloud environments are designed for
|
||||
these use cases:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Bursting workloads from private to public OpenStack
|
||||
clouds</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Bursting workloads from private to public
|
||||
non-OpenStack clouds</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>High availability across clouds (for technical
|
||||
diversity)</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>This chapter provides examples of environments
|
||||
that address each of these use cases.</para>
|
||||
<section xml:id="bursting-to-public-openstack-cloud">
|
||||
<title>Bursting to a public OpenStack cloud</title>
|
||||
<para>Company A's data center is running low on
|
||||
capacity. It is not possible to expand the data center in the
|
||||
foreseeable future. In order to accommodate
|
||||
the continuously growing need for development resources in the
|
||||
organization, Company A decides to use resources in the public
|
||||
cloud.</para>
|
||||
<para>Company A has an established data
|
||||
center with a substantial amount of hardware. Migrating the
|
||||
workloads to a public cloud is not feasible.</para>
|
||||
<para>The company has an internal cloud management platform that
|
||||
directs requests to the appropriate cloud, depending on
|
||||
the local capacity. This is a custom in-house application written for
|
||||
this specific purpose.</para>
|
||||
<para>This solution is depicted in the figure below:</para>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Multi-Cloud_Priv-Pub3.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
<para>This example shows two clouds with a Cloud Management
|
||||
Platform (CMP) connecting them. This guide does not
|
||||
discuss a specific CMP, but describes how the Orchestration and
|
||||
Telemetry services handle, manage, and control workloads.</para>
|
||||
<para>The private OpenStack cloud has at least one
|
||||
controller and at least one compute node. It includes
|
||||
metering using the Telemetry service. The Telemetry service
|
||||
captures the load increase and the CMP processes the information.
|
||||
If there is available capacity, the CMP uses the
|
||||
OpenStack API to call the Orchestration service. This creates
|
||||
instances on the private cloud in response to user requests.
|
||||
When capacity is not available on the private cloud,
|
||||
the CMP issues a request to the Orchestration service API of
|
||||
the public cloud. This creates the instance on the public
|
||||
cloud.</para>
|
||||
<para>In this example, Company A does not direct the deployments to an
|
||||
external public cloud due to concerns regarding resource control,
|
||||
security, and increased operational expense</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="bursting-to-public-nonopenstack-cloud">
|
||||
<title>Bursting to a public non-OpenStack cloud</title>
|
||||
<para>The second example examines bursting workloads from the
|
||||
private cloud into a non-OpenStack public cloud using Amazon
|
||||
Web Services (AWS) to take advantage of additional capacity
|
||||
and to scale applications.</para>
|
||||
<para>The following diagram demonstrates an OpenStack-to-AWS hybrid
|
||||
cloud:</para>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Multi-Cloud_Priv-AWS4.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
<para>Company B states that its developers are already using AWS and
|
||||
do not want to change to a different provider.</para>
|
||||
<para>If the CMP is capable of connecting to an external
|
||||
cloud provider with an appropriate API, the workflow process
|
||||
remains the same as the previous scenario. The actions the
|
||||
CMP takes, such as monitoring loads and creating new instances,
|
||||
stay the same. However, the CMP performs actions in the
|
||||
public cloud using applicable API calls.</para>
|
||||
<para>If the public cloud is AWS, the CMP would use the
|
||||
EC2 API to create a new instance and assign an Elastic IP.
|
||||
It can then add that IP to HAProxy in the private cloud.
|
||||
The CMP can also reference AWS-specific
|
||||
tools such as CloudWatch and CloudFormation.</para>
|
||||
<para>Several open source tool kits for building CMPs are
|
||||
available and can handle this kind of translation. Examples include
|
||||
ManageIQ, jClouds, and JumpGate.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="high-availability-disaster-recovery">
|
||||
<title>High availability and disaster recovery</title>
|
||||
<para>Company C requires their local data center
|
||||
to be able to recover from failure. Some of the
|
||||
workloads currently in use are running on their private
|
||||
OpenStack cloud. Protecting the data involves Block Storage,
|
||||
Object Storage, and a database. The architecture
|
||||
supports the failure of large components of the system while
|
||||
ensuring that the system continues to deliver services.
|
||||
While the services remain available to users, the failed
|
||||
components are restored in the background based on standard
|
||||
best practice data replication policies. To achieve these objectives,
|
||||
Company C replicates data to a second cloud in a geographically distant
|
||||
location. The following diagram describes this system:</para>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Multi-Cloud_failover2.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
<para>This example includes two private OpenStack clouds connected
|
||||
with a CMP. The source cloud,
|
||||
OpenStack Cloud 1, includes a controller and at least one
|
||||
instance running MySQL. It also includes at least one Block
|
||||
Storage volume and one Object Storage volume. This means that data
|
||||
is available to the users at all times. The details of the
|
||||
method for protecting each of these sources of data
|
||||
differs.</para>
|
||||
<para>Object Storage relies on the replication capabilities of
|
||||
the Object Storage provider. Company C enables OpenStack Object Storage
|
||||
so that it creates geographically separated replicas
|
||||
that take advantage of this feature. The company configures storage
|
||||
so that at least one replica exists in each cloud. In order to make
|
||||
this work, the company configures a single array spanning both clouds
|
||||
with OpenStack Identity. Using Federated Identity, the array talks
|
||||
to both clouds, communicating with OpenStack Object Storage
|
||||
through the Swift proxy.</para>
|
||||
<para>For Block Storage, the replication is a little more
|
||||
difficult, and involves tools outside of OpenStack itself. The
|
||||
OpenStack Block Storage volume is not set as the drive itself
|
||||
but as a logical object that points to a physical back end. Disaster
|
||||
recovery is configured for Block Storage for
|
||||
synchronous backup for the highest level of data protection,
|
||||
but asynchronous backup could have been set as an alternative
|
||||
that is not as latency sensitive. For asynchronous backup, the
|
||||
Block Storage API makes it possible to export the data and also the
|
||||
metadata of a particular volume, so that it can be moved and
|
||||
replicated elsewhere. More information can be found here:
|
||||
<link
|
||||
xlink:href="https://blueprints.launchpad.net/cinder/+spec/cinder-backup-volume-metadata-support">
|
||||
https://blueprints.launchpad.net/cinder/+spec/cinder-backup-volume-metadata-support</link>.
|
||||
</para>
|
||||
<para>The synchronous backups create an identical volume in both
|
||||
clouds and chooses the appropriate flavor so that each cloud
|
||||
has an identical back end. This is done by creating volumes
|
||||
through the CMP. After this is configured, a solution
|
||||
involving DRDB synchronizes the physical drives.</para>
|
||||
<para>The database component is backed up using synchronous
|
||||
backups. MySQL does not support geographically diverse
|
||||
replication, so disaster recovery is provided by replicating
|
||||
the file itself. As it is not possible to use Object Storage
|
||||
as the back end of a database like MySQL, Swift replication
|
||||
is not an option. Company C decides not to store the data on
|
||||
another geo-tiered storage system, such as Ceph, as Block
|
||||
Storage. This would have given another layer of protection.
|
||||
Another option would have been to store the database on an
|
||||
OpenStack Block Storage volume and backing it up like any
|
||||
other Block Storage.</para>
|
||||
</section>
|
||||
</section>
|
@ -1,196 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE section [
|
||||
<!ENTITY % openstack SYSTEM "../../common/entities/openstack.ent">
|
||||
%openstack;
|
||||
]>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="technical-considerations-hybrid">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Technical considerations</title>
|
||||
<para>A hybrid cloud environment requires inspection and
|
||||
understanding of technical issues in external data centers that may
|
||||
not be in your control. Ideally, select an architecture
|
||||
and CMP that are adaptable to changing environments.</para>
|
||||
<para>Using diverse cloud platforms increases the risk of compatibility
|
||||
issues, but clouds using the same version and distribution
|
||||
of OpenStack are less likely to experience problems.</para>
|
||||
<para>Clouds that exclusively use the same versions of OpenStack should
|
||||
have no issues, regardless of distribution. More recent distributions
|
||||
are less likely to encounter incompatibility between versions. An
|
||||
OpenStack community initiative defines core functions that need to
|
||||
remain backward compatible between supported versions. For example, the
|
||||
DefCore initiative defines basic functions that every distribution must
|
||||
support in order to use the name <productname>OpenStack</productname>.
|
||||
</para>
|
||||
<para>Vendors can add proprietary customization to their distributions. If
|
||||
an application or architecture makes use of these features, it can be
|
||||
difficult to migrate to or use other types of environments.</para>
|
||||
<para>If an environment includes non-OpenStack clouds, it may experience
|
||||
compatibility problems. CMP tools must account for the differences in
|
||||
the handling of operations and the implementation of services.</para>
|
||||
<itemizedlist>
|
||||
<title>Possible cloud incompatibilities</title>
|
||||
<listitem>
|
||||
<para>Instance deployment</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Network management</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Application management</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Services implementation</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<section xml:id="capacity-planning-hybrid">
|
||||
<title>Capacity planning</title>
|
||||
<para>One of the primary reasons many organizations use a
|
||||
hybrid cloud is to increase capacity without making large capital
|
||||
investments.</para>
|
||||
<para>Capacity and the placement of workloads are key design considerations
|
||||
for hybrid clouds. The long-term capacity plan for these
|
||||
designs must incorporate growth over time to prevent permanent
|
||||
consumption of more expensive external clouds. To avoid this scenario,
|
||||
account for future applications' capacity requirements and plan growth
|
||||
appropriately.</para>
|
||||
<para>It is difficult to predict the amount of load a particular
|
||||
application might incur if the number of users fluctuates, or the
|
||||
application experiences an unexpected increase in use. It is
|
||||
possible to define application requirements in terms of vCPU, RAM,
|
||||
bandwidth, or other resources and plan appropriately. However, other
|
||||
clouds might not use the same meter or even the same oversubscription
|
||||
rates.</para>
|
||||
<para>Oversubscription is a method to emulate more capacity than
|
||||
may physically be present. For example, a physical
|
||||
hypervisor node with 32 GB RAM may host 24
|
||||
instances, each provisioned with 2 GB RAM. As long
|
||||
as all 24 instances do not concurrently use 2 full
|
||||
gigabytes, this arrangement works well. However, some
|
||||
hosts take oversubscription to extremes and, as a result,
|
||||
performance can be inconsistent. If at all
|
||||
possible, determine what the oversubscription rates of each
|
||||
host are and plan capacity accordingly.</para>
|
||||
</section>
|
||||
<section xml:id="utilization-hybrid">
|
||||
<title>Utilization</title>
|
||||
<para>A CMP must be aware of what workloads are running, where they are
|
||||
running, and their preferred utilizations. For example, in
|
||||
most cases it is desirable to run as many workloads internally
|
||||
as possible, utilizing other resources only when necessary. On
|
||||
the other hand, situations exist in which the opposite is
|
||||
true, such as when an internal cloud is only for development and
|
||||
stressing it is undesirable. A cost model of various scenarios and
|
||||
consideration of internal priorities helps with this decision. To
|
||||
improve efficiency, automate these decisions when possible.</para>
|
||||
<para>The Telemetry service (ceilometer) provides information on the usage
|
||||
of various OpenStack components. Note the following:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
If Telemetry must retain a large amount of data, for
|
||||
example when monitoring a large or active cloud, we recommend
|
||||
using a NoSQL back end such as MongoDB.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
You must monitor connections to non-OpenStack clouds
|
||||
and report this information to the CMP.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="performance-hybrid">
|
||||
<title>Performance</title>
|
||||
<para>Performance is critical to hybrid cloud deployments, and they are
|
||||
affected by many of the same issues as multi-site deployments,
|
||||
such as network latency between sites. Also consider the time required
|
||||
to run a workload in different clouds and methods for reducing this
|
||||
time. This may require moving data closer to applications
|
||||
or applications closer to the data they process, and
|
||||
grouping functionality so that connections that
|
||||
require low latency take place over a single cloud rather than
|
||||
spanning clouds. This may also require a CMP that can determine which
|
||||
cloud can most efficiently run which types of workloads.</para>
|
||||
<para>As with utilization, native OpenStack tools help improve performance.
|
||||
For example, you can use Telemetry to measure performance and the
|
||||
Orchestration service (heat) to react to changes in demand.</para>
|
||||
<note>
|
||||
<para>Orchestration requires special client configurations to integrate
|
||||
with Amazon Web Services. For other types of clouds, use CMP
|
||||
features.
|
||||
</para>
|
||||
</note>
|
||||
</section>
|
||||
|
||||
<section xml:id="components">
|
||||
<title>Components</title>
|
||||
<para>Using more than one cloud in any design requires consideration of
|
||||
four OpenStack tools:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>OpenStack Compute (nova)</term>
|
||||
<listitem>
|
||||
<para>Regardless of deployment location, hypervisor choice has a
|
||||
direct effect on how difficult it is to integrate with
|
||||
additional clouds.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Networking (neutron)</term>
|
||||
<listitem>
|
||||
<para>Whether using OpenStack Networking (neutron) or legacy
|
||||
networking (nova-network), it is necessary to understand
|
||||
network integration capabilities in order to
|
||||
connect between clouds.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Telemetry (ceilometer)</term>
|
||||
<listitem>
|
||||
<para>Use of Telemetry depends, in large part, on what the other
|
||||
parts of the cloud you are using.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Orchestration (heat)</term>
|
||||
<listitem>
|
||||
<para>Orchestration can be a valuable tool in orchestrating tasks a
|
||||
CMP decides are necessary in an OpenStack-based cloud.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="special-considerations-hybrid">
|
||||
<title>Special considerations</title>
|
||||
<para>Hybrid cloud deployments require consideration of two issues that
|
||||
are not common in other situations:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Image portability</term>
|
||||
<listitem>
|
||||
<para>As of the Kilo release, there is no common image format that is
|
||||
usable by all clouds. Conversion or recreation of images is necessary
|
||||
if migrating between clouds. To simplify deployment, use the smallest
|
||||
and simplest images feasible, install only what is necessary, and
|
||||
use a deployment manager such as Chef or Puppet. Do not use golden
|
||||
images to speed up the process unless you repeatedly deploy the same
|
||||
images on the same cloud.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>API differences</term>
|
||||
<listitem>
|
||||
<para>Avoid using a hybrid cloud deployment with more than just
|
||||
OpenStack (or with different versions of OpenStack) as API changes
|
||||
can cause compatibility issues.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
</section>
|
@ -1,258 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="user-requirements-hybrid">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>User requirements</title>
|
||||
<para>Hybrid cloud architectures are complex, especially those
|
||||
that use heterogeneous cloud platforms. Ensure that design choices
|
||||
match requirements so that the benefits outweigh the inherent additional
|
||||
complexity and risks.</para>
|
||||
<variablelist>
|
||||
<title>Business considerations when designing a hybrid
|
||||
cloud deployment</title>
|
||||
<varlistentry>
|
||||
<term>Cost</term>
|
||||
<listitem>
|
||||
<para>A hybrid cloud architecture involves multiple
|
||||
vendors and technical architectures. These
|
||||
architectures may be more expensive to deploy and
|
||||
maintain. Operational costs can be higher because of
|
||||
the need for more sophisticated orchestration and
|
||||
brokerage tools than in other architectures. In
|
||||
contrast, overall operational costs might be lower by
|
||||
virtue of using a cloud brokerage tool to deploy the
|
||||
workloads to the most cost effective platform.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Revenue opportunity</term>
|
||||
<listitem>
|
||||
<para>Revenue opportunities vary based on the intent and use case
|
||||
of the cloud. As a commercial, customer-facing product, you
|
||||
must consider whether building over multiple platforms makes
|
||||
the design more attractive to customers.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Time-to-market</term>
|
||||
<listitem>
|
||||
<para>One common reason to use cloud platforms is to improve the
|
||||
time-to-market of a new product or application. For example,
|
||||
using multiple cloud platforms is viable because there is an
|
||||
existing investment in several applications. It is faster to
|
||||
tie the investments together rather than migrate the
|
||||
components and refactoring them to a single platform.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Business or technical diversity</term>
|
||||
<listitem>
|
||||
<para>Organizations leveraging cloud-based services can
|
||||
embrace business diversity and utilize a hybrid cloud
|
||||
design to spread their workloads across multiple cloud
|
||||
providers. This ensures that no single cloud provider is
|
||||
the sole host for an application.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Application momentum</term>
|
||||
<listitem>
|
||||
<para>Businesses with existing applications may find that it is
|
||||
more cost effective to integrate applications on multiple
|
||||
cloud platforms than migrating them to a single platform.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
<section xml:id="workload-considerations">
|
||||
<title>Workload considerations</title>
|
||||
<para>A workload can be a single application or a suite of applications
|
||||
that work together. It can also be a duplicate set of applications that
|
||||
need to run on multiple cloud environments. In a hybrid cloud
|
||||
deployment, the same workload often needs to function
|
||||
equally well on radically different public and private cloud
|
||||
environments. The architecture needs to address these
|
||||
potential conflicts, complexity, and platform
|
||||
incompatibilities.</para>
|
||||
<variablelist>
|
||||
<title>Use cases for a hybrid cloud architecture</title>
|
||||
<varlistentry>
|
||||
<term>Dynamic resource expansion or bursting</term>
|
||||
<listitem>
|
||||
<para>An application that requires additional resources may suit
|
||||
a multiple cloud architecture.
|
||||
For example, a retailer needs additional resources
|
||||
during the holiday season, but does not want to add private
|
||||
cloud resources to meet the peak demand. The user can
|
||||
accommodate the increased load by bursting to
|
||||
a public cloud for these peak load
|
||||
periods. These bursts could be for long or short
|
||||
cycles ranging from hourly to yearly.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Disaster recovery and business continuity</term>
|
||||
<listitem>
|
||||
<para>Cheaper storage makes the public
|
||||
cloud suitable for maintaining backup applications.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Federated hypervisor and instance management</term>
|
||||
<listitem>
|
||||
<para>Adding self-service, charge back, and transparent delivery of
|
||||
the resources from a federated pool can be cost
|
||||
effective. In a hybrid cloud environment, this is a
|
||||
particularly important consideration. Look for a cloud
|
||||
that provides cross-platform hypervisor support and
|
||||
robust instance management tools.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Application portfolio integration</term>
|
||||
<listitem>
|
||||
<para>An enterprise cloud delivers efficient application portfolio
|
||||
management and deployments by leveraging
|
||||
self-service features and rules according to use. Integrating
|
||||
existing cloud environments is a common driver when building
|
||||
hybrid cloud architectures.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Migration scenarios</term>
|
||||
<listitem>
|
||||
<para>Hybrid cloud architecture enables the migration of
|
||||
applications between different clouds.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>High availability</term>
|
||||
<listitem>
|
||||
<para>A combination of locations and platforms enables a
|
||||
level of availability that is not
|
||||
possible with a single platform. This approach increases
|
||||
design complexity.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<para>As running a workload on multiple cloud platforms increases design
|
||||
complexity, we recommend first exploring options such as transferring
|
||||
workloads across clouds at the application, instance, cloud platform,
|
||||
hypervisor, and network levels.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="tools-considerations-hybrid">
|
||||
<title>Tools considerations</title>
|
||||
<para>Hybrid cloud designs must incorporate tools to facilitate working
|
||||
across multiple clouds.</para>
|
||||
<variablelist>
|
||||
<title>Tool functions</title>
|
||||
<varlistentry>
|
||||
<term>Broker between clouds</term>
|
||||
<listitem>
|
||||
<para>Brokering software evaluates relative costs between different
|
||||
cloud platforms. Cloud Management Platforms (CMP)
|
||||
allow the designer to determine the right location for the
|
||||
workload based on predetermined criteria.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Facilitate orchestration across the clouds</term>
|
||||
<listitem>
|
||||
<para>CMPs simplify the migration of application workloads between
|
||||
public, private, and hybrid cloud platforms. We recommend
|
||||
using cloud orchestration tools for managing a diverse
|
||||
portfolio of systems and applications across multiple cloud
|
||||
platforms.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="network-considerations-hybrid">
|
||||
<title>Network considerations</title>
|
||||
<para>It is important to consider the functionality, security, scalability,
|
||||
availability, and testability of network when choosing a CMP and cloud
|
||||
provider.</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Decide on a network framework and
|
||||
design minimum functionality tests. This ensures
|
||||
testing and functionality persists during and after
|
||||
upgrades.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Scalability across multiple cloud providers may
|
||||
dictate which underlying network framework you
|
||||
choose in different cloud providers. It is important
|
||||
to present the network API functions and to
|
||||
verify that functionality persists across all cloud
|
||||
endpoints chosen.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>High availability implementations vary in
|
||||
functionality and design. Examples of some common
|
||||
methods are active-hot-standby, active-passive, and
|
||||
active-active. Development of high availability and test
|
||||
frameworks is necessary to insure understanding of
|
||||
functionality and limitations.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Consider the security of data between the client and the
|
||||
endpoint, and of traffic that traverses the multiple
|
||||
clouds.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
|
||||
<section xml:id="risk-mitigation-management-hybrid">
|
||||
<title>Risk mitigation and management considerations</title>
|
||||
<para>Hybrid cloud architectures introduce additional risk because
|
||||
they are more complex than a single cloud design and may involve
|
||||
incompatible components or tools. However, they also reduce
|
||||
risk by spreading workloads over multiple providers.</para>
|
||||
<variablelist>
|
||||
<title>Hybrid cloud risks</title>
|
||||
<varlistentry>
|
||||
<term>Provider availability or implementation details</term>
|
||||
<listitem>
|
||||
<para>
|
||||
Business changes can affect provider availability. Likewise,
|
||||
changes in a provider's service can disrupt a hybrid cloud
|
||||
environment or increase costs.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Differing SLAs</term>
|
||||
<listitem>
|
||||
<para>Hybrid cloud designs must accommodate differences in SLAs
|
||||
between providers, and consider their enforceability.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Security levels</term>
|
||||
<listitem>
|
||||
<para>Securing multiple cloud
|
||||
environments is more complex than securing single
|
||||
cloud environments. We recommend addressing concerns at
|
||||
the application, network, and cloud platform levels.
|
||||
Be aware that each cloud platform approaches security
|
||||
differently, and a hybrid cloud design must address and
|
||||
compensate for these differences.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Provider API changes</term>
|
||||
<listitem>
|
||||
<para>Consumers of external clouds rarely have control over
|
||||
provider changes to APIs, and changes can break compatibility.
|
||||
Using only the most common and basic APIs can minimize
|
||||
potential conflicts.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
</section>
|
@ -1,106 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-guide-how-this-book-is-organized">
|
||||
<title>How this book is organized</title>
|
||||
<para>This book examines some of the most common uses for OpenStack
|
||||
clouds, and explains the considerations for each use case.
|
||||
Cloud architects may use this book as a comprehensive guide by
|
||||
reading all of the use cases, but it is also possible to review
|
||||
only the chapters which pertain to a specific use case.
|
||||
The use cases covered in this guide include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="generalpurpose">General purpose</link>: Uses common components that address
|
||||
80% of common use cases.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="compute_focus">Compute focused</link>: For compute intensive workloads
|
||||
such as high performance computing (HPC).
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="storage_focus">Storage focused</link>: For storage intensive workloads such as
|
||||
data analytics with parallel file systems.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="network_focus">Network focused</link>: For high performance and reliable
|
||||
networking, such as a <glossterm
|
||||
>content delivery network (CDN)</glossterm>.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="multi_site">Multi-site</link>: For applications that require multiple site
|
||||
deployments for geographical, reliability or data
|
||||
locality reasons.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="hybrid">Hybrid cloud</link>: Uses multiple disparate clouds
|
||||
connected either for failover, hybrid cloud bursting, or
|
||||
availability.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="massively_scalable">Massively
|
||||
scalable</link>: For
|
||||
cloud service providers or other large
|
||||
installations
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="specialized">Specialized cases</link>: Architectures that have not
|
||||
previously been covered in the defined use cases.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<!-- This section is currrently commented out as it is irrelevant within the current
|
||||
context. However, there are plans to use this list in the future. Please do not remove.
|
||||
|
||||
<para>Each chapter in the guide is then further broken down into
|
||||
the following sections:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Introduction: Provides an overview of the
|
||||
architectural use case.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>User requirements: Defines the set of user
|
||||
considerations that typically come into play for that
|
||||
use case.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Technical considerations: Covers the technical
|
||||
issues that must be accounted when dealing with this
|
||||
use case.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Operational considerations: Covers the ongoing
|
||||
operational tasks associated with this use case and
|
||||
architecture.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Architecture: Covers the overall architecture
|
||||
associated with the use case.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Prescriptive examples: Presents one or more
|
||||
scenarios where this architecture could be
|
||||
deployed.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
-->
|
||||
</section>
|
@ -1,95 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-guide-why-and-who-we-wrote-this-book">
|
||||
<title>Why and how we wrote this book</title>
|
||||
<para>We wrote this book to guide you through designing an OpenStack cloud
|
||||
architecture. This guide identifies design considerations
|
||||
for common cloud use cases and provides examples.</para>
|
||||
<para>The Architecture Design Guide was written in a book sprint format,
|
||||
which is a facilitated, rapid development production method for books.
|
||||
The Book Sprint was facilitated by Faith Bosworth and Adam
|
||||
Hyde of Book Sprints, for more information, see the Book Sprints website
|
||||
(www.booksprints.net).</para>
|
||||
<para>This book was written in five days during July 2014 while
|
||||
exhausting the M&M, Mountain Dew and healthy options
|
||||
supply, complete with juggling entertainment during lunches at
|
||||
VMware's headquarters in Palo Alto.</para>
|
||||
<para>We would like to thank VMware for their generous
|
||||
hospitality, as well as our employers, Cisco, Cloudscaling,
|
||||
Comcast, EMC, Mirantis, Rackspace, Red Hat, Verizon, and
|
||||
VMware, for enabling us to contribute our time. We would
|
||||
especially like to thank Anne Gentle and Kenneth Hui for all
|
||||
of their shepherding and organization in making this
|
||||
happen.</para>
|
||||
<para>The author team includes:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Kenneth Hui (EMC)
|
||||
<link xlink:href="http://twitter.com/hui_kenneth"
|
||||
>@hui_kenneth</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Alexandra Settle (Rackspace)
|
||||
<link xlink:href="http://twitter.com/dewsday"
|
||||
>@dewsday</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Anthony Veiga (Comcast)
|
||||
<link xlink:href="http://twitter.com/daaelar"
|
||||
>@daaelar</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Beth Cohen (Verizon)
|
||||
<link xlink:href="http://twitter.com/bfcohen"
|
||||
>@bfcohen</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Kevin Jackson (Rackspace)
|
||||
<link xlink:href="http://twitter.com/itarchitectkev"
|
||||
>@itarchitectkev</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Maish Saidel-Keesing (Cisco)
|
||||
<link xlink:href="http://twitter.com/maishsk"
|
||||
>@maishsk</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Nick Chase (Mirantis)
|
||||
<link xlink:href="http://twitter.com/NickChase"
|
||||
>@NickChase</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Scott Lowe (VMware)
|
||||
<link xlink:href="http://twitter.com/scott_lowe"
|
||||
>@scott_lowe</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Sean Collins (Comcast)
|
||||
<link xlink:href="http://twitter.com/sc68cal"
|
||||
>@sc68cal</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Sean Winn (Cloudscaling)
|
||||
<link xlink:href="http://twitter.com/seanmwinn"
|
||||
>@seanmwinn</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Sebastian Gutierrez (Red Hat)
|
||||
<link xlink:href="http://twitter.com/gutseb"
|
||||
>@gutseb</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Stephen Gordon (Red Hat)
|
||||
<link xlink:href="http://twitter.com/xsgordon"
|
||||
>@xsgordon</link></para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Vinny Valdez (Red Hat)
|
||||
<link xlink:href="http://twitter.com/VinnyValdez"
|
||||
>@VinnyValdez</link></para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
@ -1,18 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-guide-intended-audience">
|
||||
<title>Intended audience</title>
|
||||
<para>This book has been written for architects and designers of
|
||||
OpenStack clouds. For a guide on deploying and operating
|
||||
OpenStack, please refer to the <citetitle>OpenStack Operations
|
||||
Guide</citetitle> (<link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops">http://docs.openstack.org/openstack-ops</link>).
|
||||
</para>
|
||||
<para>Before reading this book, we recommend prior knowledge of cloud architecture
|
||||
and principles, experience in enterprise system design, Linux
|
||||
and virtualization experience, and a basic understanding of
|
||||
networking principles and protocols.</para>
|
||||
</section>
|
@ -1,204 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE section [
|
||||
<!ENTITY % openstack SYSTEM "../../common/entities/openstack.ent">
|
||||
%openstack;
|
||||
]>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="methodology">
|
||||
<title>Methodology</title>
|
||||
<para>The best way to design your cloud architecture is through creating and
|
||||
testing use cases. Planning for applications that support thousands of
|
||||
sessions per second, variable workloads, and complex, changing data,
|
||||
requires you to identify the key meters. Identifying these key meters,
|
||||
such as number of concurrent transactions per second, and size of
|
||||
database, makes it possible to build a method for testing your assumptions.</para>
|
||||
<para>Use a functional user scenario to develop test cases, and to measure
|
||||
overall project trajectory.</para>
|
||||
<note>
|
||||
<para>If you do not want to use an application to develop user
|
||||
requirements automatically, you need to create requirements to build
|
||||
test harnesses and develop usable meters.</para>
|
||||
</note>
|
||||
<para>Establishing these meters allows you to respond to changes quickly without
|
||||
having to set exact requirements in advance.
|
||||
This creates ways to configure the system, rather than redesigning
|
||||
it every time there is a requirements change.</para>
|
||||
<important>
|
||||
<para>It is important to limit scope creep. Ensure you address tool limitations,
|
||||
but do not recreate the entire suite of tools. Work
|
||||
with technical product owners to establish critical features that are needed
|
||||
for a successful cloud deployment.</para>
|
||||
</important>
|
||||
|
||||
<section xml:id="application-cloud-readiness-methods">
|
||||
<title>Application cloud readiness</title>
|
||||
<para>The cloud does more than host virtual machines and their applications.
|
||||
This <emphasis>lift and shift</emphasis>
|
||||
approach works in certain situations, but there is a fundamental
|
||||
difference between clouds and traditional bare-metal-based
|
||||
environments, or even traditional virtualized environments.</para>
|
||||
<para>In traditional environments, with traditional enterprise
|
||||
applications, the applications and the servers that run on them are
|
||||
<emphasis>pets</emphasis>.
|
||||
They are lovingly crafted and cared for, the servers have
|
||||
names like Gandalf or Tardis, and if they get sick someone nurses
|
||||
them back to health. All of this is designed so that the application
|
||||
does not experience an outage.</para>
|
||||
<para>In cloud environments, servers are more like
|
||||
cattle. There are thousands of them, they get names like NY-1138-Q,
|
||||
and if they get sick, they get put down and a sysadmin installs
|
||||
another one. Traditional applications that are unprepared for this
|
||||
kind of environment may suffer outages, loss of data, or
|
||||
complete failure.</para>
|
||||
<para>There are other reasons to design applications with the cloud in mind.
|
||||
Some are defensive, such as the fact that because applications cannot be
|
||||
certain of exactly where or on what hardware they will be launched,
|
||||
they need to be flexible, or at least adaptable. Others are
|
||||
proactive. For example, one of the advantages of using the cloud is
|
||||
scalability. Applications need to be designed in such a way that
|
||||
they can take advantage of these and other opportunities.</para>
|
||||
</section>
|
||||
|
||||
<section xml:id="determining-whether-an-application-is-cloud-ready">
|
||||
<title>Determining whether an application is cloud-ready</title>
|
||||
<para>There are several factors to take into consideration when looking
|
||||
at whether an application is a good fit for the cloud.</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Structure</term>
|
||||
<listitem>
|
||||
<para>
|
||||
A large, monolithic, single-tiered, legacy
|
||||
application typically is not a good fit for the
|
||||
cloud. Efficiencies are gained when load can be
|
||||
spread over several instances, so that a failure
|
||||
in one part of the system can be mitigated without
|
||||
affecting other parts of the system, or so that
|
||||
scaling can take place where the app needs
|
||||
it.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Dependencies</term>
|
||||
<listitem>
|
||||
<para>
|
||||
Applications that depend on specific
|
||||
hardware, such as a particular chip set or an
|
||||
external device such as a fingerprint
|
||||
reader, might not be a good fit for the
|
||||
cloud, unless those dependencies are specifically
|
||||
addressed. Similarly, if an application depends on
|
||||
an operating system or set of libraries that
|
||||
cannot be used in the cloud, or cannot be
|
||||
virtualized, that is a problem.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Connectivity</term>
|
||||
<listitem>
|
||||
<para>
|
||||
Self-contained applications, or those that depend
|
||||
on resources that are not reachable by the cloud
|
||||
in question, will not run. In some situations,
|
||||
you can work around these issues with custom network
|
||||
setup, but how well this works depends on the
|
||||
chosen cloud environment.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Durability and resilience</term>
|
||||
<listitem>
|
||||
<para>
|
||||
Despite the existence of SLAs, things break:
|
||||
servers go down, network connections are
|
||||
disrupted, or too many tenants on a server make a
|
||||
server unusable. An application must be sturdy
|
||||
enough to contend with these issues.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
|
||||
<section xml:id="designing-for-the-cloud">
|
||||
<title>Designing for the cloud</title>
|
||||
<para>Here are some guidelines to keep in mind when designing an
|
||||
application for the cloud:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Be a pessimist: Assume everything fails and design
|
||||
backwards.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Put your eggs in multiple baskets: Leverage multiple
|
||||
providers, geographic regions and availability zones to
|
||||
accommodate for local availability issues. Design for
|
||||
portability.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Think efficiency: Inefficient designs will not scale.
|
||||
Efficient designs become cheaper as they scale. Kill off
|
||||
unneeded components or capacity.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Be paranoid: Design for defense in depth and zero
|
||||
tolerance by building in security at every level and between
|
||||
every component. Trust no one.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>But not too paranoid: Not every application needs the
|
||||
platinum solution. Architect for different SLA's, service
|
||||
tiers, and security levels.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Manage the data: Data is usually the most inflexible and
|
||||
complex area of a cloud and cloud integration architecture.
|
||||
Do not short change the effort in analyzing and addressing
|
||||
data needs.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Hands off: Leverage automation to increase consistency and
|
||||
quality and reduce response times.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Divide and conquer: Pursue partitioning and
|
||||
parallel layering wherever possible. Make components as small
|
||||
and portable as possible. Use load balancing between layers.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Think elasticity: Increasing resources should result in a
|
||||
proportional increase in performance and scalability.
|
||||
Decreasing resources should have the opposite effect.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Be dynamic: Enable dynamic configuration changes such as
|
||||
auto scaling, failure recovery and resource discovery to
|
||||
adapt to changing environments, faults, and workload volumes.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Stay close: Reduce latency by moving highly interactive
|
||||
components and data near each other.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Keep it loose: Loose coupling, service interfaces,
|
||||
separation of concerns, abstraction, and well defined API's
|
||||
deliver flexibility.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Be cost aware: Autoscaling, data transmission, virtual
|
||||
software licenses, reserved instances, and similar costs can rapidly
|
||||
increase monthly usage charges. Monitor usage closely.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</section>
|
||||
</section>
|
@ -1,102 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="operational-considerations-massive-scale">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Operational considerations</title>
|
||||
<para>In order to run efficiently at massive scale, automate
|
||||
as many of the operational processes as
|
||||
possible. Automation includes the configuration of
|
||||
provisioning, monitoring and alerting systems. Part of the
|
||||
automation process includes the capability to determine when
|
||||
human intervention is required and who should act. The
|
||||
objective is to increase the ratio of operational staff to
|
||||
running systems as much as possible in order to reduce maintenance
|
||||
costs. In a massively scaled environment, it is very difficult
|
||||
for staff to give each system individual care.</para>
|
||||
<para>Configuration management tools such as Puppet and Chef enable
|
||||
operations staff to categorize systems into groups based on
|
||||
their roles and thus create configurations and system states
|
||||
that the provisioning system enforces. Systems
|
||||
that fall out of the defined state due to errors or failures
|
||||
are quickly removed from the pool of active nodes and
|
||||
replaced.</para>
|
||||
<para>At large scale the resource cost of diagnosing failed individual
|
||||
systems is far greater than the cost of
|
||||
replacement. It is more economical to replace the failed
|
||||
system with a new system, provisioning and configuring it
|
||||
automatically and adding it to the pool of active nodes.
|
||||
By automating tasks that are labor-intensive,
|
||||
repetitive, and critical to operations, cloud operations
|
||||
teams can work more
|
||||
efficiently because fewer resources are required for these
|
||||
common tasks. Administrators are then free to tackle
|
||||
tasks that are not easy to automate and that have longer-term
|
||||
impacts on the business, for example, capacity planning.</para>
|
||||
<section xml:id="the-bleeding-edge">
|
||||
<title>The bleeding edge</title>
|
||||
<para>Running OpenStack at massive scale requires striking a
|
||||
balance between stability and features. For example, it might
|
||||
be tempting to run an older stable release branch of OpenStack
|
||||
to make deployments easier. However, when running at massive
|
||||
scale, known issues that may be of some concern or only have
|
||||
minimal impact in smaller deployments could become pain points.
|
||||
Recent releases may address well known issues. The OpenStack
|
||||
community can help resolve reported issues by applying
|
||||
the collective expertise of the OpenStack developers.</para>
|
||||
<para>The number of organizations running at
|
||||
massive scales is a small proportion of the
|
||||
OpenStack community, therefore it is important to share
|
||||
related issues with the community and be a vocal advocate for
|
||||
resolving them. Some issues only manifest when operating at
|
||||
large scale, and the number of organizations able to duplicate
|
||||
and validate an issue is small, so it is important to
|
||||
document and dedicate resources to their resolution.</para>
|
||||
<para>In some cases, the resolution to the problem is ultimately
|
||||
to deploy a more recent version of OpenStack. Alternatively,
|
||||
when you must resolve an issue in a production
|
||||
environment where rebuilding the entire environment is not an
|
||||
option, it is sometimes possible to deploy updates to specific
|
||||
underlying components in order to resolve issues or gain
|
||||
significant performance improvements. Although this may appear
|
||||
to expose the deployment to
|
||||
increased risk and instability, in many cases it
|
||||
could be an undiscovered issue.</para>
|
||||
<para>We recommend building a development and operations
|
||||
organization that is responsible for creating desired
|
||||
features, diagnosing and resolving issues, and building the
|
||||
infrastructure for large scale continuous integration tests
|
||||
and continuous deployment. This helps catch bugs early and
|
||||
makes deployments faster and easier. In addition to
|
||||
development resources, we also recommend the recruitment
|
||||
of experts in the fields of message queues, databases, distributed
|
||||
systems, networking, cloud, and storage.</para></section>
|
||||
<section xml:id="growth-and-capacity-planning">
|
||||
<title>Growth and capacity planning</title>
|
||||
<para>An important consideration in running at massive scale is
|
||||
projecting growth and utilization trends in order to plan capital
|
||||
expenditures for the short and long term. Gather utilization
|
||||
meters for compute, network, and storage, along with historical
|
||||
records of these meters. While securing major
|
||||
anchor tenants can lead to rapid jumps in the utilization
|
||||
rates of all resources, the steady adoption of the cloud
|
||||
inside an organization or by consumers in a public
|
||||
offering also creates a steady trend of increased
|
||||
utilization.</para></section>
|
||||
<section xml:id="skills-and-training">
|
||||
<title>Skills and training</title>
|
||||
<para>Projecting growth for storage, networking, and compute is
|
||||
only one aspect of a growth plan for running OpenStack at
|
||||
massive scale. Growing and nurturing development and
|
||||
operational staff is an additional consideration. Sending team
|
||||
members to OpenStack conferences, meetup events, and
|
||||
encouraging active participation in the mailing lists and
|
||||
committees is a very important way to maintain skills and
|
||||
forge relationships in the community. For a list of OpenStack
|
||||
training providers in the marketplace, see: <link
|
||||
xlink:href="http://www.openstack.org/marketplace/training/">http://www.openstack.org/marketplace/training/</link>.
|
||||
</para>
|
||||
</section>
|
||||
</section>
|
@ -1,131 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE section [
|
||||
<!ENTITY % openstack SYSTEM "../../common/entities/openstack.ent">
|
||||
%openstack;
|
||||
]>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="technical-considerations-massive-scale">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Technical considerations</title>
|
||||
<para>Repurposing an existing OpenStack environment to be
|
||||
massively scalable is a formidable task. When building
|
||||
a massively scalable environment from the ground up, ensure
|
||||
you build the initial deployment with the same principles
|
||||
and choices that apply as the environment grows. For example,
|
||||
a good approach is to deploy the first site as a multi-site
|
||||
environment. This enables you to use the same deployment
|
||||
and segregation methods as the environment grows to separate
|
||||
locations across dedicated links or wide area networks. In
|
||||
a hyperscale cloud, scale trumps redundancy. Modify applications
|
||||
with this in mind, relying on the scale and homogeneity of the
|
||||
environment to provide reliability rather than redundant
|
||||
infrastructure provided by non-commodity hardware
|
||||
solutions.</para>
|
||||
<section xml:id="infrastructure-segregation-massive-scale">
|
||||
<title>Infrastructure segregation</title>
|
||||
<para>OpenStack services support massive horizontal scale.
|
||||
Be aware that this is not the case for the entire supporting
|
||||
infrastructure. This is particularly a problem for the database
|
||||
management systems and message queues that OpenStack services
|
||||
use for data storage and remote procedure call communications.</para>
|
||||
<para>Traditional clustering techniques typically
|
||||
provide high availability and some additional scale for these
|
||||
environments. In the quest for massive scale, however, you must
|
||||
take additional steps to relieve the performance
|
||||
pressure on these components in order to prevent them from negatively
|
||||
impacting the overall performance of the environment. Ensure that
|
||||
all the components are in balance so that if the massively
|
||||
scalable environment fails, all the components are near maximum
|
||||
capacity and a single component is not causing the failure.</para>
|
||||
<para>Regions segregate completely independent
|
||||
installations linked only by an Identity and Dashboard
|
||||
(optional) installation. Services have separate
|
||||
API endpoints for each region, and include separate database
|
||||
and queue installations. This exposes some awareness of the
|
||||
environment's fault domains to users and gives them the
|
||||
ability to ensure some degree of application resiliency while
|
||||
also imposing the requirement to specify which region to apply
|
||||
their actions to.</para>
|
||||
<para>Environments operating at massive scale typically need their
|
||||
regions or sites subdivided further without exposing the
|
||||
requirement to specify the failure domain to the user. This
|
||||
provides the ability to further divide the installation into
|
||||
failure domains while also providing a logical unit for
|
||||
maintenance and the addition of new hardware. At hyperscale,
|
||||
instead of adding single compute nodes, administrators can add
|
||||
entire racks or even groups of racks at a time with each new
|
||||
addition of nodes exposed via one of the segregation concepts
|
||||
mentioned herein.</para>
|
||||
<para><glossterm baseform="cell">Cells</glossterm> provide the ability
|
||||
to subdivide the compute portion
|
||||
of an OpenStack installation, including regions, while still
|
||||
exposing a single endpoint. Each region has an API cell
|
||||
along with a number of compute cells where the
|
||||
workloads actually run. Each cell has its own database and
|
||||
message queue setup (ideally clustered), providing the ability
|
||||
to subdivide the load on these subsystems, improving overall
|
||||
performance.</para>
|
||||
<para>Each compute cell provides a complete compute installation,
|
||||
complete with full database and queue installations,
|
||||
scheduler, conductor, and multiple compute hosts. The cells
|
||||
scheduler handles placement of user requests from the single
|
||||
API endpoint to a specific cell from those available. The
|
||||
normal filter scheduler then handles placement within the
|
||||
cell.</para>
|
||||
<para>Unfortunately, Compute is the only OpenStack service that
|
||||
provides good support for cells. In addition, cells
|
||||
do not adequately support some standard
|
||||
OpenStack functionality such as security groups and host
|
||||
aggregates. Due to their relative newness and specialized use,
|
||||
cells receive relatively little testing in the OpenStack gate.
|
||||
Despite these issues, cells play an important role in
|
||||
well known OpenStack installations operating at massive scale,
|
||||
such as those at CERN and Rackspace.</para></section>
|
||||
<section xml:id="host-aggregates">
|
||||
<title>Host aggregates</title>
|
||||
<para>Host aggregates enable partitioning of OpenStack Compute
|
||||
deployments into logical groups for load balancing and
|
||||
instance distribution. You can also use host aggregates to
|
||||
further partition an availability zone. Consider a cloud which
|
||||
might use host aggregates to partition an availability zone
|
||||
into groups of hosts that either share common resources, such
|
||||
as storage and network, or have a special property, such as
|
||||
trusted computing hardware. You cannot target host aggregates
|
||||
explicitly. Instead, select instance flavors that map to host
|
||||
aggregate metadata. These flavors target host aggregates
|
||||
implicitly.</para></section>
|
||||
<section xml:id="availability-zones">
|
||||
<title>Availability zones</title>
|
||||
<para>Availability zones provide another mechanism for subdividing
|
||||
an installation or region. They are, in effect, host
|
||||
aggregates exposed for (optional) explicit targeting
|
||||
by users.</para>
|
||||
<para>Unlike cells, availability zones do not have their own database
|
||||
server or queue broker but represent an arbitrary grouping of
|
||||
compute nodes. Typically, nodes are grouped into availability
|
||||
zones using a shared failure domain based on a physical
|
||||
characteristic such as a shared power source or physical network
|
||||
connections. Users can target exposed availability zones; however,
|
||||
this is not a requirement. An alternative approach is to set a default
|
||||
availability zone to schedule instances to a non-default availability
|
||||
zone of <literal>nova</literal>.</para></section>
|
||||
<section xml:id="segregation-example">
|
||||
<title>Segregation example</title>
|
||||
<para>In this example the cloud is divided into two regions, one
|
||||
for each site, with two availability zones in each based on
|
||||
the power layout of the data centers. A number of host
|
||||
aggregates enable targeting of
|
||||
virtual machine instances using flavors, that require special
|
||||
capabilities shared by the target hosts such as SSDs, 10 GbE
|
||||
networks, or GPU cards.</para>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Massively_Scalable_Cells_+_regions_+_azs.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject></section>
|
||||
</section>
|
@ -1,135 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="user-requirements-massive-scale-overview">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>User requirements</title>
|
||||
<para>Defining user requirements for a massively scalable OpenStack
|
||||
design architecture dictates approaching the design from two
|
||||
different, yet sometimes opposing, perspectives: the cloud
|
||||
user, and the cloud operator. The expectations and perceptions
|
||||
of the consumption and management of resources of a massively
|
||||
scalable OpenStack cloud from these two perspectives are
|
||||
distinctly different.</para>
|
||||
<para>Massively scalable OpenStack clouds have the following user
|
||||
requirements:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>The cloud user expects repeatable, dependable, and
|
||||
deterministic processes for launching and deploying
|
||||
cloud resources. You could deliver this through a
|
||||
web-based interface or publicly available API
|
||||
endpoints. All appropriate options for requesting
|
||||
cloud resources must be available through some type
|
||||
of user interface, a command-line interface (CLI), or
|
||||
API endpoints.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Cloud users expect a fully self-service and
|
||||
on-demand consumption model. When an OpenStack cloud
|
||||
reaches the "massively scalable" size, expect
|
||||
consumption "as a service" in each and
|
||||
every way.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>For a user of a massively scalable OpenStack public
|
||||
cloud, there are no expectations for control over
|
||||
security, performance, or availability. Users expect
|
||||
only SLAs related to uptime of API services, and
|
||||
very basic SLAs for services offered. It is the user's
|
||||
responsibility to address these issues on their own.
|
||||
The exception to this expectation is the rare case of
|
||||
a massively scalable cloud infrastructure built for
|
||||
a private or government organization that has
|
||||
specific requirements.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>The cloud user's requirements and expectations that determine
|
||||
the cloud design focus on the consumption model. The user
|
||||
expects to consume cloud resources in an automated and
|
||||
deterministic way, without any need for knowledge of the
|
||||
capacity, scalability, or other attributes of the cloud's
|
||||
underlying infrastructure.</para>
|
||||
<section xml:id="operator-requirements-massive-scale">
|
||||
<title>Operator requirements</title>
|
||||
<para>While the cloud user can be completely unaware of the
|
||||
underlying infrastructure of the cloud and its attributes, the
|
||||
operator must build and support the infrastructure for operating
|
||||
at scale. This presents a very demanding set of requirements
|
||||
for building such a cloud from the operator's perspective:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Everything must be capable of automation. For example,
|
||||
everything from compute hardware, storage hardware,
|
||||
networking hardware, to the installation and
|
||||
configuration of the supporting software. Manual
|
||||
processes are impractical in a massively scalable
|
||||
OpenStack design architecture.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The cloud operator requires that capital expenditure
|
||||
(CapEx) is minimized at all layers of the stack.
|
||||
Operators of massively scalable OpenStack clouds
|
||||
require the use of dependable commodity hardware and
|
||||
freely available open source software components to
|
||||
reduce deployment costs and operational expenses.
|
||||
Initiatives like OpenCompute (more information
|
||||
available at <link
|
||||
xlink:href="http://www.opencompute.org">http://www.opencompute.org</link>)
|
||||
provide additional information and pointers. To cut
|
||||
costs, many operators sacrifice redundancy. For
|
||||
example, using redundant power supplies, network
|
||||
connections, and rack switches.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Companies operating a massively scalable OpenStack
|
||||
cloud also require that operational expenditures
|
||||
(OpEx) be minimized as much as possible. We
|
||||
recommend using cloud-optimized hardware when
|
||||
managing operational overhead. Some of
|
||||
the factors to consider include power,
|
||||
cooling, and the physical design of the chassis. Through
|
||||
customization, it is possible to optimize the hardware
|
||||
and systems for this type of workload because of the
|
||||
scale of these implementations.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Massively scalable OpenStack clouds require
|
||||
extensive metering and monitoring functionality to
|
||||
maximize the operational efficiency by keeping the
|
||||
operator informed about the status and state of the
|
||||
infrastructure. This includes full scale metering of
|
||||
the hardware and software status. A corresponding
|
||||
framework of logging and alerting is also required to
|
||||
store and enable operations to act on the meters
|
||||
provided by the metering and monitoring solutions.
|
||||
The cloud operator also needs a solution that uses the
|
||||
data provided by the metering and monitoring solution
|
||||
to provide capacity planning and capacity trending
|
||||
analysis.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Invariably, massively scalable OpenStack clouds extend
|
||||
over several sites. Therefore, the user-operator
|
||||
requirements for a multi-site OpenStack architecture
|
||||
design are also applicable here. This includes various
|
||||
legal requirements; other jurisdictional legal or
|
||||
compliance requirements; image
|
||||
consistency-availability; storage replication and
|
||||
availability (both block and file/object storage); and
|
||||
authentication, authorization, and auditing (AAA).
|
||||
See <xref linkend="multi_site"/>
|
||||
for more details on requirements and considerations
|
||||
for multi-site OpenStack clouds.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The design architecture of a massively scalable OpenStack
|
||||
cloud must address considerations around physical
|
||||
facilities such as space, floor weight, rack height and
|
||||
type, environmental considerations, power usage and power
|
||||
usage efficiency (PUE), and physical security.</para>
|
||||
</listitem>
|
||||
</itemizedlist></section>
|
||||
</section>
|
@ -1,123 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="arch-design-architecture-multiple-site">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Architecture</title>
|
||||
<para><xref linkend="multi-site_arch"/>
|
||||
illustrates a high level multi-site OpenStack
|
||||
architecture. Each site is an OpenStack cloud but it may be necessary
|
||||
to architect the sites on different versions. For example, if the
|
||||
second site is intended to be a replacement for the first site,
|
||||
they would be different. Another common design would be a private
|
||||
OpenStack cloud with a replicated site that would be used for high
|
||||
availability or disaster recovery. The most important design decision
|
||||
is configuring storage as a single shared pool or separate pools,
|
||||
depending on user and technical requirements.</para>
|
||||
<figure xml:id="multi-site_arch">
|
||||
<title>Multi-site OpenStack architecture</title>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="6in"
|
||||
fileref="../figures/Multi-Site_shared_keystone_horizon_swift1.png"/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
</figure>
|
||||
<section xml:id="openstack-services-architecture">
|
||||
<title>OpenStack services architecture</title>
|
||||
<para>The Identity service, which is used by all other
|
||||
OpenStack components for authorization and the catalog of
|
||||
service endpoints, supports the concept of regions. A region
|
||||
is a logical construct used to group OpenStack services in
|
||||
close proximity to one another. The concept of
|
||||
regions is flexible; it may contain OpenStack service
|
||||
endpoints located within a distinct geographic region or regions.
|
||||
It may be smaller in scope, where a region is a single rack
|
||||
within a data center, with multiple regions existing in adjacent
|
||||
racks in the same data center.</para>
|
||||
<para>The majority of OpenStack components are designed to run
|
||||
within the context of a single region. The Compute
|
||||
service is designed to manage compute resources within a region,
|
||||
with support for subdivisions of compute resources by using
|
||||
availability zones and cells. The Networking service
|
||||
can be used to manage network resources in the same broadcast
|
||||
domain or collection of switches that are linked. The OpenStack
|
||||
Block Storage service controls storage resources within a region
|
||||
with all storage resources residing on the same storage network.
|
||||
Like the OpenStack Compute service, the OpenStack Block Storage
|
||||
service also supports the availability zone construct which can
|
||||
be used to subdivide storage resources.</para>
|
||||
<para>The OpenStack dashboard, OpenStack Identity, and OpenStack
|
||||
Object Storage services are components that can each be deployed
|
||||
centrally in order to serve multiple regions.</para>
|
||||
</section>
|
||||
<section xml:id="arch-multi-storage">
|
||||
<title>Storage</title>
|
||||
<para>With multiple OpenStack regions, it is recommended to configure
|
||||
a single OpenStack Object Storage service endpoint to deliver
|
||||
shared file storage for all regions. The Object Storage service
|
||||
internally replicates files to multiple nodes which can be used
|
||||
by applications or workloads in multiple regions. This simplifies
|
||||
high availability failover and disaster recovery rollback.</para>
|
||||
<para>In order to scale the Object Storage service to meet the workload
|
||||
of multiple regions, multiple proxy workers are run and
|
||||
load-balanced, storage nodes are installed in each region, and the
|
||||
entire Object Storage Service can be fronted by an HTTP caching
|
||||
layer. This is done so client requests for objects can be served out
|
||||
of caches rather than directly from the storage modules themselves,
|
||||
reducing the actual load on the storage network. In addition to an
|
||||
HTTP caching layer, use a caching layer like Memcache to cache
|
||||
objects between the proxy and storage nodes.</para>
|
||||
<para>If the cloud is designed with a separate Object Storage
|
||||
service endpoint made available in each region, applications are
|
||||
required to handle synchronization (if desired) and other management
|
||||
operations to ensure consistency across the nodes. For some
|
||||
applications, having multiple Object Storage Service endpoints
|
||||
located in the same region as the application may be desirable due
|
||||
to reduced latency, cross region bandwidth, and ease of
|
||||
deployment.</para>
|
||||
<note>
|
||||
<para>For the Block Storage service, the most important decisions
|
||||
are the selection of the storage technology, and whether
|
||||
a dedicated network is used to carry storage traffic
|
||||
from the storage service to the compute nodes.</para>
|
||||
</note>
|
||||
</section>
|
||||
<section xml:id="arch-networking-multiple">
|
||||
<title>Networking</title>
|
||||
<para>When connecting multiple regions together, there are several design
|
||||
considerations. The overlay network technology choice determines how
|
||||
packets are transmitted between regions and how the logical network
|
||||
and addresses present to the application. If there are security or
|
||||
regulatory requirements, encryption should be implemented to secure
|
||||
the traffic between regions. For networking inside a region, the
|
||||
overlay network technology for tenant networks is equally important.
|
||||
The overlay technology and the network traffic that an application
|
||||
generates or receives can be either complementary or serve cross
|
||||
purposes. For example, using an overlay technology for an application
|
||||
that transmits a large amount of small packets could add excessive
|
||||
latency or overhead to each packet if not configured
|
||||
properly.</para>
|
||||
</section>
|
||||
<section xml:id="arch-dependencies-multiple">
|
||||
<title>Dependencies</title>
|
||||
<para>The architecture for a multi-site OpenStack installation
|
||||
is dependent on a number of factors. One major dependency to
|
||||
consider is storage. When designing the storage system, the
|
||||
storage mechanism needs to be determined. Once the storage
|
||||
type is determined, how it is accessed is critical. For example,
|
||||
we recommend that storage should use a dedicated network.
|
||||
Another concern is how the storage is configured to protect
|
||||
the data. For example, the Recovery Point Objective (RPO) and
|
||||
the Recovery Time Objective (RTO). How quickly recovery from
|
||||
a fault can be completed, determines how often the replication of
|
||||
data is required. Ensure that enough storage is allocated to
|
||||
support the data protection strategy.
|
||||
</para>
|
||||
<para>Networking decisions include the encapsulation mechanism that can
|
||||
be used for the tenant networks, how large the broadcast domains
|
||||
should be, and the contracted SLAs for the interconnects.</para>
|
||||
</section>
|
||||
</section>
|
@ -1,180 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="operational-considerations-multi-site">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Operational considerations</title>
|
||||
<para>Multi-site OpenStack cloud deployment using regions
|
||||
requires that the service catalog contains per-region entries
|
||||
for each service deployed other than the Identity service. Most
|
||||
off-the-shelf OpenStack deployment tools have limited support
|
||||
for defining multiple regions in this fashion.</para>
|
||||
<para>Deployers should be aware of this and provide the appropriate
|
||||
customization of the service catalog for their site either
|
||||
manually, or by customizing deployment tools in use.</para>
|
||||
<note><para>As of the Kilo release, documentation for
|
||||
implementing this feature is in progress. See this bug for
|
||||
more information:
|
||||
<link
|
||||
xlink:href="https://bugs.launchpad.net/openstack-manuals/+bug/1340509">https://bugs.launchpad.net/openstack-manuals/+bug/1340509</link>.
|
||||
</para></note>
|
||||
<section xml:id="licensing">
|
||||
<title>Licensing</title>
|
||||
<para>Multi-site OpenStack deployments present additional
|
||||
licensing considerations over and above regular OpenStack
|
||||
clouds, particularly where site licenses are in use to provide
|
||||
cost efficient access to software licenses. The licensing for
|
||||
host operating systems, guest operating systems, OpenStack
|
||||
distributions (if applicable), software-defined infrastructure
|
||||
including network controllers and storage systems, and even
|
||||
individual applications need to be evaluated.</para>
|
||||
<para>Topics to consider include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>The definition of what constitutes a site
|
||||
in the relevant licenses, as the term does not
|
||||
necessarily denote a geographic or otherwise
|
||||
physically isolated location.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Differentiations between "hot" (active) and "cold"
|
||||
(inactive) sites, where significant savings may be made
|
||||
in situations where one site is a cold standby for
|
||||
disaster recovery purposes only.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Certain locations might require local vendors to
|
||||
provide support and services for each site which may vary
|
||||
with the licensing agreement in place.</para>
|
||||
</listitem>
|
||||
</itemizedlist></section>
|
||||
<section xml:id="logging-and-monitoring-multi-site">
|
||||
<title>Logging and monitoring</title>
|
||||
<para>Logging and monitoring does not significantly differ for a
|
||||
multi-site OpenStack cloud. The tools described in the <link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops/content/logging_monitoring.html">Logging
|
||||
and monitoring chapter</link> of the <citetitle>Operations
|
||||
Guide</citetitle> remain applicable. Logging and monitoring
|
||||
can be provided on a per-site basis, and in a common
|
||||
centralized location.</para>
|
||||
<para>When attempting to deploy logging and monitoring facilities
|
||||
to a centralized location, care must be taken with the load
|
||||
placed on the inter-site networking links.</para></section>
|
||||
<section xml:id="upgrades-multi-site">
|
||||
<title>Upgrades</title>
|
||||
<para>In multi-site OpenStack clouds deployed using regions, sites
|
||||
are independent OpenStack installations which are linked
|
||||
together using shared centralized services such as OpenStack
|
||||
Identity. At a high level the recommended order of operations
|
||||
to upgrade an individual OpenStack environment is (see the <link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops/content/ops_upgrades-general-steps.html">Upgrades
|
||||
chapter</link> of the <citetitle>Operations Guide</citetitle>
|
||||
for details):</para>
|
||||
<orderedlist>
|
||||
<listitem>
|
||||
<para>Upgrade the OpenStack Identity service
|
||||
(keystone).</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade the OpenStack Image service (glance).</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade OpenStack Compute (nova), including
|
||||
networking components.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade OpenStack Block Storage (cinder).</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade the OpenStack dashboard (horizon).</para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
<para>The process for upgrading a multi-site environment is not
|
||||
significantly different:</para>
|
||||
<orderedlist>
|
||||
<listitem>
|
||||
<para>Upgrade the shared OpenStack Identity service
|
||||
(keystone) deployment.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade the OpenStack Image service (glance) at each
|
||||
site.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade OpenStack Compute (nova), including
|
||||
networking components, at each site.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade OpenStack Block Storage (cinder) at each
|
||||
site.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Upgrade the OpenStack dashboard (horizon), at each
|
||||
site or in the single central location if it is
|
||||
shared.</para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
<para>Compute upgrades within each site can also be performed in a rolling
|
||||
fashion. Compute controller services (API, Scheduler, and
|
||||
Conductor) can be upgraded prior to upgrading of individual
|
||||
compute nodes. This allows operations staff to keep a site
|
||||
operational for users of Compute services while performing an
|
||||
upgrade.</para></section>
|
||||
<section xml:id="quota-management-multi-site">
|
||||
<title>Quota management</title>
|
||||
<para>Quotas are used to set operational limits to prevent system
|
||||
capacities from being exhausted without notification. They are
|
||||
currently enforced at the tenant (or project) level rather than
|
||||
at the user level.</para>
|
||||
<para>Quotas are defined on a per-region basis. Operators can
|
||||
define identical quotas for tenants in each region of the
|
||||
cloud to provide a consistent experience, or even create a
|
||||
process for synchronizing allocated quotas across regions. It
|
||||
is important to note that only the operational limits imposed
|
||||
by the quotas will be aligned consumption of quotas by users
|
||||
will not be reflected between regions.</para>
|
||||
<para>For example, given a cloud with two regions, if the operator
|
||||
grants a user a quota of 25 instances in each region then that
|
||||
user may launch a total of 50 instances spread across both
|
||||
regions. They may not, however, launch more than 25 instances
|
||||
in any single region.</para>
|
||||
<para>For more information on managing quotas refer to the
|
||||
<link
|
||||
xlink:href="http://docs.openstack.org/openstack-ops/content/projects_users.html">Managing
|
||||
projects and users chapter</link> of the <citetitle>OpenStack
|
||||
Operators Guide</citetitle>.</para>
|
||||
</section>
|
||||
<section xml:id="policy-management-multi-site">
|
||||
<title>Policy management</title>
|
||||
<para>OpenStack provides a default set of Role Based Access
|
||||
Control (RBAC) policies, defined in a <filename>policy.json</filename> file, for
|
||||
each service. Operators edit these files to customize the
|
||||
policies for their OpenStack installation. If the application
|
||||
of consistent RBAC policies across sites is a requirement, then
|
||||
it is necessary to ensure proper synchronization of the
|
||||
<filename>policy.json</filename> files to all installations.</para>
|
||||
<para>This must be done using system administration tools
|
||||
such as rsync as functionality for synchronizing policies
|
||||
across regions is not currently provided within OpenStack.</para></section>
|
||||
<section xml:id="documentation-multi-site">
|
||||
<title>Documentation</title>
|
||||
<para>Users must be able to leverage cloud infrastructure and
|
||||
provision new resources in the environment. It is important
|
||||
that user documentation is accessible by users to ensure they
|
||||
are given sufficient information to help them leverage the cloud.
|
||||
As an example, by default OpenStack schedules instances on a compute node
|
||||
automatically. However, when multiple regions are available,
|
||||
the end user needs to decide in which region to schedule the
|
||||
new instance. The dashboard presents the user with
|
||||
the first region in your configuration. The API and CLI tools
|
||||
do not execute commands unless a valid region is specified.
|
||||
It is therefore important to provide documentation to your
|
||||
users describing the region layout as well as calling out that
|
||||
quotas are region-specific. If a user reaches his or her quota
|
||||
in one region, OpenStack does not automatically build new
|
||||
instances in another. Documenting specific examples helps
|
||||
users understand how to operate the cloud, thereby reducing
|
||||
calls and tickets filed with the help desk.</para></section>
|
||||
</section>
|
@ -1,236 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE section [
|
||||
<!ENTITY % openstack SYSTEM "../../common/entities/openstack.ent">
|
||||
%openstack;
|
||||
]>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="prescriptive-example-multisite">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Prescriptive examples</title>
|
||||
<para>There are multiple ways to build a multi-site OpenStack
|
||||
installation, based on the needs of the intended workloads.
|
||||
Below are example architectures based on different
|
||||
requirements. These examples are meant as a reference, and not
|
||||
a hard and fast rule for deployments. Use the previous
|
||||
sections of this chapter to assist in selecting specific
|
||||
components and implementations based on specific needs.</para>
|
||||
<para>A large content provider needs to deliver content to
|
||||
customers that are geographically dispersed. The workload is
|
||||
very sensitive to latency and needs a rapid response to
|
||||
end-users. After reviewing the user, technical and operational
|
||||
considerations, it is determined beneficial to build a number
|
||||
of regions local to the customer's edge. Rather than build a
|
||||
few large, centralized data centers, the intent of the architecture
|
||||
is to provide a pair of small data centers in locations that
|
||||
are closer to the customer. In this use
|
||||
case, spreading applications out allows for different
|
||||
horizontal scaling than a traditional compute workload scale.
|
||||
The intent is to scale by creating more copies of the
|
||||
application in closer proximity to the users that need it
|
||||
most, in order to ensure faster response time to user
|
||||
requests. This provider deploys two datacenters at each of
|
||||
the four chosen regions. The implications of this design are
|
||||
based around the method of placing copies of resources in each
|
||||
of the remote regions. Swift objects, Glance images, and block
|
||||
storage need to be manually replicated into each region.
|
||||
This may be beneficial for some systems, such as the case of
|
||||
content service, where only some of the content needs to exist
|
||||
in some but not all regions. A centralized Keystone is
|
||||
recommended to ensure authentication and that access to the
|
||||
API endpoints is easily manageable.</para>
|
||||
<para>It is recommended that you install an automated DNS system such
|
||||
as Designate. Application administrators need a way to
|
||||
manage the mapping of which application copy exists in each
|
||||
region and how to reach it, unless an external Dynamic DNS system
|
||||
is available. Designate assists by making the process automatic
|
||||
and by populating the records in the each region's zone.</para>
|
||||
<para>Telemetry for each region is also deployed, as each region
|
||||
may grow differently or be used at a different rate.
|
||||
Ceilometer collects each region's meters from each
|
||||
of the controllers and report them back to a central location.
|
||||
This is useful both to the end user and the administrator of
|
||||
the OpenStack environment. The end user will find this method
|
||||
useful, as it makes possible to determine if certain
|
||||
locations are experiencing higher load than others, and take
|
||||
appropriate action. Administrators also benefit by
|
||||
possibly being able to forecast growth per region, rather than
|
||||
expanding the capacity of all regions simultaneously,
|
||||
therefore maximizing the cost-effectiveness of the multi-site
|
||||
design.</para>
|
||||
<para>One of the key decisions of running this infrastructure is
|
||||
whether or not to provide a redundancy
|
||||
model. Two types of redundancy and high availability models in
|
||||
this configuration can be implemented. The first type
|
||||
is the availability of central OpenStack
|
||||
components. Keystone can be made highly available in three
|
||||
central data centers that host the centralized OpenStack
|
||||
components. This prevents a loss of any one of the regions
|
||||
causing an outage in service. It also has the added benefit of
|
||||
being able to run a central storage repository as a primary
|
||||
cache for distributing content to each of the regions.</para>
|
||||
<para>The second redundancy type is the edge data center itself.
|
||||
A second data center in each of the edge regional
|
||||
locations house a second region near the first region. This
|
||||
ensures that the application does not suffer degraded
|
||||
performance in terms of latency and availability.</para>
|
||||
<para><xref linkend="multi-site_customer_edge"/> depicts
|
||||
the solution designed to have both a centralized set of core
|
||||
data centers for OpenStack services and paired edge data centers:</para>
|
||||
<figure xml:id="multi-site_customer_edge">
|
||||
<title>Multi-site architecture example</title>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="6in"
|
||||
fileref="../figures/Multi-Site_Customer_Edge.png"/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
</figure>
|
||||
<section xml:id="geo-redundant-load-balancing">
|
||||
<title>Geo-redundant load balancing</title>
|
||||
<para>A large-scale web application has been designed with cloud
|
||||
principles in mind. The application is designed provide
|
||||
service to application store, on a 24/7 basis. The company has
|
||||
typical two tier architecture with a web front-end servicing the
|
||||
customer requests, and a NoSQL database back end storing the
|
||||
information.</para>
|
||||
<para>As of late there has been several outages in number of major
|
||||
public cloud providers due to applications running out of
|
||||
a single geographical location. The design therefore should
|
||||
mitigate the chance of a single site causing an outage for their
|
||||
business.</para>
|
||||
<para>The solution would consist of the following OpenStack
|
||||
components:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>A firewall, switches and load balancers on the
|
||||
public facing network connections.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Controller services running, Networking,
|
||||
dashboard, Block Storage and Compute running locally in
|
||||
each of the three regions. Identity service, Orchestration
|
||||
service, Telemetry service, Image service and
|
||||
Object Storage service can be installed centrally, with
|
||||
nodes in each of the region providing a redundant
|
||||
OpenStack Controller plane throughout the globe.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Compute nodes running the KVM
|
||||
hypervisor.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Object Storage for serving static objects
|
||||
such as images can be used to ensure that all images
|
||||
are standardized across all the regions, and
|
||||
replicated on a regular basis.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A distributed DNS service available to all
|
||||
regions that allows for dynamic update of DNS
|
||||
records of deployed instances.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A geo-redundant load balancing service can be used
|
||||
to service the requests from the customers based on
|
||||
their origin.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>An autoscaling heat template can be used to deploy the
|
||||
application in the three regions. This template includes:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Web Servers, running Apache.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Appropriate <literal>user_data</literal> to populate the central DNS
|
||||
servers upon instance launch.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Appropriate Telemetry alarms that maintain state of
|
||||
the application and allow for handling of region or
|
||||
instance failure.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Another autoscaling Heat template can be used to deploy a
|
||||
distributed MongoDB shard over the three locations, with the
|
||||
option of storing required data on a globally available swift
|
||||
container. According to the usage and load on the database
|
||||
server, additional shards can be provisioned according to
|
||||
the thresholds defined in Telemetry.</para>
|
||||
<!-- <para>The reason that three regions were selected here was because of
|
||||
the fear of having abnormal load on a single region in the
|
||||
event of a failure. Two data center would have been sufficient
|
||||
had the requirements been met.</para>-->
|
||||
<para>Two data centers would have been sufficient had the requirements
|
||||
been met. But three regions are selected here to avoid abnormal
|
||||
load on a single region in the event of a failure.</para>
|
||||
<para>Orchestration is used because of the built-in functionality of
|
||||
autoscaling and auto healing in the event of increased load.
|
||||
Additional configuration management tools, such as Puppet or
|
||||
Chef could also have been used in this scenario, but were not
|
||||
chosen since Orchestration had the appropriate built-in
|
||||
hooks into the OpenStack cloud, whereas the other tools were
|
||||
external and not native to OpenStack. In addition, external
|
||||
tools were not needed since this deployment scenario was straight
|
||||
forward.</para>
|
||||
<para>OpenStack Object Storage is used here to serve as a back end for
|
||||
the Image service since it is the most suitable solution for a
|
||||
globally distributed storage solution with its own
|
||||
replication mechanism. Home grown solutions could also have
|
||||
been used including the handling of replication, but were not
|
||||
chosen, because Object Storage is already an intricate part of the
|
||||
infrastructure and a proven solution.</para>
|
||||
<para>An external load balancing service was used and not the
|
||||
LBaaS in OpenStack because the solution in OpenStack is not
|
||||
redundant and does not have any awareness of geo location.</para>
|
||||
<figure xml:id="multi-site_geo_redundant">
|
||||
<title>Multi-site geo-redundant architecture</title>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="6in"
|
||||
fileref="../figures/Multi-site_Geo_Redundant_LB.png"/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
</figure>
|
||||
</section>
|
||||
<section xml:id="location-local-services">
|
||||
<title>Location-local service</title>
|
||||
<para>A common use for multi-site OpenStack deployment is
|
||||
creating a Content Delivery Network. An application that
|
||||
uses a location-local architecture requires low network
|
||||
latency and proximity to the user to provide an
|
||||
optimal user experience and reduce the cost of bandwidth and
|
||||
transit. The content resides on sites closer to the customer,
|
||||
instead of a centralized content store that requires utilizing
|
||||
higher cost cross-country links.</para>
|
||||
<para>This architecture includes a geo-location component
|
||||
that places user requests to the closest possible node. In
|
||||
this scenario, 100% redundancy of content across every site is
|
||||
a goal rather than a requirement, with the intent to
|
||||
maximize the amount of content available within a
|
||||
minimum number of network hops for end users. Despite
|
||||
these differences, the storage replication configuration has
|
||||
significant overlap with that of a geo-redundant load
|
||||
balancing use case.</para>
|
||||
<para>In <xref linkend="multi-site_shared_shared_keystone"/>,
|
||||
the application utilizing this multi-site OpenStack install
|
||||
that is location-aware would launch web server or content
|
||||
serving instances on the compute cluster in each site. Requests
|
||||
from clients are first sent to a global services load balancer
|
||||
that determines the location of the client, then routes the
|
||||
request to the closest OpenStack site where the application
|
||||
completes the request.</para>
|
||||
<figure xml:id="multi-site_shared_shared_keystone">
|
||||
<title>Multi-site shared keystone architecture</title>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="6in"
|
||||
fileref="../figures/Multi-Site_shared_keystone1.png"/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
</figure>
|
||||
</section>
|
||||
</section>
|
@ -1,176 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="technical-considerations-multi-site">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Technical considerations</title>
|
||||
<para>There are many technical considerations to take into account
|
||||
with regard to designing a multi-site OpenStack
|
||||
implementation. An OpenStack cloud can be designed in a
|
||||
variety of ways to handle individual application needs. A
|
||||
multi-site deployment has additional challenges compared
|
||||
to single site installations and therefore is a more
|
||||
complex solution.</para>
|
||||
<para>When determining capacity options be sure to take into
|
||||
account not just the technical issues, but also the economic
|
||||
or operational issues that might arise from specific
|
||||
decisions.</para>
|
||||
<para>Inter-site link capacity describes the capabilities of the
|
||||
connectivity between the different OpenStack sites. This
|
||||
includes parameters such as bandwidth, latency, whether or not
|
||||
a link is dedicated, and any business policies applied to the
|
||||
connection. The capability and number of the links between
|
||||
sites determine what kind of options are available for
|
||||
deployment. For example, if two sites have a pair of
|
||||
high-bandwidth links available between them, it may be wise to
|
||||
configure a separate storage replication network between the
|
||||
two sites to support a single Swift endpoint and a shared
|
||||
Object Storage capability between them. An example of this
|
||||
technique, as well as a configuration walk-through, is
|
||||
available at <link
|
||||
xlink:href="http://docs.openstack.org/developer/swift/replication_network.html#dedicated-replication-network">http://docs.openstack.org/developer/swift/replication_network.html#dedicated-replication-network</link>.
|
||||
Another option in this scenario is to build a dedicated set of
|
||||
tenant private networks across the secondary link, using
|
||||
overlay networks with a third party mapping the site overlays
|
||||
to each other.</para>
|
||||
<para>The capacity requirements of the links between sites is
|
||||
driven by application behavior. If the link latency is
|
||||
too high, certain applications that use a large number of
|
||||
small packets, for example RPC calls, may encounter issues
|
||||
communicating with each other or operating properly.
|
||||
Additionally, OpenStack may encounter similar types of issues.
|
||||
To mitigate this, Identity service call timeouts can be
|
||||
tuned to prevent issues authenticating against a central
|
||||
Identity service.</para>
|
||||
<para>Another network capacity consideration for a multi-site
|
||||
deployment is the amount and performance of overlay networks
|
||||
available for tenant networks. If using shared tenant networks
|
||||
across zones, it is imperative that an external overlay manager
|
||||
or controller be used to map these overlays together. It is
|
||||
necessary to ensure the amount of possible IDs between the zones
|
||||
are identical.</para>
|
||||
<note>
|
||||
<para>As of the Kilo release, OpenStack Networking was not
|
||||
capable of managing tunnel IDs across installations. So if
|
||||
one site runs out of IDs, but another does not, that tenant's
|
||||
network is unable to reach the other site.</para>
|
||||
</note>
|
||||
<para>Capacity can take other forms as well. The ability for a
|
||||
region to grow depends on scaling out the number of available
|
||||
compute nodes. This topic is covered in greater detail in the
|
||||
section for compute-focused deployments. However, it may be
|
||||
necessary to grow cells in an individual region, depending on
|
||||
the size of your cluster and the ratio of virtual machines per
|
||||
hypervisor.</para>
|
||||
<para>A third form of capacity comes in the multi-region-capable
|
||||
components of OpenStack. Centralized Object Storage is capable
|
||||
of serving objects through a single namespace across multiple
|
||||
regions. Since this works by accessing the object store through
|
||||
swift proxy, it is possible to overload the proxies. There are
|
||||
two options available to mitigate this issue:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Deploy a large number of swift proxies. The drawback is
|
||||
that the proxies are not load-balanced and a large file
|
||||
request could continually hit the same proxy.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Add a caching HTTP proxy and load balancer in front of
|
||||
the swift proxies. Since swift objects are returned to the
|
||||
requester via HTTP, this load balancer would alleviate the
|
||||
load required on the swift proxies.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<section xml:id="utilization-multi-site"><title>Utilization</title>
|
||||
<para>While constructing a multi-site OpenStack environment is the
|
||||
goal of this guide, the real test is whether an application
|
||||
can utilize it.</para>
|
||||
<para>The Identity service is normally the first interface for
|
||||
OpenStack users and is required for almost all major operations
|
||||
within OpenStack. Therefore, it is important that you provide users
|
||||
with a single URL for Identity service authentication, and
|
||||
document the configuration of regions within the Identity service.
|
||||
Each of the sites defined in your installation is considered
|
||||
to be a region in Identity nomenclature. This is important for
|
||||
the users, as it is required to define the region name when
|
||||
providing actions to an API endpoint or in the dashboard.</para>
|
||||
<para>Load balancing is another common issue with multi-site
|
||||
installations. While it is still possible to run HAproxy
|
||||
instances with Load-Balancer-as-a-Service, these are defined
|
||||
to a specific region. Some applications can manage this using
|
||||
internal mechanisms. Other applications may require the
|
||||
implementation of an external system, including global services
|
||||
load balancers or anycast-advertised DNS.</para>
|
||||
<para>Depending on the storage model chosen during site design,
|
||||
storage replication and availability are also a concern
|
||||
for end-users. If an application can support regions, then it
|
||||
is possible to keep the object storage system separated by region.
|
||||
In this case, users who want to have an object available to
|
||||
more than one region need to perform cross-site replication.
|
||||
However, with a centralized swift proxy, the user may need to
|
||||
benchmark the replication timing of the Object Storage back end.
|
||||
Benchmarking allows the operational staff to provide users with
|
||||
an understanding of the amount of time required for a stored or
|
||||
modified object to become available to the entire environment.</para>
|
||||
</section>
|
||||
<section xml:id="performance"><title>Performance</title>
|
||||
<para>Determining the performance of a multi-site installation
|
||||
involves considerations that do not come into play in a
|
||||
single-site deployment. Being a distributed deployment,
|
||||
performance in multi-site deployments may be affected in certain
|
||||
situations.</para>
|
||||
<para>Since multi-site systems can be geographically separated,
|
||||
there may be greater latency or jitter when communicating across
|
||||
regions. This can especially impact systems like the OpenStack
|
||||
Identity service when making authentication attempts from regions
|
||||
that do not contain the centralized Identity implementation. It
|
||||
can also affect applications which rely on Remote Procedure Call (RPC)
|
||||
for normal operation. An example of this can be seen in high
|
||||
performance computing workloads.</para>
|
||||
<para>Storage availability can also be impacted by the
|
||||
architecture of a multi-site deployment. A centralized Object
|
||||
Storage service requires more time for an object to be
|
||||
available to instances locally in regions where the object was
|
||||
not created. Some applications may need to be tuned to account
|
||||
for this effect. Block Storage does not currently have a
|
||||
method for replicating data across multiple regions, so
|
||||
applications that depend on available block storage need
|
||||
to manually cope with this limitation by creating duplicate
|
||||
block storage entries in each region.</para>
|
||||
</section>
|
||||
<section xml:id="openstack-components_multi-site">
|
||||
<title>OpenStack components</title>
|
||||
<para>Most OpenStack installations require a bare minimum set of
|
||||
pieces to function. These include the OpenStack Identity
|
||||
(keystone) for authentication, OpenStack Compute
|
||||
(nova) for compute, OpenStack Image service (glance) for image
|
||||
storage, OpenStack Networking (neutron) for networking, and
|
||||
potentially an object store in the form of OpenStack Object
|
||||
Storage (swift). Deploying a multi-site installation also demands extra
|
||||
components in order to coordinate between regions. A centralized
|
||||
Identity service is necessary to provide the single authentication
|
||||
point. A centralized dashboard is also recommended to provide a
|
||||
single login point and a mapping to the API and CLI
|
||||
options available. A centralized Object Storage service may also
|
||||
be used, but will require the installation of the swift proxy
|
||||
service.</para>
|
||||
<para>It may also be helpful to install a few extra options in
|
||||
order to facilitate certain use cases. For example,
|
||||
installing Designate may assist in automatically generating
|
||||
DNS domains for each region with an automatically-populated
|
||||
zone full of resource records for each instance. This
|
||||
facilitates using DNS as a mechanism for determining which
|
||||
region will be selected for certain applications.</para>
|
||||
<para>Another useful tool for managing a multi-site installation
|
||||
is Orchestration (heat). The Orchestration service allows the
|
||||
use of templates to define a set of instances to be launched
|
||||
together or for scaling existing sets. It can also be used to
|
||||
set up matching or differentiated groupings based on
|
||||
regions. For instance, if an application requires an equally
|
||||
balanced number of nodes across sites, the same heat template
|
||||
can be used to cover each site with small alterations to only
|
||||
the region name.</para>
|
||||
</section>
|
||||
</section>
|
@ -1,176 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="user-requirements-multi-site">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>User requirements</title>
|
||||
<section xml:id="workload-characteristics">
|
||||
<title>Workload characteristics</title>
|
||||
<para>An understanding of the expected workloads for a desired
|
||||
multi-site environment and use case is an important factor in
|
||||
the decision-making process. In this context, <literal>workload</literal>
|
||||
refers to the way the systems are used. A workload could be a
|
||||
single application or a suite of applications that work together.
|
||||
It could also be a duplicate set of applications that need to
|
||||
run in multiple cloud environments. Often in a multi-site deployment,
|
||||
the same workload will need to work identically in more than one
|
||||
physical location.</para>
|
||||
<para>This multi-site scenario likely includes one or more of the
|
||||
other scenarios in this book with the additional requirement
|
||||
of having the workloads in two or more locations. The
|
||||
following are some possible scenarios:</para>
|
||||
<para>For many use cases the proximity of the user to their
|
||||
workloads has a direct influence on the performance of the
|
||||
application and therefore should be taken into consideration
|
||||
in the design. Certain applications require zero to minimal
|
||||
latency that can only be achieved by deploying the cloud in
|
||||
multiple locations. These locations could be in different data
|
||||
centers, cities, countries or geographical regions, depending
|
||||
on the user requirement and location of the users.</para></section>
|
||||
<section xml:id="consistency-images-templates-across-sites">
|
||||
<title>Consistency of images and templates across different
|
||||
sites</title>
|
||||
<para>It is essential that the deployment of instances is
|
||||
consistent across the different sites and built
|
||||
into the infrastructure. If the OpenStack Object Storage is used as
|
||||
a back end for the Image service, it is possible to create repositories
|
||||
of consistent images across multiple sites. Having central
|
||||
endpoints with multiple storage nodes allows consistent centralized
|
||||
storage for every site.</para>
|
||||
<para>Not using a centralized object store increases the operational
|
||||
overhead of maintaining a consistent image library. This
|
||||
could include development of a replication mechanism to handle
|
||||
the transport of images and the changes to the images across
|
||||
multiple sites.</para></section>
|
||||
<section xml:id="high-availability-multi-site">
|
||||
<title>High availability</title>
|
||||
<para>If high availability is a requirement to provide continuous
|
||||
infrastructure operations, a basic requirement of high
|
||||
availability should be defined.</para>
|
||||
<para>The OpenStack management components need to have a basic and
|
||||
minimal level of redundancy. The simplest example is the loss
|
||||
of any single site should have minimal impact on the
|
||||
availability of the OpenStack services.</para>
|
||||
<para>The <link
|
||||
xlink:href="http://docs.openstack.org/ha-guide/"><citetitle>OpenStack
|
||||
High Availability Guide</citetitle></link>
|
||||
contains more information on how to provide redundancy for the
|
||||
OpenStack components.</para>
|
||||
<para>Multiple network links should be deployed between sites to
|
||||
provide redundancy for all components. This includes storage
|
||||
replication, which should be isolated to a dedicated network
|
||||
or VLAN with the ability to assign QoS to control the
|
||||
replication traffic or provide priority for this traffic. Note
|
||||
that if the data store is highly changeable, the network
|
||||
requirements could have a significant effect on the
|
||||
operational cost of maintaining the sites.</para>
|
||||
<para>The ability to maintain object availability in both sites
|
||||
has significant implications on the object storage design and
|
||||
implementation. It also has a significant impact on the
|
||||
WAN network design between the sites.</para>
|
||||
<para>Connecting more than two sites increases the challenges and
|
||||
adds more complexity to the design considerations. Multi-site
|
||||
implementations require planning to address the additional
|
||||
topology used for internal and external connectivity. Some options
|
||||
include full mesh topology, hub spoke, spine leaf, and 3D Torus.</para>
|
||||
<para>If applications running in a cloud are not cloud-aware, there
|
||||
should be clear measures and expectations to define what the
|
||||
infrastructure can and cannot support. An example would be
|
||||
shared storage between sites. It is possible, however such a
|
||||
solution is not native to OpenStack and requires a third-party
|
||||
hardware vendor to fulfill such a requirement. Another example
|
||||
can be seen in applications that are able to consume resources
|
||||
in object storage directly. These applications need to be
|
||||
cloud aware to make good use of an OpenStack Object
|
||||
Store.</para></section>
|
||||
<section xml:id="application-readiness">
|
||||
<title>Application readiness</title>
|
||||
<para>Some applications are tolerant of the lack of synchronized
|
||||
object storage, while others may need those objects to be
|
||||
replicated and available across regions. Understanding how
|
||||
the cloud implementation impacts new and existing applications
|
||||
is important for risk mitigation, and the overall success of a
|
||||
cloud project. Applications may have to be written or rewritten
|
||||
for an infrastructure with little to no redundancy, or with the
|
||||
cloud in mind.</para></section>
|
||||
<section xml:id="cost-multi-site">
|
||||
<title>Cost</title>
|
||||
<para>A greater number of sites increase cost and complexity for a
|
||||
multi-site deployment. Costs can be broken down into the following
|
||||
categories:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Compute resources</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Networking resources</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Replication</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Storage</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Management</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Operational costs</para>
|
||||
</listitem>
|
||||
</itemizedlist></section>
|
||||
<section xml:id="site-loss-and-recovery">
|
||||
<title>Site loss and recovery</title>
|
||||
<para>Outages can cause partial or full loss of site functionality.
|
||||
Strategies should be implemented to understand and plan for recovery
|
||||
scenarios.</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>The deployed applications need to continue to
|
||||
function and, more importantly, you must consider the
|
||||
impact on the performance and reliability of the application
|
||||
when a site is unavailable.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>It is important to understand what happens to the
|
||||
replication of objects and data between the sites when
|
||||
a site goes down. If this causes queues to start
|
||||
building up, consider how long these queues can
|
||||
safely exist until an error occurs.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>After an outage, ensure the method for resuming proper
|
||||
operations of a site is implemented when it comes back online.
|
||||
We recommend you architect the recovery to avoid race conditions.</para>
|
||||
</listitem>
|
||||
</itemizedlist></section>
|
||||
<section xml:id="compliance-and-geo-location-multi-site">
|
||||
<title>Compliance and geo-location</title>
|
||||
<para>An organization may have certain legal obligations and
|
||||
regulatory compliance measures which could require certain
|
||||
workloads or data to not be located in certain regions.</para></section>
|
||||
<section xml:id="auditing-multi-site">
|
||||
<title>Auditing</title>
|
||||
<para>A well thought-out auditing strategy is important in order
|
||||
to be able to quickly track down issues. Keeping track of
|
||||
changes made to security groups and tenant changes can be
|
||||
useful in rolling back the changes if they affect production.
|
||||
For example, if all security group rules for a tenant
|
||||
disappeared, the ability to quickly track down the issue would
|
||||
be important for operational and legal reasons.</para></section>
|
||||
<section xml:id="separation-of-duties">
|
||||
<title>Separation of duties</title>
|
||||
<para>A common requirement is to define different roles for the
|
||||
different cloud administration functions. An example would be
|
||||
a requirement to segregate the duties and permissions by
|
||||
site.</para></section>
|
||||
<section xml:id="authentication-between-sites">
|
||||
<title>Authentication between sites</title>
|
||||
<para>It is recommended to have a single authentication domain
|
||||
rather than a separate implementation for each and every
|
||||
site. This requires an authentication mechanism that is highly
|
||||
available and distributed to ensure continuous operation.
|
||||
Authentication server locality might be required and should be
|
||||
planned for.</para></section>
|
||||
</section>
|
@ -1,184 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="architecture-network-focus">
|
||||
<title>Architecture</title>
|
||||
<para>Network-focused OpenStack architectures have many similarities to
|
||||
other OpenStack architecture use cases. There are several factors
|
||||
to consider when designing for a network-centric or network-heavy
|
||||
application environment.</para>
|
||||
<para>Networks exist to serve as a medium of transporting data between
|
||||
systems. It is inevitable that an OpenStack design has inter-dependencies
|
||||
with non-network portions of OpenStack as well as on external systems.
|
||||
Depending on the specific workload, there may be major interactions with
|
||||
storage systems both within and external to the OpenStack environment.
|
||||
For example, in the case of content delivery network, there is twofold
|
||||
interaction with storage. Traffic flows to and from the storage array for
|
||||
ingesting and serving content in a north-south direction. In addition,
|
||||
there is replication traffic flowing in an east-west direction.</para>
|
||||
<para>Compute-heavy workloads may also induce interactions with the
|
||||
network. Some high performance compute applications require network-based
|
||||
memory mapping and data sharing and, as a result, induce a higher network
|
||||
load when they transfer results and data sets. Others may be highly
|
||||
transactional and issue transaction locks, perform their functions, and
|
||||
revoke transaction locks at high rates. This also has an impact on the
|
||||
network performance.</para>
|
||||
<para>Some network dependencies are external to OpenStack. While
|
||||
OpenStack Networking is capable of providing network ports, IP addresses,
|
||||
some level of routing, and overlay networks, there are some other
|
||||
functions that it cannot provide. For many of these, you may require
|
||||
external systems or equipment to fill in the functional gaps. Hardware
|
||||
load balancers are an example of equipment that may be necessary to
|
||||
distribute workloads or offload certain functions. OpenStack Networking
|
||||
provides a tunneling feature, however it is constrained to a
|
||||
Networking-managed region. If the need arises to extend a tunnel beyond
|
||||
the OpenStack region to either another region or an external system,
|
||||
implement the tunnel itself outside OpenStack or use a tunnel management
|
||||
system to map the tunnel or overlay to an external tunnel.
|
||||
</para>
|
||||
<para>
|
||||
Depending on the selected design, Networking itself might not
|
||||
support the required <glossterm baseform="Layer-3 network">layer-3
|
||||
network</glossterm> functionality. If you choose to use the
|
||||
provider networking mode without running the layer-3 agent, you
|
||||
must install an external router to provide layer-3 connectivity
|
||||
to outside systems.
|
||||
</para>
|
||||
<para>Interaction with orchestration services is inevitable in
|
||||
larger-scale deployments. The Orchestration service is capable of
|
||||
allocating network resource defined in templates to map to tenant
|
||||
networks and for port creation, as well as allocating floating IPs.
|
||||
If there is a requirement to define and manage network resources when
|
||||
using orchestration, we recommend that the design include the
|
||||
Orchestration service to meet the demands of users.</para>
|
||||
<section xml:id="design-impacts">
|
||||
<title>Design impacts</title>
|
||||
<para>A wide variety of factors can affect a network-focused OpenStack
|
||||
architecture. While there are some considerations shared with a general
|
||||
use case, specific workloads related to network requirements influence
|
||||
network design decisions.</para>
|
||||
<para>One decision includes whether or not to use Network Address
|
||||
Translation (NAT) and where to implement it. If there is a requirement
|
||||
for floating IPs instead of public fixed addresses then you must use
|
||||
NAT. An example of this is a DHCP relay that must know the IP of the
|
||||
DHCP server. In these cases it is easier to automate the infrastructure
|
||||
to apply the target IP to a new instance rather than to reconfigure
|
||||
legacy or external systems for each new instance.</para>
|
||||
<para>NAT for floating IPs managed by Networking resides within the
|
||||
hypervisor but there are also versions of NAT that may be running
|
||||
elsewhere. If there is a shortage of IPv4 addresses there are two common
|
||||
methods to mitigate this externally to OpenStack. The first is to run a
|
||||
load balancer either within OpenStack as an instance, or use an external
|
||||
load balancing solution. In the internal scenario, Networking's
|
||||
Load-Balancer-as-a-Service (LBaaS) can manage load balancing
|
||||
software, for example HAproxy. This is specifically to manage the
|
||||
Virtual IP (VIP) while a dual-homed connection from the HAproxy instance
|
||||
connects the public network with the tenant private network that hosts
|
||||
all of the content servers. In the external scenario, a load balancer
|
||||
needs to serve the VIP and also connect to the tenant overlay
|
||||
network through external means or through private addresses.</para>
|
||||
<para>Another kind of NAT that may be useful is protocol NAT. In some
|
||||
cases it may be desirable to use only IPv6 addresses on instances and
|
||||
operate either an instance or an external service to provide a NAT-based
|
||||
transition technology such as NAT64 and DNS64. This provides the ability
|
||||
to have a globally routable IPv6 address while only consuming IPv4
|
||||
addresses as necessary or in a shared manner.</para>
|
||||
<para>Application workloads affect the design of the underlying network
|
||||
architecture. If a workload requires network-level redundancy, the
|
||||
routing and switching architecture have to accommodate this. There
|
||||
are differing methods for providing this that are dependent on the
|
||||
selected network hardware, the performance of the hardware, and which
|
||||
networking model you deploy. Examples include
|
||||
Link aggregation (LAG) and Hot Standby Router Protocol (HSRP). Also
|
||||
consider whether to deploy OpenStack Networking or
|
||||
legacy networking (nova-network), and which plug-in to select for
|
||||
OpenStack Networking. If using an external system, configure Networking
|
||||
to run <glossterm baseform="Layer-2 network">layer 2</glossterm>
|
||||
with a provider network configuration. For example, implement HSRP
|
||||
to terminate layer-3 connectivity.</para>
|
||||
<para>Depending on the workload, overlay networks may not be the best
|
||||
solution. Where application network connections are
|
||||
small, short lived, or bursty, running a dynamic overlay can generate
|
||||
as much bandwidth as the packets it carries. It also can induce enough
|
||||
latency to cause issues with certain applications. There is an impact
|
||||
to the device generating the overlay which, in most installations,
|
||||
is the hypervisor. This causes performance degradation on packet
|
||||
per second and connection per second rates.</para>
|
||||
<para>Overlays also come with a secondary option that may not be
|
||||
appropriate to a specific workload. While all of them operate in full
|
||||
mesh by default, there might be good reasons to disable this function
|
||||
because it may cause excessive overhead for some workloads. Conversely,
|
||||
other workloads operate without issue. For example, most web services
|
||||
applications do not have major issues with a full mesh overlay network,
|
||||
while some network monitoring tools or storage replication workloads
|
||||
have performance issues with throughput or excessive broadcast
|
||||
traffic.</para>
|
||||
<para>Many people overlook an important design decision: The choice of
|
||||
layer-3 protocols. While OpenStack was initially built with only IPv4
|
||||
support, Networking now supports IPv6 and dual-stacked networks.
|
||||
Some workloads are possible through the use of IPv6 and IPv6 to IPv4
|
||||
reverse transition mechanisms such as NAT64 and DNS64 or
|
||||
<glossterm>6to4</glossterm>.
|
||||
This alters the requirements for any address plan as single-stacked and
|
||||
transitional IPv6 deployments can alleviate the need for IPv4
|
||||
addresses.</para>
|
||||
<para>OpenStack has limited support for
|
||||
dynamic routing, however there are a number of options available by
|
||||
incorporating third party solutions to implement routing within the
|
||||
cloud including network equipment, hardware nodes, and instances. Some
|
||||
workloads perform well with nothing more than static routes and default
|
||||
gateways configured at the layer-3 termination point. In most cases this
|
||||
is sufficient, however some cases require the addition of at least one
|
||||
type of dynamic routing protocol if not multiple protocols. Having a
|
||||
form of interior gateway protocol (IGP) available to the instances
|
||||
inside an OpenStack installation opens up the possibility of use cases
|
||||
for anycast route injection for services that need to use it as a
|
||||
geographic location or failover mechanism. Other applications may wish
|
||||
to directly participate in a routing protocol, either as a passive
|
||||
observer, as in the case of a looking glass, or as an active participant
|
||||
in the form of a route reflector. Since an instance might have a large
|
||||
amount of compute and memory resources, it is trivial to hold an entire
|
||||
unpartitioned routing table and use it to provide services such as
|
||||
network path visibility to other applications or as a monitoring
|
||||
tool.</para>
|
||||
<para>Path maximum transmission unit (MTU) failures are lesser known but
|
||||
harder to diagnose. The MTU must be large enough to handle normal
|
||||
traffic, overhead from an overlay network, and the desired layer-3
|
||||
protocol. Adding externally built tunnels reduces the MTU packet size.
|
||||
In this case, you must pay attention to the fully
|
||||
calculated MTU size because some systems ignore or
|
||||
drop path MTU discovery packets.</para>
|
||||
</section>
|
||||
<section xml:id="tunables">
|
||||
<title>Tunable networking components</title>
|
||||
<para>Consider configurable networking components related to an
|
||||
OpenStack architecture design when designing for network intensive
|
||||
workloads that include MTU and QoS. Some workloads require a larger MTU
|
||||
than normal due to the transfer of large blocks of data.
|
||||
When providing network service for applications such as video
|
||||
streaming or storage replication, we recommend that you configure
|
||||
both OpenStack hardware nodes and the supporting network equipment
|
||||
for jumbo frames where possible. This allows for better use of
|
||||
available bandwidth. Configure jumbo frames
|
||||
across the complete path the packets traverse. If one network
|
||||
component is not capable of handling jumbo frames then the entire
|
||||
path reverts to the default MTU.</para>
|
||||
<para>Quality of Service (QoS) also has a great impact on network
|
||||
intensive workloads as it provides instant service to packets which
|
||||
have a higher priority due to the impact of poor
|
||||
network performance. In applications such as Voice over IP (VoIP),
|
||||
differentiated services code points are a near requirement for proper
|
||||
operation. You can also use QoS in the opposite direction for mixed
|
||||
workloads to prevent low priority but high bandwidth applications,
|
||||
for example backup services, video conferencing, or file sharing,
|
||||
from blocking bandwidth that is needed for the proper operation of
|
||||
other workloads. It is possible to tag file storage traffic as a
|
||||
lower class, such as best effort or scavenger, to allow the higher
|
||||
priority traffic through. In cases where regions within a cloud might
|
||||
be geographically distributed it may also be necessary to plan
|
||||
accordingly to implement WAN optimization to combat latency or
|
||||
packet loss.</para>
|
||||
</section>
|
||||
</section>
|
@ -1,68 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="operational-considerations-networking-focus">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Operational considerations</title>
|
||||
<para>Network-focused OpenStack clouds have a number of operational
|
||||
considerations that influence the selected design, including:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Dynamic routing of static routes</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Service level agreements (SLAs)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Ownership of user management</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>An initial network consideration is the selection of a telecom
|
||||
company or transit provider.</para>
|
||||
<para>Make additional design decisions about monitoring and alarming.
|
||||
This can be an internal responsibility or the responsibility of the
|
||||
external provider. In the case of using an external provider, service
|
||||
level agreements (SLAs) likely apply. In addition, other operational
|
||||
considerations such as bandwidth, latency, and jitter can be part of an
|
||||
SLA.</para>
|
||||
<para>Consider the ability to upgrade the infrastructure. As demand for
|
||||
network resources increase, operators add additional IP address blocks
|
||||
and add additional bandwidth capacity. In addition, consider managing
|
||||
hardware and software life cycle events, for example upgrades,
|
||||
decommissioning, and outages, while avoiding service interruptions for
|
||||
tenants.</para>
|
||||
<para>Factor maintainability into the overall network design. This
|
||||
includes the ability to manage and maintain IP addresses as well as the
|
||||
use of overlay identifiers including VLAN tag IDs, GRE tunnel IDs, and
|
||||
MPLS tags. As an example, if you may need to change all of the IP
|
||||
addresses on a network, a process known as renumbering, then the design
|
||||
must support this function.</para>
|
||||
<para>Address network-focused applications when considering certain
|
||||
operational realities. For example, consider the impending exhaustion
|
||||
of IPv4 addresses, the migration to IPv6, and the use of private
|
||||
networks to segregate different types of traffic that an application
|
||||
receives or generates. In the case of IPv4 to IPv6 migrations,
|
||||
applications should follow best practices for storing IP addresses.
|
||||
We recommend you avoid relying on IPv4 features that did not carry over
|
||||
to the IPv6 protocol or have differences in implementation.</para>
|
||||
<para>To segregate traffic, allow applications to create a private tenant
|
||||
network for database and storage network traffic. Use a public network
|
||||
for services that require direct client access from the internet. Upon
|
||||
segregating the traffic, consider quality of service (QoS) and security
|
||||
to ensure each network has the required level of service.</para>
|
||||
<para>Finally, consider the routing of network traffic.
|
||||
For some applications, develop a complex policy framework for
|
||||
routing. To create a routing policy that satisfies business requirements,
|
||||
consider the economic cost of transmitting traffic over expensive links
|
||||
versus cheaper links, in addition to bandwidth, latency, and jitter
|
||||
requirements.</para>
|
||||
<para>Additionally, consider how to respond to network events. As an
|
||||
example, how load transfers from one link to another during a
|
||||
failure scenario could be a factor in the design. If you do not plan
|
||||
network capacity correctly, failover traffic could overwhelm other ports
|
||||
or network links and create a cascading failure scenario. In this case,
|
||||
traffic that fails over to one link overwhelms that link and then moves
|
||||
to the subsequent links until all network traffic stops.</para>
|
||||
</section>
|
@ -1,209 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="prescriptive-example-large-scale-web-app">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Prescriptive examples</title>
|
||||
<para>An organization designs a large-scale web application with cloud
|
||||
principles in mind. The application scales
|
||||
horizontally in a bursting fashion and generates a high
|
||||
instance count. The application requires an SSL connection to
|
||||
secure data and must not lose connection state to individual
|
||||
servers.</para>
|
||||
<para>The figure below depicts an example design for this workload.
|
||||
In this example, a hardware load balancer provides SSL offload
|
||||
functionality and connects
|
||||
to tenant networks in order to reduce address consumption.
|
||||
This load balancer links to the routing architecture as it
|
||||
services the VIP for the application. The router and load
|
||||
balancer use the GRE tunnel ID of the
|
||||
application's tenant network and an IP address within
|
||||
the tenant subnet but outside of the address pool. This is to
|
||||
ensure that the load balancer can communicate with the
|
||||
application's HTTP servers without requiring the consumption
|
||||
of a public IP address.</para>
|
||||
<para>Because sessions persist until closed, the routing and
|
||||
switching architecture provides high availability.
|
||||
Switches mesh to each hypervisor and each other, and
|
||||
also provide an MLAG implementation to ensure that layer-2
|
||||
connectivity does not fail. Routers use VRRP
|
||||
and fully mesh with switches to ensure layer-3 connectivity.
|
||||
Since GRE is provides an overlay network, Networking is present
|
||||
and uses the Open vSwitch agent in GRE tunnel
|
||||
mode. This ensures all devices can reach all other devices and
|
||||
that you can create tenant networks for private addressing
|
||||
links to the load balancer.
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Network_Web_Services1.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject></para>
|
||||
<para>A web service architecture has many options and optional
|
||||
components. Due to this, it can fit into a large number of
|
||||
other OpenStack designs. A few key components, however, need
|
||||
to be in place to handle the nature of most web-scale
|
||||
workloads. You require the following components:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>OpenStack Controller services (Image, Identity,
|
||||
Networking and supporting services such as MariaDB and
|
||||
RabbitMQ)</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Compute running KVM hypervisor</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>OpenStack Object Storage</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Orchestration service</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Telemetry service</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Beyond the normal Identity, Compute, Image service, and Object
|
||||
Storage components, we recommend the Orchestration service
|
||||
component to handle the proper scaling of workloads to adjust to
|
||||
demand. Due to the requirement for auto-scaling,
|
||||
the design includes the Telemetry service. Web services
|
||||
tend to be bursty in load, have very defined peak and valley
|
||||
usage patterns and, as a result, benefit from automatic scaling
|
||||
of instances based upon traffic. At a network level, a split
|
||||
network configuration works well with databases residing on
|
||||
private tenant networks since these do not emit a large quantity
|
||||
of broadcast traffic and may need to interconnect to some
|
||||
databases for content.
|
||||
</para>
|
||||
<section xml:id="load-balancing">
|
||||
<title>Load balancing</title>
|
||||
<para>Load balancing spreads requests across multiple instances.
|
||||
This workload scales well horizontally across large numbers of
|
||||
instances. This enables instances to run without publicly
|
||||
routed IP addresses and instead to rely on the load
|
||||
balancer to provide a globally reachable service.
|
||||
Many of these services do not require
|
||||
direct server return. This aids in address planning and
|
||||
utilization at scale since only the virtual IP (VIP) must be
|
||||
public.</para>
|
||||
</section>
|
||||
<section xml:id="overlay-networks">
|
||||
<title>Overlay networks</title>
|
||||
<para>
|
||||
The overlay functionality design includes OpenStack Networking
|
||||
in Open vSwitch GRE tunnel mode.
|
||||
In this case, the layer-3 external routers pair with
|
||||
VRRP, and switches pair with an implementation of
|
||||
MLAG to ensure that you do not lose connectivity with
|
||||
the upstream routing infrastructure.
|
||||
</para>
|
||||
</section>
|
||||
<section xml:id="performance-tuning">
|
||||
<title>Performance tuning</title>
|
||||
<para>Network level tuning for this workload is minimal.
|
||||
Quality-of-Service (QoS) applies to these workloads
|
||||
for a middle ground Class Selector depending on existing
|
||||
policies. It is higher than a best effort queue but lower
|
||||
than an Expedited Forwarding or Assured Forwarding queue.
|
||||
Since this type of application generates larger packets with
|
||||
longer-lived connections, you can optimize bandwidth utilization
|
||||
for long duration TCP. Normal bandwidth planning
|
||||
applies here with regards to benchmarking a session's usage
|
||||
multiplied by the expected number of concurrent sessions with
|
||||
overhead.</para>
|
||||
</section>
|
||||
<section xml:id="network-functions">
|
||||
<title>Network functions</title>
|
||||
<para>Network functions is a broad category but encompasses
|
||||
workloads that support the rest of a system's network. These
|
||||
workloads tend to consist of large amounts of small packets
|
||||
that are very short lived, such as DNS queries or SNMP traps.
|
||||
These messages need to arrive quickly and do not deal with
|
||||
packet loss as there can be a very large volume of them. There
|
||||
are a few extra considerations to take into account for this
|
||||
type of workload and this can change a configuration all the
|
||||
way to the hypervisor level. For an application that generates
|
||||
10 TCP sessions per user with an average bandwidth of 512
|
||||
kilobytes per second per flow and expected user count of ten
|
||||
thousand concurrent users, the expected bandwidth plan is
|
||||
approximately 4.88 gigabits per second.</para>
|
||||
<para>The supporting network for this type of configuration needs
|
||||
to have a low latency and evenly distributed availability.
|
||||
This workload benefits from having services local to the
|
||||
consumers of the service. Use a multi-site approach as
|
||||
well as deploying many copies of the application to handle
|
||||
load as close as possible to consumers. Since these
|
||||
applications function independently, they do not warrant
|
||||
running overlays to interconnect tenant networks. Overlays
|
||||
also have the drawback of performing poorly with rapid flow
|
||||
setup and may incur too much overhead with large quantities of
|
||||
small packets and therefore we do not recommend them.</para>
|
||||
<para>QoS is desirable for some workloads to ensure delivery. DNS
|
||||
has a major impact on the load times of other services and
|
||||
needs to be reliable and provide rapid responses. Configure rules
|
||||
in upstream devices to apply a higher Class
|
||||
Selector to DNS to ensure faster delivery or a better spot in
|
||||
queuing algorithms.</para>
|
||||
</section>
|
||||
<section xml:id="cloud-storage">
|
||||
<title>Cloud storage</title>
|
||||
<para>Another common use case for OpenStack environments is providing
|
||||
a cloud-based file storage and sharing service. You might
|
||||
consider this a storage-focused use case, but its network-side
|
||||
requirements make it a network-focused use case.</para>
|
||||
<para>For example, consider a cloud backup application. This workload
|
||||
has two specific behaviors that impact the network. Because this
|
||||
workload is an externally-facing service and an
|
||||
internally-replicating application, it has both <glossterm
|
||||
baseform="north-south traffic">north-south</glossterm> and
|
||||
<glossterm>east-west traffic</glossterm>
|
||||
considerations:</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>north-south traffic</term>
|
||||
<listitem>
|
||||
<para>When a user uploads and stores content, that content moves
|
||||
into the OpenStack installation. When users download this
|
||||
content, the content moves out from the OpenStack
|
||||
installation. Because this service operates primarily
|
||||
as a backup, most of the traffic moves southbound into the
|
||||
environment. In this situation, it benefits you to
|
||||
configure a network to be asymmetrically downstream
|
||||
because the traffic that enters the OpenStack installation
|
||||
is greater than the traffic that leaves the installation.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>east-west traffic</term>
|
||||
<listitem>
|
||||
<para>Likely to be fully symmetric. Because replication
|
||||
originates from any node and might target multiple other
|
||||
nodes algorithmically, it is less likely for this traffic
|
||||
to have a larger volume in any specific direction. However
|
||||
this traffic might interfere with north-south traffic.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<mediaobject>
|
||||
<imageobject>
|
||||
<imagedata contentwidth="4in"
|
||||
fileref="../figures/Network_Cloud_Storage2.png"
|
||||
/>
|
||||
</imageobject>
|
||||
</mediaobject>
|
||||
<para>This application prioritizes the north-south traffic over
|
||||
east-west traffic: the north-south traffic involves
|
||||
customer-facing data.</para>
|
||||
<para>The network design in this case is less dependent on
|
||||
availability and more dependent on being able to handle high
|
||||
bandwidth. As a direct result, it is beneficial to forgo
|
||||
redundant links in favor of bonding those connections. This
|
||||
increases available bandwidth. It is also beneficial to
|
||||
configure all devices in the path, including OpenStack, to
|
||||
generate and pass jumbo frames.</para>
|
||||
</section>
|
||||
</section>
|
@ -1,462 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="technical-considerations-network-focus">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>Technical considerations</title>
|
||||
<para>When you design an OpenStack network architecture, you must
|
||||
consider layer-2 and layer-3 issues. Layer-2
|
||||
decisions involve those made at the data-link layer, such as
|
||||
the decision to use Ethernet versus Token Ring. Layer-3 decisions
|
||||
involve those made about the protocol layer and the point when
|
||||
IP comes into the picture. As an example, a completely
|
||||
internal OpenStack network can exist at layer 2 and ignore
|
||||
layer 3. In order for any traffic to go outside of
|
||||
that cloud, to another network, or to the Internet, however, you must
|
||||
use a layer-3 router or switch.</para>
|
||||
<para>The past few years have seen two competing trends in
|
||||
networking. One trend leans towards building data center network
|
||||
architectures based on layer-2 networking. Another trend treats
|
||||
the cloud environment essentially as a miniature version of the
|
||||
Internet. This approach is radically different from the network
|
||||
architecture approach in the staging environment:
|
||||
the Internet only uses layer-3 routing rather than
|
||||
layer-2 switching.</para>
|
||||
<para>A network designed on layer-2 protocols has advantages over one
|
||||
designed on layer-3 protocols. In spite of the difficulties of
|
||||
using a bridge to perform the network role of a router, many
|
||||
vendors, customers, and service providers choose to use Ethernet
|
||||
in as many parts of their networks as possible. The benefits of
|
||||
selecting a layer-2 design are:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Ethernet frames contain all the essentials for
|
||||
networking. These include, but are not limited to,
|
||||
globally unique source addresses, globally unique
|
||||
destination addresses, and error control.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Ethernet frames can carry any kind of packet.
|
||||
Networking at layer 2 is independent of the layer-3
|
||||
protocol.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Adding more layers to the Ethernet frame only slows
|
||||
the networking process down. This is known as 'nodal
|
||||
processing delay'.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>You can add adjunct networking features, for
|
||||
example class of service (CoS) or multicasting, to
|
||||
Ethernet as readily as IP networks.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>VLANs are an easy mechanism for isolating
|
||||
networks.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Most information starts and ends inside Ethernet frames.
|
||||
Today this applies to data, voice (for example, VoIP), and
|
||||
video (for example, web cameras). The concept is that, if you can
|
||||
perform more of the end-to-end transfer of information from
|
||||
a source to a destination in the form of Ethernet frames, the network
|
||||
benefits more from the advantages of Ethernet.
|
||||
Although it is not a substitute for IP networking, networking at
|
||||
layer 2 can be a powerful adjunct to IP networking.</para>
|
||||
<para>
|
||||
Layer-2 Ethernet usage has these advantages over layer-3 IP
|
||||
network usage:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Speed</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Reduced overhead of the IP hierarchy.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>No need to keep track of address configuration as systems
|
||||
move around. Whereas the simplicity of layer-2
|
||||
protocols might work well in a data center with hundreds
|
||||
of physical machines, cloud data centers have the
|
||||
additional burden of needing to keep track of all virtual
|
||||
machine addresses and networks. In these data centers, it
|
||||
is not uncommon for one physical node to support 30-40
|
||||
instances.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<important>
|
||||
<para>Networking at the frame level says nothing
|
||||
about the presence or absence of IP addresses at the packet
|
||||
level. Almost all ports, links, and devices on a network of
|
||||
LAN switches still have IP addresses, as do all the source and
|
||||
destination hosts. There are many reasons for the continued
|
||||
need for IP addressing. The largest one is the need to manage
|
||||
the network. A device or link without an IP address is usually
|
||||
invisible to most management applications. Utilities including
|
||||
remote access for diagnostics, file transfer of configurations
|
||||
and software, and similar applications cannot run without IP
|
||||
addresses as well as MAC addresses.</para>
|
||||
</important>
|
||||
<section xml:id="layer-2-arch-limitations">
|
||||
<title>Layer-2 architecture limitations</title>
|
||||
<para>Outside of the traditional data center the limitations of
|
||||
layer-2 network architectures become more obvious.</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Number of VLANs is limited to 4096.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>The number of MACs stored in switch tables is
|
||||
limited.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>You must accommodate the need to maintain a set of
|
||||
layer-4 devices to handle traffic control.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>MLAG, often used for switch redundancy, is a
|
||||
proprietary solution that does not scale beyond two
|
||||
devices and forces vendor lock-in.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>It can be difficult to troubleshoot a network
|
||||
without IP addresses and ICMP.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Configuring <glossterm
|
||||
baseform="Address Resolution Protocol (ARP)">ARP</glossterm>
|
||||
can be complicated on large layer-2 networks.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>All network devices need to be aware of all MACs,
|
||||
even instance MACs, so there is constant churn in MAC
|
||||
tables and network state changes as instances start and
|
||||
stop.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Migrating MACs (instance migration) to different
|
||||
physical locations are a potential problem if you do not
|
||||
set ARP table timeouts properly.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>It is important to know that layer 2 has a very limited set
|
||||
of network management tools. It is very difficult to control
|
||||
traffic, as it does not have mechanisms to manage the network
|
||||
or shape the traffic, and network troubleshooting is very
|
||||
difficult. One reason for this difficulty is network devices
|
||||
have no IP addresses. As a result, there is no reasonable way
|
||||
to check network delay in a layer-2 network.</para>
|
||||
<para>On large layer-2 networks, configuring ARP learning can also
|
||||
be complicated. The setting for the MAC address timer on
|
||||
switches is critical and, if set incorrectly, can cause
|
||||
significant performance problems. As an example, the Cisco
|
||||
default MAC address timer is extremely long. Migrating MACs to
|
||||
different physical locations to support instance migration can
|
||||
be a significant problem. In this case, the network
|
||||
information maintained in the switches could be out of sync
|
||||
with the new location of the instance.</para>
|
||||
<para>In a layer-2 network, all devices are aware of all MACs,
|
||||
even those that belong to instances. The network state
|
||||
information in the backbone changes whenever an instance starts
|
||||
or stops. As a result there is far too much churn in
|
||||
the MAC tables on the backbone switches.</para>
|
||||
</section>
|
||||
<section xml:id="layer-3-arch-advantages">
|
||||
<title>Layer-3 architecture advantages</title>
|
||||
<para>In the layer 3 case, there is no churn in the routing tables
|
||||
due to instances starting and stopping. The only time there
|
||||
would be a routing state change is in the case of a Top
|
||||
of Rack (ToR) switch failure or a link failure in the backbone
|
||||
itself. Other advantages of using a layer-3 architecture
|
||||
include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Layer-3 networks provide the same level of
|
||||
resiliency and scalability as the Internet.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Controlling traffic with routing metrics is
|
||||
straightforward.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>You can configure layer 3 to use <glossterm
|
||||
baseform="Border Gateway Protocol (BGP)">BGP</glossterm>
|
||||
confederation for scalability so core routers have state
|
||||
proportional to the number of racks, not to the number of
|
||||
servers or instances.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Routing takes instance MAC and IP addresses
|
||||
out of the network core, reducing state churn. Routing
|
||||
state changes only occur in the case of a ToR switch
|
||||
failure or backbone link failure.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>There are a variety of well tested tools, for
|
||||
example ICMP, to monitor and manage traffic.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Layer-3 architectures enable the use of Quality
|
||||
of Service (QoS) to manage network performance.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<section xml:id="layer-3-arch-limitations">
|
||||
<title>Layer-3 architecture limitations</title>
|
||||
<para>The main limitation of layer 3 is that there is no built-in
|
||||
isolation mechanism comparable to the VLANs in layer-2
|
||||
networks. Furthermore, the hierarchical nature of IP addresses
|
||||
means that an instance is on the same subnet as its
|
||||
physical host. This means that you cannot migrate it outside
|
||||
of the subnet easily. For these reasons, network
|
||||
virtualization needs to use IP <glossterm>encapsulation</glossterm>
|
||||
and software at the end hosts for isolation and the separation of
|
||||
the addressing in the virtual layer from the addressing in the
|
||||
physical layer. Other potential disadvantages of layer 3
|
||||
include the need to design an IP addressing scheme rather than
|
||||
relying on the switches to keep track of the MAC
|
||||
addresses automatically and to configure the interior gateway routing
|
||||
protocol in the switches.</para>
|
||||
</section>
|
||||
</section>
|
||||
<section xml:id="network-recommendations-overview">
|
||||
<title>Network recommendations overview</title>
|
||||
<para>OpenStack has complex networking requirements for several
|
||||
reasons. Many components interact at different levels of the
|
||||
system stack that adds complexity. Data flows are complex.
|
||||
Data in an OpenStack cloud moves both between instances across
|
||||
the network (also known as East-West), as well as in and out
|
||||
of the system (also known as North-South). Physical server
|
||||
nodes have network requirements that are independent of instance
|
||||
network requirements, which you must isolate from the core
|
||||
network to account for scalability. We recommend
|
||||
functionally separating the networks for security purposes and
|
||||
tuning performance through traffic shaping.</para>
|
||||
<para>You must consider a number of important general technical
|
||||
and business factors when planning and
|
||||
designing an OpenStack network. They include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>A requirement for vendor independence. To avoid
|
||||
hardware or software vendor lock-in, the design should
|
||||
not rely on specific features of a vendor's router or
|
||||
switch.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A requirement to massively scale the ecosystem to
|
||||
support millions of end users.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A requirement to support indeterminate platforms and
|
||||
applications.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A requirement to design for cost efficient
|
||||
operations to take advantage of massive scale.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A requirement to ensure that there is no single
|
||||
point of failure in the cloud ecosystem.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A requirement for high availability architecture to
|
||||
meet customer SLA requirements.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A requirement to be tolerant of rack level
|
||||
failure.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>A requirement to maximize flexibility to architect
|
||||
future production environments.</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>Bearing in mind these considerations, we recommend the following:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Layer-3 designs are preferable to layer-2
|
||||
architectures.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Design a dense multi-path network core to support
|
||||
multi-directional scaling and flexibility.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Use hierarchical addressing because it is the only
|
||||
viable option to scale network ecosystem.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Use virtual networking to isolate instance service
|
||||
network traffic from the management and internal
|
||||
network traffic.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Isolate virtual networks using encapsulation
|
||||
technologies.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Use traffic shaping for performance tuning.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Use eBGP to connect to the Internet up-link.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Use iBGP to flatten the internal traffic on the
|
||||
layer-3 mesh.</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Determine the most effective configuration for block
|
||||
storage network.</para>
|
||||
</listitem>
|
||||
</itemizedlist></section>
|
||||
<section xml:id="additional-considerations-network-focus">
|
||||
<title>Additional considerations</title>
|
||||
<para>There are several further considerations when designing a
|
||||
network-focused OpenStack cloud.</para>
|
||||
<section xml:id="openstack-networking-versus-nova-network">
|
||||
<title>OpenStack Networking versus legacy networking (nova-network)
|
||||
considerations</title>
|
||||
<para>Selecting the type of networking technology to implement
|
||||
depends on many factors. OpenStack Networking (neutron) and
|
||||
legacy networking (nova-network) both have their advantages and
|
||||
disadvantages. They are both valid and supported options that fit
|
||||
different use cases:</para>
|
||||
<informaltable rules="all">
|
||||
<col width="40%" />
|
||||
<col width="60%" />
|
||||
<thead>
|
||||
<tr><th>Legacy networking (nova-network)</th>
|
||||
<th>OpenStack Networking</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Simple, single agent</td>
|
||||
<td>Complex, multiple agents</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>More mature, established</td>
|
||||
<td>Newer, maturing</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Flat or VLAN</td>
|
||||
<td>Flat, VLAN, Overlays, L2-L3, SDN</td></tr>
|
||||
<tr>
|
||||
<td>No plug-in support</td>
|
||||
<td>Plug-in support for 3rd parties</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Scales well</td>
|
||||
<td>Scaling requires 3rd party plug-ins</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>No multi-tier topologies</td>
|
||||
<td>Multi-tier topologies</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</informaltable>
|
||||
</section>
|
||||
<section xml:id="redundant-networking-tor-switch-ha">
|
||||
<title>Redundant networking: ToR switch high availability
|
||||
risk analysis</title>
|
||||
<para>A technical consideration of networking is the idea that
|
||||
you should install switching gear in a data center
|
||||
with backup switches in case of hardware failure.</para>
|
||||
<para>Research indicates the mean time between failures (MTBF) on switches
|
||||
is between 100,000 and 200,000 hours. This number is dependent
|
||||
on the ambient temperature of the switch in the data
|
||||
center. When properly cooled and maintained, this translates to
|
||||
between 11 and 22 years before failure. Even in the worst case
|
||||
of poor ventilation and high ambient temperatures in the data
|
||||
center, the MTBF is still 2-3 years. See <link
|
||||
xlink:href="http://www.garrettcom.com/techsupport/papers/ethernet_switch_reliability.pdf">http://www.garrettcom.com/techsupport/papers/ethernet_switch_reliability.pdf</link>
|
||||
for further information.</para>
|
||||
<para>In most cases, it is much more economical to use a
|
||||
single switch with a small pool of spare switches to replace
|
||||
failed units than it is to outfit an entire data center with
|
||||
redundant switches. Applications should tolerate rack level
|
||||
outages without affecting normal
|
||||
operations, since network and compute resources are easily
|
||||
provisioned and plentiful.</para>
|
||||
</section>
|
||||
<section xml:id="preparing-for-future-ipv6-support">
|
||||
<title>Preparing for the future: IPv6 support</title>
|
||||
<para>One of the most important networking topics today is the
|
||||
impending exhaustion of IPv4 addresses. In early 2014, ICANN
|
||||
announced that they started allocating the final IPv4 address
|
||||
blocks to the Regional Internet Registries (<link
|
||||
xlink:href="http://www.internetsociety.org/deploy360/blog/2014/05/goodbye-ipv4-iana-starts-allocating-final-address-blocks/">http://www.internetsociety.org/deploy360/blog/2014/05/goodbye-ipv4-iana-starts-allocating-final-address-blocks/</link>).
|
||||
This means the IPv4 address space is close to being fully
|
||||
allocated. As a result, it will soon become difficult to
|
||||
allocate more IPv4 addresses to an application that has
|
||||
experienced growth, or that you expect to scale out, due to the lack
|
||||
of unallocated IPv4 address blocks.</para>
|
||||
<para>For network focused applications the future is the IPv6
|
||||
protocol. IPv6 increases the address space significantly,
|
||||
fixes long standing issues in the IPv4 protocol, and will
|
||||
become essential for network focused applications in the
|
||||
future.</para>
|
||||
<para>OpenStack Networking supports IPv6 when configured to take
|
||||
advantage of it. To enable IPv6, create an IPv6 subnet in
|
||||
Networking and use IPv6 prefixes when creating security
|
||||
groups.</para></section>
|
||||
<section xml:id="asymmetric-links">
|
||||
<title>Asymmetric links</title>
|
||||
<para>When designing a network architecture, the traffic patterns
|
||||
of an application heavily influence the allocation of
|
||||
total bandwidth and the number of links that you use to send
|
||||
and receive traffic. Applications that provide file storage
|
||||
for customers allocate bandwidth and links to favor
|
||||
incoming traffic, whereas video streaming applications
|
||||
allocate bandwidth and links to favor outgoing traffic.</para>
|
||||
</section>
|
||||
<section xml:id="performance-network-focus">
|
||||
<title>Performance</title>
|
||||
<para>It is important to analyze the applications' tolerance for
|
||||
latency and jitter when designing an environment to support
|
||||
network focused applications. Certain applications, for
|
||||
example VoIP, are less tolerant of latency and jitter. Where
|
||||
latency and jitter are concerned, certain applications may
|
||||
require tuning of QoS parameters and network device queues to
|
||||
ensure that they queue for transmit immediately or
|
||||
guarantee minimum bandwidth. Since OpenStack currently does
|
||||
not support these functions, consider carefully your selected
|
||||
network plug-in.</para>
|
||||
<para>The location of a service may also impact the application or
|
||||
consumer experience. If an application serves
|
||||
differing content to different users it must properly direct
|
||||
connections to those specific locations. Where appropriate,
|
||||
use a multi-site installation for these situations.</para>
|
||||
<para>You can implement networking in two separate
|
||||
ways. Legacy networking (nova-network) provides a flat DHCP network
|
||||
with a single broadcast domain. This implementation does not
|
||||
support tenant isolation networks or advanced plug-ins, but it
|
||||
is currently the only way to implement a distributed layer-3
|
||||
agent using the multi_host configuration.
|
||||
OpenStack Networking (neutron) is the official networking implementation
|
||||
and provides a pluggable architecture that supports a large
|
||||
variety of network methods. Some of these include a layer-2
|
||||
only provider network model, external device plug-ins, or even
|
||||
OpenFlow controllers.</para>
|
||||
<para>Networking at large scales becomes a set of boundary
|
||||
questions. The determination of how large a layer-2 domain
|
||||
must be is based on the amount of nodes within the domain
|
||||
and the amount of broadcast traffic that passes between
|
||||
instances. Breaking layer-2 boundaries may require the
|
||||
implementation of overlay networks and tunnels. This decision
|
||||
is a balancing act between the need for a smaller overhead or
|
||||
a need for a smaller domain.</para>
|
||||
<para>When selecting network devices, be aware that making this
|
||||
decision based on the greatest port density often comes with a
|
||||
drawback. Aggregation switches and routers have not all kept
|
||||
pace with Top of Rack switches and may induce bottlenecks on
|
||||
north-south traffic. As a result, it may be possible for
|
||||
massive amounts of downstream network utilization to impact
|
||||
upstream network devices, impacting service to the cloud.
|
||||
Since OpenStack does not currently provide a mechanism for
|
||||
traffic shaping or rate limiting, it is necessary to implement
|
||||
these features at the network hardware level.</para>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
@ -1,104 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<section xmlns="http://docbook.org/ns/docbook"
|
||||
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
version="5.0"
|
||||
xml:id="user-requirements-network-focus">
|
||||
<?dbhtml stop-chunking?>
|
||||
<title>User requirements</title>
|
||||
<para>Network-focused architectures vary from the general-purpose
|
||||
architecture designs. Certain network-intensive applications influence
|
||||
these architectures. Some of the business requirements that influence
|
||||
the design include:</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>Network latency through slow page loads, degraded video
|
||||
streams, and low quality VoIP sessions impacts the user
|
||||
experience. Users are often not aware of how network design and
|
||||
architecture affects their experiences. Both enterprise customers
|
||||
and end-users rely on the network for delivery of an application.
|
||||
Network performance problems can result in a negative experience
|
||||
for the end-user, as well as productivity and economic loss.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<section xml:id="high-availability-issues-network-focus">
|
||||
<title>High availability issues</title>
|
||||
<para>Depending on the application and use case, network-intensive
|
||||
OpenStack installations can have high availability requirements.
|
||||
Financial transaction systems have a much higher requirement for high
|
||||
availability than a development application. Use network availability
|
||||
technologies, for example quality of service (QoS), to improve the
|
||||
network performance of sensitive applications such as VoIP and video
|
||||
streaming.</para>
|
||||
<para>High performance systems have SLA requirements for a minimum
|
||||
QoS with regard to guaranteed uptime, latency, and bandwidth. The level
|
||||
of the SLA can have a significant impact on the network architecture and
|
||||
requirements for redundancy in the systems.</para>
|
||||
</section>
|
||||
<section xml:id="risks-network-focus">
|
||||
<title>Risks</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>Network misconfigurations</term>
|
||||
<listitem>
|
||||
<para>Configuring incorrect IP addresses, VLANs, and routers
|
||||
can cause outages to areas of the network or, in the worst-case
|
||||
scenario, the entire cloud infrastructure. Automate network
|
||||
configurations to minimize the opportunity for operator error
|
||||
as it can cause disruptive problems.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Capacity planning</term>
|
||||
<listitem>
|
||||
<para>Cloud networks require management for capacity and growth
|
||||
over time. Capacity planning includes the purchase of network
|
||||
circuits and hardware that can potentially have lead times
|
||||
measured in months or years.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Network tuning</term>
|
||||
<listitem>
|
||||
<para>Configure cloud networks to minimize link loss, packet loss,
|
||||
packet storms, broadcast storms, and loops.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Single Point Of Failure (SPOF)</term>
|
||||
<listitem>
|
||||
<para>Consider high availability at the physical and environmental
|
||||
layers. If there is a single point of failure due to only one
|
||||
upstream link, or only one power supply, an outage can become
|
||||
unavoidable.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Complexity</term>
|
||||
<listitem>
|
||||
<para>An overly complex network design can be difficult to
|
||||
maintain and troubleshoot. While device-level configuration
|
||||
can ease maintenance concerns and automated tools can handle
|
||||
overlay networks, avoid or document non-traditional interconnects
|
||||
between functions and specialized hardware to prevent
|
||||
outages.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>Non-standard features</term>
|
||||
<listitem>
|
||||
<para>There are additional risks that arise from configuring the
|
||||
cloud network to take advantage of vendor specific features.
|
||||
One example is multi-link aggregation (MLAG) used to provide
|
||||
redundancy at the aggregator switch level of the network. MLAG
|
||||
is not a standard and, as a result, each vendor has their own
|
||||
proprietary implementation of the feature. MLAG architectures
|
||||
are not interoperable across switch vendors, which leads to
|
||||
vendor lock-in, and can cause delays or inability when upgrading
|
||||
components.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</section>
|
||||
</section>
|
@ -1,83 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<groupId>org.openstack.docs</groupId>
|
||||
<artifactId>parent-pom</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>openstack-arch-design</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>OpenStack Architecture Design Guide</name>
|
||||
<properties>
|
||||
<!-- This is set by Jenkins according to the branch. -->
|
||||
<release.path.name></release.path.name>
|
||||
<comments.enabled>0</comments.enabled>
|
||||
</properties>
|
||||
<!-- ################################################ -->
|
||||
<!-- USE "mvn clean generate-sources" to run this POM -->
|
||||
<!-- ################################################ -->
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>com.rackspace.cloud.api</groupId>
|
||||
<artifactId>clouddocs-maven-plugin</artifactId>
|
||||
<!-- version set in ../pom.xml -->
|
||||
<executions>
|
||||
<execution>
|
||||
<id>generate-webhelp</id>
|
||||
<goals>
|
||||
<goal>generate-webhelp</goal>
|
||||
</goals>
|
||||
<phase>generate-sources</phase>
|
||||
<configuration>
|
||||
<!-- These parameters only apply to webhelp -->
|
||||
<enableDisqus>0</enableDisqus>
|
||||
<disqusShortname>openstack-arch-design</disqusShortname>
|
||||
<enableGoogleAnalytics>1</enableGoogleAnalytics>
|
||||
<googleAnalyticsId>UA-17511903-1</googleAnalyticsId>
|
||||
<generateToc>
|
||||
appendix toc,title
|
||||
article/appendix nop
|
||||
article toc,title
|
||||
book toc,title,figure,table,example,equation
|
||||
chapter toc,title
|
||||
section toc
|
||||
part toc,title
|
||||
qandadiv toc
|
||||
qandaset toc
|
||||
reference toc,title
|
||||
set toc,title
|
||||
</generateToc>
|
||||
<!-- The following elements sets the autonumbering of sections in output for chapter numbers but no numbered sections-->
|
||||
<sectionAutolabel>0</sectionAutolabel>
|
||||
<tocSectionDepth>1</tocSectionDepth>
|
||||
<sectionLabelIncludesComponentLabel>0</sectionLabelIncludesComponentLabel>
|
||||
<webhelpDirname>arch-design</webhelpDirname>
|
||||
<pdfFilenameBase>arch-design</pdfFilenameBase>
|
||||
<pageWidth>7.44in</pageWidth>
|
||||
<pageHeight>9.68in</pageHeight>
|
||||
<doubleSided>1</doubleSided>
|
||||
<omitCover>1</omitCover>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<!-- These parameters apply to pdf and webhelp -->
|
||||
<xincludeSupported>true</xincludeSupported>
|
||||
<sourceDirectory>.</sourceDirectory>
|
||||
<includes>
|
||||
bk-openstack-arch-design.xml
|
||||
</includes>
|
||||
<canonicalUrlBase>http://docs.openstack.org/openstack-arch-design/content</canonicalUrlBase>
|
||||
<glossaryCollection>${basedir}/../glossary/glossary-terms.xml</glossaryCollection>
|
||||
<branding>openstack</branding>
|
||||
<formalProcedures>0</formalProcedures>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|