diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 2fe19533..00000000 --- a/.coveragerc +++ /dev/null @@ -1,7 +0,0 @@ -[run] -branch = True -source = tripleo-specs -omit = tripleo-specs/tests/*,tripleo-specs/openstack/* - -[report] -ignore_errors = True diff --git a/.gitignore b/.gitignore deleted file mode 100644 index c19906a3..00000000 --- a/.gitignore +++ /dev/null @@ -1,51 +0,0 @@ -*.py[cod] - -# C extensions -*.so - -# Packages -*.egg -*.egg-info -dist -build -eggs -parts -bin -var -sdist -develop-eggs -.installed.cfg -lib -lib64 - -# Installer logs -pip-log.txt - -# Unit test / coverage reports -.coverage -.tox -nosetests.xml -.stestr/ - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject - -# Complexity -output/*.html -output/*/index.html - -# Sphinx -doc/build - -# pbr generates these -AUTHORS -ChangeLog - -# Editors -*~ -.*.swp diff --git a/.mailmap b/.mailmap deleted file mode 100644 index cc92f17b..00000000 --- a/.mailmap +++ /dev/null @@ -1,3 +0,0 @@ -# Format is: -# -# \ No newline at end of file diff --git a/.stestr.conf b/.stestr.conf deleted file mode 100644 index d959b47f..00000000 --- a/.stestr.conf +++ /dev/null @@ -1,3 +0,0 @@ -[DEFAULT] -test_path=./tests -top_dir=. diff --git a/.zuul.yaml b/.zuul.yaml deleted file mode 100644 index c11af82c..00000000 --- a/.zuul.yaml +++ /dev/null @@ -1,9 +0,0 @@ -- project: - templates: - - openstack-specs-jobs - check: - jobs: - - openstack-tox-py36 - gate: - jobs: - - openstack-tox-py36 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst deleted file mode 100644 index abc0fbc7..00000000 --- a/CONTRIBUTING.rst +++ /dev/null @@ -1,16 +0,0 @@ -If you would like to contribute to the development of OpenStack, -you must follow the steps in this page: - - http://docs.openstack.org/infra/manual/developers.html - -Once those steps have been completed, changes to OpenStack -should be submitted for review via the Gerrit tool, following -the workflow documented at: - - http://docs.openstack.org/infra/manual/developers.html#development-workflow - -Pull requests submitted through GitHub will be ignored. - -Bugs should be filed on Launchpad, not GitHub: - - https://bugs.launchpad.net/tripleo diff --git a/HACKING.rst b/HACKING.rst deleted file mode 100644 index c1787376..00000000 --- a/HACKING.rst +++ /dev/null @@ -1,4 +0,0 @@ -tripleo-specs Style Commandments -=============================================== - -Read the OpenStack Style Commandments https://docs.openstack.org/hacking/latest/ \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 67db8588..00000000 --- a/LICENSE +++ /dev/null @@ -1,175 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 90f8a7ae..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,6 +0,0 @@ -include AUTHORS -include ChangeLog -exclude .gitignore -exclude .gitreview - -global-exclude *.pyc \ No newline at end of file diff --git a/README.rst b/README.rst index 1d3a41d3..4ee2c5f1 100644 --- a/README.rst +++ b/README.rst @@ -1,24 +1,10 @@ -======================== -Team and repository tags -======================== +This project is no longer maintained. -.. image:: http://governance.openstack.org/badges/tripleo-specs.svg - :target: http://governance.openstack.org/reference/tags/index.html +The contents of this repository are still available in the Git +source code management system. To see the contents of this +repository before it reached its end of life, please check out the +previous commit with "git checkout HEAD^1". -.. Change things from this point on - -=============================== -tripleo-specs -=============================== - -TripleO specs repository - -* Free software: Apache license -* Documentation: https://specs.openstack.org/openstack/tripleo-specs -* Source: http://git.openstack.org/cgit/openstack/tripleo-specs -* Bugs: http://bugs.launchpad.net/tripleo - -Features --------- - -* TODO +For any further questions, please email +openstack-discuss@lists.openstack.org or join #openstack-dev on +OFTC. diff --git a/doc/source/conf.py b/doc/source/conf.py deleted file mode 100644 index fac31731..00000000 --- a/doc/source/conf.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import datetime - -# -- General configuration ---------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [ - #'sphinx.ext.intersphinx', - 'openstackdocstheme', - 'yasfb', -] - -# Feed configuration for yasfb -feed_base_url = 'https://specs.openstack.org/openstack/tripleo-specs' -feed_author = 'OpenStack TripleO Team' - -exclude_patterns = [ - '**/template.rst', - '**/policy-template.rst', -] - -# openstackdocstheme options -openstackdocs_repo_name = 'openstack/tripleo-specs' -openstackdocs_bug_project = 'tripleo' -openstackdocs_bug_tag = '' - -# autodoc generation is a bit aggressive and a nuisance when doing heavy -# text edit cycles. -# execute "export SPHINX_DEBUG=1" in your terminal to disable - -# The suffix of source filenames. -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'tripleo-specs' -copyright = 'OpenStack Foundation' - -# If true, '()' will be appended to :func: etc. cross-reference text. -add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -add_module_names = True - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'native' - -# -- Options for HTML output -------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -# html_theme_path = ["."] -# html_theme = '_theme' -# html_static_path = ['static'] - -html_theme = 'openstackdocs' - -# Output file base name for HTML help builder. -htmlhelp_basename = '%sdoc' % project - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto/manual]). -latex_documents = [ - ('index', - '%s.tex' % project, - '%s Documentation' % project, - 'OpenStack Foundation', 'manual'), -] - -# Example configuration for intersphinx: refer to the Python standard library. -#intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/doc/source/index.rst b/doc/source/index.rst deleted file mode 100644 index 39eba1c3..00000000 --- a/doc/source/index.rst +++ /dev/null @@ -1,159 +0,0 @@ -.. tripleo documentation master file - -============================== -Tripleo Project Specifications -============================== - -Zed Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/zed/* - -Yoga Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/yoga/* - -Xena Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/xena/* - -Wallaby Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/wallaby/* - -Victoria Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/victoria/* - -Ussuri Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/ussuri/* - -Train Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/train/* - -Stein Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/stein/* - -Rocky Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/rocky/* - -Queens Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/queens/* - -Pike Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/pike/* - -Ocata Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/ocata/* - -Newton Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/newton/* - -Mitaka Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/mitaka/* - -Liberty Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/liberty/* - -Kilo Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/kilo/* - -Juno Approved Specs: - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/juno/* - -======================== -TripleO Project Policies -======================== - -Team decisions and policies that are not limited to a specific release. - -.. toctree:: - :glob: - :maxdepth: 1 - - specs/policy/* - -================== -Indices and tables -================== - -* :ref:`search` diff --git a/doc/source/specs b/doc/source/specs deleted file mode 120000 index 87a40301..00000000 --- a/doc/source/specs +++ /dev/null @@ -1 +0,0 @@ -../../specs \ No newline at end of file diff --git a/images/split-controlplane/ceph-details.png b/images/split-controlplane/ceph-details.png deleted file mode 100644 index 0f1dc2a8..00000000 Binary files a/images/split-controlplane/ceph-details.png and /dev/null differ diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f7bce985..00000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -openstackdocstheme>=2.2.1 # Apache-2.0 -sphinx>=2.0.0,!=2.1.0 # BSD -stestr>=2.0.0 # Apache-2.0 -testtools>=0.9.34 -yasfb>=0.8.0 diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index ddc4f0a0..00000000 --- a/setup.cfg +++ /dev/null @@ -1,12 +0,0 @@ -[metadata] -name = tripleo-specs -summary = TripleO specs repository -description_file = - README.rst -author = OpenStack -author_email = openstack-discuss@lists.openstack.org -home_page = https://specs.openstack.org/openstack/tripleo-specs/ -classifier = - Intended Audience :: Developers - License :: OSI Approved :: Apache Software License - Operating System :: POSIX :: Linux diff --git a/setup.py b/setup.py deleted file mode 100644 index 097bada8..00000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2013 Hewlett-Packard Development Company, L.P. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# THIS FILE IS MANAGED BY THE GLOBAL REQUIREMENTS REPO - DO NOT EDIT -import setuptools - -setuptools.setup( - setup_requires=['pbr'], - py_modules=[], - pbr=True) diff --git a/specs/juno/backwards-compat-policy.rst b/specs/juno/backwards-compat-policy.rst deleted file mode 100644 index d1cfc0cd..00000000 --- a/specs/juno/backwards-compat-policy.rst +++ /dev/null @@ -1,260 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Backwards compatibility and TripleO -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-backwards-compat - -TripleO has run with good but not perfect backwards compatibility since -creation. It's time to formalise this in a documentable and testable fashion. - -TripleO will follow Semantic Versioning (aka semver_) for versioning all -releases. We will strive to avoid breaking backwards compatibility at all, and -if we have to it will be because of extenuating circumstances such as security -fixes with no other way to fix things. - -Problem Description -=================== - -TripleO has historically run with an unspoken backwards compatibility policy -but we now have too many people making changes - we need to build a single -clear policy or else our contributors will have to rework things when one -reviewer asks for backwards compat when they thought it was not needed (or vice -versa do the work to be backwards compatible when it isn't needed. - -Secondly, because we haven't marked any of our projects as 1.0.0 there is no -way for users or developers to tell when and where backwards compatibility is -needed / appropriate. - -Proposed Change -=============== - -Adopt the following high level heuristics for identifying backwards -incompatible changes: - -* Making changes that break user code that scripts or uses a public interface. - -* Becoming unable to install something we could previously. - -* Being unable to install something because someone else has altered things - - e.g. being unable to install F20 if it no longer exists on the internet - is not an incompatible change - if it were returned to the net, we'd be able - to install it again. If we remove the code to support this thing, then we're - making an incompatible change. The one exception here is unsupported - projects - e.g. unsupported releases of OpenStack, or Fedora, or Ubuntu. - Because unsupported releases are security issues, and we expect most of our - dependencies to do releases, and stop supporting things, we will not treat - cleaning up code only needed to support such an unsupported release as - backwards compatible. For instance, breaking the ability to deploy a previous - *still supported* OpenStack release where we had previously been able to - deploy it is a backwards incompatible change, but breaking the ability to - deploy an *unsupported* OpenStack release is not. - -Corollaries to these principles: - -* Breaking a public API (network or Python). The public API of a project is - any released API (e.g. not explicitly marked alpha/beta/rc) in a version that - is >= 1.0.0. For Python projects, a \_ prefix marks a namespace as non-public - e.g. in ``foo.\_bar.quux`` ``quux`` is not public because it's in a non-public - namespace. For our projects that accept environment variables, if the - variable is documented (in the README.md/user documentation) then the variable - is part of the public interface. Otherwise it is not. - -* Increasing the set of required parameters to Heat templates. This breaks - scripts that use TripleO to deploy. Note that adding new parameters which - need to be set when deploying *new* things is fine because the user is - doing more than just pulling in updated code. - -* Decreasing the set of accepted parameters to Heat templates. Likewise, this - breaks scripts using the Heat templates to do deploys. If the parameters are - no longer accepted because they are for no longer supported versions of - OpenStack then that is covered by the carve-out above. - -* Increasing the required metadata to use an element except when both Tuskar - and tripleo-heat-templates have been updated to use it. There is a - bi-directional dependency from t-i-e to t-h-t and back - when we change - signals in the templates we have to update t-i-e first, and when we change - parameters to elements we have to alter t-h-t first. We could choose to make - t-h-t and t-i-e completely independent, but don't believe that is a sensible - use of time - they are closely connected, even though loosely coupled. - Instead we're treating them a single unit: at any point in time t-h-t can - only guarantee to deploy images built from some minimum version of t-i-e, - and t-i-e can only guarantee to be deployed with some minimum version of - t-h-t. The public API here is t-h-t's parameters, and the link to t-i-e - is equivalent to the dependency on a helper library for a Python - library/program: requiring new minor versions of the helper library is not - generally considered to be an API break of the calling code. Upgrades will - still work with this constraint - machines will get a new image at the same - time as new metadata, with a rebuild in the middle. Downgrades / rollback - may require switching to an older template at the same time, but that was - already the case. - -* Decreasing the accepted metadata for an element if that would result in an - error or misbehaviour. - -Other sorts of changes may also be backwards incompatible, and if identified -will be treated as such - that is, this list is not comprehensive. - -We don't consider the internal structure of Heat templates to be an API, nor -any test code within the TripleO codebases (whether it may appear to be public -or not). - -TripleO's incubator is not released and has no backwards compatibility -guarantees - but a point in time incubator snapshot interacts with ongoing -releases of other components - and they will be following semver, which means -that a user wanting stability can get that as long as they don't change the -incubator. - -TripleO will promote all its component projects to 1.0 within one OpenStack -release cycle of them being created. Projects may not become dependencies of a -project with a 1.0 or greater version until they are at 1.0 themselves. This -restriction serves to prevent version locking (makes upgrades impossible) by -the depending version, or breakage (breaks users) if the pre 1.0 project breaks -compatibility. Adding new projects will involve creating test jobs that test -the desired interactions before the dependency is added, so that the API can -be validated before the new project has reached 1.0. - -Adopt the following rule on *when* we are willing to [deliberately] break -backwards compatibility: - -* When all known uses of the code are for no longer supported OpenStack - releases. - -* If the PTL signs off on the break. E.g. a high impact security fix for which - we cannot figure out a backwards compatible way to deliver it to our users - and distributors. - -We also need to: - -* Set a timeline for new codebases to become mature (one cycle). Existing - codebases will have the clock start when this specification is approved. - -* Set rules for allowing anyone to depend on new codebases (codebase must be - 1.0.0). - -* Document what backwards compatible means in the context of heat templates and - elements. - -* Add an explicit test job for deploying Icehouse from trunk, because that will - tell us about our ability to deploy currently supported OpenStack versions - which we could previously deploy - that failing would indicate the proposed - patch is backwards incompatible. - -* If needed either fix Icehouse, or take a consensus decision to exclude - Icehouse support from this policy. - -* Commit to preserving backwards compatibility. - -* When we need alternate codepaths to support backwards compatibility we will - mark them clearly to facilitate future cleanup:: - - # Backwards compatibility: <....> - if .. - # Trunk - ... - elif - # Icehouse - ... - else - # Havana - ... - -Alternatives ------------- - -* We could say that we don't do backwards compatibility and release like the - OpenStack API services do, but this makes working with us really difficult - and it also forces folk with stable support desires to work from separate - branches rather than being able to collaborate on a single codebase. - -* We could treat tripleo-heat-templates and tripleo-image-elements separately - to the individual components and run them under different rules - e.g. using - stable branches rather than semver. But there have been so few times that - backwards compatibility would be hard for us that this doesn't seem worth - doing. - -Security Impact ---------------- - -Keeping code around longer may have security considerations, but this is a -well known interaction. - -Other End User Impact ---------------------- - -End users will love us. - -Performance Impact ------------------- - -None anticipated. Images will be a marginally larger due to carrying backwards -compat code around. - -Other Deployer Impact ---------------------- - -Deployers will appreciate not having to rework things. Not that they have had -to, but still. - -Developer Impact ----------------- - -Developers will have clear expectations set about backwards compatibility which -will help them avoid being asked to rework things. They and reviewers will need -to look out for backward incompatible changes and special case handling of -them to deliver the compatibility we aspire to. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - lifeless - -Other contributors: - -Work Items ----------- - -* Draft this spec. - -* Get consensus around it. - -* Release all our non-incubator projects as 1.0.0. - -* Add Icehouse deploy test job. (Because we could install Icehouse at the start - of Juno, and if we get in fast we can keep being able to do so). - -Dependencies -============ - -None. An argument could be made for doing a quick cleanup of stuff, but the -reality is that it's not such a burden we've had to clean it up yet. - -Testing -======= - -To ensure we don't accidentally break backwards compatibility we should look -at the oslo cross-project matrix eventually - e.g. run os-refresh-config -against older releases of os-apply-config to ensure we're not breaking -compatibility. Our general policy of building releases of things and using -those goes a long way to giving us good confidence though - we can be fairly -sure of no single-step regressions (but will still have to watch out for -N-step regressions unless some mechanism is put in place). - -Documentation Impact -==================== - -The users manual and developer guides should reflect this. - -References -========== - -.. _semver: http://docs.openstack.org/developer/pbr/semver.html diff --git a/specs/juno/haproxy_configuration.rst b/specs/juno/haproxy_configuration.rst deleted file mode 100644 index a088e681..00000000 --- a/specs/juno/haproxy_configuration.rst +++ /dev/null @@ -1,229 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================ -Haproxy ports and related services configuration -================================================ - -Blueprint: https://blueprints.launchpad.net/tripleo/+spec/tripleo-haproxy-configuration - -Current spec provides options for HA endpoints delivery via haproxy. - - -Problem Description -=================== - -Current tripleo deployment scheme binds services on 0.0.0.0:standard_port, -with stunnel configured to listen on ssl ports. - -This configuration has some drawbacks and wont work in ha, for several reasons: - -* haproxy cant bind on : - openstack services are - bound to 0.0.0.0: - -* services ports hardcoded in many places (any_service.conf, init-keystone), - so changing them and configuring from heat would be a lot of pain - -* the non-ssl endpoint is reachable from outside the local host, - which could potentially confuse users and expose them to an insecure connection - in the case where we want to run that service on SSL only. We want to offer SSL - by default but we can't really prevent it. - -Proposed Change -=============== - -We will bind haproxy, stunnel (ssl), openstack services on ports with -different ipaddress settings. - -HAProxy will be bound to VIP addresses only. - -STunnel where it is used will be bound to the controller ctlplane address. - -OpenStack services will bind to localhost for SSL only configurations, and to -the ctlplane address for non-SSL or mixed-mode configurations. They will bind -to the standard non-encrypted ports, but will never bind to 0.0.0.0 on any -port. - -We'll strive to make SSL-only the default. - -An example, using horizon in mixed mode (HTTPS and HTTP): - -vip_address = 192.0.2.21 -node_address = 192.0.2.24 - -1. haproxy - listen horizon_http - bind vip_address:80 - server node_1 node_address:80 - listen horizon_https - bind vip_address:443 - server node_1 node_address:443 - -2. stunnel - accept node_address:443 - connect node_address:80 - -3. horizon - bind node_address:80 - -A second example, using horizon in HTTPS only mode: - -vip_address = 192.0.2.21 -node_address = 192.0.2.24 - -1. haproxy - listen horizon_https - bind vip_address:443 - server node_1 node_address:443 - -2. stunnel - accept node_address:443 - connect 127.0.0.1:80 - -3. horizon - bind 127.0.0.1:80 - -Alternatives ------------- - -There are several alternatives which do not cover all the requirements for -security or extensibility - -Option 1: Assignment of different ports for haproxy, stunnel, openstack services on 0.0.0.0 - -* requires additional firewall configuration -* security issue with non-ssl services endpoints - -1. haproxy - bind :80 - - listen horizon - server node_1 node_address:8800 - -2. stunnel - accept :8800 - connect :8880 - -3. horizon - bind :8880 - -Option 2: Using only haproxy ssl termination is suboptimal: - -* 1.5 is still in devel phase -> potential stability issues -* we would have to get this into supported distros -* this also means that there is no SSL between haproxy and real service -* security issue with non-ssl services endpoints - -1. haproxy - bind vip_address:80 - - listen horizon - server node_1 node_address:80 - -2. horizon - bind node_address:80 - -Option 3: Add additional ssl termination before load-balancer - -* not useful in current configuration because load balancer (haproxy) - and openstack services installed on same nodes - -Security Impact ---------------- - -* Only ssl protected endpoints are publicly available if running SSL only. -* Minimal firewall configuration -* Not forwarding decrypted traffic over non-localhost connections -* compromise of a control node exposes all external traffic (future and possibly past) - to decryption and/or spoofing - -Other End User Impact ---------------------- - -Several services will listen on same port, but it will be quite easy -to understand if user (operator) will know some context. - - -Performance Impact ------------------- - -No differences between approaches. - -Other Deployer Impact ---------------------- -None - -Developer Impact ----------------- -None - -Implementation -============== - -We need to make the service configs - nova etc - know on a per service basis -where to bind. The current approach uses logic in the template to choose -between localhost and my_ip. If we move the selection into Heat this can -become a lot simpler (read a bind address, if set use it, if not don't). - -We considered extending the connect_ip concept to be on a per service basis. -Right now all services are exposed to both SSL and plain, so this would be -workable until we get a situation where only some services are plain - but we -expect that sooner rather than later. - -Assignee(s) ------------ - -Primary assignee: - dshulyak - - -Work Items ----------- - -tripleo-incubator: -* build overcloud-control image with haproxy element - -tripleo-image-elements: - -* openstack-ssl element refactoring - -* refactor services configs to listen on 127.0.0.1 / ctlplane address: - horizon apache configuration, glance, nova, cinder, swift, ceilometer, - neutron, heat, keystone, trove - -tripleo-heat-templates: -* add haproxy metadata to heat-templates - - -Dependencies -============ -None - - -Testing -======= -CI testing dependencies: - -* use vip endpoints in overcloud scripts - -* add haproxy element to overcloud-control image (maybe with stats enabled) before - adding haproxy related metadata to heat templates - - -Documentation Impact -==================== - -* update incubator manual - -* update elements README.md - - -References -========== - -http://haproxy.1wt.eu/download/1.4/doc/configuration.txt - -https://www.stunnel.org/howto.html diff --git a/specs/juno/network_configuration.rst b/specs/juno/network_configuration.rst deleted file mode 100644 index 11e9e599..00000000 --- a/specs/juno/network_configuration.rst +++ /dev/null @@ -1,272 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -TripleO network configuration -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/os-net-config - -We need a tool (or tools) to help configure host level networking -in TripleO. This includes things like: - - * Static IPs - - * Multiple OVS bridges - - * Bonding - - * VLANs - -Problem Description -=================== - -Today in TripleO we bootstrap nodes using DHCP so they can download -custom per node metadata from Heat. This metadata contains per instance -network information that allows us to create a customized host level network -configuration. - -Today this is accomplished via two scripts: - - * ensure-bridge: http://git.openstack.org/cgit/openstack/tripleo-image-elements/tree/elements/network-utils/bin/ensure-bridge - * init-neutron-ovs: http://git.openstack.org/cgit/openstack/tripleo-image-elements/tree/elements/neutron-openvswitch-agent/bin/init-neutron-ovs - -The problem with the existing scripts is that their feature set is extremely -prescriptive and limited. Today we only support bridging a single NIC -onto an OVS bridge, VLAN support is limited and more advanced configuration -(of even common IP address attributes like MTUs, etc) is not possible. - -Furthermore we also desire some level of control over how networking changes -are made and whether they are persistent. In this regard a provider layer -would be useful so that users can choose between using for example: - - * ifcfg/eni scripts: used where persistence is required and we want - to configure interfaces using the distro supported defaults - * iproute2: used to provide optimized/streamlined network configuration - which may or may not also include persistence - -Our capabilities are currently limited to the extent that we are unable -to fully provision our TripleO CI overclouds without making manual -changes and/or hacks to images themselves. As such we need to -expand our host level network capabilities. - -Proposed Change -=============== - -Create a new python project which encapsulates host level network configuration. - -This will likely consist of: - - * an internal python library to facilitate host level network configuration - - * a binary which processes a YAML (or JSON) format and makes the associated - python library calls to configure host level networking. - -By following this design the tool should work well with Heat driven -metadata and provide us the future option of moving some of the -library code into Oslo (oslo.network?) or perhaps Neutron itself. - -The tool will support a "provider" layer such that multiple implementations -can drive the host level network configuration (iproute2, ifcfg, eni). -This is important because as new network config formats are adopted -by distributions we may want to gradually start making use of them -(thinking ahead to systemd.network for example). - -The tool will also need to be extensible such that we can add new -configuration options over time. We may for example want to add -more advanced bondings options at a later point in time... and -this should be as easy as possible. - -The focus of the tool initially will be host level network configuration -for existing TripleO features (interfaces, bridges, vlans) in a much -more flexible manner. While we support these things today in a prescriptive -manner the new tool will immediately support multiple bridges, interfaces, -and vlans that can be created in an ad-hoc manner. Heat templates can be -created to drive common configurations and people can customize those -as needed for more advanced networking setups. - -The initial implementation will focus on persistent configuration formats -for ifcfg and eni, like we do today via ensure-bridge. This will help us -continue to make steps towards bringing bare metal machines back online -after a power outage (providing a static IP for the DHCP server for example). - -The primary focus of this tool should always be host level network -configuration and fine tuning that we can't easily do within Neutron itself. -Over time the scope and concept of the tool may shift as Neutron features are -added and/or subtracted. - - -Alternatives ------------- - -One alternative is to keep expanding ensure-bridge and init-neutron-ovs -which would require a significant number of new bash options and arguments to -configure all the new features (vlans, bonds, etc.). - -Many of the deployment projects within the OpenStack ecosystem are doing -similar sorts of networking today. Consider: - - * Chef/Crowbar: https://github.com/opencrowbar/core/blob/master/chef/cookbooks/network/recipes/default.rb - * Fuel: https://github.com/stackforge/fuel-library/tree/master/deployment/puppet/l23network - * VDSM (GPL): contains code to configure interfaces, both ifcfg and iproute2 abstractions (git clone http://gerrit.ovirt.org/p/vdsm.git, then look at vdsm/vdsm/network/configurators) - * Netconf: heavy handed for this perhaps but interesting (OpenDaylight, etc) - -Most of these options are undesirable because they would add a significant -number of dependencies to TripleO. - -Security Impact ---------------- - -The configuration data used by this tool is already admin-oriented in -nature and will continue to be provided by Heat. As such there should -be no user facing security concerns with regards to access to the -configuration data that aren't already present. - -This implementation will directly impact the low level network connectivity -in all layers of TripleO including the seed, undercloud, and overcloud -networks. Any of the host level networking that isn't already provided -by Neutron is likely affected. - -Other End User Impact ---------------------- - -This feature enables deployers to build out more advanced undercloud and -overcloud networks and as such should help improve the reliability and -performance of the fundamental host network capabilities in TripleO. - -End users should benefit from these efforts. - -Performance Impact ------------------- - -This feature will allow us to build better/more advanced networks and as -such should help improve performance. In particular the interface bonding -and VLAN support should help in this regard. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -None - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Dan Prince (dan-prince on Launchpad) - -Work Items ----------- - - * Create project on GitHub: os-net-config - - * Import project into openstack-infra, get unit tests gating, etc. - - * Build a python library to configure host level networking with - an initial focus on parity with what we already have including things - we absolutely need for our TripleO CI overcloud networks. - - The library will consist of an object model which will allow users to - create interfaces, bridges, and vlans, and bonds (optional). Each of - these types will act as a container for address objects (IPv4 and IPv6) - and routes (multiple routes may be defined). Additionally, each - object will include options to enable/disable DHCP and set the MTU. - - * Create provider layers for ifcfg/eni. The providers take an object - model and apply it ("make it so"). The ifcfg provider will write out - persistent config files in /etc/sysconfig/network-scripts/ifcfg- - and use ifup/ifdown to start and stop the interfaces when an change - has been made. The eni provider will write out configurations to - /etc/network/interfaces and likewise use ifup/ifdown to start and - stop interfaces when a change has been made. - - * Create a provider layer for iproute2. Optional, can be done at - a later time. This provider will most likely not use persistent - formats and will run various ip/vconfig/route commands to - configure host level networking for a given object model. - - * Create a binary that processes a YAML config file format and makes - the correct python library calls. The binary should be idempotent - in that running the binary once with a given configuration should - "make it so". Running it a second time with the same configuration - should do nothing (i.e. it is safe to run multiple times). An example - YAML configuration format is listed below which describes a single - OVS bridge with an attached interface, this would match what - ensure-bridge creates today: - -.. code-block:: yaml - - network_config: - - - type: ovs_bridge - name: br-ctlplane - use_dhcp: true - ovs_extra: - - br-set-external-id br-ctlplane bridge-id br-ctlplane - members: - - - type: interface - name: em1 - -.. - - The above format uses a nested approach to define an interface - attached to a bridge. - - * TripleO element to install os-net-config. Most likely using - pip (but we may use git initially until it is released). - - * Wire this up to TripleO...get it all working together using the - existing Heat metadata formats. This would include any documentation - changes to tripleo-incubator, deprecating old elements, etc. - - * TripleO heat template changes to use the new YAML/JSON formats. Our default - configuration would most likely do exactly what we do today (OVS bridge - with a single attached interface). We may want to create some other example - heat templates which can be used in other environments (multi-bridge - setups like we use for our CI overclouds for example). - - -Dependencies -============ - -None - -Testing -======= - -Existing TripleO CI will help ensure that as we implement this we maintain -parity with the current feature set. - -The ability to provision and make use of our Triple CI clouds without -custom modifications/hacks will also be a proving ground for much of -the work here. - -Additional manual testing may be required for some of the more advanced -modes of operation (bonding, VLANs, etc.) - -Documentation Impact -==================== - -The recommended heat metadata used for network configuration may -change as result of this feature. Older formats will be preserved for -backwards compatibility. - -References -========== - -Notes from the Atlanta summit session on this topic can be found -here (includes possible YAML config formats): - - * https://etherpad.openstack.org/p/tripleo-network-configuration diff --git a/specs/juno/oac-header.rst b/specs/juno/oac-header.rst deleted file mode 100644 index cd45fa3b..00000000 --- a/specs/juno/oac-header.rst +++ /dev/null @@ -1,162 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -===================================== -Control mechanism for os-apply-config -===================================== - -Problem Description -=================== - -We require a control mechanism in os-apply-config (oac). This could be used, -for example, to: - - * Not create an empty target - * Set permissions on the target - -Proposed Change -=============== - -The basic proposal is to parameterise oac with maps (aka dictionaries) -containing control data. These maps will be supplied as YAML in companion -control files. Each file will be named after the template it relates to, with a -".oac" suffix. For example, the file "abc/foo.sh" would be controlled by -"abc/foo.sh.oac". - -Only control files with matching templates files will be respected, IE the file -"foo" must exist for the control file "foo.oac" to have any effect. A dib-lint -check will be added to look for file control files without matching templates, -as this may indicate a template has been moved without its control file. - -Directories may also have control files. In this case, the control file must be -inside the directory and be named exactly "oac". A file either named "oac" or -with the control file suffix ".oac" will never be considered as templates. - -The YAML in the control file must evaluate to nothing or a mapping. The former -allows for the whole mapping having been commented out. The presence of -unrecognised keys in the mapping is an error. File and directory control keys -are distinct but may share names. If they do, they should also share similar -semantics. - -Example control file:: - - key1: true - key2: 0700 - # comment - key3: - - 1 - - 2 - -To make the design concrete, one file control key will be offered initially: -allow_empty. This expects a Boolean value and defaults to true. If it is true, -oac will behave as it does today. Otherwise, if after substitutions the -template body is empty, no file will be created at the target path and any -existing file there will be deleted. - -allow_empty will also be allowed as a directory control key. Again, it will -expect a Boolean value and default to true. Given a nested structure -"A/B/C/foo", where "foo" is an empty file with allow_empty=false: - - * C has allow_empty=false: A/B/ is created, C is not. - * B has allow_empty=false: A/B/C/ is created. - * B and C have allow_empty=false: Only A/ is created. - -It is expected that additional keys will be proposed soon after this spec is -approved. - -Alternatives ------------- - -A fenced header could be used rather than a separate control file. Although -this aids visibility of the control data, it is less consistent with control -files for directories and (should they be added later) symlinks. - -The directory control file name has been the subject of some debate. -Alternatives to control "foo/" include: - - * foo/.oac (not visible with unmodified "ls") - * foo/oac.control (longer) - * foo/control (generic) - * foo.oac (if foo/ is empty, can't be stored in git) - * foo/foo.oac (masks control file for foo/foo) - -Security Impact ---------------- - -None. The user is already in full control of the target environment. For -example, they could use the allow_empty key to delete a critical file. However -they could already simply provide a bash script to do the same. Further, the -resulting image will be running on their (or their customer's) hardware, so it -would be their own foot they'd be shooting. - -Other End User Impact ---------------------- - -None. - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -It will no longer be possible to create files named "oac" or with the suffix -".oac" using oac. This will not affect any elements currently within -diskimage-builder or tripleo-image-elements. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - alexisl (aka lxsli, Alexis Lee) - -Other contributors: - None - -Work Items ----------- - - * Support file control files in oac - * Support the allow_empty file control key - * Add dib-lint check for detached control files - * Support directory control files in oac - * Support the allow_empty directory control key - * Update the oac README - -Dependencies -============ - -None. - -Testing -======= - -This change is easily tested using standard unit test techniques. - -Documentation Impact -==================== - -The oac README must be updated. - -References -========== - -There has already been some significant discussion of this feature: - https://blueprints.launchpad.net/tripleo/+spec/oac-header - -There is a bug open for which an oac control mechanism would be useful: - https://bugs.launchpad.net/os-apply-config/+bug/1258351 diff --git a/specs/juno/promote-heat-env.rst b/specs/juno/promote-heat-env.rst deleted file mode 100644 index c625fe12..00000000 --- a/specs/juno/promote-heat-env.rst +++ /dev/null @@ -1,258 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================ -Promote HEAT_ENV -================ - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-promote-heat-env - -Promote values set in the Heat environment file to take precedence over -input environment variables. - -Problem Description -=================== - -Historically TripleO scripts have consulted the environment for many items of -configuration. This raises risks of scope leakage and the number of environment -variables required often forces users to manage their environment with scripts. -Consequently, there's a push to prefer data files like the Heat environment -file (HEAT_ENV) which may be set by passing -e to Heat. To allow this file to -provide an unambiguous source of truth, the environment must not be allowed to -override the values from this file. That is to say, precedence must be -transferred. - -A key distinction is whether the value of an environment variable is obtained -from the environment passed to it by its parent process (either directly or -through derivation). Those which are will be referred to as "input variables" -and are deprecated by this spec. Those which are not will be called "local -variables" and may be introduced freely. Variables containing values -synthesised from multiple sources must be handled on a case-by-case basis. - - -Proposed Change -=============== - -Since changes I5b7c8a27a9348d850d1a6e4ab79304cf13697828 and -I42a9d4b85edcc99d13f7525e964baf214cdb7cbf, ENV_JSON (the contents of the file -named by HEAT_ENV) is constructed in devtest_undercloud.sh like so:: - - ENV_JSON=$(jq '.parameters = { - "MysqlInnodbBufferPoolSize": 100 - } + .parameters + { - "AdminPassword": "'"${UNDERCLOUD_ADMIN_PASSWORD}"'", - "AdminToken": "'"${UNDERCLOUD_ADMIN_TOKEN}"'", - "CeilometerPassword": "'"${UNDERCLOUD_CEILOMETER_PASSWORD}"'", - "GlancePassword": "'"${UNDERCLOUD_GLANCE_PASSWORD}"'", - "HeatPassword": "'"${UNDERCLOUD_HEAT_PASSWORD}"'", - "NovaPassword": "'"${UNDERCLOUD_NOVA_PASSWORD}"'", - "NeutronPassword": "'"${UNDERCLOUD_NEUTRON_PASSWORD}"'", - "NeutronPublicInterface": "'"${NeutronPublicInterface}"'", - "undercloudImage": "'"${UNDERCLOUD_ID}"'", - "BaremetalArch": "'"${NODE_ARCH}"'", - "PowerSSHPrivateKey": "'"${POWER_KEY}"'", - "NtpServer": "'"${UNDERCLOUD_NTP_SERVER}"'" - }' <<< $ENV_JSON) - -This is broadly equivalent to "A + B + C", where values from B override those -from A and values from C override those from either. Currently section C -contains a mix of input variables and local variables. It is proposed that -current and future environment variables are allocated such that: - -* A only contains default values. -* B is the contents of the HEAT_ENV file (from either the user or a prior run). -* C only contains computed values (from local variables). - -The following are currently in section C but are not local vars:: - - NeutronPublicInterface (default 'eth0') - UNDERCLOUD_NTP_SERVER (default '') - -The input variables will be ignored and the defaults moved into section A:: - - ENV_JSON=$(jq '.parameters = { - "MysqlInnodbBufferPoolSize": 100, - "NeutronPublicInterface": "eth0", - "NtpServer": "" - } + .parameters + { - ... elided ... - }' <<< $ENV_JSON) - -devtest_overcloud.sh will be dealt with similarly. These are the variables -which need to be removed and their defaults added to section A:: - - OVERCLOUD_NAME (default '') - OVERCLOUD_HYPERVISOR_PHYSICAL_BRIDGE (default '') - OVERCLOUD_HYPERVISOR_PUBLIC_INTERFACE (default '') - OVERCLOUD_BRIDGE_MAPPINGS (default '') - OVERCLOUD_FLAT_NETWORKS (default '') - NeutronPublicInterface (default 'eth0') - OVERCLOUD_LIBVIRT_TYPE (default 'qemu') - OVERCLOUD_NTP_SERVER (default '') - -Only one out of all these input variables is used outside of these two scripts -and consequently the rest are safe to remove. - -The exception is OVERCLOUD_LIBVIRT_TYPE. This is saved by the script -'write-tripleorc'. As it will now be preserved in HEAT_ENV, it does not need to -also be preserved by write-tripleorc and can be removed from there. - ----- - -So that users know they need to start setting these values through HEAT_ENV -rather than input variables, it is further proposed that for an interim period -each script echo a message to STDERR if deprecated input variables are set. For -example:: - - for OLD_VAR in OVERCLOUD_NAME; do - if [ ! -z "${!OLD_VAR}" ]; then - echo "WARNING: ${OLD_VAR} is deprecated, please set this in the" \ - "HEAT_ENV file (${HEAT_ENV})" 1>&2 - fi - done - ----- - -To separate user input from generated values further, it is proposed that user -values be read from a new file - USER_HEAT_ENV. This will default to -{under,over}cloud-user-env.json. A new commandline parameter, --user-heat-env, -will be added to both scripts so that this can be changed. - -#. ENV_JSON is initialised with default values. -#. ENV_JSON is overlaid by HEAT_ENV. -#. ENV_JSON is overlaid by USER_HEAT_ENV. -#. ENV_JSON is overlaid by computed values. -#. ENV_JSON is saved to HEAT_ENV. - -See http://paste.openstack.org/show/83551/ for an example of how to accomplish -this. In short:: - - ENV_JSON=$(cat ${HEAT_ENV} ${USER_HEAT_ENV} | jq -s ' - .[0] + .[1] + {"parameters": - ({..defaults..} + .[0].parameters + {..computed..} + .[1].parameters)}') - cat > "${HEAT_ENV}" <<< ${ENV_JSON} - -Choosing to move user data into a new file, compared to moving the merged data, -makes USER_HEAT_ENV optional. If users wish, they can continue providing their -values in HEAT_ENV. The complementary solution requires users to clean -precomputed values out of HEAT_ENV, or they risk unintentionally preventing the -values from being recomputed. - -Loading computed values after user values sacrifices user control in favour of -correctness. Considering that any devtest user must be rather technical, if a -computation is incorrect they can fix or at least hack the computation -themselves. - -Alternatives ------------- - -Instead of removing the input variables entirely, an interim form could be -used:: - - ENV_JSON=$(jq '.parameters = { - "MysqlInnodbBufferPoolSize": 100, - "NeutronPublicInterface": "'"${NeutronPublicInterface}"'", - "NtpServer": "'"${UNDERCLOUD_NTP_SERVER}"'" - } + .parameters + { - ... - } - -However, the input variables would only have an effect if the keys they affect -are not present in HEAT_ENV. As HEAT_ENV is written each time devtest runs, the -keys will usually be present unless the file is deleted each time (rendering it -pointless). So this form is more likely to cause confusion than aid -transition. - ----- - -jq includes an 'alternative operator', ``//``, which is intended for providing -defaults:: - - A filter of the form a // b produces the same results as a, if a produces - results other than false and null. Otherwise, a // b produces the same - results as b. - -This has not been used in the proposal for two reasons: - -#. It only works on individual keys, not whole maps. -#. It doesn't work in jq 1.2, still included by Ubuntu 13.04 (Saucy). - -Security Impact ---------------- - -None. - -Other End User Impact ---------------------- - -An announcement will be made on the mailing list when this change merges. This -coupled with the warnings given if the deprecated variables are set should -provide sufficient notice. - -As HEAT_ENV is rewritten every time devtest executes, we can safely assume it -matches the last environment used. However users who use scripts to switch -their environment may be surprised. Overall the change should be a benefit to -these users, as they can use two separate HEAT_ENV files (passing --heat-env to -specify which to activate) instead of needing to maintain scripts to set up -their environment and risking settings leaking from one to the other. - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -None. - - -Implementation -============== - -Assignee(s) ------------ - -lxsli - -Work Items ----------- - -* Add USER_HEAT_ENV to both scripts. -* Move variables in both scripts. -* Add deprecated variables warning to both scripts. -* Remove OVERCLOUD_LIBVIRT_TYPE from write-tripleorc. - - -Dependencies -============ - -None. - - -Testing -======= - -The change will be tested in isolation from the rest of the script. - - -Documentation Impact -==================== - -* Update usage docs with env var deprecation warnings. -* Update usage docs to recommend HEAT_ENV. - - -References -========== - -#. http://stedolan.github.io/jq/manual/ - JQ manual -#. http://jqplay.herokuapp.com/ - JQ interactive demo diff --git a/specs/juno/ssl_pki.rst b/specs/juno/ssl_pki.rst deleted file mode 100644 index e4d0403b..00000000 --- a/specs/juno/ssl_pki.rst +++ /dev/null @@ -1,169 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======= -SSL PKI -======= - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-ssl-pki - -Each of our clouds require multiple ssl certificates to operate. We need to -support generating these certificates in devtest in a manner which will -closely resemble the needs of an actual deployment. We also need to support -interfacing with the PKI (Public Key Infrastructure) of existing organizations. -This spec outlines the ways we will address these needs. - -Problem Description -=================== - -We have a handful of services which require SSL certificates: - - * Keystone - * Public APIs - * Galera replication - * RabbitMQ replication - -Developers need to have these certificates generated automatically for them, -while organizations will likely want to make use of their existing PKI. We -have not made clear at what level we will manage these certificates and/or -their CA(s) and at what level the user will be responsible for them. This is -further complicated by the Public API's likely having a different CA than the -internal-only facing services. - -Proposed Change -=============== - -Each of these services will accept their SSL certificate, key, and CA via -environment JSON (heat templates for over/undercloud, config.json for seed). - -At the most granular level, a user can specify these values by editing the -over/undercloud-env.json or config.json files. If a certificate and key is -specified for a service then we will not attempt to automatically generate one -for that service. If only a certificate or key is specified it is considered -an error. - -If no certificate and key is specified for a service, we will attempt to -generate a certificate and key, and sign the certificate with a self-signed -CA we generate. Both the undercloud and seed will share a self-signed CA in -this scenario, and each overcloud will have a separate self-signed CA. We will -also add this self-signed CA to the chain of trust for hosts which use services -of the cloud being created. - -The use of a custom CA for signing the automatically generated certificates -will be solved in a future iteration. - -Alternatives ------------- - -None presented thus far. - -Security Impact ---------------- - -This change has high security impact as it affects our PKI. We currently do not -have any SSL support, and implementing this should therefore improve our -security. We should ensure all key files we create in this change have file -permissions of 0600 and that the directories they reside in have permissions -of 0700. - -There are many security implications for SSL key generation (including entropy -availability) and we defer to the OpenStack Security Guide[1] for this. - -Other End User Impact ---------------------- - -Users can interact with this feature by editing the under/overcloud-env.json -files and the seed config.json file. Additionally, the current properties which -are used for specifying the keystone CA and certificate will be changed to -support a more general naming scheme. - -Performance Impact ------------------- - -We will be performing key generation which can require a reasonable amount of -resources, including entropy sources. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -More SSL keys will be generated for developers. Debugging via monitoring -network traffic can also be more difficult once SSL is adopted. Production -environments will also require SSL unwrapping to debug network traffic, so this -will allow us to closer emulate production (developers can now spot missing SSL -wrapping). - -Implementation -============== - -The code behind generate-keystone-pki in os-cloud-config will be generalized -to support creation of a CA and certificates separately, and support creation -of multiple certificates using a single CA. A new script will be created -named 'generate-ssl-cert' which accepts a heat environment JSON file and a -service name. This will add ssl.certificate and ssl.certificate_key properties -under the servicename property (an example is below). If no ssl.ca_certificate -and ssl.ca_certificate_key properties are defined then this script will perform -generation of the self-signed certificate. - -Example heat environment output:: - - { - "ssl": { - "ca_certificate": "", - "ca_key": "" - }, - "horizon" { - "ssl": { - "ca_certificate": "", - "ca_certificate_key": "" - }, - ... - }, - ... - } - -Assignee(s) ------------ - -Primary assignee: - greghaynes - -Work Items ----------- - - * Generalize CA/certificate creation in os-cloud-config. - * Add detection logic for certificate key pairs in -env.json files to devtest - * Make devtest scripts call CA/cert creation scripts if no cert is found - for a service - -Dependencies -============ - -The services listed above are not all set up to use SSL certificates yet. This -is required before we can add detection logic for user specified certificates -for all services. - -Testing -======= - -Tests for new functionality will be made to os-cloud-config. The default -behavior for devtest is designed to closely mimic a production setup, allowing -us to best make use of our CI. - -Documentation Impact -==================== - -We will need to document the new interfaces described in 'Other End User -Impact'. - -References -========== - -1. Openstack Security Guide: http://docs.openstack.org/security-guide/content/ diff --git a/specs/juno/tripleo-juno-ci-improvements.rst b/specs/juno/tripleo-juno-ci-improvements.rst deleted file mode 100644 index 380566b9..00000000 --- a/specs/juno/tripleo-juno-ci-improvements.rst +++ /dev/null @@ -1,269 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================== -Triple CI improvements -====================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-ci-improvements - -Tripleo CI is painful at the moment, we have problems with both reliability -and consistency of running job times, this spec is intended to address a -number of the problems we have been facing. - -Problem Description -=================== - -Developers should be able to depend on CI to produce reliable test results with -a minimum number of false negatives reported in a timely fashion, this -currently isn't the case. To date the reliability of tripleo ci has been -heavily effected by network glitches, availability of network resources and -reliability of the CI clouds. This spec is intended to deal with the problems -we have been seeing. - -**Problem :** Reliability of hp1 (hp1_reliability_) - Intermittent failures on jobs running on the hp1 cloud have been causing a - large number of job failures and sometimes taking this region down - altogether. Current thinking is that the root of most of these issues is - problems with a mellanox driver. - -**Problem :** Unreliable access to network resources (net_reliability_) - Gaining reliable access to various network resources has been inconsistent - causing a CI outage when any one network resource is unavailable. Also - inconsistent speeds downloading these resources can make it difficult to - gauge overall speed improvements made to tripleo. - -**Problem :** (system_health_) The health of the overall CI system isn't - immediately obvious, problems often persist for hours (or occasionally days) - before we react to them. - -**Problem :** (ci_run_times_) The tripleo devtest story takes time to run, - this uses up CI resources and developer's time, where possible we should - reduce the time required to run devtest. - -**Problem :** (inefficient_usage_) Hardware on which to run tripleo is a finite - resource, there is a spec in place to run devtest on an openstack - deployment[1], this is the best way forward in order to use the resources we - have in the most efficient way possible. We also have a number of options to - explore that would help minimise resource wastage. - -**Problem :** (system_feedback_) Our CI provides no feedback about trends. - A good CI system should be more than a system that reports pass or fail, we - should be getting feedback on metrics allowing us to observe degradations, - where possible we should make use of services already provided by infra. - This will allow us to proactively intervene as CI begins to degrade? - -**Problem :** (bug_frequency_) We currently have no indication of which CI - bugs are occurring most often. This frustrates efforts to make CI more - reliable. - -**Problem :** (test_coverage_) Currently CI only tests a subset of what it - should. - - -Proposed Change -=============== - -There are a number of changes required in order to address the problems we have -been seeing, each listed here (in order of priority). - -.. _hp1_reliability: - -**Solution :** - -* Temporarily scale back on CI by removing one of the overcloud jobs (so rh1 has - the capacity to run CI Solo). -* Remove hp1 from the configuration. -* Run burn-in tests on each hp1 host, removing(or repairing) failing hosts. - Burn-in tests should consist of running CI on a newly deployed cloud matching - the load expected to run on the region. Any failure rate should not exceed - that of currently deployed regions. -* Redeploy testing infrastructure on hp1 and test with tempest, this redeploy - should be done with our tripleo scripts so it can be repeated and we - are sure of parity between ci-overcloud deployments. -* Place hp1 back into CI and monitor situation. -* Add back any removed CI jobs. -* Ensure burn-in / tempest tests are followed on future regions being deployed. -* Attempts should be made to deal with problems that develop on already - deployed clouds, if it becomes obvious they can't be quickly dealt with after - 48 hours they should be temporarily removed from the CI infrastructure and will - need to pass the burn-in tests before being added back into production. - -.. _net_reliability: - -**Solution :** - -* Deploy a mirror of pypi.openstack.org on each Region. -* Deploy a mirror of the Fedora and Ubuntu package repositories on each region. -* Deploy squid in each region and cache http traffic through it, mirroring - where possible should be considered our preference but having squid in place - should cache any resources not mirrored. -* Mirror other resources (e.g. github.com, percona tarballs etc..). -* Any new requirements added to devtest should be cachable with caches in - place before the requirement is added. - -.. _system_health: - -**Solution :** - -* Monitor our CI clouds and testenvs with Icinga, monitoring should include - ping, starting (and connecting to) new instances, disk usage etc.... -* Monitor CI test results and trigger an alert if "X" number of jobs of the - same type fail in succession. An example of using logstash to monitor CI - results can be found here[5]. - -Once consistency is no longer a problem we will investigate speed improvements -we can make on the speed of CI jobs. - -.. _ci_run_times: - -**Solution :** - -* Investigate if unsafe disk caching strategies will speed up disk image - creation, if an improvement is found implement it in production CI by one of - - * run "unsafe" disk caching strategy on ci cloud VM's (would involve exposing - this libvirt option via the nova api). - * use "eatmydata" to noop disk sync system calls, not currently - packaged for F20 but we could try and restart that process[2]. - - -.. _inefficient_usage: - -**Solution :** - -* Abandon on failure : adding a feature to zuul (or turning it on if it already - exists) to abandon all jobs in a queue for a particular commit as soon as a - voting commit fails. This would minimize usage of resources running long - running jobs that we already know will have to be rechecked. - -* Adding the collectl element to compute nodes and testenv hosts will allow us - to find bottle necks and also identify places where it is safe to overcommit - (e.g. we may find that overcommitting CPU a lot on testenv hosts is viable). - -.. _system_feedback: - -**Solution :** - -* Using a combination of logstash and graphite - - * Output graphs of occurrences of false negative test results. - * Output graphs of CI run times over time in order to identify trends. - * Output graphs of CI job peak memory usage over time. - * Output graphs of CI image sizes over time. - -.. _bug_frequency: - -**Solution :** - -* In order to be able to track false negatives that are hurting us most we - should agree not to use "recheck no bug", instead recheck with the - relevant bug number. Adding signatures to Elastic recheck for known CI - issues should help uptake of this. - -.. _test_coverage: - -**Solution :** - -* Run tempest against the deployed overcloud. -* Test our upgrade story by upgrading to a new images. Initially to avoid - having to build new images we can edit something on the overcloud qcow images - in place in order to get a set of images to upgrade too[3]. - - -Alternatives ------------- - -* As an alternative to deploying our own distro mirrors we could simply point - directly at a mirror known to be reliable. This is undesirable as a long - term solution as we still can't control outages. - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -* No longer using recheck no bug places a burden on developers to - investigate why a job failed. - -* Adding coverage to our tests will increase the overall time to run a job. - -Performance Impact ------------------- - -Performance of CI should improve overall. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - derekh - -Other contributors: - looking for volunteers... - - -Work Items ----------- - -* hp1 upgrade to trusty. -* Potential pypi mirror. -* Fedora Mirrors. -* Ubuntu Mirrors. -* Mirroring other non distro resources. -* Per region caching proxy. -* Document CI. -* Running an unsafe disk caching strategy in the overcloud nodes. -* ZUUL abandon on failure. -* Include collectl on compute and testenv Hosts and analyse output. -* Mechanism to monitor CI run times. -* Mechanism to monitor nodepool connection failures to instances. -* Remove ability to recheck no bug or at the very least discourage its use. -* Monitoring cloud/testenv health. -* Expand ci to include tempest. -* Expand ci to include upgrades. - - -Dependencies -============ - -None - -Testing -======= - -CI failure rate and timings will be tracked to confirm improvements. - -Documentation Impact -==================== - -The tripleo-ci repository needs additional documentation in order to describe -the current layout and should then be updated as changes are made. - -References -========== - -* [1] spec to run devtest on openstack https://review.openstack.org/#/c/92642/ -* [2] eatmydata for Fedora https://bugzilla.redhat.com/show_bug.cgi?id=1007619 -* [3] CI upgrades https://review.openstack.org/#/c/87758/ -* [4] summit session https://etherpad.openstack.org/p/juno-summit-tripleo-ci -* [5] http://jogo.github.io/gate/tripleo.html diff --git a/specs/juno/tripleo-juno-configurable-mnt-state.rst b/specs/juno/tripleo-juno-configurable-mnt-state.rst deleted file mode 100644 index c3b97d4f..00000000 --- a/specs/juno/tripleo-juno-configurable-mnt-state.rst +++ /dev/null @@ -1,238 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================================= -Configurable directory for persistent and stateful data -======================================================= - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-configurable-mnt-state - -Make the hardcoded /mnt/state path for stateful data be configurable. - -Problem Description -=================== - -1. A hard coded directory of /mnt/state for persistent data is incompatible -with Red Hat based distros available mechanism for a stateful data path. Red -Hat based distros, such as Fedora, RHEL, and CentOS, have a feature that uses -bind mounts for mounting paths onto a stateful data partition and does not -require manually reconfiguring software to use /mnt/state. - -2. Distros that use SELinux have pre-existing policy that allows access to -specific paths. Reconfiguring these paths to be under /mnt/state, results -in SELinux denials for existing services, requiring additional policy to be -written and maintained. - -3. Some operators and administrators find the reconfiguring of many services to -not use well known default values for filesystem paths to be disruptive and -inconsistent. They do not expect these changes when using a distro that they've -come to learn and anticipate certain configurations. These types of changes -also require documentation changes to existing documents and processes. - - -Proposed Change -=============== -Deployers will be able to choose a configurable path instead of the hardcoded -value of /mnt/state for the stateful path. - -A new element, stateful-path will be added that defines the value for the -stateful path. The default will be /mnt/state. - -There are 3 areas that need to respect the configurable path: - -os-apply-config template generation - The stateful-path element will set the stateful path value by installing a - JSON file to a well known location for os-collect-config to use as a local - data source. This will require a new local data source collector to be added - to os-collect-config (See `Dependencies`_). - - The JSON file's contents will be based on $STATEFUL_PATH, e.g.: - - {'stateful-path': '/mnt/state'} - - File templates (files under os-apply-config in an element) will then be - updated to replace the hard coded /mnt/state with {{stateful-path}}. - - Currently, there is a mix of root locations of the os-apply-config templates. - Most are written under /, although some are written under /mnt/state. The - /mnt/state is hard coded in the directory tree under os-apply-config in these - elements, so this will be removed to have the templates just written under /. - Symlinks could instead be used in these elements to setup the correct paths. - Support can also be added to os-apply-config's control file mechanism to - indicate these files should be written under the stateful path. An example - patch that does this is at: https://review.openstack.org/#/c/113651/ - -os-refresh-config scripts run at boot time - In order to make the stateful path configurable, all of the hard coded - references to /mnt/state in os-refresh-config scripts will be replaced with an - environment variable, $STATEFUL_PATH. - - The stateful-path element will provide an environment.d script for - os-refresh-config that reads the value from os-apply-config: - - export STATEFUL_PATH=$(os-apply-config --key stateful-path --type raw) - -Hook scripts run at image build time - The stateful-path element will provide an environment.d script for use at - image build time: - - export STATEFUL_PATH=${STATEFUL_PATH:-"/mnt/state"} - -The use-ephemeral element will depend on the stateful-path element, effectively -making the default stateful path remain /mnt/state. - -The stateful path can be reconfigured by defining $STATEFUL_PATH either A) in -the environment before an image build; or B) in an element with an -environment.d script which runs earlier than the stateful-path environment.d -script. - - -Alternatives ------------- -None come to mind, the point of this spec is to enable an alternative to what's -already existing. There may be additional alternatives out there other folks -may wish to add support for. - -Security Impact ---------------- -None - -Other End User Impact ---------------------- -End users using elements that change the stateful path location from /mnt/state -to something else will see this change reflected in configuration files and in -the directories used for persistent and stateful data. They will have to know -how the stateful path is configured and accessed. - -Different TripleO installs would appear different if used with elements that -configured the stateful path differently. - -This also adds some complexity when reading TripleO code, because instead of -there being an explicit path, there would instead be a reference to a -configurable value. - -Performance Impact ------------------- -There will be additional logic in os-refresh-config to determine and set the -stateful path, and an additional local collector that os-collect-config would -use. However, these are negligible in terms of negatively impacting -performance. - - -Other Deployer Impact ---------------------- -Deployers will be able to choose different elements that may reconfigure the -stateful path or change the value for $STATEFUL_PATH. The default will remain -unchanged however. - -Deployers would have to know what the stateful path is, and if it's different -across their environment, this could be confusing. However, this seems unlikely -as deployers are likely to be standardizing on one set of common elements, -distro, etc. - -In the future, if TripleO CI and CD clouds that are based on Red Hat distros -make use of this feature to enable Red Hat read only root support, then these -clouds would be configured differently from clouds that are configured to use -/mnt/state. As a team, the tripleo-cd-admins will have to know which -configuration has been used. - -Developer Impact ----------------- -1. Developers need to use the $STATEFUL_PATH and {{stateful-path}} -substitutions when they intend to refer to the stateful path. - -2. Code that needs to know the stateful path will need access to the variable -defining the path, it won't be able to assume the path is /mnt/state. A call to -os-apply-config to query the key defining the path could be done to get -the value, as long as os-collect-config has already run at least once. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - james-slagle - -Work Items ----------- - -tripleo-incubator -^^^^^^^^^^^^^^^^^ -* Update troubleshooting docs to mention that /mnt/state is a configurable - path, and could be different in local environments. - -tripleo-image-elements -^^^^^^^^^^^^^^^^^^^^^^ -* Add a new stateful-path element that configures stateful-path and $STATEFUL_PATH - to /mnt/state -* Update os-apply-config templates to replace /mnt/state with {{stateful-path}} -* Update os-refresh-config scripts to replace /mnt/state with $STATEFUL_PATH -* Update all elements that have os-apply-config template files under /mnt/state - to just be under /. - - * update os-apply-config element to call os-apply-config with a --root - $STATEFUL_PATH option - * update elements that have paths to os-apply-config generated files (such - as /etc/nova/nova.conf) to refer to those paths as - $STATEFUL_PATH/path/to/file. - -* make use-ephemeral element depend on stateful-path element - -Dependencies -============ -1. os-collect-config will need a new feature to read from a local data source - directory that elements can install JSON files into, such as a source.d. There - will be a new spec filed on this feature. - https://review.openstack.org/#/c/100965/ - -2. os-apply-config will need an option in its control file to support - generating templates under the configurable stateful path. There is a patch - here: https://review.openstack.org/#/c/113651/ - - -Testing -======= - -There is currently no testing that all stateful and persistent data is actually -written to a stateful partition. - -We should add tempest tests that directly exercise the preserve_ephemeral -option, and have tests that check that all stateful data has been preserved -across a "nova rebuild". Tempest seems like a reasonable place to add these -tests since preserve_ephemeral is a Nova OpenStack feature. Plus, once TripleO -CI is running tempest against the deployed OverCloud, we will be testing this -feature. - -We should also test in TripleO CI that state is preserved across a rebuild by -adding stateful data before a rebuild and verifying it is still present after a -rebuild. - -Documentation Impact -==================== - -We will document the new stateful-path element. - -TripleO documentation will need to mention the potential difference in -configuration files and the location of persistent data if a value other than -/mnt/state is used. - - -References -========== - -os-collect-config local datasource collector spec: - -* https://review.openstack.org/100965 - -Red Hat style stateful partition support this will enable: - -* https://git.fedorahosted.org/cgit/initscripts.git/tree/systemd/fedora-readonly -* https://git.fedorahosted.org/cgit/initscripts.git/tree/sysconfig/readonly-root -* https://git.fedorahosted.org/cgit/initscripts.git/tree/statetab -* https://git.fedorahosted.org/cgit/initscripts.git/tree/rwtab diff --git a/specs/juno/tripleo-juno-deploy-cloud-hypervisor-type.rst b/specs/juno/tripleo-juno-deploy-cloud-hypervisor-type.rst deleted file mode 100644 index 7ac58648..00000000 --- a/specs/juno/tripleo-juno-deploy-cloud-hypervisor-type.rst +++ /dev/null @@ -1,258 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -==================================== -TripleO Deploy Cloud Hypervisor Type -==================================== - -# TODO: file the actual blueprint... -https://blueprints.launchpad.net/tripleo/+spec/tripleo-deploy-cloud-hypervisor-type - -The goal of this spec is to detail how the TripleO deploy cloud type could be -varied from just baremetal to baremetal plus other hypervisors to deploy -Overcloud services. - -Linux kernel containers make this approach attractive due to the lightweight -nature that services and process can be virtualized and isolated, so it seems -likely that libvirt+lxc and Docker would be likely targets. However we should -aim to make this approach as agnostic as possible for those deployers who may -wish to use any Nova driver, such as libvirt+kvm. - -Problem Description -=================== - -The overcloud control plane is generally lightly loaded and allocation of -entire baremetal machines to it is wasteful. Also, when the Overcloud services -are running entirely on baremetal they take longer to upgrade and rollback. - -Proposed Change -=============== - -We should support any Nova virtualization type as a target for Overcloud -services, as opposed to using baremetal nodes to deploy overcloud images. -Containers are particularly attractive because they are lightweight, easy to -upgrade/rollback and offer similar isolation and security as full VM's. For the -purpose of this spec, the alternate Nova virtualization target for the -Overcloud will be referred to as alt-hypervisor. alt-hypervisor could be -substituted with libvirt+lxc, Docker, libvirt+kvm, etc. - -At a minimum, we should support running each Overcloud service in isolation in -its own alt-hypervisor instance in order to be as flexible as possible to deployer -needs. We should also support combining services. - -In order to make other alt-hypervisors available as deployment targets for the -Overcloud, we need additional Nova Compute nodes/services configured to use -alt-hypervisors registered with the undercloud Nova. - -Additionally, the undercloud must still be running a Nova compute with the -ironic driver in order to allow for scaling itself out to add additional -undercloud compute nodes. - -To accomplish this, we can run 2 Nova compute processes on each undercloud -node. One configured with Nova+Ironic and one configured with -Nova+alt-hypervisor. For the straight baremetal deployment, where an alternate -hypervisor is not desired, the additional Nova compute process would not be -included. This would be accomplished via the standard inclusion/exclusion of -elements during a diskimage-builder tripleo image build. - -It will also be possible to build and deploy just an alt-hypervisor compute -node that is registered with the Undercloud as an additional compute node. - -To minimize the changes needed to the elements, we will aim to run a full init -stack in each alt-hypervisor instance, such as systemd. This will allow all the -services that we need to also be running in the instance (cloud-init, -os-collect-config, etc). It will also make troubleshooting similar to the -baremetal process in that you'd be able to ssh to individual instances, read -logs, restart services, turn on debug mode, etc. - -To handle Neutron network configuration for the Overcloud, the Overcloud -neutron L2 agent will have to be on a provider network that is shared between -the hypervisors. VLAN provider networks will have to be modeled in Neutron and -connected to alt-hypervisor instances. - -Overcloud compute nodes themselves would be deployed to baremetal nodes. These -images would be made up of: -* libvirt+kvm (assuming this is the hypervisor choice for the Overcloud) -* nova-compute + libvirt+kvm driver (registered to overcloud control). -* neutron-l2-agent (registered to overcloud control) -An image with those contents is deployed to a baremetal node via nova+ironic -from the undercloud. - -Alternatives ------------- - -Deployment from the seed -^^^^^^^^^^^^^^^^^^^^^^^^ -An alternative to having the undercloud deploy additional alt-hypervisor -compute nodes would be to register additional baremetal nodes with the seed vm, -and then describe an undercloud stack in a template that is the undercloud -controller and its set of alt-hypervisor compute nodes. When the undercloud -is deployed via the seed, all of the nodes are set up initially. - -The drawback with that approach is that the seed is meant to be short-lived in -the long term. So, it then becomes difficult to scale out the undercloud if -needed. We could offer a hybrid of the 2 models: launch all nodes initially -from the seed, but still have the functionality in the undercloud to deploy -more alt-hypervisor compute nodes if needed. - -The init process -^^^^^^^^^^^^^^^^ -If running systemd in a container turns out to be problematic, it should be -possible to run a single process in the container that starts just the -OpenStack service that we care about. However that process would also need to -do things like read Heat metadata. It's possible this process could be -os-collect-config. This change would require more changes to the elements -themselves however since they are so dependent on an init process currently in -how they enable/restart services etc. It may be possible to replace os-svc-* -with other tools that don't use systemd or upstart when you're building images -for containers. - -Security Impact ---------------- -* We should aim for equivalent security when deploying to alt-hypervisor - instances as we do when deploying to baremetal. To the best of our ability, it - should not be possible to compromise the instance if an individual service is - compromised. - -* Since Overcloud services and Undercloud services would be co-located on the - same baremetal machine, compromising the hypervisor and gaining access to the - host is a risk to both the Undercloud and Overcloud. We should mitigate this - risk to the best of our ability via things like SELinux, and removing all - unecessary software/processes from the alt-hypervisor instances. - -* Certain hypervisors are inherently more secure than others. libvirt+kvm uses - virtualization and is much more secure then container based hypervisors such as - libvirt+lxc and Docker which use namespacing. - -Other End User Impact ---------------------- -None. The impact of this change is limited to Deployers. End users should have -no visibility into the actual infrastructure of the Overcloud. - -Performance Impact ------------------- -Ideally, deploying an overcloud to containers should result in a faster -deployment than deploying to baremetal. Upgrading and downgrading the Overcloud -should also be faster. - -More images will have to be built via diskimage-builder however, which will -take more time. - -Other Deployer Impact ---------------------- -The main impact to deployers will be the ability to use alt-hypervisors -instances, such as containers if they wish. They also must understand how to -use nova-baremetal/ironic on the undercloud to scale out the undercloud and add -additional alt-hypervisor compute nodes if needed. - -Additional space in the configured glance backend would also likely be needed -to store additional images. - -Developer Impact ----------------- -* Developers working on TripleO will have the option of deploying to - alt-hypervisor instances. This should make testing and developing on some - aspects of TripleO easier due to the need for less vm's. - -* More images will have to be built due to the greater potential variety with - alt-hypervisor instances housing Overcloud services. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - james-slagle - -Work Items ----------- - -tripleo-incubator -^^^^^^^^^^^^^^^^^ -* document how to use an alternate hypervisor for the overcloud deployment - ** eventually, this could possibly be the default -* document how to troubleshoot this type of deployment -* need a user option or json property to describe if the devtest - environment being set up should use an alternate hypervisor for the overcloud - deployment or not. Consider using HEAT_ENV where appropriate. -* load-image should be updated to add an additional optional argument that sets - the hypervisor_type property on the loaded images in glance. The argument is - optional and wouldn't need to be specified for some images, such as regular - dib images that can run under KVM. -* Document commands to setup-neutron for modeling provider VLAN networks. - -tripleo-image-elements -^^^^^^^^^^^^^^^^^^^^^^ -* add new element for nova docker driver -* add new element for docker registry (currently required by nova docker - driver) -* more hypervisor specific configuration files for the different nova compute - driver elements - ** /etc/nova/compute/nova-kvm.conf - ** /etc/nova/compute/nova-baremetal.conf - ** /etc/nova/compute/nova-ironic.conf - ** /etc/nova/compute/nova-docker.conf -* Separate configuration options per compute process for: - ** host (undercloud-kvm, undercloud-baremetal, etc). - ** state_path (/var/lib/nova-kvm, /var/lib/nova-baremetal, etc). -* Maintain backwards compatibility in the elements by consulting both old and - new heat metadata key namespaces. - -tripleo-heat-templates -^^^^^^^^^^^^^^^^^^^^^^ -* Split out heat metadata into separate namespaces for each compute process - configuration. -* For the vlan case, update templates for any network modeling for - alt-hypervisor instances so that those instances have correct interfaces - attached to the vlan network. - -diskimage-builder -^^^^^^^^^^^^^^^^^ -* add ability where needed to build new image types for alt-hypervisor - ** Docker - ** libvirt+lxc -* Document how to build images for the new types - -Dependencies -============ -For Docker support, this effort depends on continued development on the nova -Docker driver. We would need to drive any missing features or bug fixes that -were needed in that project. - -For other drivers that may not be as well supported as libvirt+kvm, we will -also have to drive missing features there as well if we want to support them, -such as libvirt+lxc, openvz, etc. - -This effort also depends on the provider resource templates spec (unwritten) -that will be done for the template backend for Tuskar. That work should be done -in such a way that the provider resource templates are reusable for this effort -as well in that you will be able to create templates to match the images that -you intend to create for your Overcloud deployment. - -Testing -======= -We would need a separate set of CI jobs that were configured to deploy an -Overcloud to each alternate hypervisor that TripleO intended to support well. - -For Docker support specifically, CI jobs could be considered non-voting since -they'd rely on a stackforge project which isn't officially part of OpenStack. -We could potentially make this job voting if TripleO CI was enabled on the -stackforge/nova-docker repo so that changes there are less likely to break -TripleO deployments. - -Documentation Impact -==================== -We should update the TripleO specific docs in tripleo-incubator to document how -to use an alternate hypervisor for an Overcloud deployment. - -References -========== -Juno Design Summit etherpad: https://etherpad.openstack.org/p/juno-summit-tripleo-and-docker -nova-docker driver: https://git.openstack.org/cgit/stackforge/nova-docker -Docker: https://www.docker.io/ -Docker github: https://github.com/dotcloud/docker diff --git a/specs/juno/tripleo-juno-dracut-ramdisks.rst b/specs/juno/tripleo-juno-dracut-ramdisks.rst deleted file mode 100644 index 7c321d55..00000000 --- a/specs/juno/tripleo-juno-dracut-ramdisks.rst +++ /dev/null @@ -1,176 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================== -Dracut Deploy Ramdisks -====================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-dracut-ramdisks - -Our current deploy ramdisks include functionality that is duplicated from -existing tools such as Dracut, and do not include some features that those -tools do. Reimplementing our deploy ramdisks to use Dracut would shrink -our maintenance burden for that code and allow us to take advantage of those -additional features. - -Problem Description -=================== - -Currently our deploy ramdisks are implemented as a bash script that runs -as init during the deploy process. This means that we are responsible for -correctly configuring things such as udev and networking which would normally -be handled by distribution tools. While this isn't an immediate problem -because the implementation has already been done, it is an unnecessary -duplication and additional maintenance debt for the future as we need to add -or change such low-level functionality. - -In addition, because our ramdisk is a one-off, users will not be able to make -use of any ramdisk troubleshooting methods that they might currently know. -This is an unnecessary burden when there are tools to build ramdisks that are -standardized and well-understood by the people using our software. - -Proposed Change -=============== - -The issues discussed above can be dealt with by using a standard tool such as -Dracut to build our deploy ramdisks. This will actually result in a reduction -in code that we have to maintain and should be compatible with all of our -current ramdisks because we can continue to use the same method of building -the init script - it will just run as a user script instead of process 0, -allowing Dracut to do low-level configuration for us. - -Initially this will be implemented alongside the existing ramdisk element to -provide a fallback option if there are any use cases not covered by the -initial version of the Dracut ramdisk. - -Alternatives ------------- - -For consistency with the rest of Red Hat/Fedora's ramdisks I would prefer to -implement this using Dracut, but if there is a desire to also make use of -another method of building ramdisks, that could probably be implemented -alongside Dracut. The current purely script-based implementation could even -be kept in parallel with a Dracut version. However, I believe Dracut is -available on all of our supported platforms so I don't see an immediate need -for alternatives. - -Additionally, there is the option to replace our dynamically built init -script with Dracut modules for each deploy element. This is probably -unnecessary as it is perfectly fine to use the current method with Dracut, -and using modules would tightly couple our deploy ramdisks to Dracut, making -it difficult to use any alternatives in the future. - -Security Impact ---------------- - -The same security considerations that apply to the current deploy ramdisk -would continue to apply to Dracut-built ones. - -Other End User Impact ---------------------- - -This change would enable end users to make use of any Dracut knowledge they -might already have, including the ability to dynamically enable tracing -of the commands used to do the deployment (essentially set -x in bash). - -Performance Impact ------------------- - -Because Dracut supports more hardware and software configurations, it is -possible there will be some additional overhead during the boot process. -However, I would expect this to be negligible in comparison to the time it -takes to copy the image to the target system, so I see it as a reasonable -tradeoff. - -Other Deployer Impact ---------------------- - -As noted before, Dracut supports a wide range of hardware configurations, -so deployment methods that currently wouldn't work with our script-based -ramdisk would become available. For example, Dracut supports using network -disks as the root partition, so running a diskless node with separate -storage should be possible. - -Developer Impact ----------------- - -There would be some small changes to how developers would add a new dependency -to the ramdisk images. Instead of executables and their required libraries -being copied to the ramdisk manually, the executable can simply be added to -the list of things Dracut will include in the ramdisk. - -Developers would also gain the dynamic tracing ability mentioned above in -the end user impact. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - bnemec - -Work Items ----------- - -* Convert the ramdisk element to use Dracut (see WIP change in References). - -* Verify that DHCP booting of ramdisks still works. - -* Verify that nova-baremetal ramdisks can be built successfully with Dracut. - -* Verify that Ironic ramdisks can be built successfully with Dracut. - -* Verify that Dracut can build Ironic-IPA ramdisks. - -* Verify the Dracut debug shell provides equivalent functionality to the - existing one. - -* Provide ability for other elements to install additional files to the - ramdisk. - -* Provide ability for other elements to include additional drivers. - -* Find a way to address potential 32-bit binaries being downloaded and run in - the ramdisk for firmware deployments. - -Dependencies -============ - -This would add a dependency on Dracut for building ramdisks. - -Testing -======= - -Since building deploy ramdisks is already part of CI, this should be covered -automatically. If it is implemented in parallel with another method, then -the CI jobs would need to be configured to exercise the different methods -available. - -Documentation Impact -==================== - -We would want to document the additional features available in Dracut. -Otherwise this should function in essentially the same way as the current -ramdisks, so any existing documentation will still be valid. - -Some minor developer documentation changes may be needed to address the -different ways Dracut handles adding extra kernel modules and files. - -References -========== - -* Dracut: https://dracut.wiki.kernel.org/index.php/Main_Page - -* PoC of building ramdisks with Dracut: - https://review.openstack.org/#/c/105275/ - -* openstack-dev discussion: - http://lists.openstack.org/pipermail/openstack-dev/2014-July/039356.html diff --git a/specs/juno/tripleo-juno-occ-localdatasource.rst b/specs/juno/tripleo-juno-occ-localdatasource.rst deleted file mode 100644 index efbeed35..00000000 --- a/specs/juno/tripleo-juno-occ-localdatasource.rst +++ /dev/null @@ -1,168 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=================================== -os-collect-config local data source -=================================== - - -https://blueprints.launchpad.net/tripleo-juno-occ-local-datasource - -os-collect-config needs a local data source collector for configuration data. -This will allow individual elements to drop files into a well-known location to -set the initial configuration data of an instance. - -There is already a heat_local collector, but that uses a single hard coded path -of /var/lib/heat-cfntools/cfn-init-data. - -Problem Description -=================== - -* Individual elements can not currently influence the configuration available - to os-apply-config for an instance without overwriting each other. -* Elements that rely on configuration values that must be set the same at both - image build time and instance run time currently have no way of propagating the - value used at build time to a run time value. -* Elements have no way to specify default values for configuration they may - need at runtime (outside of configuration file templates). - - -Proposed Change -=============== - -A new collector class will be added to os-collect-config that collects -configuration data from JSON files in a configurable list of directories with a -well known default of /var/lib/os-collect-config/local-data. - -The collector will return a list of pairs of JSON files and their content, -sorted by the JSON filename in traditional C collation. For example, if -/var/lib/os-collect-config/local-data contains bar.json and foo.json - - [ ('bar.json', bar_content), - ('foo.json', foo_content) ] - -This new collector will be configured first in DEFAULT_COLLECTORS in -os-collect-config. This means all later configured collectors will override any -shared configuration keys from the local datasource collector. - -Elements making use of this feature can install a json file into the -/var/lib/os-collect-config/local-data directory. The os-collect-config element -will be responsible for creating the /var/lib/os-collect-config/local-data -directory at build time and will create it with 0755 permissions. - -Alternatives ------------- - -OS_CONFIG_FILES -^^^^^^^^^^^^^^^ -There is already a mechanism in os-apply-config to specify arbitrary files to -look at for configuration data via setting the OS_CONFIG_FILES environment -variable. However, this is not ideal because each call to os-apply-config would -have to be prefaced with setting OS_CONFIG_FILES, or it would need to be set -globally in the environment (via an environment.d script for instance). As an -element developer, this is not clear. Having a robust and clear documented -location to drop in configuration data will be simpler. - -heat_local collector -^^^^^^^^^^^^^^^^^^^^ -There is already a collector that reads from local data, but it must be -configured to read explicit file paths. This does not scale well if several -elements want to each provide local configuration data, in that you'd have to -reconfigure os-collect-config itself. We could modify the heat_local collector -to read from directories instead, while maintaining backwards compatibility as -well, instead of writing a whole new collector. However, given that collectors -are pretty simple implementations, I'm proposing just writing a new one, so -that they remain generally single purpose with clear goals. - -Security Impact ---------------- - -* Harmful elements could drop bad configuration data into the well known - location. This is mitigated somewhat that as a deployer, you should know and - validate what elements you're using that may inject local configuration. - -* We should verify that the local data source files are not world writable and - are in a directory that is root owned. Checks to dib-lint could be added to - verify this at image build time. Checks could be added to os-collect-config - for instance run time. - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -An additional collector will be running as part of os-collect-config, but its -execution time should be minimal. - -Other Deployer Impact ---------------------- - -* There will be an additional configuration option in os-collect-config to - configure the list of directories to look at for configuration data. This - will have a reasonable default and will not usually need to be changed. -* Deployers will have to consider what local data source configuration may be - influencing their current applied configuration. - -Developer Impact ----------------- - -We will need to make clear in documentation when to use this feature versus -what to expose in a template or specify via passthrough configuration. -Configuration needed at image build time where you need access to those values -at instance run time as well are good candidates for using this feature. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - james-slagle - -Work Items ----------- - -* write new collector for os-collect-config -* unit tests for new collector -* document new collector -* add checks to dib-lint to verify JSON files installed to the local data - source directory are not world writable -* add checks to os-collect-config to verify JSON files read by the local data - collector are not world writable and that their directory is root owned. - -Dependencies -============ - -* The configurable /mnt/state spec at: - https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-configurable-mnt-state - depends on this spec. - -Testing -======= - -Unit tests will be written for the new collector. The new collector will also -eventually be tested in CI because there will be an existing element that will -configure the persistent data directory to /mnt/state that will make use of -this implementation. - - -Documentation Impact -==================== - -The ability of elements to drop configuration data into a well known location -should be documented in tripleo-image-elements itself so folks can be made -better aware of the functionality. - -References -========== - -* https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-configurable-mnt-state -* https://review.openstack.org/#/c/94876 diff --git a/specs/juno/tripleo-juno-tuskar-rest-api.rst b/specs/juno/tripleo-juno-tuskar-rest-api.rst deleted file mode 100644 index 046d23f9..00000000 --- a/specs/juno/tripleo-juno-tuskar-rest-api.rst +++ /dev/null @@ -1,611 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================== -Tuskar Plan REST API Specification -================================== - -Blueprint: -https://blueprints.launchpad.net/tuskar/+spec/tripleo-juno-tuskar-plan-rest-api - -In Juno, the Tuskar API is moving towards a model of being a large scale -application planning service. Its initial usage will be to deploy OpenStack -on OpenStack by leveraging TripleO Heat Templates and fitting into the -greater TripleO workflow. - -As compared to Icehouse, Tuskar will no longer make calls to Heat for creating -and updating a stack. Instead, it will serve to define and manipulate the Heat -templates for describing a cloud. Tuskar will be the source for the cloud -planning while Heat is the source for the state of the live cloud. - -Tuskar employs the following concepts: - -* *Deployment Plan* - The description of an application (for example, - the overcloud) being planned by Tuskar. The deployment plan keeps track of - what roles will be present in the deployment and their configuration values. - In TripleO terms, each overcloud will have its own deployment plan that - describes what services will run and the configuration of those services - for that particular overcloud. For brevity, this is simply referred to as - the "plan" elsewhere in this spec. -* *Role* - A unit of functionality that can be added to a plan. A role - is the definition of what will run on a single server in the deployed Heat - stack. For example, an "all-in-one" role may contain all of the services - necessary to run an overcloud, while a "compute" role may provide only the - nova-compute service. - -Put another way, Tuskar is responsible for assembling -the user-selected roles and their configuration into a Heat environment and -making the built Heat templates and files available to the caller (the -Tuskar UI in TripleO but, more generally, any consumer of the REST API) to send -to Heat. - -Tuskar will ship with the TripleO Heat Templates installed to serve as its -roles (dependent on the conversions taking place this release [4]_). -For now it is assumed those templates are installed as part of the TripleO's -installation of Tuskar. A different spec will cover the API calls necessary -for users to upload and manipulate their own custom roles. - -This specification describes the REST API clients will interact with in -Tuskar, including the URLs, HTTP methods, request, and response data, for the -following workflow: - -* Create an empty plan in Tuskar. -* View the list of available roles. -* Add roles to the plan. -* Request, from Tuskar, the description of all of the configuration values - necessary for the entire plan. -* Save user-entered configuration values with the plan in Tuskar. -* Request, from Tuskar, the Heat templates for the plan, which includes - all of the files necessary to deploy the configured application in Heat. - -The list roles call is essential to this workflow and is therefore described -in this specification. Otherwise, this specification does not cover the API -calls around creating, updating, or deleting roles. It is assumed that the -installation process for Tuskar in TripleO will take the necessary steps to -install the TripleO Heat Templates into Tuskar. A specification will be filed -in the future to cover the role-related API calls. - - -Problem Description -=================== - -The REST API in Tuskar seeks to fulfill the following needs: - -* Flexible selection of an overcloud's functionality and deployment strategy. -* Repository for discovering what roles can be added to a cloud. -* Help the user to avoid having to manually manipulate Heat templates to - create the desired cloud setup. -* Storage of a cloud's configuration without making the changes immediately - live (future needs in this area may include offering a more structured - review and promotion lifecycle for changes). - - -Proposed Change -=============== - -**Overall Concepts** - -* These API calls will be added under the ``/v2/`` path, however the v1 API - will not be maintained (the model is being changed to not contact Heat and - the existing database is being removed [3]_). -* All calls have the potential to raise a 500 if something goes horribly wrong - in the server, but for brevity this is omitted from the list of possible - response codes in each call. -* All calls have the potential to raise a 401 in the event of a failed user - authentication and have been similarly omitted from each call's - documentation. - ----- - -.. _retrieve-single-plan: - -**Retrieve a Single Plan** - -URL: ``/plans//`` - -Method: ``GET`` - -Description: Returns the details of a specific plan, including its -list of assigned roles and configuration information. - -Notes: - -* The configuration values are read from Tuskar's stored files rather than - Heat itself. Heat is the source for the live stack, while Tuskar is the - source for the plan. - -Request Data: None - -Response Codes: - -* 200 - if the plan is found -* 404 - if there is no plan with the given UUID - -Response Data: - -JSON document containing the following: - -* Tuskar UUID for the given plan. -* Name of the plan that was created. -* Description of the plan that was created. -* The timestamp of the last time a change was made. -* List of the roles (identified by name and version) assigned to the plan. - For this sprint, there will be no pre-fetching of any more role information - beyond name and version, but can be added in the future while maintaining - backward compatibility. -* List of parameters that can be configured for the plan, including the - parameter name, label, description, hidden flag, and current value if - set. - -Response Example: - -.. code-block:: json - - { - "uuid" : "dd4ef003-c855-40ba-b5a6-3fe4176a069e", - "name" : "dev-cloud", - "description" : "Development testing cloud", - "last_modified" : "2014-05-28T21:11:09Z", - "roles" : [ - { - "uuid" : "55713e6a-79f5-42e1-aa32-f871b3a0cb64", - "name" : "compute", - "version" : "1", - "links" : { - "href" : "http://server/v2/roles/55713e6a-79f5-42e1-aa32-f871b3a0cb64/", - "rel" : "bookmark" - } - }, - { - "uuid" : "2ca53130-b9a4-4fa5-86b8-0177e8507803", - "name" : "controller", - "version" : "1", - "links" : { - "href" : "http://server/v2/roles/2ca53130-b9a4-4fa5-86b8-0177e8507803/", - "rel" : "bookmark" - } - } - ], - "parameters" : [ - {"name" : "database_host", - "label" : "Database Host", - "description" : "Hostname of the database server", - "hidden" : "false", - "value" : "10.11.12.13" - } - ], - "links" : [ - { - "href" : "http://server/v2/plans/dd4ef003-c855-40ba-b5a6-3fe4176a069e/", - "rel" : "self" - } - ] - } - ----- - -.. _retrieve-plan-template: - -**Retrieve a Plan's Template Files** - -URL: ``/plans//templates/`` - -Method: ``GET`` - -Description: Returns the set of files to send to Heat to create or update -the planned application. - -Notes: - -* The Tuskar service will build up the entire environment into a single - file suitable for sending to Heat. The contents of this file are returned - from this call. - -Request Data: None - -Response Codes: - -* 200 - if the plan's templates are found -* 404 - if no plan exists with the given ID - -Response Data: - ----- - -.. _list-plans: - -**List Plans** - -URL: ``/plans/`` - -Method: ``GET`` - -Description: Returns a list of all plans stored in Tuskar. In the future when -multi-tenancy is added, this will be scoped to a particular tenant. - -Notes: - -* The detailed information about a plan, including its roles and configuration - values, are not returned in this call. A follow up call is needed on the - specific plan. It may be necessary in the future to add a flag to pre-fetch - this information during this call. - -Request Data: None (future enhancement will require the tenant ID and -potentially support a pre-fetch flag for more detailed data) - -Response Codes: - -* 200 - if the list can be retrieved, even if the list is empty - -Response Data: - -JSON document containing a list of limited information about each plan. -An empty list is returned when no plans are present. - -Response Example: - -.. code-block:: json - - [ - { - "uuid" : "3e61b4b2-259b-4b91-8344-49d7d6d292b6", - "name" : "dev-cloud", - "description" : "Development testing cloud", - "links" : { - "href" : "http://server/v2/plans/3e61b4b2-259b-4b91-8344-49d7d6d292b6/", - "rel" : "bookmark" - } - }, - { - "uuid" : "135c7391-6c64-4f66-8fba-aa634a86a941", - "name" : "qe-cloud", - "description" : "QE testing cloud", - "links" : { - "href" : "http://server/v2/plans/135c7391-6c64-4f66-8fba-aa634a86a941/", - "rel" : "bookmark" - } - } - ] - - ----- - -.. _create-new-plan: - -**Create a New Plan** - -URL: ``/plans/`` - -Method: ``POST`` - -Description: Creates an entry in Tuskar's storage for the plan. The details -are outside of the scope of this spec, but the idea is that all of the -necessary Heat environment infrastructure files and directories will be -created and stored in Tuskar's storage solution [3]_. - -Notes: - -* Unlike in Icehouse, Tuskar will not make any calls into Heat during this - call. This call is to create a new (empty) plan in Tuskar that - can be manipulated, configured, saved, and retrieved in a format suitable - for sending to Heat. -* This is a synchronous call that completes when Tuskar has created the - necessary files for the newly created plan. -* As of this time, this call does not support a larger batch operation that - will add roles or set configuration values in a single call. From a REST - perspective, this is acceptable, but from a usability standpoint we may want - to add this support in the future. - -Request Data: - -JSON document containing the following: - -* Name - Name of the plan being created. Must be unique across all plans - in the same tenant. -* Description - Description of the plan to create. - -Request Example: - -.. code-block:: json - - { - "name" : "dev-cloud", - "description" : "Development testing cloud" - } - -Response Codes: - -* 201 - if the create is successful -* 409 - if there is an existing plan with the given name (for a particular - tenant when multi-tenancy is taken into account) - -Response Data: - -JSON document describing the created plan. -The details are the same as for the GET operation on an individual plan -(see :ref:`Retrieve a Single Plan `). - - ----- - -.. _delete-plan: - -**Delete an Existing Plan** - -URL: ``/plans//`` - -Method: ``DELETE`` - -Description: Deletes the plan's Heat templates and configuration values from -Tuskar's storage. - -Request Data: None - -Response Codes: - -* 200 - if deleting the plan entries from Tuskar's storage was successful -* 404 - if there is no plan with the given UUID - -Response Data: None - - ----- - -.. _add-plan-role: - -**Adding a Role to a Plan** - -URL: ``/plans//roles/`` - -Method: ``POST`` - -Description: Adds the specified role to the given plan. - -Notes: - -* This will cause the parameter consolidation to occur and entries added to - the plan's configuration parameters for the new role. -* This call will update the ``last_modified`` timestamp to indicate a change - has been made that will require an update to Heat to be made live. - -Request Data: - -JSON document containing the uuid of the role to add. - -Request Example: - -.. code-block:: json - - { - "uuid" : "role_uuid" - } - -Response Codes: - -* 201 - if the addition is successful -* 404 - if there is no plan with the given UUID -* 409 - if the plan already has the specified role - -Response Data: - -The same document describing the plan as from -:ref:`Retrieve a Single Plan `. The newly added -configuration parameters will be present in the result. - - ----- - -.. _remove-cloud-plan: - -**Removing a Role from a Plan** - -URL: ``/plans//roles//`` - -Method: ``DELETE`` - -Description: Removes a role identified by role_uuid from the given plan. - -Notes: - -* This will cause the parameter consolidation to occur and entries to be - removed from the plan's configuration parameters. -* This call will update the ``last_modified`` timestamp to indicate a change - has been made that will require an update to Heat to be made live. - -Request Data: None - -Response Codes: - -* 200 - if the removal is successful -* 404 - if there is no plan with the given UUID or it does not have the - specified role and version combination - -Response Data: - -The same document describing the cloud as from -:ref:`Retrieve a Single Plan `. The configuration -parameters will be updated to reflect the removed role. - - ----- - -.. _changing-plan-configuration: - -**Changing a Plan's Configuration Values** - -URL: ``/plans//`` - -Method: ``PATCH`` - -Description: Sets the values for one or more configuration parameters. - -Notes: - -* This call will update the ``last_modified`` timestamp to indicate a change - has been made that will require an update to Heat to be made live. - -Request Data: JSON document containing the parameter keys and values to set -for the plan. - -Request Example: - -.. code-block:: json - - [ - { - "name" : "database_host", - "value" : "10.11.12.13" - }, - { - "name" : "database_password", - "value" : "secret" - } - ] - -Response Codes: - -* 200 - if the update was successful -* 400 - if one or more of the new values fails validation -* 404 - if there is no plan with the given UUID - -Response Data: - -The same document describing the plan as from -:ref:`Retrieve a Single Plan `. - - ----- - -.. _list-roles: - -**Retrieving Possible Roles** - -URL: ``/roles/`` - -Method: ``GET`` - -Description: Returns a list of all roles available in Tuskar. - -Notes: - -* There will be a separate entry for each version of a particular role. - -Request Data: None - -Response Codes: - -* 200 - containing the available roles - -Response Data: A list of roles, where each role contains: - -* Name -* Version -* Description - -Response Example: - -.. code-block:: json - - [ - { - "uuid" : "3d46e510-6a63-4ed1-abd0-9306a451f8b4", - "name" : "compute", - "version" : "1", - "description" : "Nova Compute" - }, - { - "uuid" : "71d6c754-c89c-4293-9d7b-c4dcc57229f0", - "name" : "compute", - "version" : "2", - "description" : "Nova Compute" - }, - { - "uuid" : "651c26f6-63e2-4e76-9b60-614b51249677", - "name" : "controller", - "version" : "1", - "description" : "Controller Services" - } - ] - - -Alternatives ------------- - -There are currently no alternate schemas proposed for the REST APIs. - -Security Impact ---------------- - -These changes should have no additional security impact. - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -The potential performance issues revolve around Tuskar's solution for storing -the cloud files [3]_. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -After being merged, there will be a period where the Tuskar CLI is out of date -with the new calls. The Tuskar UI will also need to be updated for the changes -in flow. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - jdob - -Work Items ----------- - -* Implement plan CRUD APIs -* Implement role retrieval API -* Write REST API documentation - - -Dependencies -============ - -These API changes are dependent on the rest of the Tuskar backend being -implemented, including the changes to storage and the template consolidation. - -Additionally, the assembly of roles (provider resources) into a Heat -environment is contingent on the conversion of the TripleO Heat templates [4]_. - - -Testing -======= - -Tempest testing should be added as part of the API creation. - - -Documentation Impact -==================== - -The REST API documentation will need to be updated accordingly. - - -References -========== - -.. [3] https://review.openstack.org/#/c/97553/ -.. [4] https://review.openstack.org/#/c/97939/ diff --git a/specs/juno/tripleo-juno-tuskar-template-storage.rst b/specs/juno/tripleo-juno-tuskar-template-storage.rst deleted file mode 100644 index d7086873..00000000 --- a/specs/juno/tripleo-juno-tuskar-template-storage.rst +++ /dev/null @@ -1,552 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================ -TripleO Template and Deployment Plan Storage -============================================ - -This design specification describes a storage solution for a deployment plan. -Deployment plans consist of a set of roles, which in turn define a master Heat -template that can be used by Heat to create a stack representing the deployment -plan; and an environment file that defines the parameters needed by the master -template. - -This specification is principally intended to be used by Tuskar. - -https://blueprints.launchpad.net/tuskar/+spec/tripleo-juno-tuskar-template-storage - -.. _tripleo_juno_tuskar_template_storage_problem: - -Problem Description -=================== - -.. note:: The terminology used in this specification is defined in the `Tuskar - REST API`_ specification. - -.. _Tuskar REST API: https://blueprints.launchpad.net/tuskar/+spec/tripleo-juno-tuskar-plan-rest-api - -In order to accomplish the goal of this specification, we need to first define -storage domain models for roles, deployment plans, and associated concepts. -These associated concepts include Heat templates and environment files. The -models must account for requirements such as versioning and the appropriate -relationships between objects. - -We also need to create a storage mechanism for these models. The storage -mechanism should be distinct from the domain model, allowing the latter to be -stable while the former retains enough flexibility to use a variety of backends -as need and availability dictates. Storage requirements for particular models -include items such as versioning and secure storage. - - -Proposed Change -=============== - -**Change Summary** - -The following proposed change is split into three sections: - -- Storage Domain Models: Defines the domain models for templates, environment - files, roles, and deployment plans. -- Storage API Interface: Defines Python APIs that relate the models to - the underlying storage drivers; is responsible for translating stored content - into a model object and vice versa. Each model requires its own storage - interface. -- Storage Drivers: Defines the API that storage backends need to implement in - order to be usable by the Python API Interface. Plans for initial and future - driver support are discussed here. - -It should be noted that each storage interface will be specified by the user as -part of the Tuskar setup. Thus, the domain model can assume that the appropriate -storage interfaces - a template store, an environment store, etc - are defined -globally and accessible for use. - - -**Storage Domain Models** - -The storage API requires the following domain models: - -- Template -- Environment File -- Role -- Deployment Plan - -The first two map directly to Heat concepts; the latter two are Tuskar concepts. - -Note that each model will also contain a save method. The save method will call -create on the store if the uuid isn't set, and will call update on the store -if the instance has a uuid. - - -**Template Model** - -The template model represents a Heat template. - -.. code-block:: python - - class Template: - uuid = UUID string - name = string - version = integer - description = string - content = string - created_at = datetime - - # This is derived from the content from within the template store. - parameters = dict of parameter names with their types and defaults - - -**Environment File Model** - -The environment file defines the parameters and resource registry for a Heat -stack. - -.. code-block:: python - - class EnvironmentFile: - uuid = UUID string - content = string - created_at = datetime - updated_at = datetime - - # These are derived from the content from within the environment file store. - resource_registry = list of provider resource template names - parameters = dict of parameter names and their values - - def add_provider_resource(self, template): - # Adds the specified template object to the environment file as a - # provider resource. This updates the parameters and resource registry - # in the content. The provider resource type will be derived from the - # template file name. - - def remove_provider_resource(self, template): - # Removes the provider resource that matches the template from the - # environment file. This updates the parameters and resource registry - # in the content. - - def set_parameters(self, params_dict): - # The key/value pairs in params_dict correspond to parameter names/ - # desired values. This method updates the parameters section in the - # content to the values specified in params_dict. - - -**Role Model** - -A role is a scalable unit of a cloud. A deployment plan specifies one or more -roles. Each role must specify a primary role template. It must also specify -the dependencies of that template. - -.. code-block:: python - - class Role: - uuid = UUID string - name = string - version = integer - description = string - role_template_uuid = Template UUID string - dependent_template_uuids = list of Template UUID strings - created_at = datetime - - def retrieve_role_template(self): - # Retrieves the Template with uuid matching role_template_uuid - - def retrieve_dependent_templates(self): - # Retrieves the list of Templates with uuids matching - # dependent_template_uuids - - -**Deployment Plan Model** - -The deployment plan defines the application to be deployed. It does so by -specifying a list of roles. Those roles are used to construct an environment -file that contains the parameters that are needed by the roles' templates and -the resource registry that register each role's primary template as a provider -resource. A master template is also constructed so that the plan can be -deployed as a single Heat stack. - -.. code-block:: python - - class DeploymentPlan: - uuid = UUID string - name = string - description = string - role_uuids = list of Role UUID strings - master_template_uuid = Template UUID string - environment_file_uuid = EnvironmentFile UUID string - created_at = datetime - updated_at = datetime - - def retrieve_roles(self): - # Retrieves the list of Roles with uuids matching role_uuids - - def retrieve_master_template(self): - # Retrieves the Template with uuid matching master_template_uuid - - def retrieve_environment_file(self): - # Retrieves the EnvironmentFile with uuid matching environment_file_uuid - - def add_role(self, role): - # Adds a Role to the plan. This operation will modify the master - # template and environment file through template munging operations - # specified in a separate spec. - - def remove_role(self, role): - # Removes a Role from the plan. This operation will modify the master - # template and environment file through template munging operations - # specified in a separate spec. - - def get_dependent_templates(self): - # Returns a list of dependent templates. This consists of the - # associated role templates. - - -**Storage API Interface** - -Each of the models defined above has their own Python storage interface. These -are manager classes that query and perform CRUD operations against the storage -drivers and return instances of the models for use (with the exception of delete -which returns ``None``). The storage interfaces bind the models to the driver -being used; this allows us to store each model in a different location. - -Note that each store also contains a serialize method and a deserialize method. -The serialize method takes the relevant object and returns a dictionary -containing all value attributes; the deserialize method does the reverse. - -The drivers are discussed in -:ref:`the next section`. - - -**Template API** - -.. code-block:: python - - class TemplateStore: - - def create(self, name, content, description=None): - # Creates a Template. If no template exists with a matching name, - # the template version is set to 0; otherwise it is set to the - # greatest existing version plus one. - - def retrieve(self, uuid): - # Retrieves the Template with the specified uuid. Queries a Heat - # template parser for template parameters and dependent template names. - - def retrieve_by_name(self, name, version=None): - # Retrieves the Template with the specified name and version. If no - # version is specified, retrieves the latest version of the Template. - - def delete(self, uuid): - # Deletes the Template with the specified uuid. - - def list(self, only_latest=False): - # Returns a list of all Templates. If only_latest is True, filters - # the list to the latest version of each Template name. - - -**Environment File API** - -The environment file requires secure storage to protect parameter values. - -.. code-block:: python - - class EnvironmentFileStore: - - def create(self): - # Creates an empty EnvironmentFile. - - def retrieve(self, uuid): - # Retrieves the EnvironmentFile with the specified uuid. - - def update(self, model): - # Updates an EnvironmentFile. - - def delete(self, uuid): - # Deletes the EnvironmentFile with the specified uuid. - - def list(self): - # Returns a list of all EnvironmentFiles. - - -**Role API** - -.. code-block:: python - - class RoleStore: - - def create(self, name, role_template, description=None): - version=None, template_uuid=None): - # Creates a Role. If no role exists with a matching name, the - # template version is set to 0; otherwise it is set to the greatest - # existing version plus one. - # - # Dependent templates are derived from the role_template. The - # create method will take all dependent template names from - # role_template, retrieve the latest version of each from the - # TemplateStore, and use those as the dependent template list. - # - # If a dependent template is missing from the TemplateStore, then - # an exception is raised. - - def retrieve(self, uuid): - # Retrieves the Role with the specified uuid. - - def retrieve_by_name(self, name, version=None): - # Retrieves the Role with the specified name and version. If no - # version is specified, retrieves the latest version of the Role. - - def update(self, model): - # Updates a Role. - - def delete(self, uuid): - # Deletes the Role with the specified uuid. - - def list(self, only_latest=False): - # Returns a list of all Roles. If only_latest is True, filters - # the list to the latest version of each Role. - - -**Deployment Plan API** - -.. code-block:: python - - class DeploymentPlanStore: - - def create(self, name, description=None): - # Creates a DeploymentPlan. Also creates an associated empty master - # Template and EnvironmentFile; these will be modified as Roles are - - def retrieve(self, uuid): - # Retrieves the DeploymentPlan with the specified uuid. - - def update(self, model): - # Updates a DeploymentPlan. - - def delete(self, uuid): - # Deletes the DeploymentPlan with the specified uuid. - - def list(self): - # Retrieves a list of all DeploymentPlans. - - -.. _tripleo_juno_tuskar_template_storage_drivers: - -**Storage Drivers** - -Storage drivers operate by storing object dictionaries. For storage solutions -such as Glance these dictionaries are stored as flat files. For a storage -solution such as a database, the dictionary is translated into a table row. It -is the responsibility of the driver to understand how it is storing the object -dictionaries. - -Each storage driver must provide the following methods. - -.. code-block:: python - - class Driver: - - def create(self, filename, object_dict): - # Stores the specified content under filename and returns the resulting - # uuid. - - def retrieve(self, uuid): - # Returns the object_dict matching the uuid. - - def update(self, uuid, object_dict): - # Updates the object_dict specified by the uuid. - - def delete(self, uuid): - # Deletes the content specified by the uuid. - - def list(self): - # Return a list of all content. - - -For Juno, we will aim to use a combination of a relational database and Heat. -Heat will be used for the secure storage of sensitive environment parameters. -Database tables will be used for everything else. The usage of Heat for secure -stores relies on `PATCH support`_ to be added the Heat API. This bug is -targeted for completion by Juno-2. - -.. _PATCH support: https://bugs.launchpad.net/heat/+bug/1224828 - -This is merely a short-term solution, as it is understood that there is some -reluctance in introducing an unneeded database dependency. In the long-term we -would like to replace the database with Glance once it is updated from an image -store to a more general artifact repository. However, this feature is currently -in development and cannot be relied on for use in the Juno cycle. The -architecture described in this specification should allow reasonable ease in -switching from one to the other. - - -.. _tripleo_juno_tuskar_template_storage_alternatives: - -Alternatives ------------- - -**Modeling Relationships within Heat Templates** - -The specification proposes modeling relationships such as a plan's associated -roles or a role's dependent templates as direct attributes of the object. -However, this information would appear to be available as part of a plan's -environment file or by traversing the role template's dependency graph. Why -not simply derive the relationships in that way? - -A role is a Tuskar abstraction. Within Heat, it corresponds to a template used -as a provider resource; however, a role has added requirements, such as the -versioning of itself and its dependent templates, or the ability to list out -available roles for selection within a plan. These are not requirements that -Heat intends to fulfill, and fulfilling them entirely within Heat feels like an -abuse of mechanics. - -From a practical point of view, modeling relationships within Heat templates -requires the in-place modification of Heat templates by Tuskar to deal with -versioning. For example, if version 1 of the compute role specifies -{{compute.yaml: 1}, {compute-config.yaml: 1}}, and version 2 of the role -specifies {{compute.yaml: 1}, {compute-config.yaml: 2}}, the only way to -allow both versions of the role to be used is to allow programmatic -modification of compute.yaml to point at the correct version of -compute-config.yaml. - - -**Swift as a Storage Backend** - -Swift was considered as an option to replace the relational database but was -ultimately discounted for two key reasons: - -- The versioning system in Swift doesn't provide a static reference to the - current version of an object. Rather it has the version "latest" and this is - dynamic and changes when a new version is added, therefore there is no way to - stick a deployment to a version. -- We need to create a relationship between the provider resources within a Role - and swift doesn't support relationships between stored objects. - -Having said that, after seeking guidance from the Swift team, it has been -suggested that a naming convention or work with different containers may -provide us with enough control to mimic a versioning system that meets our -requirements. These suggestions have made Swift more favourable as an option. - - -**File System as a Storage Backend** - -The filesystem was briefly considered and may be included to provide a simpler -developer setup. However, to create a production ready system with versioning, -and relationships this would require re-implementing much of what other -databases and services provide for us. Therefore, this option is reserved only -for a development option which will be missing key features. - - -**Secure Driver Alternatives** - -Barbican, the OpenStack secure storage service, provides us with an alternative -if PATCH support isn't added to Heat in time. - -Currently the only alternative other than Barbican is to implement our own -cryptography with one of the other options listed above. This isn't a -favourable choice as it adds a technical complexity and risk that should be -beyond the scope of this proposal. - -The other option with regards to sensitive data is to not store any. This would -require the REST API caller to provide the sensitive information each time a -Heat create (and potentially update) is called. - - -Security Impact ---------------- - -Some of the configuration values, such as service passwords, will be sensitive. -For this reason, Heat or Barbican will be used to store all configuration -values. - -While access will be controlled by the Tuskar API large files could be provided -in the place of provider resource files or configuration files. These should be -verified against a reasonable limit. - - -Other End User Impact ---------------------- - -The template storage will be primarily used by the Tuskar API, but as it may be -used directly in the future it will need to be documented. - - -Performance Impact ------------------- - -Storing the templates in Glance and Barbican will lead to API calls over the -local network rather than direct database access. These are likely to have -higher overhead. However, the read and writing used in Tuskar is expected to be -infrequent and will only trigger simple reads and writes when manipulating a -deployment plan. - - -Other Deployer Impact ---------------------- - -None - - -Developer Impact ----------------- - -TripleO will have access to sensitive and insensitive storage through the -storage API. - - -Implementation -============== - - -Assignee(s) ------------ - -Primary assignee: - d0ugal - -Other contributors: - tzumainn - - -Work Items ----------- - -- Implement storage API -- Create Glance and Barbican based storage driver -- Create database storage driver - - -Dependencies -============ - -- Glance -- Barbican - - -Testing -======= - -- The API logic will be verified with a suite of unit tests that mock the - external services. -- Tempest will be used for integration testing. - - -Documentation Impact -==================== - -The code should be documented with docstrings and comments. If it is used -outside of Tuskar further user documentation should be developed. - - -References -========== - -- https://blueprints.launchpad.net/glance/+spec/artifact-repository-api -- https://blueprints.launchpad.net/glance/+spec/metadata-artifact-repository -- https://bugs.launchpad.net/heat/+bug/1224828 -- https://docs.google.com/document/d/1tOTsIytVWtXGUaT2Ia4V5PWq4CiTfZPDn6rpRm5In7U -- https://etherpad.openstack.org/p/juno-hot-artifacts-repository-finalize-design -- https://etherpad.openstack.org/p/juno-summit-tripleo-tuskar-planning -- https://wiki.openstack.org/wiki/Barbican -- https://wiki.openstack.org/wiki/TripleO/TuskarJunoPlanning -- https://wiki.openstack.org/wiki/TripleO/TuskarJunoPlanning/TemplateBackend diff --git a/specs/juno/tripleo-on-openstack.rst b/specs/juno/tripleo-on-openstack.rst deleted file mode 100644 index b715efd7..00000000 --- a/specs/juno/tripleo-on-openstack.rst +++ /dev/null @@ -1,246 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -QuintupleO - TripleO on OpenStack -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-on-openstack - -This is intended as a new way to do a TripleO deployment in a virtualized -environment. Rather than provisioning the target virtual machines directly -via virsh, we would be able to use the standard OpenStack apis to create and -manage the instances. This should make virtual TripleO environments more -scalable and easier to manage. - -Ultimately the goal would be to make it possible to do virtual TripleO -deployments on any OpenStack cloud, except where necessary features have -explicitly been disabled. We would like to have the needed features -available on the public clouds used for OpenStack CI, so existing providers -are invited to review this specification. - -Problem Description -=================== - -TripleO development and testing requires a lot of hardware resources, and -this is only going to increase as things like HA are enabled by default. -In addition, we are going to want to be able to test larger deployments than -will fit on a single physical machine. While it would be possible to set -this up manually, OpenStack already provides services capable of managing -a large number of physical hosts and virtual machines, so it doesn't make -sense to reinvent the wheel. - -Proposed Change -=============== - -* Write a virtual power driver for OpenStack instances. I already have a - rough version for nova-baremetal, but it needs a fair amount of cleaning up - before it could be merged into the main codebase. We will also need to - work with the Ironic team to enable this functionality there. - -* Determine whether changes are needed in Neutron to allow us to run our own - DHCP server, and if so work with the Neutron team to make those changes. - This will probably require allowing an instance to be booted without any - ip assigned. If not, booting an instance without an IP would be a good - future enhancement to avoid wasting IP quota. - -* Likewise, determine how to use virtual ips with keepalived/corosync+pacemaker - in Neutron, and if changes to Neutron are needed work with their team to - enable that functionality. - -* Enable PXE booting in Nova. There is already a bug open to track this - feature request, but it seems to have been abandoned. See the link in the - References section of this document. Ideally this should be enabled on a - per-instance basis so it doesn't require a specialized compute node, which - would not allow us to run on a standard public cloud. - -* For performance and feature parity with the current virtual devtest - environment, we will want to be allow the use of unsafe caching for the - virtual baremetal instances. - -* Once all of the OpenStack services support this use case we will want to - convert our CI environment to a standard OpenStack KVM cloud, as well as - deprecate the existing method of running TripleO virtually and enable - devtest to install and configure a local OpenStack installation (possibly - using devstack) on which to run. - -* Depending on the state of our container support at that time, we may want - to run the devtest OpenStack using containers to avoid taking over the host - system the way devstack normally does. This may call for its own spec when - we reach that point. - -Alternatives ------------- - -* There's no real alternative to writing a virtual power driver. We have to - be able to manage OpenStack instances as baremetal nodes for this to work. - -* Creating a flat Neutron network connected to a local bridge can address the - issues with Neutron not allowing DHCP traffic, but that only works if you - have access to create the local bridge and configure the new network. This - may not be true in many (all?) public cloud providers. - -* I have not done any work with virtual IP addresses in Neutron yet, so it's - unclear to me whether any alternatives exist for that. - -* As noted earlier, using an iPXE image can allow PXE booting of Nova - instances. However, because that image is overwritten during the deploy, - it is not possible to PXE boot the instance afterward. Making the TripleO - images bootable on their own might be an option, but it would diverge from - how a real baremetal environment would work and thus is probably not - desirable. - -Deploy overcloud without PXE boot -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Since a number of the complications around doing TripleO development on an -OpenStack cloud relate to PXE booting the instances, one option that could -be useful in some situations is the ability to deploy images directly. Since -we're using Heat for deployments, it should be possible to build the TripleO -images with the ``vm`` element and deploy them as regular instances instead of -fake baremetal ones. - -This has the drawback of not exercising as much of the TripleO baremetal -functionality as a full virtual PXE boot process, but it should be easier to -implement, and for some development work not related to the deploy process -would be sufficient for verifying that a feature works as intended. It might -serve as a good intermediate step while we work to enable full PXE boot -functionality in OpenStack clouds. - -It would also prevent exercising HA functionality because we would likely not -be able to use virtual IP addresses if we can't use DHCP/PXE to manage our -own networking environment. - -Security Impact ---------------- - -* The virtual power driver is going to need access to OpenStack - credentials so it can control the instances. - -* The Neutron changes to allow private networks to behave as flat networks - may have security impacts, though I'm not exactly sure what they would be. - The same applies to virtual IP support. - -* PXE booting instances could in theory allow an attacker to override the - DHCP server and boot arbitrary images, but in order to do that they would - already need to have access to the private network being used, so I don't - consider this a significant new threat. - -Other End User Impact ---------------------- - -End users doing proof of concepts using a virtual deployment environment -would need to be switched to this new method, but that should be largely -taken care of by the necessary changes to devtest since that's what would -be used for such a deployment. - -Performance Impact ------------------- - -In my testing, my OpenStack virtual power driver was significantly slower -than the existing virsh-based one, but I believe with a better implementation -that could be easily solved. - -When running TripleO on a public cloud, a developer would be subject to the -usual limitations of shared hardware - a given resource may be oversubscribed -and cause performance issues for the processing or disk-heavy operations done -by a TripleO deployment. - -Other Deployer Impact ---------------------- - -This is not intended to be visible to regular deployers, but it should -make our CI environment more flexible by allowing more dynamic allocation -of resources. - -Developer Impact ----------------- - -If this becomes the primary method of doing TripleO development, devtest would -need to be altered to either point at an existing OpenStack environment or -to configure a local one itself. This will have an impact on how developers -debug problems with their environment, but since they would be debugging -OpenStack in that case it should be beneficial in the long run. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - bnemec - -Other contributors: - jang - -Work Items ----------- -* Implement an Ironic OpenStack virtual power driver. - -* Implement a nova-baremetal OpenStack virtual power driver, probably out - of tree based on the feedback we're getting from Nova and Ironic. - -* Enable PXE booting of Nova instances. - -* Enable unsafe caching to be enabled on Nova instances. - -* Allow DHCP/PXE traffic on private networks in Neutron. - -* If not already covered by the previous point, allow booting of instances - without IP addresses. - -* Migrate CI to use an OpenStack cloud for its virtual baremetal instances. - -* Migrate devtest to install and configure an OpenStack cloud instead of - managing instances and networking manually. - -* To simplify the VM provisioning process, we should make it possible to - provision but not boot a Nova VM. - - -Dependencies -============ - -The Ironic, Neutron, and Nova changes in the Work Items section will all have -to be done before TripleO can fully adopt this feature. - - -Testing -======= - -* All changes in the other projects will be unit and functional tested as - would any other new feature. - -* We cannot test this functionality by running devstack to provision an - OpenStack cloud in a gate VM, such as would be done for Tempest, because - the performance of the nested qemu virtual machines would make the process - prohibitively slow. We will need to have a baremetal OpenStack deployment - that can be targeted by the tests. A similar problem exists today with - virsh instances, however, and it can probably be solved in a similar - fashion with dedicated CI environments. - -* We will need to have Tempest tests gating on all the projects we use to - exercise the functionality we depend on. This should be largely covered - by the functional tests for the first point, but it's possible we will find - TripleO-specific scenarios that need to be added as well. - - -Documentation Impact -==================== - -devtest will need to be updated to reflect the new setup steps needed to run -it against an OpenStack-based environment. - - -References -========== - -This is largely based on the discussion Devtest on OpenStack in -https://etherpad.openstack.org/p/devtest-env-reqs - -Nova bug requesting PXE booting support: -https://bugs.launchpad.net/nova/+bug/1183885 diff --git a/specs/juno/unit-testing.rst b/specs/juno/unit-testing.rst deleted file mode 100644 index f3022de2..00000000 --- a/specs/juno/unit-testing.rst +++ /dev/null @@ -1,187 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Unit Testing TripleO Projects -========================================== - -https://blueprints.launchpad.net/tripleo/unit-testing - -We should enable more unit testing in TripleO projects to allow better test -coverage of code paths not included in CI, make it easier for reviewers -to verify that a code change does what it is supposed to, and avoid wasting -reviewer and developer time resolving style issues. - -Problem Description -=================== - -Right now there is very little unit testing of the code in most of the TripleO -projects. This has a few negative effects: - -- We have no test coverage of any code that isn't included in our CI runs. - -- For the code that is included in CI runs, we don't actually know how much - of that code is being tested. There may be many code branches that are not - used during a CI run. - -- We have no way to test code changes in isolation, which makes it slower to - iterate on them. - -- Changes not covered by CI are either not tested at all or must be manually - tested by reviewers, which is tedious and error-prone. - -- Major refactorings frequently break less commonly used interfaces to tools - because those interfaces are not tested. - -Additionally, because there are few/no hacking-style checks in the TripleO -projects, many patches get -1'd for style issues that could be caught by -an automated tool. This causes unnecessary delay in merging changes. - -Proposed Change -=============== - -I would like to build out a unit testing framework that simplifies the -process of unit testing in TripleO. Once that is done, we should start -requiring unit tests for new and changed features like the other OpenStack -projects do. At that point we can also begin adding test coverage for -existing code. - -The current plan is to make use of Python unit testing libraries to be as -consistent as possible with the rest of OpenStack and make use of the test -infrastructure that already exists. This will reduce the amount of new code -required and make it easier for developers to begin writing unit tests. - -For style checking, the dib-lint tool has already been created to catch -common errors in image elements. More rules should be added to it as we -find problems that can be automatically found. It should also be applied -to the tripleo-image-elements project. - -The bashate project also provides some general style checks that would be -useful in TripleO, so we should begin making use of it as well. We should -also contribute additional checks when possible and provide feedback on any -checks we disagree with. - -Any unit tests added should be able to run in parallel. This both speeds up -testing and helps find race bugs. - -Alternatives ------------- - -Shell unit testing -^^^^^^^^^^^^^^^^^^ -Because of the quantity of bash code used in TripleO, we may want to -investigate using a shell unit test framework in addition to Python. I -think this can be revisited once we are further along in the process and -have a better understanding of how difficult it will be to unit test our -scripts with Python. I still think we should start with Python for the -reasons above and only add other options if we find something that Python -unit tests can't satisfy. - -One possible benefit of a shell-specific unit testing framework is that it -could provide test coverage stats so we know exactly what code is and isn't -being tested. - -If we determine that a shell unit test framework is needed, we should try -to choose a widely-used one with well-understood workflows to ease adoption. - -Sandboxing -^^^^^^^^^^ -I have done some initial experimentation with using fakeroot/fakechroot to -sandbox scripts that expect to have access to the root filesystem. I was -able to run a script that writes to root-owned files as a regular user, making -it think it was writing to the real files, but I haven't gotten this working -with tox for running unit tests that way. - -Another option would be to use real chroots. This would provide isolation -and is probably more common than fakeroots. The drawback would be that -chrooting requires root access on the host machine, so running the unit tests -would as well. - -Security Impact ---------------- - -Many scripts in elements assume they will be running as root. We obviously -don't want to do that in unit tests, so we need a way to sandbox those scripts -to allow them to run but not affect the test system's root filesystem. - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -Adding more tests will increase the amount of time Jenkins gate jobs take. -This should have minimal real impact though, because unit tests should run -in significantly less time than the integration tests. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Developers will need to implement unit tests for their code changes, which -will require learning the unit testing tools we adopt. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - bnemec - -goneri has begun some work to enable dib-lint in tripleo-image-elements - -Work Items ----------- - -* Provide and document a good Python framework for testing the behavior of - bash scripts. Use existing functionality in upstream projects where - possible, and contribute new features when necessary. - -* Gate tripleo-image-elements on dib-lint, which will require fixing any - lint failures currently in tripleo-image-elements. - -* Enable bashate in the projects with a lot of bash scripts. - -* Add unit-testing to tripleo-incubator to enable verification of things - like ``devtest.sh --build-only``. - -* Add a template validation test job to triple-heat-templates. - -Dependencies -============ - -* bashate will be a new test dependency. - -Testing -======= - -These changes should leverage the existing test infrastructure as much as -possible, so the only thing needed to enable the new tests would be changes -to the infra config for the affected projects. - -Documentation Impact -==================== - -None of this work should be user-visible, but we may need developer -documentation to help with writing unit tests. - - -References -========== - -bashate: http://git.openstack.org/cgit/openstack-dev/bashate/ - -There are some notes related to this spec at the bottom of the Summit -etherpad: https://etherpad.openstack.org/p/juno-summit-tripleo-ci diff --git a/specs/juno/virtual-public-ips.rst b/specs/juno/virtual-public-ips.rst deleted file mode 100644 index 84021175..00000000 --- a/specs/juno/virtual-public-ips.rst +++ /dev/null @@ -1,159 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================ -Virtual IPs for public addresses -================================ - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+specs/tripleo-juno-virtual-public-ips - -The current public IP feature is intended to specify the endpoint that a cloud -can be reached at. This is typically something where HA is highly desirable. - -Making the public IP be a virtual IP instead of locally bound to a single -machine should increase the availability of the clustered service, once we -increase the control plane scale to more than one machine. - -Problem Description -=================== - -Today, we run all OpenStack services with listening ports on one virtual IP. - -This means that we're exposing RabbitMQ, MySQL and possibly other cluster-only -services to the world, when really what we want is public services exposed to -the world and cluster only servers not exposed to the world. Deployers are -(rightfully) not exposing our all-services VIP to the world, which leads to -them having to choose between a) no support for externally visible endpoints, -b) all services attackable or c) manually tracking the involved ports and -playing a catch-up game as we evolve things. - -Proposed Change -=============== - -Create a second virtual IP from a user supplied network. Bind additional copies -of API endpoints that should be publically accessible to that virtual IP. We -need to keep presenting them internally as well (still via haproxy and the -control virtual IP) so that servers without any public connectivity such as -hypervisors can still use the APIs (though they may need to override the IP to -use in their hosts files - we have facilities for that already). - -The second virtual IP could in principle be on a dedicated ethernet card, or -on a VLAN on a shared card. For now, lets require the admin to specify the -interface on which keepalived should be provisioning the shared IP - be that -``br-ctlplane``, ``vlan25`` or ``eth2``. Because the network topology may be -independent, the keepalive quorum checks need to take place on the specified -interface even though this costs external IP addresses. - -The user must be able to specify the same undercloud network as they do today -so that small installs are not made impossible - requiring two distinct -networks is likely hard for small organisations. Using the same network would -not imply using the same IP address - a dedicated IP address will still be -useful to permit better testing confidence and also allows for simple exterior -firewalling of the cluster. - -Alternatives ------------- - -We could not do HA for the public endpoints - not really an option. - -We could not do public endpoints and instead document how to provide border -gateway firewalling and NAT through to the endpoints. This just shifts the -problem onto infrastructure we are not deploying, making it harder to deploy. - -Security Impact ---------------- - -Our security story improves by making this change, as we can potentially -start firewalling the intra-cluster virtual IP to only allow known nodes to -connect. Short of that, our security story has improved since we started -binding to specific ips only, as that made opening a new IP address not -actually expose core services (other than ssh) on it. - -Other End User Impact ---------------------- - -End users will need to be able to find out about the new virtual IP. That -should be straight forward via our existing mechanisms. - -Performance Impact ------------------- - -None anticipated. - -Other Deployer Impact ---------------------- - -Deployers will require an additional IP address either on their undercloud -ctlplane network (small installs) or on their public network (larger/production -installs). - -Developer Impact ----------------- - -None expected. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - lifeless (hahahaha) - -Other contributors: - None. - -Work Items ----------- - -* Generalise keepalived.conf to support multiple VRRP interfaces. - -* Add support for binding multiple IPs to the haproxy configuration. - -* Add logic to incubator and/or heat templates to request a second virtual IP. - -* Change heat templates to bind public services to the public virtual IP. - -* Possibly tweak setup-endpoints to cooperate, though the prior support - should be sufficient. - -These are out of scope for this, but necessary to use it - I intend to put -them in the discussion in Dan's network overhaul spec. - -* Add optional support to our heat templates to boot the machines with two - nics, not just one - so that we have an IP address for the public interface - when its a physical interface. We may find there are ordering / enumeration - issues in Nova/Ironic/Neutron to solve here. - -* Add optional support to our heat templates for statically allocating a port - from neutron and passing it into the control plane for when we're using - VLANs. - -Dependencies -============ - -None. - -Testing -======= - -This will be on by default, so our default CI path will exercise it. - -Additionally we'll be using it in the up coming VLAN test job which will -give us confidence it works when the networks are partitoned. - -Documentation Impact -==================== - -Add to the manual is the main thing. - -References -========== - -None diff --git a/specs/kilo/cinder_ha.rst b/specs/kilo/cinder_ha.rst deleted file mode 100644 index 71d33611..00000000 --- a/specs/kilo/cinder_ha.rst +++ /dev/null @@ -1,183 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========= -Cinder HA -========= - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-kilo-cinder-ha - -Ensure Cinder volumes remain available if one or multiple nodes running -Cinder services or hosting volumes go down. - - -Problem Description -=================== - -TripleO currently deploys Cinder without a shared storage, balancing requests -amongst the nodes. Should one of the nodes running `cinder-volume` fail, -requests for volumes hosted by that node will fail as well. In addition to that, -without a shared storage, should a disk of any of the `cinder-volume` nodes -fail, volumes hosted by that node would be lost forever. - - -Proposed Change -=============== - -Overview --------- - -We aim at introducing support for the configuration of Cinder's Ceph backend -driver and for the deployment of a Ceph storage for use with Cinder. - -Such a scenario will install `ceph-osd` on an arbitrary number of Ceph storage -nodes and `cinder-api`, `cinder-scheduler`, `cinder-volume` and `ceph-mon` on -the controller nodes, allowing users to scale out the Ceph storage nodes -independently from the controller nodes. - -To ensure HA of the volumes, these will be then hosted on the Ceph storage and -to achieve HA for the `cinder-volume` service, all Cinder nodes will use a -shared string as their `host` config setting so that will be able to operate -on the entire (and shared) set of volumes. - -Support for configuration of more drivers could be added later. - -Alternatives ------------- - -An alternative approach could be to deploy the `cinder-volume` services in an -active/standby configuration. This would allow us to support scenarios where the -storage is not shared amongst the Cinder nodes, one of which is for example -LVM over a shared Fiber Channel LUNs. Such a scenario would suffer from -downsides though, it won't permit to scale out and balance traffic over the -storage nodes as easily and may be prone to issues related to the iSCSI session -management on failover. - -A different scenario, based instead on the usage of LVM and DRBD combined, could -be imagined too. Yet this would suffer from downsides as well. The deployment -program would be put in charge of managing the replicas and probably required to -have some understanding of the replicas status as well. These are easily covered -by Ceph itself which takes care of more related problems indeed, like data -rebalancing, or replicas recreation. - -Security Impact ---------------- - -By introducing support for the deployment of the Ceph's tools, we will have to -secure the Ceph services. - -We will allow access to the data hosted by Ceph only to authorized hosts via -usage of `cephx` for authentication, distributing the `cephx` keyrings on the -relevant nodes. Controller nodes will be provisioned with the `ceph.mon` -keyring, with the `client.admin` keyring and the `client.cinder` keyring, -Compute nodes will be provisioned with the `client.cinder` secret in libvirt and -lastly the Ceph storage nodes will be provisioned with the `client.admin` -keyring. - -It is to be said that monitors should not be reachable from the public -network, despite being hosted on the Controllers. Also Cinder won't need -to get access to the monitors' keyring nor the `client.admin` keyring but -those will be hosted on same host as Controllers also run the Ceph monitor -service; Cinder config will not provide any knowledge about those though. - -Other End User Impact ---------------------- - -Cinder volumes as well as Cinder services will remain available despite failure -of one (or more depending on scaling setting) of the Controller nodes or Ceph -storage nodes. - -Performance Impact ------------------- - -The `cinder-api` services will remain balanced and the Controller nodes unloaded -of the LVM-file overhead and the iSCSI traffic so this topology should, as an -additional benefit, improve performances. - -Other Deployer Impact ---------------------- - -* Automated setup of Cinder HA will require the deployment of Ceph. - -* To take advantage of a pre-existing Ceph installation instead of deploying it - via TripleO, deployers will have to provide the input data needed to configure - Cinder's backend driver appropriately - -* It will be possible to scale the number of Ceph storage nodes at any time, as - well as the number of Controllers (running `cinder-volume`) but changing the - backend driver won't be supported as there are no plans to support volumes - migration. - -* Not all Cinder drivers support the scenario where multiple instances of the - `cinder-volume` service use a shared `host` string, notably the default LVM - driver does not. We will use this setting only when appropriate config params - are found in the Heat template, as it happens today with the param called - `include_nfs_backend`. - -* Ceph storage nodes, running the `ceph-osd` service, use the network to - maintain replicas' consistency and as such may transfer some large amount of - data over the network. Ceph allows for the OSD service to differentiate - between a public network and a cluster network for this purpose. This spec - is not going to introduce support for usage of a dedicated cluster network - but we want to have a follow-up spec to implement support for that later. - -Developer Impact ----------------- - -Cinder will continue to be configured with the LVM backend driver by default. - -Developers interested in testing Cinder with the Ceph shared storage will have -to use an appropriate scaling setting for the Ceph storage nodes. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - gfidente - -Other contributors: - jprovazn - -Work Items ----------- - -* add support for deployment of Cinder's Ceph backend driver - -* add support for deployment of the Ceph services - -* add support for external configuration of Cinder's Ceph backend driver - - -Dependencies -============ - -None. - - -Testing -======= - -Will be testable in CI when support for the deployment of the shared Ceph -storage nodes becomes available in TripleO itself. - - -Documentation Impact -==================== - -We will need to provide documentation on how users can deploy Cinder together -with the Ceph storage nodes and also on how users can use instead some -pre-existing Ceph deployment. - - -References -========== - -juno mid-cycle meetup -kilo design session, https://etherpad.openstack.org/p/tripleo-kilo-l3-and-cinder-ha diff --git a/specs/kilo/remove-mergepy.rst b/specs/kilo/remove-mergepy.rst deleted file mode 100644 index 9313fe5a..00000000 --- a/specs/kilo/remove-mergepy.rst +++ /dev/null @@ -1,486 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================================== -Remove merge.py from TripleO Heat Templates -=========================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-juno-remove-mergepy - -``merge.py`` is where we've historically accumulated the technical debt for our -Heat templates [0]_ with the intention of migrating away from it when Heat meets -our templating needs. - -Its main functionality includes combining smaller template snippets into a -single template describing the full TripleO deployment, merging certain -resources together to reduce duplication while keeping the snippets themselves -functional as standalone templates and a support for manual scaling of Heat -resources. - -This spec describes the changes necessary to move towards templates -that do not depend on ``merge.py``. We will use native Heat features -where we can and document the rest, possibly driving new additions to -the Heat template format. - -It is largely based on the April 2014 discussion in openstack-dev [1]_. - - -Problem Description -=================== - -Because of the mostly undocumented nature of ``merge.py`` our templates are -difficult to understand or modify by newcomers (even those already familiar with -Heat). - -It has always been considered a short-term measure and Heat can now provide most -of what we need in our templates. - - -Proposed Change -=============== - -We will start with making small correctness-preserving changes to our -templates and ``merge.py`` that move us onto using more Heat native -features. Where we cannot make the change for some reason, we will -file a bug with Heat and work with them to unblock the process. - -Once we get to a point where we have to do large changes to the -structure of our templates, we will split them off to new files and -enable them in our CI as parallel implementations. - -Once we are confident that the new templates fulfill the same -requirements as the original ones, we will deprecate the old ones, -deprecate ``merge.py`` and switch to the new ones as the default. - -The list of action items necessary for the full transition is -below. - -**1. Remove the custom resource types** - -TripleO Heat templates and ``merge.py`` carry two custom types that (after the -move to software config [8]_, [9]_) are no longer used for anything: - -* OpenStack::ImageBuilder::Elements -* OpenStack::Role - -We will drop them from the templates and deprecate in the merge tool. - - -**2. Remove combining whitelisted resource types** - -If we have two ``AWS::AutoScaling::LaunchConfiguration`` resources with the same -name, ``merge.py`` will combine their ``Properties`` and ``Metadata``. Our -templates are no longer using this after the software-config update. - - -**3. Port TripleO Heat templates to HOT** - -With most of the non-Heat syntax out of the way, porting our CFN/YAML templates -to pure HOT format [2]_ should be straightforward. - -We will have to update ``merge.py`` as well. We should be able to support both -the old format and HOT. - -We should be able to differentiate between the two by looking for the -``heat_template_version`` top-level section which is mandatory in the HOT -syntax. - -Most of the changes to ``merge.py`` should be around spelling (``Parameters`` -> -``parameters``, ``Resources`` -> ``resources``) and different names for -intrinsic functions, etc. (``Fn::GetAtt`` -> ``get_attr``). - -This task will require syntactic changes to all of our templates and -unfortunately, it isn't something different people can update bit by bit. We -should be able to update the undercloud and overcloud portions separately, but -we can't e.g. just update a part of the overcloud. We are still putting -templates together with ``merge.py`` at this point and we would end up with a -template that has both CFN and HOT bits. - - -**4. Move to Provider resources** - -Heat allows passing-in multiple templates when deploying a stack. These -templates can map to custom resource types. Each template would represent a role -(compute server, controller, block storage, etc.) and its ``parameters`` and -``outputs`` would map to the custom resource's ``properties`` and -``attributes``. - -These roles will be referenced from a master template (``overcloud.yaml``, -``undercloud.yaml``) and eventually wrapped in a scaling resource -(``OS::Heat::ResourceGroup`` [5]_) or whatever scaling mechanism we adopt. - -.. note:: Provider resources represent fully functional standalone templates. - Any provider resource template can be passed to Heat and turned into a - stack or treated as a custom resource in a larger deployment. - -Here's a hypothetical outline of ``compute.yaml``:: - - parameters: - flavor: - type: string - image: - type: string - amqp_host: - type: string - nova_compute_driver: - type: string - - resources: - compute_instance: - type: OS::Nova::Server - properties: - flavor: {get_param: flavor} - image: {get_param: image} - - compute_deployment: - type: OS::Heat::StructuredDeployment - properties: - server: {ref: compute_instance} - config: {ref: compute_config} - input_values: - amqp_host: {get_param: amqp_host} - nova_compute_driver: {get_param: nova_compute_driver} - - compute_config: - type: OS::Heat::StructuredConfig - properties: - group: os-apply-config - config: - amqp: - host: {get_input: amqp_host} - nova: - compute_driver: {get_input: nova_compute_driver} - ... - -We will use a similar structure for all the other roles (``controller.yaml``, -``block-storage.yaml``, ``swift-storage.yaml``, etc.). That is, each role will -contain the ``OS::Nova::Server``, the associated deployments and any other -resources required (random string generators, security groups, ports, floating -IPs, etc.). - -We can map the roles to custom types using Heat environments [4]_. - -``role_map.yaml``: :: - - resource_registry: - OS::TripleO::Compute: compute.yaml - OS::TripleO::Controller: controller.yaml - OS::TripleO::BlockStorage: block-storage.yaml - OS::TripleO::SwiftStorage: swift-storage.yaml - - -Lastly, we'll have a master template that puts it all together. - -``overcloud.yaml``:: - - parameters: - compute_flavor: - type: string - compute_image: - type: string - compute_amqp_host: - type: string - compute_driver: - type: string - ... - - resources: - compute0: - # defined in controller.yaml, type mapping in role_map.yaml - type: OS::TripleO::Compute - parameters: - flavor: {get_param: compute_flavor} - image: {get_param: compute_image} - amqp_host: {get_param: compute_amqp_host} - nova_compute_driver: {get_param: compute_driver} - - controller0: - # defined in controller.yaml, type mapping in role_map.yaml - type: OS::TripleO::Controller - parameters: - flavor: {get_param: controller_flavor} - image: {get_param: controller_image} - ... - - outputs: - keystone_url: - description: URL for the Overcloud Keystone service - # `keystone_url` is an output defined in the `controller.yaml` template. - # We're referencing it here to expose it to the Heat user. - value: { get_attr: [controller_0, keystone_url] } - -and similarly for ``undercloud.yaml``. - -.. note:: The individual roles (``compute.yaml``, ``controller.yaml``) are - structured in such a way that they can be launched as standalone - stacks (i.e. in order to test the compute instance, one can type - ``heat stack-create -f compute.yaml -P ...``). Indeed, Heat treats - provider resources as nested stacks internally. - - -**5. Remove FileInclude from ``merge.py``** - -The goal of ``FileInclude`` was to keep individual Roles (to borrow a -loaded term from TripleO UI) viable as templates that can be launched -standalone. The canonical example is ``nova-compute-instance.yaml`` [3]_. - -With the migration to provider resources, ``FileInclude`` is not necessary. - - -**6. Move the templates to Heat-native scaling** - -Scaling of resources is currently handled by ``merge.py``. The ``--scale`` -command line argument takes a resource name and duplicates it as needed (it's -a bit more complicated than that, but that's beside the point). - -Heat has a native scaling ``OS::Heat::ResourceGroup`` [5]_ resource that does -essentially the same thing:: - - scaled_compute: - type: OS::Heat::ResourceGroup - properties: - count: 42 - resource_def: - type: OS::TripleO::Compute - parameters: - flavor: baremetal - image: compute-image-rhel7 - ... - -This will create 42 instances of compute hosts. - - -**7. Replace Merge::Map with scaling groups' inner attributes** - -We are using the custom ``Merge::Map`` helper function for getting values out of -scaled-out servers: - -* `Building a comma-separated list of RabbitMQ nodes`__ - -__ https://github.com/openstack/tripleo-heat-templates/blob/a7f2a2c928e9c78a18defb68feb40da8c7eb95d6/overcloud-source.yaml#L642 - -* `Getting the name of the first controller node`__ - -__ https://github.com/openstack/tripleo-heat-templates/blob/a7f2a2c928e9c78a18defb68feb40da8c7eb95d6/overcloud-source.yaml#L405 - -* `List of IP addresses of all controllers`__ - -__ https://github.com/openstack/tripleo-heat-templates/blob/a7f2a2c928e9c78a18defb68feb40da8c7eb95d6/overcloud-source.yaml#L405 - -* `Building the /etc/hosts file`__ - -__ https://github.com/openstack/tripleo-heat-templates/blob/a7f2a2c928e9c78a18defb68feb40da8c7eb95d6/overcloud-source.yaml#L585 - - -The ``ResourceGroup`` resource supports selecting an attribute of an inner -resource as well as getting the same attribute from all resources and returning -them as a list. - -Example of getting an IP address of the controller node: :: - - {get_attr: [controller_group, resource.0.networks, ctlplane, 0]} - -(`controller_group` is the `ResourceGroup` of our controller nodes, `ctlplane` -is the name of our control plane network) - -Example of getting the list of names of all of the controller nodes: :: - - {get_attr: [controller_group, name]} - -The more complex uses of ``Merge::Map`` involve formatting the returned data in -some way, for example building a list of ``{ip: ..., name: ...}`` dictionaries -for haproxy or generating the ``/etc/hosts`` file. - -Since our ResourceGroups will not be using Nova servers directly, but rather the -custom role types using provider resources and environments, we can put this -data formatting into the role's ``outputs`` section and then use the same -mechanism as above. - -Example of building out the haproxy node entries:: - - # overcloud.yaml: - resources: - controller_group: - type: OS::Heat::ResourceGroup - properties: - count: {get_param: controller_scale} - resource_def: - type: OS::TripleO::Controller - properties: - ... - - controllerConfig: - type: OS::Heat::StructuredConfig - properties: - ... - haproxy: - nodes: {get_attr: [controller_group, haproxy_node_entry]} - - - - # controller.yaml: - resources: - ... - controller: - type: OS::Nova::Server - properties: - ... - - outputs: - haproxy_node_entry: - description: A {ip: ..., name: ...} dictionary for configuring the - haproxy node - value: - ip: {get_attr: [controller, networks, ctlplane, 0]} - name: {get_attr: [controller, name]} - - - -Alternatives ------------- - -This proposal is very t-h-t and Heat specific. One alternative is to do nothing -and keep using and evolving ``merge.py``. That was never the intent, and most -members of the core team do not consider this a viable long-term option. - - -Security Impact ---------------- - -This proposal does not affect the overall functionality of TripleO in any way. -It just changes the way TripleO Heat templates are stored and written. - -If anything, this will move us towards more standard and thus more easily -auditable templates. - - -Other End User Impact ---------------------- - -There should be no impact for the users of vanilla TripleO. - -More advanced users may want to customise the existing Heat templates or write -their own. That will be made easier when we rely on standard Heat features only. - - -Performance Impact ------------------- - -This moves some of the template-assembling burden from ``merge.py`` to Heat. It -will likely also end up producing more resources and nested stacks on the -background. - -As far as we're aware, no one has tested these features at the scale we are -inevitably going to hit. - -Before we land changes that can affect this (provider config and scaling) we -need to have scale tests in Tempest running TripleO to make sure Heat can cope. - -These tests can be modeled after the `large_ops`_ scenario: a Heat template that -creates and destroys a stack of 50 Nova server resources with associated -software configs. - -We should have two tests to asses the before and after performance: - -1. A single HOT template with 50 copies of the same server resource and software - config/deployment. -2. A template with a single server and its software config/deploys, an - environment file with a custom type mapping and an overall template that - wraps the new type in a ResourceGroup with the count of 50. - -.. _large_ops: https://github.com/openstack/tempest/blob/master/tempest/scenario/test_large_ops.py - - -Other Deployer Impact ---------------------- - -Deployers can keep using ``merge.py`` and the existing Heat templates as before --- existing scripts ought not break. - -With the new templates, Heat will be called directly and will need the resource -registry (in a Heat environment file). This will mean a change in the deployment -process. - - - -Developer Impact ----------------- - -This should not affect non-Heat and non-TripleO OpenStack developers. - -There will likely be a slight learning curve for the TripleO developers who want -to write and understand our Heat templates. Chances are, we will also encounter -bugs or unforeseen complications while swapping ``merge.py`` for Heat features. - -The impact on Heat developers would involve processing the bugs and feature -requests we uncover. This will hopefully not be an avalanche. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Tomas Sedovic - - -Work Items ----------- - -1. Remove the custom resource types -2. Remove combining whitelisted resource types -3. Port TripleO Heat templates to HOT -4. Move to Provider resources -5. Remove FileInclude from ``merge.py`` -6. Move the templates to Heat-native scaling -7. Replace Merge::Map with scaling groups' inner attributes - - -Dependencies -============ - -* The Juno release of Heat -* Being able to kill specific nodes in Heat (for scaling down or because they're - misbehaving) - - Relevant Heat blueprint: `autoscaling-parameters`_ - -.. _autoscaling-parameters: https://blueprints.launchpad.net/heat/+spec/autoscaling-parameters - - -Testing -======= - -All of these changes will be made to the tripleo-heat-templates repository and -should be testable by our CI just as any other t-h-t change. - -In addition, we will need to add Tempest scenarios for scale to ensure Heat can -handle the load. - - -Documentation Impact -==================== - -We will need to update the `devtest`_, `Deploying TripleO`_ and `Using TripleO`_ -documentation and create a guide for writing TripleO templates. - -.. _devtest: http://docs.openstack.org/developer/tripleo-incubator/devtest.html -.. _Deploying TripleO: http://docs.openstack.org/developer/tripleo-incubator/deploying.html -.. _Using TripleO: http://docs.openstack.org/developer/tripleo-incubator/userguide.html - - -References -========== - -.. [0] https://github.com/openstack/tripleo-heat-templates -.. [1] http://lists.openstack.org/pipermail/openstack-dev/2014-April/031915.html -.. [2] http://docs.openstack.org/developer/heat/template_guide/hot_guide.html -.. [3] https://github.com/openstack/tripleo-heat-templates/blob/master/nova-compute-instance.yaml -.. [4] http://docs.openstack.org/developer/heat/template_guide/environment.html -.. [5] http://docs.openstack.org/developer/heat/template_guide/openstack.html#OS::Heat::ResourceGroup -.. [8] https://review.openstack.org/#/c/81666/ -.. [9] https://review.openstack.org/#/c/93319/ diff --git a/specs/kilo/tripleo-enable-dvr.rst b/specs/kilo/tripleo-enable-dvr.rst deleted file mode 100644 index 7d5a92c8..00000000 --- a/specs/kilo/tripleo-enable-dvr.rst +++ /dev/null @@ -1,169 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Enable Neutron DVR on overcloud in TripleO -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/support-neutron-dvr - -Neutron distributed virtual routing should be able to be configured in TripleO. - - -Problem Description -=================== - -To be able to enable distributed virtual routing in Neutron there needs to be -several changes to the current TripleO overcloud deployment. The overcloud -compute node(s) are constructed with the ``neutron-openvswitch-agent`` image -element, which provides the ``neutron-openvswitch-agent`` on the compute node. -In order to support distributed virtual routing, the compute node(s) must also -have the ``neutron-metadata-agent`` and ``neutron-l3-agent`` installed. The -installation of the ``neutron-l3-agent`` and ``neutron-dhcp-agent`` will need -also to be decoupled. - -Additionally, for distributed virtual routing to be enabled, the -``neutron.conf``, ``l3_agent.ini`` and ``ml2_conf.ini`` all need to have -additional settings. - -Proposed Change -=============== - -Overview --------- - -In the tripleo-image-elements, move the current ``neutron-network-node`` element -to an element named ``neutron-router``, which will be responsible for doing the -installation and configuration work required to install the ``neutron-l3-agent`` -and the ``neutron-metadata-agent``. This ``neutron-router`` element will list -the ``neutron-openvswitch-agent`` in its element-deps. The ``neutron-network --node`` element will then become simply a 'wrapper' whose sole purpose is to list -the dependencies required for a network node (neutron, ``neutron-dhcp-agent``, -``neutron-router``, os-refresh-config). - -Additionally, in the tripleo-image-elements/neutron element, the -``neutron.conf``, ``l3_agent.ini`` and ``plugins/ml2/ml2_conf.ini`` will be -modified to add the configuration variables required in each to support -distributed virtual routing (the required configuration variables are listed at -https://wiki.openstack.org/wiki/Neutron/DVR/HowTo#Configuration). - -In the tripleo-heat-templates, the ``nova-compute-config.yaml`` -``nova-compute-instance.yaml`` and ``overcloud-source.yaml`` files will be -modified to provide the correct settings for the new distributed virtual routing -variables. The enablement of distributed virtual routing will be determined by -a 'NeutronDVR' variable which will be 'False' by default (distributed virtual -routing not enabled) for backward compatibility, but can be set to 'True' if -distributed virtual routing is desired. - -Lastly, the tripleo-incubator script ``devtest_overcloud.sh`` will be modified -to: a) build the overcloud-compute disk-image with ``neutron-router`` rather -than with ``neutron-openvswitch-agent``, and b) configure the appropriate -parameter values to be passed in to the heat stack create for the overcloud so -that distributed routing is either enabled or disabled. - -Alternatives ------------- - -We could choose to make no change to the ``neutron-router`` image-element and -it can be included as well in the list of elements arguments to the disk image -build for compute nodes. This has the undesired effect of also -including/configuring and starting the ``neutron-dhcp-agent`` on each compute -node. Alternatively, it is possible to keep the ``neutron-network-node`` -element as it is and create a ``neutron-router`` element which is a copy of -most of the element contents of the ``neutron-network-node`` element but without -the dependency on the ``neutron-dhcp-agent`` element. This approach would -introduce a significant amount of code duplication. - -Security Impact ---------------- - -Although TripleO installation does not use FWaaS, enablement of DVR currently -is known to break FWaaS. -See https://blueprints.launchpad.net/neutron/+spec/neutron-dvr-fwaas - -Other End User Impact ---------------------- - -The user will have the ability to set an environment variable during install -which will determine whether distributed virtual routing is enabled or not. - -Performance Impact ------------------- - -None identified - -Other Deployer Impact ---------------------- - -The option to enable or disable distributed virtual routing at install time will -be added. By default distributed virtual routing will be disabled. - -Developer Impact ----------------- - -None identified - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Erik Colnick (erikcolnick on Launchpad) -Other contributors: - None - -Work Items ----------- - - * Create ``neutron-router`` element in tripleo-image-elements and move related - contents from ``neutron-network-node`` element. Remove the - ``neutron-dhcp-agent`` dependency from the element-deps of the - ``neutron-router`` element. - - * Add the ``neutron-router`` element as a dependency in the - ``neutron-network-node`` ``element-deps`` file. The ``element-deps`` - file becomes the only content in the ``neutron-network-node`` element. - - * Add the configuration values indicated in - https://wiki.openstack.org/wiki/Neutron/DVR/HowTo#Configuration to the - ``neutron.conf``, ``l3_agent.ini`` and ``ml2_conf.ini`` files in the - ``neutron`` image element. - - * Add the necessary reference variables to the ``nova-compute-config.yaml`` and - ``nova-compute-instance.yaml`` tripleo-heat-templates files in order to be - able to set the new variables in the config files (from above item). Add - definitions and default values in ``overcloud-source.yaml``. - - * Modify tripleo-incubator ``devtest_overcloud.sh`` script to set the - appropriate environment variables which will drive the configuration of - neutron on the overcloud to either enable distributed virtual routers or - disable distributed virtual routers (with disable as the default). - -Dependencies -============ - -None - -Testing -======= - -Existing TripleO CI will help ensure that as this is implemented, the current -feature set is not impacted and that the default behavior of disabled -distributed virtual routers is maintained. - -Additional CI tests which test the installation with distributed virtual -routers should be added as this implementation is completed. - -Documentation Impact -==================== - -Documentation of the new configuration option will be needed. - -References -========== - diff --git a/specs/kilo/tripleo-review-standards.rst b/specs/kilo/tripleo-review-standards.rst deleted file mode 100644 index 23e9b864..00000000 --- a/specs/kilo/tripleo-review-standards.rst +++ /dev/null @@ -1,144 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================== -TripleO Review Standards -======================== - -No launchpad blueprint because this isn't a spec to be implemented in code. - -Like many OpenStack projects, TripleO generally has more changes incoming to -the projects than it has core reviewers to review and approve those changes. -Because of this, optimizing reviewer bandwidth is important. This spec will -propose some changes to our review process discussed at the Paris OpenStack -Summit and intended to make the best possible use of core reviewer time. - -There are essentially two major areas that a reviewer looks at when reviewing -a given change: design and implementation. The design part of the review -covers things like whether the change fits with the overall direction of the -project and whether new code is organized in a reasonable fashion. The -implementation part of a review will get into smaller details, such as -whether language functionality is being used properly and whether the general -sections of the code identified in the design part of the review do what is -intended. - -Generally design is considered first, and then the reviewer will drill down to -the implementation details of the chosen design. - -Problem Description -=================== -Many times an overall design for a given change will be agreed upon early in -the change's lifecycle. The implementation for the design may then be -tweaked multiple times (due to rebases, or specific issues pointed out by -reviewers) without any changes to the overall design. Many times these -implementation details are small changes that shouldn't require much -review effort, but because of our current standard of 2 +2's on the current -patch set before a change can be approved, reviewers often must unnecessarily -revisit a change even when it is clear that everyone involved in the review -is in favor of it. - -Proposed Change -=============== - -Overview --------- - -When appropriate, allow a core reviewer to approve a change even if the -latest patch set does not have 2 +2's. Specifically, this should be used -under the following circumstances: - -* A change that has had multiple +2's on past patch sets, indicating an - agreement from the other reviewers that the overall design of the change - is good. -* Any further alterations to the change since the patch set(s) with +2's should - be implementation details only - trivial rebases, minor syntax changes, or - comment/documentation changes. Any more significant changes invalidate this - option. - -As always, core reviewers should use their judgment. When in doubt, waiting -for 2 +2's to approve a change is always acceptable, but this new policy is -intended to make it socially acceptable to single approve a change under the -circumstances described above. - -When approving a change in this manner, it is preferable to leave a comment -explaining why the change is being approved without 2 +2's. - -Alternatives ------------- - -Allowing a single +2 on "trivial" changes was also discussed, but there were -concerns from a number of people present that such a policy might cause more -trouble than it was worth, particularly since "trivial" changes by nature do -not require much review and therefore don't take up much reviewer time. - -Security Impact ---------------- - -Should be minimal to none. If a change between patch sets is significant -enough to have a security impact then this policy does not apply. - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Core reviewers will spend less time revisiting patches they have already -voted in favor of, and contributors should find it easier to get their -patches merged because they won't have to wait as long after rebases and -minor changes. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - bnemec - -Other contributors: - All cores should review and implement this spec in their reviewing - -Work Items ----------- - -Publish the agreed-upon guidelines somewhere more permanent than a spec. - - -Dependencies -============ - -None - -Testing -======= - -None - -Documentation Impact -==================== - -A new document will need to be created for core reviewers to reference. - - -References -========== - -https://etherpad.openstack.org/p/kilo-tripleo-summit-reviews diff --git a/specs/liberty/release-branch.rst b/specs/liberty/release-branch.rst deleted file mode 100644 index 59e3a051..00000000 --- a/specs/liberty/release-branch.rst +++ /dev/null @@ -1,219 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Release Branch proposal for TripleO -========================================== - -To date, the majority of folks consuming TripleO have been doing so via the -master branches of the various repos required to allow TripleO to deploy -an OpenStack cloud. This proposes an alternative "release branch" methodology -which should enable those consuming stable OpenStack releases to deploy -more easily using TripleO. - - -Problem Description -=================== - -Historically strong guarantees about deploying the current stable OpenStack -release have not been made, and it's not something we've been testing in -upstream CI. This is fine from a developer perspective, but it's a major -impediment to those wishing to deploy production clouds based on the stable -OpenStack releases/branches. - -Proposed Change -=============== - -I propose we consider supporting additional "release" branches, for selected -TripleO repos where release-specific changes are required. - -The model will be based on the stable branch model[1] used by many/most -OpenStack projects, but with one difference, "feature" backports will be -permitted provided they are 100% compatible with the currently released -OpenStack services. - -Overview --------- - -The justification for allowing features is that many/most TripleO features are -actually enabling access to features of OpenStack services which will exist in -the stable branches of the services being deployed. Thus, the target audience -of this branch will likely want to consume such "features" to better access -features and configurations which are appropriate to the OpenStack release they -are consuming. - -The other aspect of justification is that projects are adding features -constantly, thus it's unlikely TripleO will be capable of aligning with every -possible new feature for, say Liberty, on day 1 of the release being made. The -recognition that we'll be playing "catch up", and adopting a suitable branch -policy should mean there is scope to continue that alignment after the services -themselves have been released, which will be of benefit to our users. - -Changes landing on the master branch can be considered as valid candidates for -backport, unless: - -* The patch requires new features of an OpenStack service (that do not exist - on the stable branches) to operate. E.g if a tripleo-heat-templates change - needs new-for-liberty Heat features it would *not* be allowed for release/kilo. - -* The patch enables Overcloud features of an OpenStack service that do not - exist on the stable branches of the supported Overcloud version (e.g for - release/kilo we only support kilo overcloud features). - -* User visible interfaces are modified, renamed or removed - removal of - deprecated interfaces may be allowed on the master branch (after a suitable - deprecation period), but these changes would *not* be valid for backport as - they could impact existing users without warning. Adding new interfaces - such as provider resources or parameters would be permitted provided the - default behavior does not impact existing users of the release branch. - -* The patch introduces new dependencies or changes the current requirements.txt. - -To make it easier to identify not-valid-for-backport changes, it's proposed -that a review process be adopted whereby a developer proposing a patch to -master would tag a commit if it doesn't meet the criteria above, or there is -some other reason why the patch would be unsuitable for backport. - -e.g: - - No-Backport: This patch requires new for Mitaka Heat features - - -Alternatives ------------- - -The main alternative to this is to leave upstream TripleO as something which -primarily targets developer/trunk-chasing users, and leave maintaining a -stable branch of the various components to downstream consumers of TripleO, -rdo-manager for example. - -The disadvantage of this approach is it's an impediment to adoption and -participation in the upstream project, so I feel it'd be better to do this work -upstream, and improve the experience for those wishing to deploy via TripleO -using only the upstream tools and releases. - - -Security Impact ---------------- - -We'd need to ensure security related patches landing in master got -appropriately applied to the release branches (same as stable branches for all -other projects). - -Other End User Impact ---------------------- - -This should make it much easier for end users to stand up a TripleO deployed -cloud using the stable released versions of OpenStack services. - -Other Deployer Impact ---------------------- - -This may reduce duplication of effort when multiple downstream consumers of -TripleO exist. - -Developer Impact ----------------- - -The proposal of valid backports will ideally be made by the developer -proposing a patch to the master branch, but avoid creating an undue barrier to -entry for new contributors this will not be mandatory, but will be reccomended -and encouraged via code review comments. - -Standard stable-maint processes[1] will be observed when proposing backports. - -We need to consider if we want a separate stable-maint core (as is common on -most other projects), or if all tripleo-core members can approve backports. -Initially it is anticipated to allow all tripleo-core, potentially with the -addition of others with a specific interest in branch maintenance (e.g -downstream package maintainers). - -Implementation -============== - -Initially the following repos will gain release branches: - -* openstack/tripleo-common -* openstack/tripleo-docs -* openstack/tripleo-heat-templates -* openstack/tripleo-puppet-elements -* openstack/python-tripleoclient -* openstack/instack-undercloud - -These will all have a new branch created, ideally near the time of the upcoming -liberty release, and to avoid undue modification to existing infra tooling, -e.g zuul, they will use the standard stable branch naming, e.g: - -* stable/liberty - -If any additional repos require stable branches, we can add those later when -required. - -It is expected that any repos which don't have a stable/release branch must -maintain compatibility such that they don't break deploying the stable released -OpenStack version (if this proves impractical in any case, we'll create -branches when required). - -Also, when the release branches have been created, we will explicitly *not* -require the master branch for those repos to observe backwards compatibility, -with respect to consuming new OpenStack features. For example, new-for-mitaka -Heat features may be consumed on the master branch of tripleo-heat-templates -after we have a stable/liberty branch for that repo. - -Assignee(s) ------------ - -Primary assignee: - shardy - -Other contributors: - TBC - -Work Items ----------- - -1. Identify the repos which require release branches -2. Create the branches -3. Communicate need to backport to developers, consider options for automating -4. CI jobs to ensure the release branch stays working -5. Documentation to show how users may consume the release branch - -Testing -======= - -We'll need CI jobs configured to use the TripleO release branches, deploying -the stable branches of other OpenStack projects. Hopefully we can make use of -e.g RDO packages for most of the project stable branch content, then build -delorean packages for the tripleo release branch content. - -Ideally in future we'd also test upgrade from one release branch to another -(e.g current release from the previous, and/or from the release branch to -master). - -As a starting point derekh has suggested we create a single centos job, which -only tests HA, and that we'll avoid having a tripleo-ci release branch, -ideally using the under development[2] tripleo.sh developer script to abstract -any differences between deployment steps for branches. - -Documentation Impact -==================== - -We'll need to update the docs to show: - -1. How to deploy an undercloud node from the release branches using stable -OpenStack service versions -2. How to build images containing content from the release branches -3. How to deploy an overcloud using only the release branch versions - -References -========== - -We started discussing this idea in this thread: - -http://lists.openstack.org/pipermail/openstack-dev/2015-August/072217.html - -[1] https://wiki.openstack.org/wiki/StableBranch -[2] https://review.openstack.org/#/c/225096/ diff --git a/specs/mitaka/external-load-balancer.rst b/specs/mitaka/external-load-balancer.rst deleted file mode 100644 index 5b86dbb4..00000000 --- a/specs/mitaka/external-load-balancer.rst +++ /dev/null @@ -1,169 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================== -External Load Balancer -====================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-mitaka-external-load-balancer - -Make it possible to use (optionally) an external load balancer as frontend for -the Overcloud. - - -Problem Description -=================== - -To use an external load balancer the Overcloud templates and manifests will be -updated to accomplish the following three: - -* accept a list of virtual IPs as parameter to be used instead of the virtual - IPs which are normally created as Neutron ports and hosted by the controllers - -* make the deployment and configuration of HAProxy on the controllers optional - -* allow for the assignment of a predefined list of IPs to the controller nodes - so that these can be used for the external load balancer configuration - - -Proposed Change -=============== - -Overview --------- - -The VipMap structure, governed by the ``OS::TripleO::Network::Ports::NetIpMap`` -resource type, will be switched to ``OS::TripleO::Network::Ports::NetVipMap``, -a more specific resource type so that it can pointed to a custom YAML allowing -for the VIPs to be provided by the user at deployment time. Any reference to the -VIPs in the templates will be updated to gather the VIP details from such a -structure. The existing VIP resources will also be switched from the non -specialized type ``OS::TripleO::Controller::Ports::InternalApiPort`` into a -more specific type ``OS::TripleO::Network::Ports::InternalApiVipPort`` so that -it will be possible to noop the VIPs or add support for more parameters as -required and independently from the controller ports resource. - -The deployment and configuration of HAProxy on the controller nodes will become -optional and driven by a new template parameter visible only to the controllers. - -It will be possible to provide via template parameters a predefined list of IPs -to be assigned to the controller nodes, on each network, so that these can be -configured as target IPs in the external load balancer, before the deployment -of the Overcloud is initiated. A new port YAML will be provided for the purpose; -when using an external load balancer this will be used for resources like -``OS::TripleO::Controller::Ports::InternalApiPort``. - -As a requirement for the deployment process to succeed, the external load -balancer must be configured in advance with the appropriate balancing rules and -target IPs. This is because the deployment process itself uses a number of -infrastructure services (database/messaging) as well as core OpenStack services -(Keystone) during the configuration steps. A validation script will be provided -so that connectivity to the VIPs can be tested in advance and hopefully avoid -false negatives during the deployment. - -Alternatives ------------- - -None. - -Security Impact ---------------- - -By filtering the incoming connections for the controller nodes, an external load -blancer might help the Overcloud survive network flood attacks or issues due -to purposely malformed API requests. - -Other End User Impact ---------------------- - -The deployer wishing to deploy with an external load balancer will have to -provide at deployment time a few more parameters, amongst which: - -* the VIPs configured on the balancer to be used by the Overcloud services - -* the IPs to be configured on the controllers, for each network - -Performance Impact ------------------- - -Given there won't be any instance of HAProxy running on the controllers, when -using an external load balancer these might benefit from a lower stress on the -TCP stack. - -Other Deployer Impact ---------------------- - -None expected unless deploying with an external load balancer. A sample -environment file will be provided to provide some guidance over the parameters -to be passed when deploying with an external load balancer. - -Developer Impact ----------------- - -In those scenarios where the deployer was using only a subset of the isolated -networks, the customization templates will need to be updated so that the new -VIPs resource type is nooped. This can be achieved with something like: - -.. code:: - - resource_registry: - OS::TripleO::Network::Ports::InternalApiVipPort: /path/to/network/ports/noop.yaml - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - gfidente - -Other contributors: - dprince - -Work Items ----------- - -* accept user provided collection of VIPs as parameter - -* make the deployment of the managed HAProxy optional - -* allow for the assignment of a predefined list of IPs to the controller nodes - -* add a validation script to test connectivity against the external VIPs - - -Dependencies -============ - -None. - - -Testing -======= - -The feature seems untestable in CI at the moment but it will be possible to test -at least the assignment of a predefined list of IPs to the controller nodes by -providing only the predefined list of IPs as parameter. - - -Documentation Impact -==================== - -In addition to documenting the specific template parameters needed when -deploying with an external load balancer, it will also be necessary to provide -some guidance for the configuration of the load balancer configuration so that -it will behave as expected in the event of a failure. Unfortunately the -configuration settings are strictly dependent on the balancer in use; we should -publish a copy of a managed HAProxy instance config to use as reference so that -a deployer could configure his external appliance similarily. - - -References -========== - -None. diff --git a/specs/mitaka/puppet-modules-deployment-via-swift.rst b/specs/mitaka/puppet-modules-deployment-via-swift.rst deleted file mode 100644 index ea7bd7fe..00000000 --- a/specs/mitaka/puppet-modules-deployment-via-swift.rst +++ /dev/null @@ -1,202 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================== -Puppet Module Deployment via Swift -================================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/puppet-modules-deployment-via-swift - -The ability to deploy a local directory of puppet modules to an overcloud -using the OpenStack swift object service. - -Problem Description -=================== - -When deploying puppet modules to the overcloud there are currently three - options: - - * pre-install the puppet modules into a "golden" image. You can pre-install - modules via git sources or by using a distro package. - - * use a "firstboot" script to rsync the modules from the undercloud (or - some other rsync server that is available). - - * post-install the puppet modules via a package upgrade onto a running - Overcloud server by using a (RPM, Deb, etc.) - -None of the above mechanisms provides an easy workflow when making -minor (ad-hoc) changes to the puppet modules and only distro packages can be -used to provide updated puppet modules to an already deployed overcloud. -While we do have a way to rsync over updated modules on "firstboot" via -rsync this isn't a useful mechanism for operator who may wish to -use heat stack-update to deploy puppet changes without having to build -a new RPM/Deb package for each revision. - -Proposed Change -=============== - -Overview --------- - -Create an optional (opt-in) workflow that if enabled will allow an operator -to create and deploy a local artifact (tarball, distro package, etc.) of -puppet modules to a new or existing overcloud via heat stack-create and -stack-update. The mechanism would use the OpenStack object store service -(rather than rsync) which we already have available on the undercloud. -The new workflow would work like this: - - * A puppet modules artifact (tarball, distro package, etc.) would be uploaded - into a swift container. - - * The container would be configured so that a Swift Temp URL can be generated - - * A Swift Temp URL would be generated for the puppet modules URL that is - stored in swift - - * A heat environment would be generated which sets a DeployArtifactURLs - parameter to this swift URL. (the parameter could be a list so that - multiple URLs could also be downloaded.) - - * The TripleO Heat Templates would be modified so that they include a new - 'script' step which if it detects a custom DeployArtifactURLs parameter - would automatically download the artifact from the provided URL, and - deploy it locally on each overcloud role during the deployment workflow. - By "deploy locally" we mean a tarball would be extracted, and RPM would - get installed, etc. The actual deployment mechanism will be pluggable - such that both tarballs and distro packages will be supported and future - additions might be added as well so long as they also fit into the generic - DeployArtifactURLs abstraction. - - * The Operator could then use the generated heat environment to deploy - a new set of puppet modules via heat stack-create or heat stack-update. - - * TripleO client could be modified so that it automically loads - generated heat environments in a convienent location. This (optional) - extra step would make enabling the above workflow transparent and - only require the operator to run a 'upload-puppet-modules' tool to - upload and configure new puppet modules for deployment via Swift. - -Alternatives ------------- - -There are many alternatives we could use to obtain a similar workflow that -allows the operator to more deploy puppet modules from a local directory: - - * Setting up a puppet master would allow a similar workflow. The downside - of this approach is that it would require a bit of overhead, and it - is puppet specific (the deployment mechanism would need to be re-worked - if we ever had other types of on-disk files to update). - - * Rsync. We already support rsync for firstboot scripts. The downside of - rsync is it requires extra setup, and doesn't have an API like - OpenStack swift does allowing for local or remote management and updates - to the puppet modules. - -Security Impact ---------------- - -The new deployment would use a Swift Temp URL over HTTP/HTTPS. The duration -of the Swift Temp URL's can be controlled when they are signed via -swift-temp-url if extra security is desired. By using a Swift Temp URL we -avoid the need to pass the administrators credentials onto each overcloud -node for swiftclient and instead can simply use curl (or wget) to download -the updated puppet modules. Given we already deploy images over http/https -using an undercloud the use of Swift in this manner should pose minimal extra -security risks. - -Other End User Impact ---------------------- - -The ability to deploy puppet modules via Swift will be opt-in so the -impact on end users would be minimal. The heat templates will contain -a new script deployment that may take a few extra seconds to deploy on -each node (even if the feature is not enabled). We could avoid the extra -deployment time perhaps by noop'ing out the heat resource for the new -swift puppet module deployment. - -Performance Impact ------------------- - -Developers and Operators would likely be able to deploy puppet module changes -more quickly (without having to create a distro package). The actual deployment -of puppet modules via swift (downloading and extracting the tarball) would -likely be just as fast as a tarball. - -Other Deployer Impact ---------------------- - -None. - - -Developer Impact ----------------- - -Being able to more easily deploy updated puppet modules to an overcloud would -likely speed up the development update and testing cycle of puppet modules. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - dan-prince - -Work Items ----------- - - * Create an upload-puppet-modules script in tripleo-common. Initially this - may be a bash script which we ultimately refine into a Python version if - it proves useful. - - * Modify tripleo-heat-templates so that it supports a DeployArtifactURLs - parameter (if the parameter is set) attempt to deploy the list of - files from this parameter. The actual contents of the file might be - a tarball or a distribution package (RPM). - - * Modify tripleoclient so that the workflow around using upload-puppet-modules - can be "transparent". Simply running upload-puppet-modules would not only - upload the puppet modules it would also generate a Heat environment that - would then automatically configure heat stack-update/create commands - to use the new URL via a custom heat environment. - - * Update our CI scripts in tripleo-ci and/or tripleo-common so that we - make use of the new Puppet modules deployment mechanism. - - * Update tripleo-docs to make note of the new feature. - -Dependencies -============ - -None. - -Testing -======= - -We would likely want to switch to use this feature in our CI because -it allows us to avoid git cloning the same puppet modules for both -the undercloud and overcloud nodes. Simply calling the extra -upload-puppet-modules script on the undercloud as part of our -deployment workflow would enable the feature and allow it to be tested. - -Documentation Impact -==================== - -We would need to document the additional (optional) workflow associated -with deploying puppet modules via Swift. - - -References -========== - - * https://review.openstack.org/#/c/245314/ (Add support for DeployArtifactURLs) - * https://review.openstack.org/#/c/245310/ (Add scripts/upload-swift-artifacts) - * https://review.openstack.org/#/c/245172/ (tripleoclient --environment) diff --git a/specs/mitaka/refactor-puppet-manifests.rst b/specs/mitaka/refactor-puppet-manifests.rst deleted file mode 100644 index c43d42db..00000000 --- a/specs/mitaka/refactor-puppet-manifests.rst +++ /dev/null @@ -1,129 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Refactor top level puppet manifests -========================================== - -Launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/refactor-puppet-manifests - -The current overcloud controller puppet manifests duplicate a large amount -of code between the pacemaker (HA) and non-ha version. We can reduce the -effort required to add new features by refactoring this code, and since -there is already a puppet-tripleo module this is the logical destination. - -Problem Description -=================== - -Large amounts of puppet/manifests/overcloud\_controller.pp are shared with -puppet/manifests/overcloud\_controller\_pacemaker.pp. When adding a feature -or fixing a mistake in the former, it is frequently also an issue in the -latter. It is a violation of the common programming principle of DRY, which -while not an inviolable rule, is usually considered good practice. - -In addition, moving this code into separate classes in another module will -make it simpler to enable/disable components, as it will be a matter of -merely controlling which classes (profiles) are included. - -Finally, it allows easier experimentation with modifying the 'ha strategy'. -Currently this is done using 'step', but could in theory be done using a -service registry. By refactoring into ha+non-ha classes this would be quite -simple to swap in/out. - -Proposed Change -=============== - -Overview --------- - -While there are significant differences in ha and non-ha deployments, in almost -all cases the ha code will be a superset of the non-ha. A simple example of -this is at the top of both files, where the load balancer is handled. The non -ha version simply includes the loadbalancing class, while the HA version -instantiates the exact same class but with some parameters changed. Across -the board the same classes are included for the openstack services, but with -manage service set to false in the HA case. - -I propose first breaking up the non-ha version into profiles which can reside -in puppet-tripleo/manifests/profile/nonha, then adding ha versions which -use those classes under puppet-tripleo-manifests/profile/pacemaker. Pacemaker -could be described as an 'ha strategy' which in theory should be replaceable. -For this reason we use a pacemaker subfolder since one day perhaps we'll have -an alternative. - -Alternatives ------------- - -We could leave things as they are, which works and isn't the end of the world, -but it's probably not optimal. - -We could use kolla or something that removes the need for puppet entirely, but -this discussion is outside the scope of this spec. - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -It will make downstreams happy since they can sub in/out classes more easily. - -Performance Impact ------------------- - -Adding wrapper classes isn't going to impact puppet compile times very much. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Changes in t-h-t and puppet-tripleo will often be coupled, as t-h-t -defines the data on which puppet-tripleo depends on. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - michaeltchapman - -Work Items ----------- - -Move overcloud controller to profile classes -Move overcloud controller pacemaker to profile classes -Move any other classes from the smaller manifests in t-h-t - -Dependencies -============ - -None - -Testing -======= - -No new features so current tests apply in their entirety. -Additional testing can be added for each profile class - -Documentation Impact -==================== - -None - -References -========== - -None diff --git a/specs/mitaka/tripleo-mistral-deployment-library.rst b/specs/mitaka/tripleo-mistral-deployment-library.rst deleted file mode 100644 index 34b9a2ee..00000000 --- a/specs/mitaka/tripleo-mistral-deployment-library.rst +++ /dev/null @@ -1,274 +0,0 @@ -============================================================ -Library support for TripleO Overcloud Deployment Via Mistral -============================================================ - -We need a TripleO library that supports the overcloud deployment workflow. - -Problem Description -=================== - -TripleO has an overcloud deployment workflow that uses Heat templates and uses -the following steps: - -* The user edits the templates and environment file. These can be stored - anywhere. -* Templates may be validated by Heat. -* Templates and environment are sent to Heat for overcloud deployment. - -This workflow is already supported by the CLI. - -However from a GUI perspective, although the workflow is straightforward, it is -not simple. Here are some of the complications that arise: - -* Some of the business logic in this workflow is contained in the CLI itself, - making it difficult for other UIs to use. -* If the TripleO overcloud deployment workflow changes, it is easy for the CLI - and GUI approach to end up on divergent paths - a dangerous situation. -* The CLI approach allows open-ended flexibility (the CLI doesn't care where - the templates come from) that is detrimental for a GUI (the GUI user doesn't - care where the templates are stored, but consistency in approach is desirable - to prevent divergence among GUIs and CLIs). - -There is a need to create common code that accommodates the flexibility of the -CLI with the ease-of-use needs of GUI consumers. - -Proposed Change -=============== - -In order to solve this problem, we propose to create a Mistral-integrated -deployment with the following: - -* Encapsulate the business logic involved in the overcloud deployment workflow - within the tripleo-common library utilizing Mistral actions and workflows. -* Provide a simplified workflow to hide unneeded complexity from GUI consumers -* Update the CLI to use this code where appropriate to prevent divergence with - GUIs. - -The first three points deserve further explanation. First, let us lay out the -proposed GUI workflow. - -1. A user pushes the Heat deployment templates into swift. -2. The user defines values for the template resource types given by Heat - template capabilities which are stored in an environment[1]. Note that this - spec will be completed by mitaka at the earliest. A workaround is discussed - below. -3. Now that the template resource types are specified, the user can configure - deployment parameters given by Heat. Edited parameters are updated and are - stored in an environment. 'Roles' can still be derived from available Heat - parameters[2]. -4. Steps 2 and 3 can be repeated. -5. With configuration complete, the user triggers the deployment of the - overcloud. The templates and environment file are taken from Swift - and sent to Heat. -6. Once overcloud deployment is complete, any needed post-deploy config is - performed. - -The CLI and GUI will both use the Swift workflow and store the templates into -Swift. This would facilitate the potential to switch to the UI from a CLI based -deployment and vice-versa. - -Mistral Workflows are composed of Tasks, which group together one or more -Actions to be executed with a Workflow Execution. The Action is implemented as -a class with an initialization method and a run method. The run method provides -a single execution point for Python code. Any persistence of state required for -Actions or Workflows will be stored in a Mistral Environment object. - -In some cases, an OpenStack Service may be missing a feature needed for TripleO -or it might only be accessible through its associated Python client. To -mitigate this issue in the short term, some of the Actions will need to be -executed directly with an Action Execution [3] which calls the Action directly and -returns instantly, but also doesn't have access to the same context as a -Workflow Execution. In theory, every action execution should be replaced by an -OpenStack service API call. - -Below is a summary of the intended Workflows and Actions to be executed from the -CLI or the GUI using the python-mistralclient or Mistral API. There may be -additional actions or library code necessary to enable these operations that -will not be intended to be consumed directly. - -Workflows: - - * Node Registration - * Node Introspection - * Plan Creation - * Plan Deletion - * Deploy - * Validation Operations - -Actions: - - * Plan List - * Get Capabilites - * Update Capabilities - * Get Parameters - * Update Parameters - * Roles List - -For Flavors and Image management, the Nova and Glance APIs will be used -respectively. - -The registration and introspection of nodes will be implemented within a -Mistral Workflow. The logic is currently in tripleoclient and will be ported, -as certain node configurations are specified as part of the logic (ramdisk, -kernel names, etc.) so the user does not have to specify those. Tagging, -listing and deleting nodes will happen via the Ironic/Inspectors APIs as -appropriate. - -A deployment plan consists of a collection of heat templates in a Swift -container, combined with data stored in a Mistral Environment. When the plan is -first created, the capabilities map data will be parsed and stored in the -associated Mistral Environment. The templates will need to be uploaded to a -Swift container with the same name as the stack to be created. While any user -could use a raw POST request to accomplish this, the GUI and CLI will provide -convenience functions improve the user experience. The convenience functions -will be implemented in an Action that can be used directly or included in a -Workflow. - -The deletion of a plan will be implemented in a Workflow to ensure there isn't -an associated stack before deleting the templates, container and Mistral -Environment. Listing the plans will be accomplished by calling -'mistral environment-list'. - -To get a list of the available Heat environment files with descriptions and -constraints, the library will have an Action that returns the information about -capabilities added during plan creation and identifies which Heat environment -files have already been selected. There will also be an action that accepts a -list of user selected Heat environment files and stores the information in the -Mistral Environment. It would be inconvenient to use a Workflow for these -actions as they just read or update the Mistral Environment and do not require -additional logic. - -The identification of Roles will be implemented in a Workflow that calls out to -Heat. - -To obtain the deployment parameters, Actions will be created that will call out -to heat with the required template information to obtain the parameters and set -the parameter values to the Environment. - -To perform TripleO validations, Workflows and associated Actions will be created -to support list, start, stop, and results operations. See the spec [4] for more -information on how the validations will be implemented with Mistral. - -Alternatives ------------- - -One alternative is to force non-CLI UIs to re-implement the business logic -currently contained within the CLI. This is not a good alternative. Another -possible alternative would be to create a REST API [5] to abstract TripleO -deployment logic, but it would require considerably more effort to create and -maintain and has been discussed at length on the mailing list. [6][7] - -Security Impact ---------------- - -Other End User Impact ---------------------- - -The --templates workflow will end up being modified to use the updated -tripleo-common library. - -Integrating with Mistral is a straightforward process and this may result in -increased usage. - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Rather than write workflow code in python-tripleoclient directly developers will -now create Mistral Actions and Workflows that help implement the requirements. - -Right now, changing the overcloud deployment workflow results in stress due to -the need to individually update both the CLI and GUI code. Converging the two -makes this a far easier proposition. However developers will need to have this -architecture in mind and ensure that changes to the --templates or --plan -workflow are maintained in the tripleo-common library (when appropriate) to -avoid unneeded divergences. - -Implementation -============== - -Assignee(s) ------------ -Primary assignees: - -* rbrady -* jtomasek -* dprince - -Work Items ----------- -The work items required are: - -* Develop the tripleo-common Mistral actions that provide all of the - functionality required for our deployment workflows. -* This involves moving much of the code out of python-tripleoclient and into - generic, narrowly focused, Mistral actions that can be consumed via the - Mistral API. -* Create new Mistral workflows to help with high level things like deployment, - introspection, node registration, etc. -* tripleo-common is more of an internal library, and its logic is meant to be - consumed (almost) solely by using Mistral - actions. Projects should not attempt to circumvent the API by using - tripleo-common as a library as much as possible. - There may be some exceptions to this for common polling functions, etc. but in - general all core workflow logic should be API driven. -* Update the CLI to consume these Mistral actions directly via - python-mistralclient. - -All patches that implement these changes must pass CI and add additional tests -as needed. - -Dependencies -============ - -None - - -Testing -======= - -The TripleO CI should be updated to test the updated tripleo-common library. - -Our intent is to make tripleoclient consume Mistral actions as we write them. -Because all of the existing upstream Tripleo CI release on tripleoclient taking -this approach ensures that our all of our workflow actions always work. This -should get us coverage on 90% of the Mistral actions and workflows and allow us -to proceed with the implementation iteratively/quickly. Once the UI is installed -and part of our upstream CI we can also rely on coverage there to ensure we -don't have breakages. - -Documentation Impact -==================== - -Mistral Actions and Workflows are sort of self-documenting and can be easily -introspected by running 'mistral workflow-list' or 'mistral action-list' on the -command line. The updated library however will have to be well-documented and -meet OpenStack standards. Documentation will be needed in both the -tripleo-common and tripleo-docs repositories. - -References -========== - -[1] https://specs.openstack.org/openstack/heat-specs/specs/mitaka/resource-capabilities.html - -[2] https://specs.openstack.org/openstack/heat-specs/specs/liberty/nested-validation.html - -[3] http://docs.openstack.org/developer/mistral/terminology/executions.html - -[4] https://review.openstack.org/#/c/255792/ - -[5] http://specs.openstack.org/openstack/tripleo-specs/specs/mitaka/tripleo-overcloud-deployment-library.html - -[6] http://lists.openstack.org/pipermail/openstack-dev/2016-January/083943.html - -[7] http://lists.openstack.org/pipermail/openstack-dev/2016-January/083757.html - diff --git a/specs/mitaka/tripleo-overcloud-deployment-library.rst b/specs/mitaka/tripleo-overcloud-deployment-library.rst deleted file mode 100644 index ff2482f0..00000000 --- a/specs/mitaka/tripleo-overcloud-deployment-library.rst +++ /dev/null @@ -1,244 +0,0 @@ -================================================ -Library support for TripleO Overcloud Deployment -================================================ - -We need a TripleO library that supports the overcloud deployment workflow. - -Problem Description -=================== - -With Tuskar insufficient for complex overcloud deployments, TripleO has moved to -an overcloud deployment workflow that bypasses Tuskar. This workflow can be -summarized as follows: - - * The user edits the templates and environment file. These can be stored - anywhere. - * Templates may be validated by Heat. - * Templates and environment are sent to Heat for overcloud deployment. - * Post-deploy, overcloud endpoints are configured. - -This workflow is already supported by the CLI. - -However from a GUI perspective, although the workflow is straightforward, it is -not simple. Here are some of the complications that arise: - - * Some of the business logic in this workflow is contained in the CLI itself, - making it difficult for other UIs to use. - * If the TripleO overcloud deployment workflow changes, it is easy for the CLI - and GUI approach to end up on divergent paths - a dangerous situation. - * The CLI approach allows open-ended flexibility (the CLI doesn't care where the - templates come from) that is detrimental for a GUI (the GUI user doesn't care - where the templates are stored, but consistency in approach is desirable to - prevent divergence among GUIs). - -There is a need to create common code that accommodates the flexibility of the -CLI with the ease-of-use needs of Python-based GUI consumers. Note that an API -will eventually be needed in order to accommodate non-Python GUIs. The work -there will be detailed in a separate spec. - -Proposed Change -=============== - -In order to solve this problem, we propose the following: - - * Encapsulate the business logic involved in the overcloud deployment workflow - within the tripleo-common library. - * Provide a simplified workflow to hide unneeded complexity from GUI consumers - - for example, template storage. - * Update the CLI to use this code where appropriate to prevent divergence with - GUIs. - -The first two points deserve further explanation. First, let us lay out the -proposed GUI workflow. We will refer to the Heat files the user desires to use -for the overcloud deployment as a 'plan'. - -1. A user creates a plan by pushing a copy of the Heat deployment templates into - a data store. -2. The user defines values for the template resource types given by Heat - template capabilities. This results in an updated resource registry in an - environment file saved to the data store. - (https://review.openstack.org/#/c/196656/7/specs/liberty/resource-capabilities.rst) - Note that this spec will be completed by mitaka at the earliest. A - workaround is discussed below. -3. Now that the template resource types are specified, the user can configure - deployment parameters given by Heat. Edited parameters are updated and an - updated environment file is saved to the data store. 'Roles' no longer exist - in Tuskar, but can still be derived from available Heat parameters. - (https://review.openstack.org/#/c/197199/5/specs/liberty/nested-validation.rst) -4. Steps 2 and 3 can be repeated. -5. With configuration complete, the user triggers the deployment of the - overcloud. The templates and environment file are taken from the data store - and sent to Heat. -6. Once overcloud deployment is complete, any needed post-deploy config is - performed. - -In order to fulfill this workflow, we propose to initially promote the use of -Swift as the template data store. This usage will be abstracted away behind -the tripleo-common library, and later updates may allow the use of other data -stores. - -Note that the Swift-workflow is intended to be an alternative to the current CLI -'--templates' workflow. Both would end up being options under the CLI; a user -could choose '--templates' or '--plan'. However they would both be backed by -common tripleo-common library code, with the '--plan' option simply calling -additional functions to pull the plan information from Swift. And GUIs that -expect a Swift-backed deployment would lose functionality if the deployment -is deployed using the '--templates' CLI workflow. - -The tripleo-common library functions needed are: - - * **Plan CRUD** - - * **create_plan(plan_name, plan_files)**: Creates a plan by creating a Swift - container matching plan_name, and placing all files needed for that plan - into that container (for Heat that would be the 'parent' templates, nested - stack templates, environment file, etc). The Swift container will be - created with object versioning active to allow for versioned updates. - * **get_plan(plan_name)**: Retrieves the Heat templates and environment file - from the Swift container matching plan_name. - * **update_plan(plan_name, plan_files)**: Updates a plan by updating the - plan files in the Swift container matching plan_name. This may necessitate - an update to the environment file to add and/or remove parameters. Although - updates are versioned, retrieval of past versions will not be implemented - until the future. - * **delete_plan(plan_name)**: Deletes a plan by deleting the Swift container - matching plan_name, but only if there is no deployed overcloud that was - deployed with the plan. - - * **Deployment Options** - - * **get_deployment_plan_resource_types(plan_name)**: Determine available - template resource types by retrieving plan_name's templates from Swift and - using the proposed Heat resource-capabilities API - (https://review.openstack.org/#/c/196656/7/specs/liberty/resource-capabilities.rst). - If that API is not ready in the required timeframe, then we will implement - a temporary workaround - a manually created map between templates and - provider resources. We would work closely with the spec developers to try - and ensure that the output of this method matches their proposed output, so - that once their API is ready, replacement is easy. - * **update_deployment_plan_resource_types(plan_name, resource_types)**: - Retrieve plan_name's environment file from Swift and update the - resource_registry tree according to the values passed in by resource_types. - Then update the environment file in Swift. - - * **Deployment Configuration** - - * **get_deployment_parameters(plan_name)**: Determine available deployment - parameters by retrieving plan_name's templates from Swift and using the - proposed Heat nested-validation API call - (https://review.openstack.org/#/c/197199/5/specs/liberty/nested-validation.rst). - * **update_deployment_parameters(plan_name, deployment_parameters)**: - Retrieve plan_name's environment file from Swift and update the parameters - according to the values passed in by deployment_parameters. Then update the - environment file in Swift. - * **get_deployment_roles(plan_name)**: Determine available deployment roles. - This can be done by retrieving plan_name's deployment parameters and - deriving available roles from parameter names; or by looking at the top- - level ResourceGroup types. - - * **Deployment** - - * **validate_plan(plan_name)**: Retrieve plan_name's templates and environment - file from Swift and use them in a Heat API validation call. - * **deploy_plan(plan_name)**: Retrieve plan_name's templates and environment - file from Swift and use them in a Heat API call to create the overcloud - stack. Perform any needed pre-processing of the templates, such as the - template file dictionary needed by Heat. This function will return a Heat - stack ID that can be used to monitor the status of the deployment. - - * **Post-Deploy** - - * **postdeploy_plan(plan_name)**: Initialize the API endpoints of the - overcloud corresponding to plan_name. - -Alternatives ------------- - -The alternative is to force non-CLI UIs to re-implement the business logic -currently contained within the CLI. This is not a good alternative. - -Security Impact ---------------- - -Other End User Impact ---------------------- - -The --templates workflow will end up being modified to use the updated -tripleo-common library. - -Python-based code would find it far easier to adapt the TripleO method of -deployment. This may result in increased usage. - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Right now, changing the overcloud deployment workflow results in stress due to -the need to individually update both the CLI and GUI code. Converging the two -makes this a far easier proposition. However developers will need to have this -architecture in mind and ensure that changes to the --templates or --plan -workflow are maintained in the tripleo-common library (when appropriate) to -avoid unneeded divergences. - -Another important item to note is that we will need to keep the TripleO CI -updated with changes, and will be responsible for fixing the CI as needed. - - -Implementation -============== - -Assignee(s) ------------ -Primary assignees: - -* tzumainn -* akrivoka -* jtomasek -* dmatthews - -Work Items ----------- - -The work items required are: - - * Develop the tripleo-common library to provide the functionality described - above. This also involves moving code from the CLI to tripleo-common. - * Update the CLI to use the tripleo-common library. - -All patches that implement these changes must pass CI and add additional tests as -needed. - - -Dependencies -============ - -We are dependent upon two HEAT specs: - - * Heat resource-capabilities API - (https://review.openstack.org/#/c/196656/7/specs/liberty/resource-capabilities.rst) - * Heat nested-validation API - (https://review.openstack.org/#/c/197199/5/specs/liberty/nested-validation.rst) - -Testing -======= - -The TripleO CI should be updated to test the updated tripleo-common library. - -Documentation Impact -==================== - -The updated library with its Swift-backed workflow will have to be well- -documented and meet OpenStack standards. Documentation will be needed in both -the tripleo-common and tripleo-docs repositories. - -References -========== diff --git a/specs/mitaka/tripleo-quickstart.rst b/specs/mitaka/tripleo-quickstart.rst deleted file mode 100644 index 65c01bf6..00000000 --- a/specs/mitaka/tripleo-quickstart.rst +++ /dev/null @@ -1,140 +0,0 @@ -================== -TripleO Quickstart -================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-quickstart - -We need a common way for developers/CI systems to quickly stand up a virtual -environment. - -Problem Description -=================== - -The tool we currently document for this use case is instack-virt-setup. -However this tool has two major issues, and some missing features: - -* There is no upstream CI using it. This means we have no way to test changes - other than manually. This is a huge barrier to adding the missing features. - -* It relies on a maze of bash scripts in the incubator repository[1] in order - to work. This is a barrier to new users, as it can take quite a bit of time - to find and then navigate that maze. - -* It has no way to use a pre-built undercloud image instead of starting from - scratch and redoing the same work that CI and every other tripleo developer - is doing on every run. Starting from a pre-built undercloud with overcloud - images prebaked can be a significant time savings for both CI systems as well - as developer test environments. - -* It has no way to create this undercloud image either. - -* There are other smaller missing features like automatically tagging the fake - baremetals with profile capability tags via instackenv.json. These would not - be too painful to implement, but without CI even small changes carry some - amount of pain. - -Proposed Change -=============== - -Overview --------- - -* Import the tripleo-quickstart[2] tool that RDO is using for this purpose. - This project is a set of ansible roles that can be used to build an - undercloud.qcow2, or alternatively to consume it. It was patterned after - instack-virt-setup, and anything configurable via instack-virt-setup is - configurable in tripleo-quickstart. - -* Use third-party CI for self-gating this new project. In order to setup an - environment similar to how developers and users can use this tool, we need - a baremetal host. The CI that currently self gates this project is setup on - ci.centos.org[3], and setting this up as third party CI would not be hard. - -Alternatives ------------- - -* One alternative is to keep using instack-virt-setup for this use case. - However, we would still need to add CI for instack-virt-setup. This would - still need to be outside of tripleoci, since it requires a baremetal host. - Unless someone is volunteering to set that up, this is not really a viable - alternative. - -* Similarly, we could use some other method for creating virtual environments. - However, this alternative is similarly constrained by needing third-party CI - for validation. - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -Using a pre-built undercloud.qcow2 drastically symplifies the virt-setup -instructions, and therefore is less error prone. This should lead to a better -new user experience of TripleO. - -Performance Impact ------------------- - -Using a pre-built undercloud.qcow2 will shave 30+ minutes from the CI -gate jobs. - -Other Deployer Impact ---------------------- - -There is no reason this same undercloud.qcow2 could not be used to deploy -real baremetal environments. There have been many production deployments of -TripleO that have used a VM undercloud. - -Developer Impact ----------------- - -The undercloud.qcow2 approach makes it much easier and faster to reproduce -exactly what is run in CI. This leads to a much better developer experience. - -Implementation -============== - -Assignee(s) ------------ -Primary assignees: - -* trown - -Work Items ----------- - -* Import the existing work from the RDO community to the openstack namespace - under the TripleO umbrella. - -* Setup third-party CI running in ci.centos.org to self-gate this new project. - (We can just update the current CI[3] to point at the new upstream location) - -* Documentation will need to be updated for the virtual environment setup. - -Dependencies -============ - -Currently, the only undercloud.qcow2 available is built in RDO. We would -either need to build one in tripleo-ci, or use the one built in RDO. - -Testing -======= - -We need a way to CI the virtual environment setup. This is not feasible within -tripleoci, since it requires a baremetal host machine. We will need to rely on -third party CI for this. - -Documentation Impact -==================== - -Overall this will be a major simplification of the documentation. - -References -========== - -[1] https://github.com/openstack/tripleo-incubator/tree/master/scripts -[2] https://github.com/redhat-openstack/tripleo-quickstart -[3] https://ci.centos.org/view/rdo/job/tripleo-quickstart-gate-mitaka-delorean-minimal/ diff --git a/specs/mitaka/tripleo-ui.rst b/specs/mitaka/tripleo-ui.rst deleted file mode 100644 index 5f726297..00000000 --- a/specs/mitaka/tripleo-ui.rst +++ /dev/null @@ -1,175 +0,0 @@ -========== -TripleO UI -========== - -We need a graphical user interface that will support deploying OpenStack using -TripleO. - -Problem Description -=================== - -Tuskar-UI, the only currently existing GUI capable of TripleO deployments, has -several significant issues. - -Firstly, its back-end relies on an obsolete version of the Tuskar API, which is -insufficient for complex overcloud deployments. - -Secondly, it is implemented as a Horizon plugin and placed under the Horizon -umbrella, which has proven to be suboptimal, for several reasons: - - * The placement under the Horizon program. In order to be able to develop the - Tuskar-UI, one needs deep familiarity with both Horizon and TripleO projects. - Furthermore, in order to be able to approve patches, one needs to be a - Horizon core reviewer. This restriction reduces the number of people who can - contribute drastically, as well as makes it hard for Tuskar-UI developers to - actually land code. - - * The complexity of the Horizon Django application. Horizon is a very complex - heavyweight application comprised of many OpenStack services. It has become - very large, inflexible and consists of several unnecessary middle layers. As - a result of this, we have been witnessing the emergence of several new GUIs - implemented as independent (usually fully client-side JavaScript) applications, - rather than as Horizon plugins. Ironic webclient[1] is one such example. This - downside of Horizon has been recognized and an attempt to address it is - described in the next point. - - * The move to Angular JS (version 1). In an attempt to address the issues listed - above, the Horizon community decided to rewrite it in Angular JS. However, - instead of doing a total rewrite, they opted for a more gradual approach, - resulting in even more middle layers (the original Django layer turned into an - API for Angular based front end). Although the intention is to eventually - get rid of the unwanted layers, the move is happening very slowly. In - addition, this rewrite of Horizon is to AngularJS version 1, which may soon - become obsolete, with version 2 just around the corner. This probably means - another complete rewrite in not too distant future. - - * Packaging issues. The move to AngularJS brought along a new set of issues - related to the poor state of packaging of nodejs based tooling in all major - Linux distributions. - -Proposed Change -=============== - -Overview --------- - -In order to address the need for a TripleO based GUI, while avoiding the issues -listed above, we propose introducing a new GUI project, *TripleO UI*, under the -TripleO program. - -As it is a TripleO specific UI, TripleO GUI will be placed under the TripleO -program, which will bring it to attention of TripleO reviewers and allow -TripleO core reviewers to approve patches. This should facilitate the code -contribution process. - -TripleO UI will be a web UI designed for overcloud deployment and -management. It will be a lightweight, independent client-side application, -designed for flexibility, adaptability and reusability. - -TripleO UI will be a fully client-side JavaScript application. It will be -stateless and contain no business logic. It will consume the TripleO REST API[2], -which will expose the overcloud deployment workflow business logic implemented -in the tripleo-common library[3]. As opposed to the previous architecture which -included many unwanted middle layers, this one will be very simple, consisting -only of the REST API serving JSON, and the client-side JavaScript application -consuming it. - -The development stack will consist of ReactJS[4] and Flux[5]. We will use ReactJS -to implement the web UI components, and Flux for architecture design. - -Due to the packaging problems described above, we will not provide any packages -for the application for now. We will simply make the code available for use. - -Alternatives ------------- - -The alternative is to keep developing Tuskar-UI under the Horizon umbrella. In -addition to all the problems outlined above, this approach would also mean a -complete re-write of Tuskar-UI back-end to make it use the new tripleo-common -library. - -Security Impact ---------------- - -This proposal introduces a brand new application; all the standard security -concerns which come with building a client-side web application apply. - -Other End User Impact ---------------------- - -We plan to build a standalone web UI which will be capable of deploying -OpenStack with TripleO. Since as of now no such GUIs exist, this can be a huge -boost for adoption of TripleO. - -Performance Impact ------------------- - -The proposed technology stack, ReactJS and Flux, have excellent performance -characteristics. TripleO UI should be a lightweight, fast, flexible application. - - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Right now, development on Tuskar-UI is uncomfortable for the reasons -detailed above. This proposal should result in more comfortable development -as it logically places TripleO UI under the TripleO program, which brings -it under the direct attention of TripleO developers and core reviewers. - -Implementation -============== - -Assignee(s) ------------ -Primary assignees: - -* jtomasek -* flfuchs -* jrist -* - -Work Items ----------- - -This is a general proposal regarding the adoption of a new graphical user -interface under the TripleO program. The implementation of specific features -will be covered in subsequent proposals. - -Dependencies -============ - -We are dependent upon the creation of the TripleO REST API[2], which in turn -depends on the tripleo-common[3] library containing all the functionality -necessary for advanced overcloud deployment. - -Alternatively, using Mistral to provide a REST API, instead of building a new -API, is currently being investigated as another option. - -Testing -======= - -TripleO UI should be thoroughly tested, including unit tests and integration -tests. Every new feature and bug fix should be accompanied by appropriate tests. - -The TripleO CI should be updated to test the TripleO UI. - -Documentation Impact -==================== - -TripleO UI will have to be well-documented and meet OpenStack standards. -We will need both developer and deployment documentation. Documentation will -live in the tripleo-docs repository. - -References -========== - -[1] https://github.com/openstack/ironic-webclient -[2] https://review.openstack.org/#/c/230432 -[3] http://specs.openstack.org/openstack/tripleo-specs/specs/mitaka/tripleo-overcloud-deployment-library.html -[4] https://facebook.github.io/react/ -[5] https://facebook.github.io/flux/ diff --git a/specs/newton/metal-to-tenant.rst b/specs/newton/metal-to-tenant.rst deleted file mode 100644 index e5bbff5d..00000000 --- a/specs/newton/metal-to-tenant.rst +++ /dev/null @@ -1,220 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -==================================== -Metal to Tenant: Ironic in Overcloud -==================================== - -https://blueprints.launchpad.net/tripleo/+spec/ironic-integration - -This blueprint adds support for providing bare metal machines to tenants by -integrating Ironic to the overcloud. - - -Problem Description -=================== - -There is an increasing interest in providing bare metal machines to tenants in -the overcloud in addition to or instead of virtual instances. One example is -Sahara: users hope to achieve better performance by removing the hypervisor -abstraction layer in order to eliminate the noisy neighbor effect. For that -purpose, the OpenStack Bare metal service (Ironic) provides an API and a Nova -driver to serve bare metal instances behind the same Nova and Neutron API's. -Currently however TripleO does not support installing and configuring Ironic -and Nova to serve bare metal instances to the tenant. - - -Proposed Change -=============== - -Composable Services -------------------- - -In the bare metal deployment case, the nova-compute service is only a thin -abstraction layer around the Ironic API. The actual compute instances in -this case are the bare metal nodes. Thus a TripleO deployment with support for -only bare metal nodes will not need dedicated compute nodes in the overcloud. -The overcloud nova-compute service will therefore be placed on controller nodes. - -New TripleO composable services will be created and optionally deployed on the -controller nodes: - -* ``OS::TripleO::Services::IronicApi`` will deploy the bare metal API. - -* ``OS::TripleO::Services::IronicNovaCompute`` will deploy nova compute - with Ironic as a back end. It will also configure the nova compute to use - `ClusteredComputeManager - `_ - provide by Ironic to work around inability to have several nova compute - instances configured with Ironic. - -* ``OS::TripleO::Services::IronicConductor`` will deploy a TFTP server, - an HTTP server (for an optional iPXE environment) and an ironic-conductor - instance. The ironic-conductor instance will not be managed by pacemaker - in the HA scenario, as Ironic has its own Active/Active HA model, - which spreads load on all active conductors using a hash ring. - - There is no public data on how many bare metal nodes each conductor - can handle, but the Ironic team expects an order of hundreds of nodes - per conductor. - -Since this feature is not a requirement in all deployments, this will be -opt-in by having a separate environment file. - -Hybrid Deployments ------------------- - -For hybrid deployments with both virtual and bare metal instances, we will use -Nova host aggregates: one for all bare metal hosts, the other for all virtual -compute nodes. This will prevent virtual instances being deployed on baremetal -nodes. Note that every bare metal machine is presented as a separate -Nova compute host. These host aggregates will always be created, even for -purely bare metal deployments, as users might want to add virtual computes -later. - -Networking ----------- - -As of Mitaka, Ironic only supports flat networking for all tenants and for -provisioning. The **recommended** deployment layout will consist of two networks: - -* The ``provisioning`` / ``tenant`` network. It must have access to the - overcloud Neutron service for DHCP, and to overcloud baremetal-conductors - for provisioning. - - .. note:: While this network can technically be the same as the undercloud - provisioning network, it's not recommended to do so due to - potential conflicts between various DHCP servers provided by - Neutron (and in the future by ironic-inspector). - -* The ``management`` network. It will contain the BMCs of bare metal nodes, - and it only needs access to baremetal-conductors. No tenant access will be - provided to this network. - - .. note:: Splitting away this network is not really required if tenants are - trusted (which is assumed in this spec) and BMC access is - reasonably restricted. - -Limitations ------------ - -To limit the scope of this spec the following definitely useful features are -explicitly left out for now: - -* ``provision`` <-> ``tenant`` network separation (not yet implemented by - ironic) - -* in-band inspection (requires ironic-inspector, which is not yet HA-ready) - -* untrusted tenants (requires configuring secure boot and checking firmwares, - which is vendor-dependent) - -* node autodiscovery (depends on ironic-inspector) - -Alternatives ------------- - -Alternatively, we could leave configuring a metal-to-tenant environment up to -the operator. - -We could also have it enabled by default, but most likely it won't be required -in most deployments. - -Security Impact ---------------- - -Most of the security implications have to be handled within Ironic. Eg. wiping -the hard disk, checking firmwares, etc. Ironic needs to be configured to be -able to run these jobs by enabling automatic cleaning during node lifecycle. -It is also worth mentioning that we will assume trusted tenants for these bare -metal machines. - -Other End User Impact ---------------------- - -The ability to deploy Ironic in the overcloud will be optional. - -Performance Impact ------------------- - -If enabled, TripleO will deploy additional services to the overcloud: - -* ironic-conductor - -* a TFTP server - -* an HTTP server - -None of these should have heavy performance requirements. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -None. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - ifarkas - -Other contributors: - dtantsur, lucasagomes, mgould, mkovacik - -Work Items ----------- - -when the environment file is included, make sure: - -* ironic is deployed on baremetal-conductor nodes - -* nova compute is deployed and correctly configured, including: - - * configuring Ironic as a virt driver - - * configuring ClusteredComputeManager - - * setting ram_allocation_ratio to 1.0 - -* host aggregates are created - -* update documentation - - -Dependencies -============ - -None. - - -Testing -======= - -This is testable in the CI with nested virtualization and tests will be added -to the tripleo-ci jobs. - - -Documentation Impact -==================== - -* Quick start documentation and a sample environment file will be provided. - -* Document how to enroll new nodes in overcloud ironic (including host - aggregates) - - -References -========== - -* `Host aggregates `_ diff --git a/specs/newton/os-net-config-teaming.rst b/specs/newton/os-net-config-teaming.rst deleted file mode 100644 index fbc40712..00000000 --- a/specs/newton/os-net-config-teaming.rst +++ /dev/null @@ -1,197 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -==================================== -Add Adapter Teaming to os-net-config -==================================== - -https://blueprints.launchpad.net/os-net-config/+spec/os-net-config-teaming - -This spec describes adding features to os-net-config to support adapter teaming -as an option for bonded interfaces. Adapter teaming allows additional features -over regular bonding, due to the use of the teaming agent. - -Problem Description -=================== - -os-net-config supports both OVS bonding and Linux kernel bonding, but some -users want to use adapter teaming instead of bonding. Adapter teaming provides -additional options that bonds don't support, and do support almost all of the -options that are supported by bonds. - -Proposed Change -=============== - -Overview --------- - -Add a new class similar to the existing bond classes that allows for the -configuration of the teamd daemon through teamdctl. The syntax for the -configuration of the teams should be functionally similar to configuring -bonds. - -Alternatives ------------- - -We already have two bonding methods in use, the Linux bonding kernel module, -and Open vSwitch. However, adapter teaming is becoming a best practice, and -this change will open up that possibility. - -Security Impact ---------------- - -The end result of using teaming instead of other modes of bonding should be -the same from a security standpoint. Adapter teaming does not interfere with -iptables or selinux. - - -Other End User Impact ---------------------- - -Operators who are troubleshooting a deployment where teaming is used may need -to familiarize themselves with the teamdctl utility. - -Performance Impact ------------------- - -Using teaming rather than bonding will have a mostly positive impact on -performance. Teaming is very lightweight, and may use less CPU than other -bonding modes, especially OVS. Teaming has the following impacts: - -* Fine-grained control over load balancing hashing algorithms. - -* Port-priorities and stickyness - -* Per-port monitoring. - -Other Deployer Impact ---------------------- - -In TripleO, os-net-config has existing sample templates for OVS-mode -bonds and Linux bonds. There has been some discussion with Dan Prince -about unifying the bonding templates in the future. - -The type of bond could be set as a parameter in the NIC config -templates. To this end, it probably makes sense to make the teaming -configuration as similar to the bonding configurations as possible. - -Developer Impact ----------------- - -If possible, the configuration should be as similar to the bonding -configuration as possible. In fact, it might be treated as a different -form of bond, as long as the required metadata for teaming can be -provided in the options. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Dan Sneddon - -Work Items ----------- - -* Add teaming object and unit tests. - -* Configure sample templates to demonstrate usage of teaming. - -* Test TripleO with new version of os-net-config and adapter teaming configured. - -Configuration Example ---------------------- - -The following is an example of a teaming configuration that os-net-config -should be able to implement:: - - - - type: linux_team - name: team0 - bonding_options: "{"runner": {"name": "activebackup"}, "link_watch": {"name": "ethtool"}}" - addresses: - - - ip_subnet: 192.168.0.10/24 - members: - - - type: interface - name: eno2 - primary: true - - - type: interface - name: eno3 - -The only difference between a Linux bond configuration and an adapter team -configuration in the above example is the type (linux_team), and the content -of the bonding_options (bonding has a different format for options). - -Implementation Details ----------------------- - -os-net-config will have to configure the ifcfg files for the team. The ifcfg -format for team interfaces is documented here [1]. - -If an interface is marked as primary, then the ifcfg file for that interface -should list it at a higher than default (0) priority:: - - TEAM_PORT_CONFIG='{"prio": 100}' - -The mode is set in the runner: statement, as well as any settings that -apply to that teaming mode. - -We have the option of using strictly ifcfg files or using the ip utility -to influence the settings of the adapter team. It appears from the teaming -documentation that either approach will work. - -The proposed implementation [2] of adapter teaming for os-net-config uses -only ifcfg files to set the team settings, slave interfaces, and to -set the primary interface. The potential downside of this path is that -the interface must be shut down and restarted when config changes are -made, but that is consistent with the other device types in os-net-config. -This is probably acceptable, since network changes are made rarely and -are assumed to be disruptive to the host being reconfigured. - -Dependencies -============ - -* teamd daemon and teamdctl command-line utility must be installed. teamd is - not installed by default on RHEL/CENTOS, however, teamd is currently - included in the RDO overcloud-full image. This should be added ot the list - of os-net-config RPM dependencies. - -* For LACP bonds using 802.3ad, switch support will need to be configured and - at least two ports must be configured for LACP bonding. - - -Testing -======= - -In order to test this in CI, we would need to have an environment where we -have multiple physical NICs. Adapter teaming supports modes other than LACP, -so we could possibly get away with multiple links without any special -configuration. - - -Documentation Impact -==================== - -The deployment documentation will need to be updated to cover the use of -teaming. The os-net-config sample configurations will demonstrate the use -in os-net-config. TripleO Heat template examples should also help with -deployments using teaming. - - -References -========== - -* [1] - Documentation: Creating a Network Team Using ifcfg Files - https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/7/html/Networking_Guide/sec-Configure_a_Network_Team_Using-the_Command_Line.html#sec-Creating_a_Network_Team_Using_ifcfg_Files - -* [2] - Review: Add adapter teaming support using teamd for ifcfg-systems - https://review.openstack.org/#/c/339854/ \ No newline at end of file diff --git a/specs/newton/pacemaker-next-generation-architecture.rst b/specs/newton/pacemaker-next-generation-architecture.rst deleted file mode 100644 index 4cfa252e..00000000 --- a/specs/newton/pacemaker-next-generation-architecture.rst +++ /dev/null @@ -1,229 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================================== -Pacemaker Next Generation Architecture -====================================== - -https://blueprints.launchpad.net/tripleo/+spec/ha-lightweight-architecture - -Change the existing HA manifests and templates to deploy a minimal pacemaker -architecture, where all the openstack services are started and monitored by -systemd with the exception of: VIPs/Haproxy, rabbitmq, redis and galera. - -Problem Description -=================== - -The pacemaker architecture deployed currently via -`puppet/manifests/overcloud_controller_pacemaker.pp` manages most -service on the controllers via pacemaker. This approach, while having the -advantage of having a single entity managing and monitoring all services, does -bring a certain complexity to it and assumes that the operators are quite -familiar with pacemaker and its management of resources. The aim is to -propose a new architecture, replacing the existing one, where pacemaker -controls the following resources: - -* Virtual IPs + HAProxy -* RabbitMQ -* Galera -* Redis -* openstack-cinder-volume (as the service is not A/A yet) -* Any future Active/Passive service - -Basically every service that is managed today by a specific resource agent -and not systemd, will be still running under pacemaker. The same goes -for any service (like openstack-cinder-volume) that need to be active/passive. - -Proposed Change -=============== - -Overview --------- - -Initially the plan was to create a brand new template implementing this -new HA architecture. After a few rounds of discussions within the TripleO -community, it has been decided to actually have a single HA architecture. -The main reasons for moving to a single next generation HA architecture are due to -the amount work needed to maintain two separate architectures and to the -fact that the previous HA architecture does not bring substantial advantages -over this next generation one. - -The new architecture will enable most services via systemd and will remove most -pacemaker resource definitions with their corresponding constraints. -In terms of ordering constraints we will go from a graph like this one: -http://acksyn.org/files/tripleo/wsgi-openstack-core.pdf (mitaka) - -to a graph like this one: -http://acksyn.org/files/tripleo/light-cib-nomongo.pdf (next-generation-mitaka) - -Once this new architecture is in place and we have tested it extensively, we -can work on the upgrade path from the previous fully-fledged pacemaker HA -architecture to this new one. Since the impact of pacemaker in the new -architecture is quite small, it is possible to consider dropping the non-ha -template in the future for every deployment and every CI job. The decision -on this can be taken in a later step, even post-newton. - -Another side-benefit is that with this newer architecture the -whole upgrade/update topic is much easier to manage with TripleO, -because there is less coordination needed between pacemaker, the update -of openstack services, puppet and the update process itself. - -Note that once composable service land, this next generation architecture will -merely consist of a single environment file setting some services to be -started via systemd, some via pacemaker and a bunch of environment variables -needed for the services to reconnect even when galera and rabbitmq are down. -All services that need to be started via systemd will be done via the default -state: -https://github.com/openstack/tripleo-heat-templates/blob/40ad2899106bc5e5c0cf34c40c9f391e19122a49/overcloud-resource-registry-puppet.yaml#L124 - -The services running via pacemaker will be explicitely listed in an -environment file, like here: -https://github.com/openstack/tripleo-heat-templates/blob/40ad2899106bc5e5c0cf34c40c9f391e19122a49/environments/puppet-pacemaker.yaml#L12 - -Alternatives ------------- - -There are many alternative designs for the HA architecture. The decision -to use pacemaker only for a certain set of "core" services and all the -Active/Passive services comes from a careful balance between complexity -of the architecture and its management and being able to recover resources -in a known broken state. There is a main assumption here about native -openstack services: - -They *must* be able to start when the broker and the database are down and keep -retrying. - -The reason for using only pacemaker for the core services and not, for -example keepalived for the Virtual IPs, is to keep the stack simple and -not introduce multiple distributed resource managers. Also, if we used -only keepalived, we'd have no way of recovering from a failure beyond -trying to relocate the VIP. - -The reason for keeping haproxy under pacemaker's management is that -we can guarantee that a VIP will always run where haproxy is running, -should an haproxy service fail. - - -Security Impact ---------------- - -No changes regarding security aspects compared to the existing status quo. - -Other End User Impact ---------------------- - -The operators working with a cloud are impacted in the following ways: - -* The services (galera, redis, openstack-cinder-volume, VIPs, - haproxy) will be managed as usual via `pcs`. Pacemaker will monitor these - services and provide their status via `pcs status`. - -* All other services will be managed via `systemctl` and systemd will be - configured to automatically restart a failed service. Note, that this is - already done in RDO with (Restart={always,on-failure}) in the service files. - It is a noop when pacemaker manages the service as an override file is - created by pacemaker: - - https://github.com/ClusterLabs/pacemaker/blob/master/lib/services/systemd.c#L547 - - With the new architecture, restarting a native openstack service across - all controllers will require restarting it via `systemctl` on each node (as opposed - to a single `pcs` command as it is done today) - -* All services will be configured to retry indefinitely to connect to - the database or to the messaging broker. In case of a controller failure, - the failover scenario will be the same as with the current HA architecture, - with the difference that the services will just retry to re-connect indefinitely. - -* Previously with the HA template every service would be monitored and managed by - pacemaker. With the split between openstack services being managed by systemd and - "core" services managed by pacemaker, the operator needs to know which service - to monitor with which command. - -Performance Impact ------------------- - -No changes compared to the existing architecture. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -In the future we might see if the removal of the non-HA template is feasible, -thereby simplifying our CI jobs and have single more-maintained template. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - michele - -Other contributors: - ... - - -Work Items ----------- - -* Prepare the roles that deploy the next generation architecture. Initially, - keep it as close as possible to the existing HA template and make it simpler - in a second iteration (remove unnecessary steps, etc.) Template currently - lives here and deploys successfully: - - https://review.openstack.org/#/c/314208/ - -* Test failure scenarios and recovery scenario, open bugs against services that - misbehave in the face of database and/or broker being down. - - -Dependencies -============ - -None - -Testing -======= - -Initial smoke-testing has been completed successfully. Another set of tests -focusing on the behaviour of openstack services when galera and rabbitmq are -down is in the process of being run. - -Particular focus will be on failover scenarios and recovery times and making -sure that there are no regressions compared to the current HA architecture. - - -Documentation Impact -==================== - -Currently we do not describe the architectures as deployed by TripleO itself, -so no changes needed. A short page in the docs describing the architecture -would be a nice thing to have in the future. - -References -========== - -This design came mostly out from a meeting in Brno with the following attendees: - -* Andrew Beekhof -* Chris Feist -* Eoghan Glynn -* Fabio Di Nitto -* Graeme Gillies -* Hugh Brock -* Javier Peña -* Jiri Stransky -* Lars Kellogg-Steadman -* Mark Mcloughlin -* Michele Baldessari -* Raoul Scarazzini -* Rob Young diff --git a/specs/newton/tripleo-lldp-validation.rst b/specs/newton/tripleo-lldp-validation.rst deleted file mode 100644 index b01198cd..00000000 --- a/specs/newton/tripleo-lldp-validation.rst +++ /dev/null @@ -1,229 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -TripleO LLDP Validation -========================================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-lldp-validation - -The Link Layer Discovery Protocol (LLDP) is a vendor-neutral link layer -protocol in the Internet Protocol Suite used by network devices for -advertising their identity, capabilities, and neighbors on an -IEEE 802 local area network, principally wired Ethernet. [1] - -The Link Layer Discover Protocol (LLDP) helps identify layer 1/2 -connections between hosts and switches. The switch port, chassis ID, -VLANs trunked, and other info is available, for planning or -troubleshooting a deployment. For instance, a deployer may validate -that the proper VLANs are supplied on a link, or that all hosts -are connected to the Provisioning network. - -Problem Description -=================== - -A detailed description of the problem: - -* Deployment networking is one of the most difficult parts of any - OpenStack deployment. A single misconfigured port or loose cable - can derail an entire multi-rack deployment. - -* Given the first point, we should work to automate validation and - troubleshooting where possible. - -* Work is underway to collect LLDP data in ironic-python-agent, - and we have an opportunity to make that data useful [2]. - - -Proposed Change -=============== - -Overview --------- - -The goal is to expose LLDP data that is collected during -introspection, and provide this data in a format that is useful for the -deployer. This work depends on the LLDP collection work being done -in ironic-python-agent [3]. - -There is work being done to implement LLDP data collection for Ironic/ -Neutron integration. Although this work is primarily focused on features -for bare-metal Ironic instances, there will be some overlap with the -way TripleO uses Ironic to provision overcloud servers. - -Alternatives ------------- - -There are many network management utilities that use CDP or LLDP data to -validate the physical networking. Some of these are open source, but none -are integrated with OpenStack. - -Alternative approaches that do not use LLDP are typically vendor-specific -and require specific hardware support. Cumulus has a solution which works -with multiple vendors' hardware, but that solution requires running their -custom OS on the Ethernet switches. - -Another approach which is common is to perform collection of the switch -configurations to a central location, where port configurations can be -viewed, or in some cases even altered and remotely pushed. The problem -with this approach is that the switch configurations are hardware and -vendor-specific, and typically a network engineer is required to read -and interpret the configuration. A unified approach that works for all -common switch vendors is preferred, along with a unified reporting format. - -Security Impact ---------------- - -The physical network report provides a roadmap to the underlying network -structure. This could prove handy to an attacker who was unaware of the -existing topology. On the other hand, the information about physical -network topology is less valuable than information about logical topology -to an attacker. LLDP contains some information about both physical and -logical topology, but the logical topology is limited to VLAN IDs. - -The network topology report should be considered sensitive but not -critical. No credentials or shared secrets are revealed in the data -collected by ironic-inspector. - -Other End User Impact ---------------------- - -This report will hopefully reduce the troubleshooting time for nodes -with failed network deployments. - -Performance Impact ------------------- - -If this report is produced as part of the ironic-inspector workflow, -then it will increase the time taken to introspect each node by a -negligible amount, perhaps a few seconds. - -If this report is called by the operator on demand, it will have -no performance impact on other components. - -Other Deployer Impact ---------------------- - -Deployers may want additional information than the per-node LLDP report. -There may be some use in providing aggregate reports, such as the number -of nodes with a specific configuration of interfaces and trunked VLANs. -This would help to highlight outliers or misconfigured nodes. - -There have been discussions about adding automated switch configuration -in TripleO. This would be a mechanism whereby deployers could produce the -Ethernet switch configuration with a script based on a configuration -template. The deployer would provide specifics like the number of nodes -and the configuration per node, and the script would generate the switch -configuration to match. In that case, the LLDP collection and analysis -would function as a validator for the automatically generated switch -port configurations. - -Developer Impact ----------------- - -The initial work will be to fill in fixed fields such as Chassis ID -and switch port. An LLDP packet can contain additional data on a -per-vendor basis, however. - -The long-term plan is to store the entire LLDP packet in the -metadata. This will have to be parsed out. We may have to work with -switch vendors to figure out how to interpret some of the data if -we want to make full use of it. - -Implementation -============== - -Some notes about implementation: - -* This Python tool will access the introspection data and produce - reports on various information such as VLANs per port, host-to-port - mapping, and MACs per host. - -* The introspection data can be retrieved with the Ironic API [4] [5]. - -* The data will initially be a set of fixed fields which are retrievable - in the JSON in the Ironic introspection data. Later, the entire - LLDP packet will be stored, and will need to be parsed outisde of the - Ironic API. - -* Although the initial implementation can return a human-readable report, - other outputs should be available for automation, such as YAML. - -* The tool that produces the LLDP report should be able to return data - on a single host, or return all of the data. - -* Some basic support for searching would be a nice feature to have. - -* This data will eventually be used by the GUI to display as a validation - step in the deployment workflow. - -Assignee(s) ------------ - -Primary assignee: - dsneddon - -Other contributors: - bfournie - -Work Items ----------- - -* Create the Python script to grab introspection data from Swift using - the API. - -* Create the Python code to extract the relevant LLDP data from the - data JSON. - -* Implement per-node reports - -* Implement aggregate reports - -* Interface with UI developers to give them the data in a form that can - be consumed and presented by the TripleO UI. - -* In the future, when the entire LLDP packet is stored, refactor logic - to take this into account. - -Testing -======= - -Since this is a report that is supposed to benefit the operator, perhaps -the best way to include it in CI is to make sure that the report gets -logged by the Undercloud. Then the report can be reviewed in the log -output from the CI run. - -In fact, this might benefit the TripleO CI process, since hardware issues -on the network would be easier to troubleshoot without having access to -the bare metal console. - - -Documentation Impact -==================== - -Documentation will need to be written to cover making use of the new -LLDP reporting tool. This should cover running the tool by hand and -interpreting the data. - - -References -========== -* [1] - Wikipedia entry on LLDP: - https://en.wikipedia.org/wiki/Link_Layer_Discovery_Protocol - -* [2] - Blueprint for Ironic/Neutron integration: - https://blueprints.launchpad.net/ironic/+spec/ironic-ml2-integration - -* [3] - Review: Support LLDP data as part of interfaces in inventory - https://review.openstack.org/#/c/320584/ - -* [4] - Accessing Ironic Introspection Data - http://tripleo.org/advanced_deployment/introspection_data.html - -* [5] - Ironic API - Get Introspection Data - http://docs.openstack.org/developer/ironic-inspector/http-api.html#get-introspection-data \ No newline at end of file diff --git a/specs/newton/tripleo-opstools-availability-monitoring.rst b/specs/newton/tripleo-opstools-availability-monitoring.rst deleted file mode 100644 index d9924b4d..00000000 --- a/specs/newton/tripleo-opstools-availability-monitoring.rst +++ /dev/null @@ -1,186 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================ -Enable deployment of availability monitoring -============================================ - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-opstools-availability-monitoring - -TripleO should be deploying out-of-the-box availability monitoring solution -to serve the overcloud. - -Problem Description -=================== - -Currently there is no such feature implemented except for possibility to deploy -sensu-server, sensu-api and uchiwa (Sensu dashboard) services in the undercloud -stack. Without sensu-client services deployed on overcloud nodes this piece -of code is useless. Due to potential of high resource consumption it is also -reasonable to remove current undercloud code to avoid possible problems -when high number of overcloud nodes is being deployed. - -Instead sensu-server, sensu-api and uchiwa should be deployed on the separate -node(s) whether it is on the undercloud level or on the overcloud level. -And so sensu-client deployment support should be flexible enough to enable -connection to external monitoring infrastructure or with Sensu stack deployed -on the dedicated overcloud node. - -Summary of use cases: - -1. sensu-server, sensu-api and uchiwa deployed in external infrastructure; -sensu-client deployed on each overcloud node -2. sensu-server, sensu-api and uchiwa deployed as a separate Heat stack in -the overcloud stack; sensu-client deployed on each overcloud node - -Proposed Change -=============== - -Overview --------- - -The sensu-client service will be deployed as a composable service on -the overcloud stack when it is explicitly stated via environment file. -Sensu checks will have to be configured as subscription checks (see [0] -for details). Each composable service will have it's own subscription string, -which will ensure that checks defined on Sensu server node (wherever it lives) -are run on the correct overcloud nodes. - -There will be implemented a possibility to deploy sensu-server, sensu-api -and uchiwa services on a stand alone node deployed by the undercloud. -This standalone node will have a dedicated purpose for monitoring -(not only for availability monitoring services, but in future also for -centralized logging services or performance monitoring services) - -The monitoring node will be deployed as a separate Heat stack to the overcloud -stack using Puppet and composable roles for required services. - -Alternatives ------------- - -None - -Security Impact ---------------- - -Additional service (sensu-client) will be installed on all overcloud nodes. -These services will have open connection to RabbitMQ instance running -on monitoring node and are used to execute commands (checks) on the overcloud -nodes. Check definition will live on the monitoring node. - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -We might consider deploying separate RabbitMQ and Redis for monitoring purposes -if we want to avoid influencing OpenStack deployment in the overcloud. - -Other Deployer Impact ---------------------- - -* Sensu clients will be deployed by default on all overcloud nodes except the monitoring node. -* New Sensu common parameters: - - * MonitoringRabbitHost - - * RabbitMQ host Sensu has to connect to - - * MonitoringRabbitPort - - * RabbitMQ port Sensu has to connect to - - * MonitoringRabbitUseSSL - - * Whether Sensu should connect to RabbitMQ using SSL - - * MonitoringRabbitPassword - - * RabbitMQ password used for Sensu to connect - - * MonitoringRabbitUserName - - * RabbitMQ username used for Sensu to connect - - * MonitoringRabbitVhost - - * RabbitMQ vhost used for monitoring purposes. - -* New Sensu server/API parameters - - * MonitoringRedisHost - - * Redis host Sensu has to connect to - - * MonitoringRedisPassword - - * Redis password used for Sensu to connect - - * MonitoringChecks: - - * Full definition (for all subscriptions) of checks performed by Sensu - -* New parameters for subscription strings for each composable service: - - * For example for service nova-compute MonitoringSubscriptionNovaCompute, which will default to 'overcloud-nova-compute' - - -Developer Impact ----------------- - -Support for new node type should be implemented for tripleo-quickstart. - -Implementation -============== - -Assignee(s) ------------ - -Martin Mágr - -Work Items ----------- - -* puppet-tripleo profile for Sensu services -* puppet-tripleo profile for Uchiwa service -* tripleo-heat-templates composable service for sensu-client deployment -* tripleo-heat-templates composable service for sensu-server deployment -* tripleo-heat-templates composable service for sensu-api deployment -* tripleo-heat-templates composable service for uchiwa deployment -* Support for monitoring node in tripleo-quickstart -* Revert patch(es) implementing Sensu support in instack-undercloud - -Dependencies -============ - -* Puppet module for Sensu services: sensu-puppet [1] -* Puppet module for Uchiwa: puppet-uchiwa [2] -* CentOS Opstools SIG repo [3] - -Testing -======= - -Sensu client deployment will be tested by current TripleO CI as soon as -the patch is merged, as it will be deployed by default. - -We should consider creating CI job for deploying overcloud with monitoring -node to test the rest of the monitoring components. - -Documentation Impact -==================== - -Process of creating new node type and new options will have to be documented. - -References -========== - -[0] https://sensuapp.org/docs/latest/reference/checks.html#subscription-checks -[1] https://github.com/sensu/sensu-puppet -[2] https://github.com/Yelp/puppet-uchiwa -[3] https://wiki.centos.org/SpecialInterestGroup/OpsTools diff --git a/specs/newton/tripleo-opstools-centralized-logging.rst b/specs/newton/tripleo-opstools-centralized-logging.rst deleted file mode 100644 index 673b5436..00000000 --- a/specs/newton/tripleo-opstools-centralized-logging.rst +++ /dev/null @@ -1,147 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================== -Enable deployment of centralized logging -======================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-opstools-centralized-logging - -TripleO should be deploying with an out-of-the-box centralized logging -solution to serve the overcloud. - -Problem Description -=================== - -With a complex distributed system like OpenStack, identifying and -diagnosing a problem may require tracking a transaction across many -different systems and many different logfiles. In the absence of a -centralized logging solution, this process is frustrating to both new -and experienced operators and can make even simple problems hard to -diagnose. - -Proposed Change -=============== - -We will deploy the Fluentd_ service in log collecting mode as a -composable service on all nodes in the overcloud stack when configured -to do so by the environment. Each composable service will have its -own fluentd source configuration. - -.. _fluentd: http://www.fluentd.org/ - -To receive these messages, we will deploy a centralized logging system -running Kibana_, Elasticsearch_ and Fluentd on dedicated nodes to -provide log aggregation and analysis. This will be deployed in a -dedicated Heat stack that is separate from the overcloud stack using -composable roles. - -.. _kibana: https://www.elastic.co/products/kibana -.. _elasticsearch: https://www.elastic.co/ - -We will also support sending messages to an external Fluentd -instance not deployed by tripleo. - -Summary of use cases --------------------- - -1. Elasticsearch, Kibana and Fluentd log relay/transformer deployed as - a separate Heat stack in the overcloud stack; Fluentd log - collector deployed on each overcloud node - -2. ElasticSearch, Kibana and Fluentd log relay/transformer deployed in - external infrastructure; Fluentd log collector deployed on each - overcloud node - -Alternatives ------------- - -None - -Security Impact ---------------- - -Data collected from the logs of OpenStack services can contain -sensitive information: - -- Communication between the - fluentd agent and the log aggregator should be protected with SSL. - -- Access to the Kibana UI must have at least basic HTTP - authentication, and client access should be via SSL. - -- ElasticSearch should only allow collections over ``localhost``. - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -Additional resources will be required for running Fluentd on overcloud -nodes. Log traffic from the overcloud nodes to the log aggregator -will consume some bandwidth. - -Other Deployer Impact ---------------------- - -- Fluentd will be deployed on all overcloud nodes. -- New parameters for configuring Fluentd collector. -- New parameters for configuring log collector (Fluentd, - ElasticSearch, and Kibana) - -Developer Impact ----------------- - -Support for the new node type should be implemented for tripleo-quickstart. - -Implementation -============== - -Assignee(s) ------------ - -Martin Mágr -Lars Kellogg-Stedman - -Work Items ----------- - -- puppet-tripleo profile for fluentd service -- tripleo-heat-templates composable role for FluentD collector deployment -- tripleo-heat-templates composable role for FluentD aggregator deployment -- tripleo-heat-templates composable role for ElasticSearch deployment -- tripleo-heat-templates composable role for Kibana deployment -- Support for logging node in tripleo-quickstart - -Dependencies -============ - -- Puppet module for Fluentd: `konstantin-fluentd` [1] -- Puppet module for ElasticSearch `elasticsearch-elasticsearch` [2] -- Puppet module for Kibana (tbd) -- CentOS Opstools SIG package repository - -Testing -======= - -Fluentd client deployment will be tested by current TripleO CI as soon as -the patch is merged. Because the centralized logging features will not -be enabled by default we may need to introduce specific tests for -these features. - -Documentation Impact -==================== - -Process of creating new node type and new options will have to be documented. - -References -========== - -[1] https://forge.puppet.com/srf/fluentd -[2] https://forge.puppet.com/elasticsearch/elasticsearch diff --git a/specs/newton/tripleo-ovs-dpdk.rst b/specs/newton/tripleo-ovs-dpdk.rst deleted file mode 100644 index 24a7fa4a..00000000 --- a/specs/newton/tripleo-ovs-dpdk.rst +++ /dev/null @@ -1,232 +0,0 @@ - -This work is licensed under a Creative Commons Attribution 3.0 Unported -License. - -http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Adding OVS-DPDK to Tripleo -========================================== - -Blueprint URL - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-ovs-dpdk - -DPDK is a set of libraries and drivers for fast packet processing and gets as -close to wire-line speed as possible for virtual machines. - - * It is a complete framework for fast packet processing in data plane - applications. - - * Directly polls the data from the NIC. - - * Does not use interrupts - to prevent performance overheads. - - * Uses the hugepages to preallocate large regions of memory, which allows the - applications to DMA data directly into these pages. - - * DPDK also has its own buffer and ring management systems for handling - sk_buffs efficiently. - -DPDK provides data plane libraries and NIC drivers for - - - * Queue management with lockless queues. - - * Buffer manager with pre-allocated fixed size buffers. - - * PMD (poll mode drivers) to work without asynchronous notifications. - - * Packet framework (set of libraries) to help data plane packet processing. - - * Memory manager - allocates pools of memory, uses a ring to store free - objects. - -Problem Description -=================== - -* Today the installation and configuration of OVS+DPDK in openstack is done - manually after overcloud deployment. This can be very challenging for the - operator and tedious to do over a large number of compute nodes. - The installation of OVS+DPDK needs be automated in tripleo. - -* Identification of the hardware capabilities for DPDK were all done manually - today and the same shall be automated during introspection. This hardware - detection also provides the operator with the data needed for configuring - Heat templates. - -* As of today its not possible to have the co-existence of compute nodes with - DPDK enabled hardware and without DPDK enabled hardware. - - -Proposed Change -=============== - -* Ironic Python Agent shall discover the below hardware details and store it - in swift blob - - - * CPU flags for hugepages support - - If pse exists then 2MB hugepages are supported - If pdpe1gb exists then 1GB hugepages are supported - - * CPU flags for IOMMU - - If VT-d/svm exists, then IOMMU is supported, provided IOMMU support is - enabled in BIOS. - - * Compatible nics - - Shall compare it with the list of NICs whitelisted for DPDK. The DPDK - supported NICs are available at http://dpdk.org/doc/nics - - The nodes without any of the above mentioned capabilities can't be used for - COMPUTE role with DPDK. - -* Operator shall have a provision to enable DPDK on compute nodes - -* The overcloud image for the nodes identified to be COMPUTE capable and having - DPDK NICs, shall have the OVS+DPDK package instead of OVS. It shall also have - packages dpdk and driverctl. - -* The device names of the DPDK capable NIC’s shall be obtained from T-H-T. - The PCI address of DPDK NIC needs to be identified from the device name. - It is required for whitelisting the DPDK NICs during PCI probe. - -* Hugepages needs to be enabled in the Compute nodes with DPDK. - Bug: https://bugs.launchpad.net/tripleo/+bug/1589929 needs to be implemeted - -* CPU isolation needs to be done so that the CPU cores reserved for DPDK Poll - Mode Drivers (PMD) are not used by the general kernel balancing, - interrupt handling and scheduling algorithms. - Bug: https://bugs.launchpad.net/tripleo/+bug/1589930 needs to be implemented. - -* On each COMPUTE node with DPDK enabled NIC, puppet shall configure the - DPDK_OPTIONS for whitelisted NIC's, CPU mask and number of memory channels - for DPDK PMD. The DPDK_OPTIONS needs to be set in /etc/sysconfig/openvswitch - -* Os-net-config shall - - - * Associate the given interfaces with the dpdk drivers (default as vfio-pci - driver) by identifying the pci address of the given interface. The - driverctl shall be used to bind the driver persistently - - * Understand the ovs_user_bridge and ovs_dpdk_port types and configure the - ifcfg scripts accordingly. - - * The "TYPE" ovs_user_bridge shall translate to OVS type OVSUserBridge and - based on this OVS will configure the datapath type to 'netdev'. - - * The "TYPE" ovs_dpdk_port shall translate OVS type OVSDPDKPort and based on - this OVS adds the port to the bridge with interface type as 'dpdk' - - * Understand the ovs_dpdk_bond and configure the ifcfg scripts accordingly. - -* On each COMPUTE node with DPDK enabled NIC, puppet shall - - - * Enable OVS+DPDK in /etc/neutron/plugins/ml2/openvswitch_agent.ini - [OVS] - datapath_type=netdev - vhostuser_socket_dir=/var/run/openvswitch - - * Configure vhostuser ports in /var/run/openvswitch to be owned by qemu. - -* On each controller node, puppet shall - - - * Add NUMATopologyFilter to scheduler_default_filters in nova.conf. - -Alternatives ------------- - -* The boot parameters could be configured via puppet (during overcloud - deployment) as well as virt-customize (after image building or downloading). - The choice of selection of boot parameter is moved out of scope of this - blueprint and would be tracked via - https://bugs.launchpad.net/tripleo/+bug/1589930. - -Security impact ---------------- - -* We have no firewall drivers which support ovs-dpdk at present. Security group - support with conntrack is a possible option, and this work is in progress. - Security groups will not be supported. - - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -* OVS-DPDK can augment 3 times dataplane performance. - Refer http://goo.gl/Du1EX2 - -Other Deployer Impact ---------------------- - -* The operator shall ensure that the VT-d/IOMMU virtualization technology is - enabled in BIOS of the compute nodes. - -* Post deployment, operator shall modify the VM flavors for using hugepages, - CPU pinning - Ex: nova flavor-key m1.small set "hw:mem_page_size=large" - - -Developer Impact ----------------- - -None - -Implementation -============== - - -Assignee(s) ------------ - -Primary assignees: - -* karthiks -* sanjayu - -Work Items ----------- - -* The proposed changes discussed earlier will be the work items - -Dependencies -============ - -* We are dependent on composable roles, as this is something we would - require only on specific compute nodes and not generally on all the nodes. - -* To enable Hugepages, bug: https://bugs.launchpad.net/tripleo/+bug/1589929 - needs to be implemeted - -* To address boot parameter changes for CPU isolation, - bug: https://bugs.launchpad.net/tripleo/+bug/1589930 needs to be implemented - -Testing -======= - -* Since DPDK needs specific hardware support, this feature cant be tested under - CI. We will need third party CI for validating it. - -Documentation Impact -==================== - -* Manual steps that needs to be done by the operator shall be documented. - Ex: configuring BIOS for VT-d, adding boot parameter for CPU isolation, - hugepages, post deploymenent configurations. - -Refrences -========= - -* Manual steps to setup DPDK in RedHat Openstack Platform 8 - https://goo.gl/6ymmJI - -* Setup procedure for CPU pinning and NUMA topology - http://goo.gl/TXxuhv - -* DPDK supported NICS - http://dpdk.org/doc/nics - - - diff --git a/specs/newton/tripleo-sriov.rst b/specs/newton/tripleo-sriov.rst deleted file mode 100644 index 78644a5a..00000000 --- a/specs/newton/tripleo-sriov.rst +++ /dev/null @@ -1,250 +0,0 @@ -This work is licensed under a Creative Commons Attribution 3.0 Unported -License. - -http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Adding SR-IOV to Tripleo -========================================== - -Blueprint URL: - https://blueprints.launchpad.net/tripleo/+spec/tripleo-sriov - -SR-IOV is a specification that extends the PCI Express specification and allows -a PCIe device to appear to be multiple separate physical PCIe devices. - -SR-IOV provides one or more Virtual Functions (VFs) and a Physical Function(PF) - - * Virtual Functions (VF's) are ‘lightweight’ PCIe functions that contain the - resources necessary for data movement but have a carefully minimized set - of configuration resources. - - * Physical Function (PF) are full PCIe functions that include the SR-IOV - Extended Capability. This capability is used to configure and manage - the SR-IOV functionality. - -The VF’s could be attached to VMs like a dedicated PCIe device and thereby the -usage of SR-IOV NICs boosts the networking performance considerably. - - -Problem Description -=================== - -* Today the installation and configuration of SR-IOV feature is done manually - after overcloud deployment. It shall be automated via tripleo. - -* Identification of the hardware capabilities for SR-IOV were all done manually - today and the same shall be automated during introspection. The hardware - detection also provides the operator, the data needed for configuring Heat - templates. - -Proposed Change -=============== - -Overview --------- - -* Ironic Python Agent will discover the below hardware details and store it in - swift blob - - - * SR-IOV capable NICs: - Shall read /sys/bus/pci/devices/.../sriov_totalvfs and check if its non - zero, inorder to identify if the NIC is SR-IOV capable - - * VT-d or IOMMU support in BIOS: - The CPU flags shall be read to identify the support. - -* DIB shall include the package by default - openstack-neutron-sriov-nic-agent. - -* The nodes without any of the above mentioned capabilities can't be used for - COMPUTE role with SR-IOV - -* SR-IOV drivers shall be loaded during bootup via persistent module loading - scripts. These persistent module loading scripts shall be created by the - puppet manifests. - -* T-H-T shall provide the below details - - * supported_pci_vendor_devs - configure the vendor-id/product-id couples in - the nodes running neutron-server - - * max number of vf's - persistent across reboots - - * physical device mappings - Add physical device mappings ml2_conf_sriov.ini - file in compute node - -* On the nodes running the Neutron server, puppet shall - - * enable sriovnicswitch in the /etc/neutron/plugin.ini file - mechanism_drivers = openvswitch,sriovnicswitch - This configuration enables the SR-IOV mechanism driver alongside - OpenvSwitch. - - * Set the VLAN range for SR-IOV in the file /etc/neutron/plugin.ini, present - in the network node - network_vlan_ranges = : - : Ex : network_vlan_ranges = fabric0:15:20 - - * Configure the vendor-id/product-id couples if it differs from - “15b3:1004,8086:10ca” in /etc/neutron/plugins/ml2/ml2_conf_sriov.ini - supported_pci_vendor_devs = 15b3:1004,8086:10ca, - - * Configure neutron-server.service to use the ml2_conf_sriov.ini file - [Service] Type=notify User=neutron ExecStart=/usr/bin/neutron-server - --config-file /usr/share/neutron/neutron-dist.conf --config-file - /etc/neutron/neutron.conf --config-file /etc/neutron/plugin.ini - --config-file /etc/neutron/plugins/ml2/ml2_conf_sriov.ini --log-file - /var/log/neutron/server.log - -* In the nodes running nova scheduler, puppet shall - - * add PciPassthroughFilter filter to the list of scheduler_default_filters. - This needs to be done to allow proper scheduling of SR-IOV devices - -* On each COMPUTE+SRIOV node, puppet shall configure /etc/nova/nova.conf - - * Associate the available VFs with each physical network - Ex: pci_passthrough_whitelist={"devname": "enp5s0f1", - "physical_network":"fabric0"} - - PCI passthrough whitelist entries use the following syntax: ["device_id": - "",] ["product_id": "",] ["address": - "[[[[]:]]:][][.[]]" | "devname": "Ethernet - Interface Name",] "physical_network":"Network label string" - - VF's that needs to be excluded from agent configuration shall be added in - [sriov_nic]/exclude_devices. T-H-T shall configure this. - - Multiple whitelist entries per host are supported. - -* Puppet shall - - * Setup max number of VF's to be configured by the operator - echo required_max_vfs > /sys/bus/pci/devices/.../sriov_numvfs - Puppet will also validate the required_max_vfs, so that it does not go - beyond the supported max on the device. - - * Enable NoopFirewallDriver in the - '/etc/neutron/plugins/ml2/sriov_agent.ini' file. - - [securitygroup] - firewall_driver = neutron.agent.firewall.NoopFirewallDriver - - * Add mappings to the /etc/neutron/plugins/ml2/sriov_agent.ini file. Ex: - physical_device_mappings = fabric0:enp4s0f1 - In this example, fabric0 is the physical network, and enp4s0f1 is the - physical function - -* Puppet shall start the SR-IOV agent on Compute - - * systemctl enable neutron-sriov-nic-agent.service - - * systemctl start neutron-sriov-nic-agent.service - - -Alternatives ------------- - -None - -Security impact ---------------- - -* We have no firewall drivers which support SR-IOV at present. - Security groups will be disabled only for SR-IOV ports in compute hosts. - - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -* SR-IOV provides near native I/O performance for each virtual machine on a - physical server. Refer - http://goo.gl/HxZvXX - - -Other Deployer Impact ---------------------- - -* The operator shall ensure that the BIOS supports VT-d/IOMMU virtualization - technology on the compute nodes. - -* IOMMU needs to be enabled in the Compute+SR-IOV nodes. Boot parameters - (intel_iommu=on or amd_iommu=pt) shall be added in the grub.conf, using the - first boot scripts (THT). - -* Post deployment, operator shall - - * Create neutron ports prior to creating VM’s (nova boot) - neutron port-create fabric0_0 --name sr-iov --binding:vnic-type direct - - * Create the VM with the required flavor and SR-IOV port id - Ex: nova boot --flavor m1.small --image --nic port-id= - vnf0 - -Developer Impact ----------------- - -None - -Implementation -============== - - -Assignee(s) ------------ - -Primary assignees: - -* karthiks -* sanjayu - - -Work Items ----------- - -* Documented above in the Proposed changes - - -Dependencies -============ - -* We are dependent on composable roles as SR-IOV specific changes is something - we would require on specific compute nodes and not generally on all the - nodes. Blueprint - - https://blueprints.launchpad.net/tripleo/+spec/composable-services-within-roles - -Testing -======= - -* Since SR-IOV needs specific hardware support, this feature cant be tested - under CI. We will need third party CI for validating it. - -Documentation Impact -==================== - -* Manual steps that needs to be done by the operator shall be documented. - Ex: configuring BIOS for VT-d, IOMMU, post deploymenent configurations. - -Refrences -========= - -* SR-IOV support for virtual networking - https://goo.gl/eKP1oO - -* Enable SR-IOV functionality available in OpenStack - http://docs.openstack.org/liberty/networking-guide/adv_config_sriov.html - -* Introduction to SR-IOV - http://goo.gl/m7jP3 - -* Setup procedure for CPU pinning and NUMA topology - http://goo.gl/TXxuhv - -* /sys/bus/pci/devices/.../sriov_totalvfs - This file appears when a physical - PCIe device supports SR-IOV. - https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-bus-pci - diff --git a/specs/newton/undercloud-upgrade.rst b/specs/newton/undercloud-upgrade.rst deleted file mode 100644 index d6866b8c..00000000 --- a/specs/newton/undercloud-upgrade.rst +++ /dev/null @@ -1,272 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================== -Undercloud Upgrade -================== - -https://blueprints.launchpad.net/tripleo/+spec/undercloud-upgrade - -Our currently documented upgrade path for the undercloud is very problematic. -In fact, it doesn't work. A number of different patches are attempting to -address this problem (see the `References`_ section), but they all take slightly -different approaches that are not necessarily compatible with each other. - -Problem Description -=================== - -The undercloud upgrade must be carefully orchestrated. A few of the problems -that can be encountered during an undercloud upgrade if things are not done -or not done in the proper order: - -#. Services may fail and get stuck in a restart loop - -#. Service databases may not be properly upgraded - -#. Services may fail to stop and prevent the upgraded version from starting - -Currently there is not agreement over who should be responsible for running -the various steps of the undercloud upgrade. Getting everyone on the same -page regarding this is the ultimate goal of this spec. - -Also of note is the MariaDB major version update flow from -`Upgrade documentation (under and overcloud)`_. This will need to be -addressed as part of whatever upgrade solution we decide to pursue. - -Proposed Change -=============== - -I'm going to present my proposed solution here, but will try to give a fair -overview of the other proposals in the `Alternatives`_ section. Others -should feel free to push modifications or follow-ups if I miss anything -important, however. - -Overview --------- - -Services must be stopped before their respective package update is run. -This is because the RPM specs for the services include a mandatory restart to -ensure that the new code is running after the package is updated. On a major -version upgrade, this can and does result in broken services because the config -files are not always forward compatible, so until Puppet is run again to -configure them appropriately the service cannot start. The broken services -can cause other problems as well, such as the yum update taking an excessively -long time because it times out waiting for the service to restart. It's worth -noting that this problem does not exist on an HA overcloud because Pacemaker -stubs out the service restarts in the systemd services so the package update -restart becomes a noop. - -Because the undercloud is not required to have extremely high uptime, I am in -favor of just stopping all of the services, updating all the packages, then -re-running the undercloud install to apply the new configs and start the -services again. This ensures that the services are not restarted by the -package update - which only happens if the service was running at the time of -the update - and that there is no chance of an old version of a service being -left running and interfering with the new version, as can happen when moving -a service from a standalone API process to httpd. - -instack-undercloud will be responsible for implementing the process described -above. However, to avoid complications with instack-undercloud trying to -update itself, tripleoclient will be responsible for updating -instack-undercloud and its dependencies first. This two-step approach -should allow us to sanely use an older tripleoclient to run the upgrade -because the code in the client will be minimal and should not change from -release to release. Upgrade-related backports to stable clients should not -be needed in any foreseeable case. Any potential version-specific logic can -live in instack-undercloud. The one exception being that we may need to -initially backport this new process to the previous stable branch so we can -start using it without waiting an entire cycle. Since the current upgrade -process does not work correctly there, I think this would be a valid bug fix -backport. - -A potential drawback of this approach is that it will not automatically -trigger the Puppet service db-syncs because Puppet is not aware that the -version has changed if we update the packages separately. However, I feel -this is a case we need to handle sanely anyway in case a package is updated -outside Puppet either intentionally or accidentally. To that end, we've -already merged a patch to always run db-syncs on the undercloud since they're -idempotent anyway. See `Stop all services before upgrading`_ for a link to -the patch. - -MariaDB -------- - -Regarding the MariaDB issue mentioned above, I believe that regardless of the -approach we take, we should automate the dump and restore of the database as -much as possible. Either solution should be able to look at the version of -mariadb before yum update and the version after, and decide whether the db -needs to be dumped. If a user updates the package manually outside the -undercloud upgrade flow then they will be responsible for the db upgrade -themselves. I think this is the best we can do, short of writing some sort -of heuristic that can figure out whether the existing db files are for an -older version of MariaDB and doing the dump/restore based on that. - -Updates vs. Upgrades --------------------- - -I am also proposing that we not differentiate between minor updates and major -upgrades on the undercloud. Because we don't need to be as concerned with -uptime there, any additional time required to treat all upgrades as a -potential major version upgrade should be negligible, and it avoids us -having to maintain and test multiple paths. - -Additionally, the difference between a major and minor upgrade becomes very -fuzzy for anyone upgrading between versions of master. There may be db -or rpc changes that require the major upgrade flow anyway. Also, the whole -argument assumes we can even come up with a sane, yet less-invasive update -strategy for the undercloud anyway, and I think our time is better spent -elsewhere. - -Alternatives ------------- - -As shown in `Don't update whole system on undercloud upgrade`_, another -option is to limit the manual yum update to just instack-undercloud and make -Puppet responsible for updating everything else. This would allow Puppet -to handle all of the upgrade logic internally. As of this writing, there is -at least one significant problem with the patch as proposed because it does -not update the Puppet modules installed on the undercloud, which leaves us -in a chicken and egg situation with a newer instack-undercloud calling older -Puppet modules to run the update. I believe this could be solved by also -updating the Puppet modules along with instack-undercloud. - -Drawbacks of this approach would be that each service needs to be orchestrated -correctly in Puppet (this could also be a feature, from a Puppet CI -perspective), and it does not automatically handle things like services moving -from standalone to httpd. This could be mitigated by the undercloud upgrade -CI job catching most such problems before they merge. - -I still personally feel this is more complicated than the proposal above, but -I believe it could work, and as noted could have benefits for CI'ing upgrades -in Puppet modules. - -There is one other concern with this that is less a functional issue, which is -that it significantly alters our previous upgrade methods, and might be -problematic to backport as older versions of instack-undercloud were assuming -an external package update. It's probably not an insurmountable obstacle, but -I do feel it's worth noting. Either approach is going to require some amount -of backporting, but this may require backporting in non-tripleo Puppet modules -which may be more difficult to do. - -Security Impact ---------------- - -No significant security impact one way or another. - -Other End User Impact ---------------------- - -This will likely have an impact on how a user runs undercloud upgrades, -especially compared to our existing documented upgrade method. -Ideally all of the implementation will happen behind the ``openstack undercloud -upgrade`` command regardless of which approach is taken, but even that is a -change from before. - -Performance Impact ------------------- - -The method I am suggesting can do an undercloud upgrade in 20-25 -minutes end-to-end in a scripted CI job. - -The performance impact of the Puppet approach is unknown to me. - -The performance of the existing method where service packages are updated with -the service still running is terrible - upwards of two hours for a full -upgrade in some cases, assuming the upgrade completes at all. This is largely -due to the aforementioned problem with services restarting before their config -files have been updated. - -Other Deployer Impact ---------------------- - -Same as the end user impact. In this case I believe they're the same person. - -Developer Impact ----------------- - -Discussed somewhat in the proposals, but I believe my approach is a little -simpler from the developer perspective. They don't have to worry about the -orchestration of the upgrade, they only have to provide a valid configuration -for a given version of OpenStack. The one drawback is that if we add any new -services on the undercloud, their db-sync must be wired into the "always run -db-syncs" list. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignees: - -* bnemec -* EmilienM - -Other contributors (I'm essentially listing everyone who has been involved in -upgrade work so far): - -* lbezdick -* bandini -* marios -* jistr - -Work Items ----------- - -* Implement an undercloud upgrade CI job to test upgrades. -* Implement the selected approach in the undercloud upgrade command. - - -Dependencies -============ - -None - -Testing -======= - -A CI job is already underway. See `Undercloud Upgrade CI Job`_. This should -provide reasonable coverage on a per-patch basis. We may also want to test -undercloud upgrades in periodic jobs to ensure that it is possible to deploy -an overcloud with an upgraded undercloud. This probably takes too long to be -done in the regular CI jobs, however. - -There has also been discussion of running Tempest API tests on the upgraded -undercloud, but I'm unsure of the status of that work. It would be good to -have in the standalone undercloud upgrade job though. - - -Documentation Impact -==================== - -The docs will need to be updated to reflect the new upgrade method. Hopefully -this will be as simple as "Run openstack undercloud upgrade", but that remains -to be seen. - - -References -========== - -Stop all services before upgrading ----------------------------------- -Code: https://review.openstack.org/331804 - -Docs: https://review.openstack.org/315683 - -Always db-sync: https://review.openstack.org/#/c/346138/ - -Don't update whole system on undercloud upgrade ------------------------------------------------ -https://review.openstack.org/327176 - -Upgrade documentation (under and overcloud) -------------------------------------------- -https://review.openstack.org/308985 - -Undercloud Upgrade CI Job -------------------------- -https://review.openstack.org/346995 diff --git a/specs/newton/validations.rst b/specs/newton/validations.rst deleted file mode 100644 index 4055c15c..00000000 --- a/specs/newton/validations.rst +++ /dev/null @@ -1,159 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================== -TripleO Deployment Validations -============================== - -We need ways in TripleO for performing validations at various stages of the -deployment. - -Problem Description -=================== - -TripleO deployments, and more generally all OpenStack deployments, are complex, -error prone, and highly dependent on the environment. An appropriate set of -tools can help engineers to identify potential problems as early as possible -and fix them before going further with the deployment. - -People have already developed such tools [1], however they appear more like -a random collection of scripts than a well integrated solution within TripleO. -We need to expose the validation checks from a library so they can be consumed -from the GUI or CLI without distinction and integrate flawlessly within TripleO -deployment workflow. - -Proposed Change -=============== - -We propose to extend the TripleO Overcloud Deployment Mistral workflow [2] to -include Actions for validation checks. - -These actions will need at least to: - -* List validations -* Run and stop validations -* Get validation status -* Persist and retrieve validation results -* Permit grouping validations by 'deployment stage' and execute group operations - -Running validations will be implemented in a workflow to ensure the nodes meet -certain expectations. For example, a baremetal validation may require the node -to boot on a ramdisk first. - -Mistral workflow execution can be started with the `mistral execution-create` -command and can be stopped with the `mistral execution-update` command by -setting the workflow status to either SUCCESS or ERROR. - -Every run of the workflow (workflow execution) is stored in Mistral's DB and -can be retrieved for later use. The workflow execution object contains all -information about the workflow and its execution, including all output data and -statuses for all the tasks composing the workflow. - -By introducing a reasonable validation workflows naming, we are able to use -workflow names to identify stage at which the validations should run and -trigger all validations of given stage (e.g. -tripleo.validation.hardware.undercloudRootPartitionDiskSizeCheck) - -Using the naming conventions, the user is also able to register a new -validation workflow and add it to the existing ones. - -Alternatives ------------- - -One alternative is to ship a collection of scripts within TripleO to be run by -engineers at different stages of the deployment. This solution is not optimal -because it requires a lot of manual work and does not integrate with the UI. - -Another alternative is to build our own API, but it would require significantly -more effort to create and maintain. This topic has been discussed at length on -the mailing list. - -Security Impact ---------------- - -The whole point behind the validations framework is to permit running scripts -on the nodes, thus providing access from the control node to the deployed nodes -at different stages of the deployment. Special care needs to be taken to grant -access to the target nodes using secure methods and ensure only trusted scripts -can be executed from the library. - -Other End User Impact ---------------------- - -We expect reduced deployment time thanks to early issue detection. - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -Developers will need to keep the TripleO CI updated with changes, and will be -responsible for fixing the CI as needed. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignees: - -* shadower -* mandre - -Work Items ----------- - -The work items required are: - -* Develop the tripleo-common Mistral actions that provide all of the - functionality required for the validation workflow. -* Write an initial set of validation checks based on real deployment - experience, starting by porting existing validations [1] to work with the - implemented Mistral actions. - -All patches that implement these changes must pass CI and add additional tests as -needed. - - -Dependencies -============ - -We are dependent upon the tripleo-mistral-deployment-library [2] work. - - -Testing -======= - -The TripleO CI should be updated to test the updated tripleo-common library. - - -Documentation Impact -==================== - -Mistral Actions and Workflows are sort of self-documenting and can be easily -introspected by running 'mistral workflow-list' or 'mistral action-list' on the -command line. The updated library however will have to be well-documented and -meet OpenStack standards. Documentation will be needed in both the -tripleo-common and tripleo-docs repositories. - - -References -========== - -* [1] Set of tools to help detect issues during TripleO deployments: - https://github.com/rthallisey/clapper -* [2] Library support for TripleO Overcloud Deployment Via Mistral: - https://specs.openstack.org/openstack/tripleo-specs/specs/mitaka/tripleo-mistral-deployment-library.html diff --git a/specs/newton/workflow-simplification.rst b/specs/newton/workflow-simplification.rst deleted file mode 100644 index 2aef17ab..00000000 --- a/specs/newton/workflow-simplification.rst +++ /dev/null @@ -1,212 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Workflow Simplification -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/workflow-simplification - -The TripleO workflow is still too complex for many (most?) users to follow -successfully. There are some fairly simple steps we can take to improve -that situation. - -Problem Description -=================== - -The current TripleO workflow grew somewhat haphazardly out of a collection -of bash scripts that originally made up instack-undercloud. These scripts -started out life as primarily a proof of concept exercise to demonstrate -that the idea was viable, and while the steps still work fine when followed -correctly, it seems "when followed correctly" is too difficult today, at least -based on the feedback I'm hearing from users. - -Proposed Change -=============== - -Overview --------- - -There seem to be a number of low-hanging fruit candidates for cleanup. In the -order in which they appear in the docs, these would be: - -#. **Node registration** Why is this two steps? Is there ever a case where we - would want to register a node but not configure it to be able to boot? - If there is, is it a significant enough use case to justify the added - step every time a user registers nodes? - - I propose that we configure boot on newly registered nodes automatically. - Note that this will probably require us to also update the boot - configuration when updating images, but again this is a good workflow - improvement. Users are likely to forget to reconfigure their nodes' boot - images after updating them in Glance. - - .. note:: This would not remove the ``openstack baremetal configure boot`` - command for independently updating the boot configuration of - Ironic nodes. In essence, it would just always call the - configure boot command immediately after registering nodes so - it wouldn't be a mandatory step. - - This also means that the deploy ramdisk would have to be built - and loaded into Glance before registering nodes, but our - documented process already satisfies that requirement, and we - could provide a --no-configure-boot param to import for cases - where someone wanted to register nodes without configuring them. - -#. **Flavor creation** Nowhere in our documentation do we recommend or - provide guidance on customizing the flavors that will be used for - deployment. While it is possible to deploy solely based on flavor - hardware values (ram, disk, cpu), in practice it is often simpler - to just assign profiles to Ironic nodes and have scheduling done solely - on that basis. This is also the method we document at this time. - - I propose that we simply create all of the recommended flavors at - undercloud install time and assign them the appropriate localboot and - profile properties at that time. These flavors would be created with the - minimum supported cpu, ram, and disk values so they would work for any - valid hardware configuration. This would also reduce the possibility of - typos in the flavor creation commands causing avoidable deployment - failures. - - These default flavors can always be customized if a user desires, so there - is no loss of functionality from making this change. - -#. **Node profile assignment** This is not currently part of the standard - workflow, but in practice it is something we need to be doing for most - real-world deployments with heterogeneous hardware for controllers, - computes, cephs, etc. Right now the documentation requires running an - ironic node-update command specifying all of the necessary capabilities - (in the manual case anyway, this section does not apply to the AHC - workflow). - - os-cloud-config does have support for specifying the node profile in - the imported JSON file, but to my knowledge we don't mention that anywhere - in the documentation. This would be the lowest of low-hanging - fruit since it's simply a question of documenting something we already - have. - - We could even give the generic baremetal flavor a profile and have our - default instackenv.json template include that[1], with a note that it can - be overridden to a more specific profile if desired. If users want to - change a profile assignment after registration, the node update command - for ironic will still be available. - - 1. For backwards compatibility, we might want to instead create a new flavor - named something like 'default' and use that, leaving the old baremetal - flavor as an unprofiled thing for users with existing unprofiled nodes. - -Alternatives ------------- - -tripleo.sh -~~~~~~~~~~ -tripleo.sh addresses the problem to some extent for developers, but it is -not a viable option for real world deployments (nor should it be IMHO). -However, it may be valuable to look at tripleo.sh for guidance on a simpler -flow that can be more easily followed, as that is largely the purpose of the -script. A similar flow codified into the client/API would be a good result -of these proposed changes. - -Node Registration -~~~~~~~~~~~~~~~~~ -One option Dmitry has suggested is to make the node registration operation -idempotent, so that it can be re-run any number of times and will simply -update the details of any already registered nodes. He also suggested -moving the bulk import functionality out of os-cloud-config and (hopefully) -into Ironic itself. - -I'm totally in favor of both these options, but I suspect that they will -represent a significantly larger amount of work than the other items in this -spec, so I think I'd like that to be addressed as an independent spec since -this one is already quite large. - -Security Impact ---------------- - -Minimal, if any. This is simply combining existing deployment steps. If we -were to add a new API for node profile assignment that would have some slight -security impact as it would increase our attack surface, but I feel even that -would be negligible. - -Other End User Impact ---------------------- - -Simpler deployments. This is all about the end user. - -Performance Impact ------------------- - -Some individual steps may take longer, but only because they will be -performing actions that were previously in separate steps. In aggregate -the process should take about the same time. - -Other Deployer Impact ---------------------- - -If all of these suggested improvements are implemented, it will make the -standard deployment process somewhat less flexible. However, in the -Proposed Change section I attempted to address any such new limitations, -and I feel they are limited to the edgiest of edge cases that in most cases -can still be implemented through some extra manual steps (which likely would -have been necessary anyway - they are edge cases after all). - -Developer Impact ----------------- - -There will be some changes in the basic workflow, but as noted above the same -basic steps will be getting run. Developers will see some impact from the -proposed changes, but as they will still likely be using tripleo.sh for an -already simplified workflow it should be minimal. - -Implementation -============== - -Assignee(s) ------------ - -bnemec - -Work Items ----------- - -* Configure boot on newly registered nodes automatically. -* Reconfigure boot on nodes after deploy images are updated. -* Remove explicit step for configure boot from the docs, but leave the actual - function itself in the client so it can still be used when needed. -* Create flavors at undercloud install time and move documentation on creating - them manually to the advanced section of the docs. -* Add a 'default' flavor to the undercloud. -* Update the sample instackenv.json to include setting a profile (by default, - the 'default' profile associated with the flavor from the previous step). - - - -Dependencies -============ - -Nothing that I'm aware of. - - -Testing -======= - -As these changes are implemented, we would need to update tripleo.sh to match -the new flow, which will result in the changes being covered in CI. - - -Documentation Impact -==================== - -This should reduce the number of steps in the basic deployment flow in the -documentation. It is intended to simplify the documentation. - - -References -========== - -Proposed change to create flavors at undercloud install time: -https://review.openstack.org/250059 -https://review.openstack.org/251555 diff --git a/specs/ocata/capture-environment-status-and-logs.rst b/specs/ocata/capture-environment-status-and-logs.rst deleted file mode 100644 index 3c7f836b..00000000 --- a/specs/ocata/capture-environment-status-and-logs.rst +++ /dev/null @@ -1,133 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================================== -Tool to Capture Environment Status and Logs -=========================================== - -https://blueprints.launchpad.net/tripleo/+spec/capture-environment-status-and-logs - -To aid in troubleshooting, debugging, and reproducing issues we should create -or integrate with a tool that allows an operator or developer to collect and -generage a single bundle that provides the state and history of a deployed -environment. - -Problem Description -=================== - -Currently there is no single command that can be run via either the -tripleoclient or via the UI that will generage a single artifact to be used -to report issues when failures occur. - -* tripleo-quickstart_, tripleo-ci_ and operators collect the logs for bug - reports in different ways. - -* When a failure occurs, many different peices of information must be collected - to be able to understand where the failure occured. If the logs required are - not asked for, an operator may not know to what to provide for - troubleshooting. - - -Proposed Change -=============== - -Overview --------- - -TripleO should provide a unified method for collecting status and logs from the -undercloud and overcloud nodes. The tripleoclient should support executing a -workflow to run status and log collection processes via sosreport_. The output -of the sosreport_ should be collected and bundled together in a single location. - -Alternatives ------------- - -Currently, various shell scripts and ansible tasks are used by the CI processes -to perform log collection. These scripts are not maintained in combination with -the core TripleO and may require additional artifacts that are not installed by -default with a TripleO environment. - -tripleo-quickstart_ uses ansible-role-tripleo-collect-logs_ to collect logs. - -tripleo-ci_ uses bash scripts to collect the logs. - -Fuel uses timmy_. - -Security Impact ---------------- - -The logs and status information may be considered sensitive information. The -process to trigger status and logs should require authentication. Additionally -we should provide a basic password protection mechanism for the bundle of logs -that is created by this process. - -Other End User Impact ---------------------- - -None. - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -None. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - alex-schultz - - -Work Items ----------- - -* Ensure OpenStack `sosreport plugins`_ are current. -* Write a TripleO sosreport plugin. -* Write a `Mistral workflow`_ to execute sosreport and collect artifacts. -* Write python-tripleoclient_ integration to execute Mistral workflows. -* Update documentation and CI scripts to leverage new collection method. - - -Dependencies -============ - -None. - -Testing -======= - -As part of CI testing, the new tool should be used to collect environment logs. - -Documentation Impact -==================== - -Documentation should be updated to reflect the standard ways to collect the logs -using the tripleo client. - -References -========== - -.. _ansible-role-tripleo-collect-logs: https://github.com/redhat-openstack/ansible-role-tripleo-collect-logs -.. _Mistral workflow: http://docs.openstack.org/developer/mistral/terminology/workflows.html -.. _python-tripleoclient: https://github.com/openstack/python-tripleoclient -.. _tripleo-ci: https://github.com/openstack-infra/tripleo-ci -.. _tripleo-quickstart: https://github.com/openstack/tripleo-quickstart -.. _sosreport: https://github.com/sosreport/sos -.. _sosreport plugins: https://github.com/sosreport/sos/tree/master/sos/plugins -.. _timmy: https://github.com/openstack/timmy diff --git a/specs/ocata/composable-ha-architecture.rst b/specs/ocata/composable-ha-architecture.rst deleted file mode 100644 index 8efb59f6..00000000 --- a/specs/ocata/composable-ha-architecture.rst +++ /dev/null @@ -1,201 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================== -Composable HA architecture -========================== - -https://blueprints.launchpad.net/tripleo/+spec/composable-ha - -Since Newton, we have the following services managed by pacemaker: - -* Cloned and master/slave resources: - galera, redis, haproxy, rabbitmq - -* Active/Passive resources: - VIPs, cinder-volume, cinder-backup, manila-share - -It is currently not possible to compose the above service in the same -way like we do today via composable roles for the non-pacemaker services -This spec aims to address this limitation and let the operator be more flexible -in the composition of the control plane. - -Problem Description -=================== - -Currently tripleo has implemented no logic whatsoever to assign specific pacemaker -managed services to roles/nodes. - -* Since we do not have a lot in terms of hard performance data, we typically support - three controller nodes. This is perceived as a scalability limiting factor and there is - a general desire to be able to assign specific nodes to specific pacemaker-managed - services (e.g. three nodes only for galera, five nodes only for rabbitmq) - -* Right now if the operator deploys on N controllers he will get N cloned instances - of the non-A/P pacemaker services on the same N nodes. We want to be able to - be much more flexible. E.g. deploy galera on the first 3 nodes, rabbitmq on the - remaining 5 nodes, etc. - -* It is also desirable for the operator to be able to choose on which nodes the A/P - resources will run. - -* We also currently have a scalability limit of 16 nodes for the pacemaker cluster. - -Proposed Change -=============== - -Overview --------- - -The proposal here is to keep the existing cluster in its current form, but to extend -it in two ways: -A) Allow the operator to include a specific service in a custom node and have pacemaker -run that resource only on that node. E.g. the operator can define the following custom nodes: - -* Node A - pacemaker - galera - -* Node B - pacemaker - rabbitmq - -* Node C - pacemaker - VIPs, cinder-volume, cinder-backup, manila-share, redis, haproxy - -With the above definition the operator can instantiate any number of A, B or C nodes -and scale up to a total of 16 nodes. Pacemaker will place the resources only on -the appropriate nodes. - -B) Allow the operator to extend the cluster beyond 16 nodes via pacemaker remote. -For example an operator could define the following: - -* Node A - pacemaker - galera - rabbitmq - -* Node B - pacemaker-remote - redis - -* Node C - pacemaker-remote - VIPs, cinder-volume, cinder-backup, manila-share, redis, haproxy - -This second scenario would allow an operator to extend beyond the 16 nodes limit. -The only difference to scenario 1) is the fact that the quorum of the cluster is -obtained only by the nodes from Node A. - -The way this would work is that the placement on nodes would be controllerd by location -rules that would work based on node properties matching. - -Alternatives ------------- - -A bunch of alternative designs was discussed and evaluated: -A) A cluster per service: - -One possible architecture would be to create a separate pacemaker cluster for -each HA service. This has been ruled out mainly for the following reasons: - -* It cannot be done outside of containers -* It would create a lot of network traffic - -* It would increase the management/monitoring of the pacemaker resources and clusters - exponentially - -* Each service would still be limited to 16 nodes -* A new container fencing agent would have to be written - -B) A single cluster where only the clone-max property is set for the non A/P services - -This would be still a single cluster, but unlike today where the cloned and -master/slave resources run on every controller we would introduce variables to -control the maximum number of nodes a resource could run on. E.g. -GaleraResourceCount would set clone-max to a value different than the number of -controllers. Example: 10 controllers, galera has clone-max set to 3, rabbit to -5 and redis to 3. -While this would be rather simple to implement and would change very little in the -current semantics, this design was ruled out: - -* We'd still have the 16 nodes limit -* It would not provide fine grained control over which services live on which nodes - -Security Impact ---------------- - -No changes regarding security aspects compared to the existing status quo. - -Other End User Impact ---------------------- - -No particular impact except added flexibility in placing pacemaker-managed resources. - -Performance Impact ------------------- - -The performance impact here is that with the added scalability it will be possible for -an operator to dedicate specific nodes for certain pacemaker-managed services. -There are no changes in terms of code, only a more flexible and scalable way to deploy -services on the control plane. - -Other Deployer Impact ---------------------- - -This proposal aims to use the same method that the custom roles introduced in Newton -use to tailor the services running on a node. With the very same method it will be possible -to do that for the HA services managed by pacemaker today. - -Developer Impact ----------------- - -No impact - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - michele - -Other contributors: - cmsj, abeekhof - -Work Items ----------- - -We need to work on the following: - -1. Add location rule constraints support in puppet -2. Make puppet-tripleo set node properties on the nodes where a service profile -3. Create corresponding location rules -4. Add a puppet-tripleo pacemaker-remote profile - -Dependencies -============ - -No additional dependencies are required. - -Testing -======= - -We will need to test the flexible placement of the pacemaker-managed services -within the CI. This can be done within today's CI limitations (i.e. in the three -controller HA job we can make sure that the placement is customized and working) - -Documentation Impact -==================== - -No impact - -References -========== - -Mostly internal discussions within the HA team at Red Hat diff --git a/specs/ocata/containerize-tripleo-overcloud.rst b/specs/ocata/containerize-tripleo-overcloud.rst deleted file mode 100644 index 8590f840..00000000 --- a/specs/ocata/containerize-tripleo-overcloud.rst +++ /dev/null @@ -1,212 +0,0 @@ -=============================== -Deploying TripleO in Containers -=============================== - -https://blueprints.launchpad.net/tripleo/+spec/containerize-tripleo - -Ability to deploy TripleO in Containers. - -Problem Description -=================== - -Linux containers are changing how the industry deploys applications by offering -a lightweight, portable and upgradeable alternative to deployments on a physical -host or virtual machine. - -Since TripleO already manages OpenStack infrastructure by using OpenStack -itself, containers could be a new approach to deploy OpenStack services. It -would change the deployment workflow but could extend upgrade capabilities, -orchestration, and security management. - -Benefits of containerizing the openstack services include: - - * Upgrades can be performed by swapping out containers. - * Since the entire software stack is held within the container, - interdependencies do not affect deployments of services. - * Containers define explicit state and data requirements. Ultimately if we - moved to kubernetes all volumes would be off the host making the host - stateless. - * Easy rollback to working containers if upgrading fails. - * Software shipped in each container has been proven to work for this service. - * Mix and match versions of services on the same host. - * Immutable containers provide a consistent environment upon startup. - -Proposed Change -=============== - -Overview --------- - -The intention of this blueprint is to introduce containers as a method of -delivering an OpenStack installation. We currently have a fully functioning -containerized version of the compute node but we would like to extend this to -all services. In addition it should work with the new composable roles work that -has been recently added. - -The idea is to have an interface within the heat templates that adds information -for each service to be started as a container. This container format should -closely resemble the Kubernetes definition so we can possibly transition to -Kubernetes in the future. This work has already been started here: - - https://review.openstack.org/#/c/330659/ - -There are some technology choices that have been made to keep things usable and -practical. These include: - - * Using Kolla containers. Kolla containers are built using the most popular - operating system choices including CentOS, RHEL, Ubuntu, etc. and are a - good fit for our use case. - * We are using a heat hook to start these containers directly via docker. - This minimizes the software required on the node and maps directly to the - current baremetal implementation. Also maintaining the heat interface - keeps the GUI functional and allows heat to drive upgrades and changes to - containers. - * Changing the format of container deployment to match Kubernetes for - potential future use of this technology. - * Using CentOS in full (not CentOS Atomic) on the nodes to allow users to - have a usable system for debugging. - * Puppet driven configuration that is mounted into the container at startup. - This allows us to retain our puppet configuration system and operate in - parallel with existing baremetal deployment. - -Bootstrapping -------------- - -Once the node is up and running, there is a systemd service script that runs -which starts the docker agents container. This container has all of the -components needed to bootstrap the system. This includes: - - * heat agents including os-collect-config, os-apply-config etc. - * puppet-agent and modules needed for the configuration of the deployment. - * docker client that connects to host docker daemon. - * environment for configuring networking on the host. - -This containers acts as a self-installing container. Once started, this -container will use os-collect-config to connect back to heat. The heat agents -then perform the following tasks: - - * Set up an etc directory and runs puppet configuration scripts. This - generates all the config files needed by the services in the same manner - it would if run without containers. These are copied into a directory - accessible on the host and by all containerized services. - * Begin starting containerized services and other steps as defined in the - heat template. - -Currently all containers are implemented using net=host to allow the services to -listen directly on the host network(s). This maintains functionality in terms of -network isolation and IPv6. - -Security Impact ---------------- - -There shouldn't be major security impacts from this change. The deployment -shouldn't be affected negatively by this change from a security standpoint but -unknown issues might be found. SELinux support is implemented in Docker. - -End User Impact ---------------- - -* Debugging of containerized services will be different as it will require - knowledge about docker (kubernetes in the future) and other tools to access - the information from the containers. -* Possibly provide more options for upgrades and new versions of services. -* It'll allow for service isolation and better dependency management - -Performance Impact ------------------- - -Very little impact: - - * Runtime performance should remain the same. - * We are noticing a slightly longer bootstrapping time with containers but that - should be fixable with a few easy optimizations. - -Deployer Impact ---------------- - -From a deployment perspective very little changes: - * Deployment workflow remains the same. - * There may be more options for versions of different services since we do - not need to worry about interdependency issues with the software stack. - -Upgrade Impact --------------- - -This work aims to allow for resilent, transparent upgrades from baremetal -overcloud deployments to container based ones. - -Initially we need to transition to containers: - * Would require node reboots. - * Automated upgrades should be possible as services are the same, just - containerized. - * Some state may be moved off nodes to centralized storage. Containers very - clearly define required data and state storage requirements. - -Upgrades could be made easier: - * Individual services can be upgraded because of reduced interdependencies. - * It is easier to roll back to a previous version of a service. - * Explicit storage of data and state for containers makes it very clear what - needs to be preserved. Ultimately state information and data will likely - not exist on individual nodes. - -Developer Impact ----------------- - -The developer work flow changes slighly. Instead of interacting with the service -via systemd and log files, you will interact with the service via docker. - -Inside the compute node: - * sudo docker ps -a - * sudo docker logs - * sudo docker exec -it /bin/bash - -Implementation -============== - -Assignee(s) - rhallisey - imain - flaper87 - mandre - -Other contributors: - dprince - emilienm - -Work Items ----------- - -* Heat Docker hook that starts containers (DONE) -* Containerized Compute (DONE) -* TripleO CI job (INCOMPLETE - https://review.openstack.org/#/c/288915/) -* Containerized Controller -* Automatically build containers for OpenStack services -* Containerized Undercloud - -Dependencies -============ - -* Composable roles. -* Heat template interface which allows extensions to support containerized - service definitions. - -Testing -======= -TripleO CI would need a new Jenkins job that will deploy an overcloud in -containers by using the selected solution. - -Documentation Impact -==================== -https://github.com/openstack/tripleo-heat-templates/blob/master/docker/README-containers.md - -* Deploying TripleO in containers -* Debugging TripleO containers - -References -========== -* https://docs.docker.com/misc/ -* https://etherpad.openstack.org/p/tripleo-docker-puppet -* https://docs.docker.com/articles/security/ -* http://docs.openstack.org/developer/kolla/ -* https://review.openstack.org/#/c/209505/ -* https://review.openstack.org/#/c/227295/ diff --git a/specs/ocata/gui-deployment-configuration.rst b/specs/ocata/gui-deployment-configuration.rst deleted file mode 100644 index 3dc79ca1..00000000 --- a/specs/ocata/gui-deployment-configuration.rst +++ /dev/null @@ -1,236 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -GUI Deployment configuration improvements -========================================== - -TripleO UI deployment configuration is based on enabling environments provided by -deployment plan (tripleo-heat-templates) and letting user set parameter values. - -This spec proposes improvements to this approach. - -Blueprint: https://blueprints.launchpad.net/tripleo/+spec/deployment-configuration-improvements - -Problem Description -=================== - -The general goal of TripleO UI is to guide user through the deployment -process and provide relevant information along the way, so user does not -have to search for a context in documentation or by analyzing TripleO templates. - -There is a set of problems identified with a current deployment configuration -solution. Resolving those problems should lead to improved user experience when -making deployment design decisions. - -The important information about the usage of environment and relevant parameters -is usually included as a comment in environment file itself. This is not consumable by GUI. -We currently use capabilities-map.yaml to define environment meta data to work -around this. - -* As the number of environments is growing it is hard to keep capabilities-map.yaml - up to date. When certain environment is added, capabilities-map.yaml is usually - not updated by the same developer, which leads to inaccuracy in environment - description when added later. - -* The environments depend on each other and potentially collide when used together - -* There are no means to list and let user set parameters relevant to certain - environment. These are currently listed as comments in environments - not - consumable by GUI (example: [1]) - -* There are not enough means to organize parameters coming as a result of - heat validate - -* Not all parameters defined in tripleo-heat-templates have correct type set - and don't include all relevant information that Hot Spec provides. - (constraints...) - -* Same parameters are defined in multiple templates in tripleo-heat-templates - but their definition differs - -* List of parameters which are supposed to get auto-generated when value is not - provided by user are hard-coded in deployment workflow - -Proposed Change -=============== - -Overview --------- - -* Propose environment metadata to track additional information about environment - directly as part of the file in Heat (partially in progress [2]). Similar concept is - already present in heat resources [3]. - In the meantime update tripleo-common environment listing feature to read - environments and include environment metadata. - - Each TripleO environment file should define: - - .. code:: - - metadata: - label: - description: - - resource_registry: - ... - - parameter_defaults: - ... - - -* With the environment metadata in place, capabilities-map.yaml purpose would - simplify to defining grouping and dependencies among environments. - -* Implement environment parameter listing in TripleO UI - -* To organize parameters we should use ParameterGroups. - (related discussion: [4]) - -* Make sure that same parameters are defined the same way across tripleo-heat-templates - There may be exceptions but in those cases it must be sure that two templates which - define same parameter differently won't be used at the same time. - -* Update parameter definitions in TripleO templates, so the type actually matches - expected parameter value (e.g. 'string' vs 'boolean') This will result in correct - input type being used in GUI - -* Define a custom constraint for parameters which are supposed to be auto-generated. - -Alternatives ------------- - -Potential alternatives to listing environment related parameters are: - -* Use Parameter Groups to match template parameters to an environment. This - solution ties the template with an environment and clutters the template. - - -* As the introduction of environment metadata depends on having this feature accepted - and implemented in Heat, alternative solution is to keep title and description in - capabilities map as we do now - -Security Impact ---------------- - -No significant security impact - -Other End User Impact ---------------------- - -Resolving mentioned problems greatly improves the TripleO UI workflow and -makes deployment configuration much more streamlined. - -Performance Impact ------------------- - -Described approach allows to introduce caching of Heat validation which is -currently the most expensive operation. Cache gets invalid only in case -when a deployment plan is updated or switched. - -Other Deployer Impact ---------------------- - -Same as End User Impact - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - jtomasek - -Other contributors: - rbrady - -Work Items ----------- - -* tripleo-heat-templates: update environments to include metadata (label, - description), update parameter_defaults to include all parameters relevant - to the environment - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/update-environment-files-with-related-parameters - -* tripleo-heat-templates: update capabilities-map.yaml to map environment - grouping and dependencies - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/update-capabilities-map-to-map-environment-dependencies - -* tripleo-heat-templates: create parameter groups for deprecated and internal - parameters - -* tripleo-heat-templates: make sure that same parameters have the same definition - - bug: https://bugs.launchpad.net/tripleo/+bug/1640243 - -* tripleo-heat-templates: make sure type is properly set for all parameters - - bug: https://bugs.launchpad.net/tripleo/+bug/1640248 - -* tripleo-heat-templates: create custom constraint for autogenerated parameters - - bug: https://bugs.launchpad.net/tripleo/+bug/1636987 - -* tripleo-common: update environments listing to combine capabilities map with - environment metadata - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/update-capabilities-map-to-map-environment-dependencies - -* tripleo-ui: Environment parameters listing - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/get-environment-parameters - -* tripleo-common: autogenerate values for parameters with custom constraint - - bug: https://bugs.launchpad.net/tripleo/+bug/1636987 - -* tripleo-ui: update environment configuration to reflect API changes, provide means to display and configure environment parameters - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/tripleo-ui-deployment-configuration - -* tripleo-ui: add client-side parameter validations based on parameter type - and constraints - - bugs: https://bugs.launchpad.net/tripleo/+bug/1638523, https://bugs.launchpad.net/tripleo/+bug/1640463 - -* tripleo-ui: don't show parameters included in deprecated and internal groups - -Dependencies -============ - -* Heat Environment metadata discussion [2] - -* Heat Parameter Groups discussion [3] - -Testing -======= - -The changes should be covered by unit tests in tripleo-common and GUI - -Documentation Impact -==================== - -Part of this effort should be proper documentation of how TripleO environments -as well as capabilities-map.yaml should be defined - -References -========== - -[1] https://github.com/openstack/tripleo-heat-templates/blob/b6a4bdc3e4db97785b930065260c713f6e70a4da/environments/storage-environment.yaml - -[2] http://lists.openstack.org/pipermail/openstack-dev/2016-June/097178.html - -[3] http://docs.openstack.org/developer/heat/template_guide/hot_spec.html#resources-section. - -[4] http://lists.openstack.org/pipermail/openstack-dev/2016-August/102297.html diff --git a/specs/ocata/gui-plan-import-export.rst b/specs/ocata/gui-plan-import-export.rst deleted file mode 100644 index c1f14807..00000000 --- a/specs/ocata/gui-plan-import-export.rst +++ /dev/null @@ -1,154 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================== -GUI: Import/Export Deployment Plan -================================== - -Add two features to TripleO UI: - -* Import a deployment plan with a Mistral environment -* Export a deployment plan with a Mistral environment - -Blueprint: https://blueprints.launchpad.net/tripleo/+spec/gui-plan-import-export - - -Problem Description -=================== - -Right now, the UI only supports simple plan creation. The user needs to upload -the plan files, make the environment selection and set the parameters. We want -to add a plan import feature which would allow the user to import the plan -together with a complete Mistral environment. This way the selection of the -environment and parameters would be stored and automatically imported, without -any need for manual configuration. - -Conversely, we want to allow the user to export a plan together with a Mistral -environment, using the UI. - - -Proposed Change -=============== - -Overview --------- - -In order to identify the Mistral environment when importing a plan, I propose -we use a JSON formatted file and name it 'plan-environment.json'. This file -should be uploaded to the Swift container together with the rest of the -deployment plan files. The convention of calling the file with a fixed name is -enough for it to be detected. Once this file is detected by the tripleo-common -workflow handling the plan import, the workflow then creates (or updates) the -Mistral environment using the file's contents. In order to avoid possible future -unintentional overwriting of environment, the workflow should delete this file -once it has created (or updated) the Mistral environment with its contents. - -Exporting the plan should consist of downloading all the plan files from the -swift container, adding the plan-environment.json, and packing it all up in -a tarball. - -Alternatives ------------- - -One alternative is what we have now, i.e. making the user perform all the -environment configuration settings and parameter settings manually each time. -This is obviously very tedious and the user experience suffers greatly as a -result. - -The alternative to deleting the plan-environment.json file upon its -processing is to leave in the swift container and keep it in sync with all -the updates that might happen thereafter. This can get very complicated and is -entirely unnecessary, so deleting the file instead is a better choice. - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -The import and export features will only be triggered on demand (user clicks -on button, or similar), so they will have no performance impact on the rest -of the application. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -None - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - akrivoka - -Other contributors: - jtomasek - d0ugal - -Work Items ----------- - -* tripleo-common: Enhance plan creation/update to consume plan-environment.json - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/enhance-plan-creation-with-plan-environment-json - -* tripleo-common: Add plan export workflow - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/plan-export-workflow - -* python-tripleoclient: Add plan export command - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/plan-export-command - -* tripleo-ui: Integrate plan export into UI - - bluerpint: https://blueprints.launchpad.net/tripleo/+spec/plan-export-gui - -Note: We don't need any additional UI (neither GUI nor CLI) for plan import - the -existing GUI elements and CLI command for plan creation can be used for import -as well. - - -Dependencies -============ - -None - - -Testing -======= - -The changes should be covered by unit tests in tripleo-ui, tripleo-common and -python-tripleoclient. - - -Documentation Impact -==================== - -User documentation should be enhanced by adding instructions on how these two -features are to be used. - - -References -========== - -None \ No newline at end of file diff --git a/specs/ocata/om-dual-backends.rst b/specs/ocata/om-dual-backends.rst deleted file mode 100644 index c8d110a6..00000000 --- a/specs/ocata/om-dual-backends.rst +++ /dev/null @@ -1,190 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================================ -Enable deployment of alternative backends for oslo.messaging -============================================================ - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/om-dual-backends - -This spec describes adding two functional capabilities to the messaging -services of an overcloud deployment. The first capability is to enable -the selection and configuration of separate messaging backends for -oslo.messaging RPC and Notification communications. The second -capability is to introduce support for a brokerless messaging backend -for oslo.messaging RPC communications via the AMQP 1.0 Apache -qpid-dispatch-router. - - -Problem Description -=================== - -The oslo.messaging library supports the deployment of dual messaging system -backends. This enables alternative backends to be deployed for RPC and -Notification messaging communications. Users have identified the -constraints of using a store and forward (broker based) messaging system for RPC -communications and are seeking direct messaging (brokerless) -approaches to optimize the RPC messaging pattern. In addition to -operational challenges, emerging distributed cloud architectures -define requirements around peer-to-peer relationships and geo-locality -that can be addressed through intelligent messaging transport routing -capabilities such as is provided by the AMQP 1.0 qpid-dispatch-router. - - -Proposed Change -=============== - -Overview --------- - -Provide the capability to select and configure alternative -transport_url's for oslo.messaging RPCs and Notifications across -overcloud OpenStack services. - -Retain the current default behavior to deploy the rabbitMQ server as -the messaging backend for both RPC and Notification communications. - -Introduce an alternative deployment of the qpid-dispatch-router as the -messaging backend for RPC communications. - -Utilize the oslo.messaging AMQP 1.0 driver for delivering RPC services -via the dispatch-router messaging backend. - -Alternatives ------------- - -The configuration of dual backends for oslo.messaging could be -performed post overcloud deployment. - -Security Impact ---------------- - -The end result of using the AMQP 1.0 dispatch-router as an alternative -messaging backend for oslo.messaging RPC communications should be the -same from a security standpoint. The driver/router solution provides -SSL and SASL support in parity to the current rabbitMQ server deployment. - -Other End User Impact ---------------------- - -The configuration of the dual backends for RPC and Notification -messaging communications should be transparent to the operation of the OpenStack -services. - -Performance Impact ------------------- - -Using a dispatch-router mesh topology rather than broker clustering -for messaging communications will have a positive impact on -performance and scalability by: - -* Directly expanding connection capacity - -* Providing parallel communication flows across the mesh - -* Increasing aggregate message transfer capacity - -* Improving resource utilization of messaging infrastructure - -Other Deployer Impact ---------------------- - -The deployment of the dispatch-router, however, will be new to -OpenStack operators. Operators will need to learn the -architectural differences as compared to a broker cluster -deployment. This will include capacity planning, monitoring, -troubleshooting and maintenance best practices. - -Developer Impact ----------------- - -Support for alternative oslo.messaging backends and deployment of -qpid-dispatch-router in addition to rabbitMQ should be implemented for -tripleo-quickstart. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - -* John Eckersberg - -* Andy Smith - - -Work Items ----------- - -* Update overcloud templates for dual backends and dispatch-router service - -* Add dispatch-router packages to overcloud image elements - -* Add services template for dispatch-router - -* Update OpenStack services base templates to select and configure - transport_urls for RPC and Notification - -* Deploy dispatch-router for controller and compute for topology - -* Test failure and recovery scenarios for dispatch-router - -Transport Configuration ------------------------ - -The oslo.messaging configuration options define a default and -additional notification transport_url. If the notification -transport_url is not specified, oslo.messaging will use the default -transport_url for both RPC and Notification messaging communications. - -The transport_url parameter is of the form:: - - transport://user:pass@host1:port[,hostN:porN]/virtual_host - -Where the transport scheme specifies the RPC or Notification backend as -one of rabbit or amqp, etc. Oslo.messaging is deprecating the host, -port and auth configuration options. All drivers will get these -options via the transport_url. - - -Dependencies -============ - -Support for dual backends in and AMQP 1.0 driver integration -with the dispatch-router depends on oslo.messaging V5.10 or later. - - -Testing -======= - -In order to test this in CI, an environment will be needed where dual -messaging system backends (e.g. rabbitMQ server and dispatch-router -server) are deployed. Any existing hardware configuration should be -appropriate for the dual backend deployment. - - -Documentation Impact -==================== - -The deployment documentation will need to be updated to cover the -configuration of dual messaging system backends and the use of the -dispatch-router. TripleO Heat template examples should also help with -deployments using dual backends. - - -References -========== - -* [1] https://blueprints.launchpad.net/oslo.messaging/+spec/amqp-dispatch-router -* [2] http://qpid.apache.org/components/dispatch-router/ -* [3] http://docs.openstack.org/developer/oslo.messaging/AMQP1.0.html -* [4] https://etherpad.openstack.org/p/ocata-oslo-consistent-mq-backends -* [5] https://github.com/openstack/puppet-qdr diff --git a/specs/ocata/ssl-certmonger.rst b/specs/ocata/ssl-certmonger.rst deleted file mode 100644 index f9dd8e43..00000000 --- a/specs/ocata/ssl-certmonger.rst +++ /dev/null @@ -1,258 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================ -PKI management of the overcloud using Certmonger -================================================ - -There is currently support for enabling SSL for the public endpoints of the -OpenStack services. However, certain use cases require the availability of SSL -everywhere. This spec proposes an approach to enable it. - -Problem Description -=================== - -Even though there is support for deploying both the overcloud and the -undercloud with TLS/SSL support for the public endpoints, there are deployments -that demand the usage of encrypted communications through all the interfaces. - -The current approach for deploying SSL in TripleO is to inject the needed -keys/certificates through Heat environment files; this requires the -pre-creation of those. While this approach works for the public-facing -services, as we attempt to secure the communication between different -services, and in different levels of the infrastructure, the amount of keys -and certificates grows. So, getting the deployer to generate all the -certificates and manage them will be quite cumbersome. - -On the other hand, TripleO is not meant to handle the PKI of the cloud. And -being the case that we will at some point need to enable the deployer to be -able to renew, revoke and keep track of the certificates and keys deployed in -the cloud, we are in need of a system with such capabilities. - -Instead of brewing an OpenStack-specific solution ourselves. I propose the -usage of already existing systems that will make this a lot easier. - -Proposed Change -=============== - -Overview --------- - -The proposal is to start using certmonger[1] in the nodes of the overcloud to -interact with a CA for managing the certificates that are being used. With this -tool, we can request the fetching of the needed certificates for interfaces -such as the internal OpenStack endpoints, the database cluster and the message -broker for the cloud. Those certificates will in turn have automatic tracking, -and for cases where there is a certificate to identify the node, it could -even automatically request a renewal of the certificate when needed. - -Certmonger is already available in several distributions (both Red Hat or -Debian based) and has the capability of interacting with several CAs, so if the -operator already has a working one, they could use that. On the other hand, -certmonger has the mechanism for registering new CAs, and executing scripts -(which are customizable) to communicate with those CAs. Those scripts are -language independent. But for means of the open source community, a solution -such as FreeIPA[2] or Dogtag[3] could be used to act as a CA and handle the -certificates and keys for us. Note that it's possible to write a plugin for -certmonger to communicate with Barbican or another CA, if that's what we would -like to go for. - -In the FreeIPA case, this will require a full FreeIPA system running either on -another node in the cluster or in the undercloud in a container[4]. - -For cases where the services are terminated by HAProxy, and the overcloud being -in an HA-deployment, the controller nodes will need to share a certificate that -HAProxy will present when accessed. In this case, the workflow will be as -following: - -#. Register the undercloud as a FreeIPA client. This configures the kerberos - environment and provides access control to the undercloud node. -#. Get keytab (credentials) corresponding to the undercloud in order to access - FreeIPA, and be able to register nodes. -#. Create a HAProxy service -#. Create a certificate/key for that service -#. Store the key in FreeIPA's Vault. -#. Create each of the controllers to be deployed as hosts in FreeIPA (Please - see note about this) -#. On each controller node get the certificate from service entry. -#. Fetch the key from the FreeIPA vault. -#. Set certmonger to keep track of the resulting certificates and - keys. - -.. note:: - - While the process of creating each node beforehand could sound cumbersome, - this can be automated to increase usability. The proposed approach is to - have a nova micro-service that automatically registers the nodes from the - overcloud when they are created [5]. This hook will not only register the - node in the system, but will also inject an OTP which the node can use to - fetch the required credentials and get its corresponding certificate and - key. The aforementioned OTP is only used for enrollment. Once enrollment - has already taken place, certmonger can already be used to fetch - certificates from FreeIPA. - - However, even if this micro-service is not in place, we could pass the OTP - via the TripleO Heat Templates (in the overcloud deployment). So it is - possible to have the controllers fetching their keytab and subsequently - request their certificates even if we don't have auto-enrollment in place. - -.. note:: - - Barbican could also be used instead of FreeIPA's Vault. With the upside of - it being an already accepted OpenStack service. However, Barbican will also - need to have a backend, which might be Dogtag in our case, since having an - HSM for the CI will probably not be an option. - -Now, for services such as the message broker, where an individual certificate -is required per-host, the process is much simpler, since the nodes will have -already been registered in FreeIPA and will be able to fetch their credentials. -Now we can just let certmonger do the work and request, and subsequently track -the appropriate certificates. - -Once the certificates and keys are present in the nodes, then we can let the -subsequent steps of the overcloud deployment process take place; So the -services will be configured to use those certificates and enable TLS where the -deployer specifies it. - -Alternatives ------------- - -The alternative is to take the same approach as we did for the public -endpoints. Which is to simply inject the certificates and keys to the nodes. -That would have the downside that the certificates and keys will be pasted in -heat environment files. This will be problematic for services such as RabbitMQ, -where we are giving a list of nodes for communication, because to enable SSL in -it, we need to have a certificate per-node serving as a message broker. -In this case two approaches could be taken: - -* We will need to copy and paste each certificate and key that is needed for - each of the nodes. With the downside being how much text needs to be copied, - and the difficulty of keeping track of the certificates. On the other hand, - each time a node is removed or added, we need to make sure we remember to add - a certificate and a key for it in the environment file. So this becomes a - scaling and a usability issue too. - -* We could also give in an intermediate certificate, and let TripleO create the - certificates and keys per-service. However, even if this fixes the usability - issue, we still cannot keep track of the specific certificates and keys that - are being deployed in the cloud. - -Security Impact ---------------- - -This approach enables better security for the overcloud, as it not only eases -us to enable TLS everywhere (if desired) but it also helps us keep track and -manage our PKI. On the other hand, it enables other means of security, such as -mutual authentication. In the case of FreeIPA, we could let the nodes have -client certificates, and so they would be able to authenticate to the services -(as is possible with tools such as HAProxy or Galera/MySQL). However, this can -come as subsequent work of this. - -Other End User Impact ---------------------- - -For doing this, the user will need to pass extra parameters to the overcloud -deployment, such as the CA information. In the case of FreeIPA, we will need to -pass the host and port, the kerberos realm, the kerberos principal of the -undercloud and the location of the keytab (the credentials) for the undercloud. - -However, this will be reflected in the documentation. - -Performance Impact ------------------- - -Having SSL everywhere will degrade the performance of the overcloud overall, as -there will be some overhead in each call. However, this is a known issue and -this is why SSL everywhere is optional. It should only be enabled for deployers -that really need it. - -The usage of an external CA or FreeIPA shouldn't impact the overcloud -performance, as the operations that it will be doing are not recurrent -operations (issuing, revoking or renewing certificates). - -Other Deployer Impact ---------------------- - -If a deployer wants to enable SSL everywhere, they will need to have a working -CA for this to work. Or if they don't they could install FreeIPA in a node. - -Developer Impact ----------------- - -Discuss things that will affect other developers working on OpenStack. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - jaosorior - - -Work Items ----------- - -* Enable certmonger and the FreeIPA client tools in the overcloud image - elements. - -* Include the host auto-join hook for nova in the undercloud installation. - -* Create nested templates that will be used in the existing places for the - NodeTLSData and NodeTLSCAData. These templates will do the certmonger - certificate fetching and tracking. - -* Configure the OpenStack internal endpoints to use TLS and make this optional - through a heat environment. - -* Configure the Galera/MySQL cluster to use TLS and make this optional through - a heat environment. - -* Configure RabbitMQ to use TLS (which means having a certificate for each - node) and make this optional through a heat environment - -* Create a CI gate for SSL everywhere. This will include a FreeIPA installation - and it will enable SSL for all the services, ending in the running of a - pingtest. For the FreeIPA preparations, a script running before the overcloud - deployment will add the undercloud as a client, configure the appropriate - permissions for it and deploy a keytab so that it can use the nova hook. - Subsequently it will create a service for the OpenStack internal endpoints, - and the database, which it will use to create the needed certificates and - keys. - - -Dependencies -============ - -* This requires the following bug to be fixed in Nova: - https://bugs.launchpad.net/nova/+bug/1518321 - -* Also requires the packaging of the nova hook. - - -Testing -======= - -We will need to create a new gate in CI to test this. - - -Documentation Impact -==================== - -The documentation on how to use an external CA and how to install and use -FreeIPA with TripleO needs to be created. - - -References -========== - -[1] https://fedorahosted.org/certmonger/ -[2] http://www.freeipa.org/page/Main_Page -[3] http://pki.fedoraproject.org/wiki/PKI_Main_Page -[4] http://www.freeipa.org/page/Docker -[5] https://github.com/richm/rdo-vm-factory/blob/use-centos/rdo-ipa-nova/novahooks.py diff --git a/specs/ocata/step-by-step-validation.rst b/specs/ocata/step-by-step-validation.rst deleted file mode 100644 index 7c3ac745..00000000 --- a/specs/ocata/step-by-step-validation.rst +++ /dev/null @@ -1,149 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================= -Step by step validation -======================= - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/step-by-step-validation - -Validate each step during the installation to be able to stop fast in -case of errors and provide feedback on which components are in error. - -Problem Description -=================== - -During deployment, problems are often spotted at the end of the -configuration and can accumulate on top of each other making it -difficult to find the root cause. - -Deployers and developers will benefit by having the installation -process fails fast and spotting the lowest level possible components -causing the problem. - -Proposed Change -=============== - -Overview --------- - -Leverage the steps already defined in Tripleo to run a validation tool -at the end of each step. - -During each step, collect assertions about what components are -configured on each host then at the end of the step, run a validation -tool consumming the assertions to report all the failed assertions. - -Alternatives ------------- - -We could use Puppet to add assertions in the code to validate what has -been configured. The drawback of this approach is the difficulty to -have a good reporting on what are the issues compared to a specialized -tool that can be run outside of the installer if needed. - -The other drawback to this approach is that it can't be reused in -future if/when we support non-puppet configuration and it probably -also can't be used when we use puppet to generate an external config -file for containers. - -Security Impact ---------------- - -* some validations may require access to sensitive data like passwords - or keys to access the components. - -Other End User Impact ---------------------- - -This feature will be activated automatically in the installer. - -If needed, the deployer or developper will be able to launch the tool -by hand to validate a set of assertions. - -Performance Impact ------------------- - -We expect the validations to take less than one minute by step. - -Other Deployer Impact ---------------------- - -The objective is to have a fastest iterative process by failing fast. - -Developer Impact ----------------- - -Each configuration module will need to generate assertions to be -consummed by the validation tool. - - -Implementation -============== - -Note that this approach (multiple step application of ansible in -localhost mode via heat) for upgrades and it will work well for -validations too. - -https://review.openstack.org/#/c/393448/ - -Assignee(s) ------------ - -Primary assignee: - -Other contributors to help validate services: - - -Work Items ----------- - -* generate assertions about the configured components on the server - being configured in yaml files. - -* implement the validation tool leveraging the work that has already - been done in ``tripleo-validations`` that will do the following - steps: - - 1. collect yaml files from the servers on the undercloud. - - 2. run validations in parallel on each server from the undercloud. - - 3. report all issues and exit with 0 if no error or 1 if there is at - least one error. - -Dependencies -============ - -To be added. - -Testing -======= - -The change will be used automatically in the CI so it will always be tested. - -Documentation Impact -==================== - -We'll need to document integration with whatever validation tool is -used, e.g so that those integrating new services (or in future -out-of-tree additional services) can know how to integrate with the -validation. - -References -========== - -A similar approach was used in SpinalStack using serverspec. See -https://github.com/redhat-cip/config-tools/blob/master/verify-servers.sh - -A collection of Ansible playbooks to detect and report potential -issues during TripleO deployments: -https://github.com/openstack/tripleo-validations - -Prototype of composable upgrades with Heat+Ansible: -https://review.openstack.org/#/c/393448/ diff --git a/specs/ocata/third-party-gating-with-tripleo-quickstart.rst b/specs/ocata/third-party-gating-with-tripleo-quickstart.rst deleted file mode 100644 index 35f8973d..00000000 --- a/specs/ocata/third-party-gating-with-tripleo-quickstart.rst +++ /dev/null @@ -1,258 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================================================== -Make tripleo third party ci toolset tripleo-quickstart -====================================================== - -https://blueprints.launchpad.net/tripleo/+spec/use-tripleo-quickstart-and-tripleo-quickstart-extras-for-the-tripleo-ci-toolset - -Devstack being the reference CI deployment of OpenStack does a good job at -running both in CI and locally on development hardware. -TripleO-Quickstart (TQ)`[3]`_ and TripleO-QuickStart-Extras (TQE) can provide -an equivalent experience like devstack both in CI and on local development -hardware. TQE does a nice job of breaking down the steps required to install an -undercloud and deploy and overcloud step by step by creating bash scripts on the -developers system and then executing them in the correct order. - - -Problem Description -=================== - -Currently there is a population of OpenStack developers that are unfamiliar -with TripleO and our TripleO CI tools. It's critical that this population have -a tool which can provide a similar user experience that devstack currently -provides OpenStack developers. - -Recreating a deployment failure from TripleO-CI can be difficult for developers -outside of TripleO. Developers may need more than just a script that executes -a deployment. Ideally developers have a tool that provides a high level -overview, a step-by-step install process with documentation, and a way to inject -their local patches or patches from Gerrit into the build. - -Additionally there may be groups outside of TripleO that want to integrate -additional code or steps to a deployment. In this case the composablity of the -CI code is critical to allow others to plugin, extend and create their own steps -for a deployment. - - -Proposed Change -=============== - -Overview --------- - -Replace the tools found in openstack-infra/tripleo-ci that drive the deployment -of tripleo with TQ and TQE. - -Alternatives ------------- - -One alternative is to break down TripleO-CI into composable shell scripts, and -improve the user experience `[4]`_. - -Security Impact ---------------- - -No known additional security vulnerabilities at this time. - -Other End User Impact ---------------------- - -We expect that newcomers to TripleO will have an enhanced experience -reproducing results from CI. - -Performance Impact ------------------- - -Using an undercloud image with preinstalled rpms should provide a faster -deployment end-to-end. - -Other Deployer Impact ---------------------- - -None at this time. - -Developer Impact ----------------- - -This is the whole point really and discussed elsewhere in the spec. However, -this should provide a quality user experience for developers wishing to setup -TripleO. - -TQE provides a step-by-step, well documented deployment of TripleO. -Furthermore, and is easy to launch and configure:: - - bash quickstart.sh -p quickstart-extras.yml -r quickstart-extras-requirements.txt --tags all - -Everything is executed via a bash shell script, the shell scripts are customized -via jinja2 templates. Users can see the command prior to executing it when -running it locally. Documentation of what commands were executed are -automatically generated per execution. - -Node registration and introspection example: -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* Bash script:: - - https://ci.centos.org/artifacts/rdo/jenkins-tripleo-quickstart-promote-newton-delorean-minimal-31/undercloud/home/stack/overcloud-prep-images.sh - - -* Execution log:: - - https://ci.centos.org/artifacts/rdo/jenkins-tripleo-quickstart-promote-newton-delorean-minimal-31/undercloud/home/stack/overcloud_prep_images.log.gz - -* Generated rst documentation:: - - https://ci.centos.org/artifacts/rdo/jenkins-tripleo-quickstart-promote-newton-delorean-minimal-31/docs/build/overcloud-prep-images.html - -Overcloud Deployment example: -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -* Bash script:: - - https://ci.centos.org/artifacts/rdo/jenkins-tripleo-quickstart-promote-newton-delorean-minimal_pacemaker-31/undercloud/home/stack/overcloud-deploy.sh.gz - -* Execution log:: - - https://ci.centos.org/artifacts/rdo/jenkins-tripleo-quickstart-promote-newton-delorean-minimal_pacemaker-31/undercloud/home/stack/overcloud_deploy.log.gz - -* Generated rST documentation:: - - https://ci.centos.org/artifacts/rdo/jenkins-tripleo-quickstart-promote-master-current-tripleo-delorean-minimal-37/docs/build/overcloud-deploy.html - -Step by Step Deployment: -^^^^^^^^^^^^^^^^^^^^^^^^ - -There are times when a developer will want to walk through a deployment step-by-step, -run commands by hand, and try to figure out what exactly is involved with -a deployment. A developer may also want to tweak the settings or add a patch. -To do the above the deployment can not just run through end to end. - -TQE can setup the undercloud and overcloud nodes, and then just add add already -configured scripts to install the undercloud and deploy the overcloud -successfully. Essentially allowing the developer to ssh to the undercloud and -drive the installation from there with prebuilt scripts. - -* Example:: - - ./quickstart.sh --no-clone --bootstrap --requirements quickstart-extras-requirements.txt --playbook quickstart-extras.yml --skip-tags undercloud-install,undercloud-post-install,overcloud-deploy,overcloud-validate --release newton - -Composability: -^^^^^^^^^^^^^^ - -TQE is not a single tool, it's a collection of composable Ansible roles. These -Ansible roles can coexist in a single Git repository or be distributed to many -Git repositories. See "Additional References." - -Why have two projects? Why risk adding complexity? -One of the goals of the TQ and TQE is to not assume we are -writing code that works for everyone, on every deployment type, and in any -kind of infrastructure. To ensure that TQE developers can not block outside -contributions (roles, additions, and customization to either TQ or TQE), -it was thought best to uncouple as well and make it as composable -as possible. Ansible playbooks after all, are best used as a method to just -call roles so that anyone can create playbooks with a variety of roles in the -way that best suits their purpose. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - - weshayutin - -Other contributors: - - trown - - sshnaidm - - gcerami - - adarazs - - larks - -Work Items ----------- - -- Enable third party testing `[1]`_ -- Enable TQE to run against the RH2 OVB OpenStack cloud `[2]`_ -- Move the TQE roles into one or many OpenStack Git Repositories, see the roles listed - in the "Additional References" - - -Dependencies -============ - -- A decision needs to be made regarding `[1]`_ -- The work to enable third party testing in rdoproject needs to be completed - -Testing -======= - -There is a work in progress testing TQE against the RH2 OVB cloud atm `[2]`_. TQE -has been vetted for quite some time with OVB on other clouds. - - -Documentation Impact -==================== - -What is the impact on the docs? Don't repeat details discussed above, but -please reference them here. - - -References -========== -* `[1]`_ -- http://lists.openstack.org/pipermail/openstack-dev/2016-October/105248.html -* `[2]`_ -- https://review.openstack.org/#/c/381094/ -* `[3]`_ -- https://etherpad.openstack.org/p/tripleo-third-party-ci-quickstart -* `[4]`_ -- https://blueprints.launchpad.net/tripleo/+spec/make-tripleo-ci-externally-consumable - -.. _[1]: http://lists.openstack.org/pipermail/openstack-dev/2016-October/105248.html -.. _[2]: https://review.openstack.org/#/c/381094/ -.. _[3]: https://etherpad.openstack.org/p/tripleo-third-party-ci-quickstart -.. _[4]: https://blueprints.launchpad.net/tripleo/+spec/make-tripleo-ci-externally-consumable - -Additional References -===================== - -TQE Ansible role library ------------------------- - -* Undercloud roles: - - * https://github.com/redhat-openstack/ansible-role-tripleo-baremetal-virt-undercloud - * https://github.com/redhat-openstack/ansible-role-tripleo-pre-deployment-validate ( under development ) - -* Overcloud roles: - - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-prep-config - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-prep-flavors - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-prep-images - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-prep-network - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud - * https://github.com/redhat-openstack/ansible-role-tripleo-ssl ( under development ) - -* Utility roles: - - * https://github.com/redhat-openstack/ansible-role-tripleo-cleanup-nfo - * https://github.com/redhat-openstack/ansible-role-tripleo-collect-logs - * https://github.com/redhat-openstack/ansible-role-tripleo-gate - * https://github.com/redhat-openstack/ansible-role-tripleo-provision-heat - * https://github.com/redhat-openstack/ansible-role-tripleo-image-build - -* Post Deployment roles: - - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-upgrade - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-scale-nodes - * https://github.com/redhat-openstack/ansible-role-tripleo-tempest - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-validate - * https://github.com/redhat-openstack/ansible-role-tripleo-validate-ipmi - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-validate-ha - -* Baremetal roles: - - * https://github.com/redhat-openstack/ansible-role-tripleo-baremetal-prep-virthost - * https://github.com/redhat-openstack/ansible-role-tripleo-overcloud-prep-baremetal \ No newline at end of file diff --git a/specs/ocata/tripleo-composable-upgrades.rst b/specs/ocata/tripleo-composable-upgrades.rst deleted file mode 100644 index 0be057d0..00000000 --- a/specs/ocata/tripleo-composable-upgrades.rst +++ /dev/null @@ -1,197 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================== -Composable Service Upgrades -=========================== - -https://blueprints.launchpad.net/tripleo/+spec/overcloud-upgrades-per-service - -In the Newton release TripleO delivered a new capability to deploy arbitrary -custom roles_ (groups of nodes) with a lot of flexibility of which services -are placed on which roles (using roles_data.yaml_). This means we can no -longer make the same assumptions about a specific service running on a -particular role (e.g Controller). - -The current upgrades workflow_ is organised around the node role determining -the order in which that given node and services deployed therein are upgraded. -The workflow dictates "swifts", before "controllers", before "cinders", before -"computes", before "cephs". The reasons for this ordering are beyond the scope -here and ultimately inconsequential, since the important point to note is -there is a hard coded relationship between a given service and a given node -with respect to upgrading that service (e.g. a script that upgrades all -services on "Compute" nodes). For upgrades from Newton to Ocata we can no -longer make these assumptions about services being tied to a specific role, -so a more composable model is needed. - -Consensus after the initial discussion during the Ocata design summit session_ -was that: - - * Re-engineering the upgrades workflow for Newton to Ocata is necessary - because 'custom roles' - * We should start by moving the upgrades logic into the composable service - templates in the tripleo-heat-templates (i.e. into each service) - * There is still a need for an over-arching workflow - albeit service - rather than role oriented. - * It is TBD what will drive that workflow. We will use whatever will be - 'easier' for a first iteration, especially given the Ocata development - time contraints. - -Problem Description -=================== - -As explained in the introduction above, the current upgrades workflow_ can no -longer work for composable service deployments. Right now the upgrade scripts -are organised around and indeed targetted at specific nodes: the upgrade -script for swifts_ is different to that for computes_ or for controllers (split -across a number_ of_ steps_) cinders_ or cephs_. These scripts are invoked -as part of a worfklow where each step is either a heat stack update or -invocation of the upgrade-non-controller.sh_ script to execute the node -specific upgrade script (delivered as one of the earlier steps in the workflow) -on non controllers. - -One way to handle this problem is to decompose the upgrades logic -from those monolithic per-node upgrade scripts into per-service upgrades logic. -This should live in the tripleo-heat-templates puppet services_ templates for -each service. For the upgrade of a give service we need to express: - - * any pre-upgrade requirements (run a migration, stop a service, pin RPC) - * any post upgrade (migrations, service starts/reload config) - * any dependencies on other services (upgrade foo only after bar) - -If we organise the upgrade logic in this manner the idea is to gain the -flexibility to combine this dynamically into the new upgrades workflow. -Besides the per-service upgrades logic the worklow will also need to handle -and provide for any deployment wide upgrades related operations such as -unpin of the RPC version once all services are successfully running Ocata, or -upgrading of services that aren't directly managed or configured by the -tripleo deployment (like openvswitch as just one example), or even the delivery -of a new kernel which will require a reboot on the given service node after -all services have been upgraded. - - -Proposed Change -=============== - -The first step is to work out where to add upgrades related configuration to -each service in the tripleo-heat-templates services_ templates. The exact -format will depend on what we end up using to drive the workflow. We could -include them in the *outputs* as 'upgrade_config', like:: - - outputs: - role_data: - description: Role data for the Nova Compute service. - value: - service_name: nova_compute - ... - upgrade_tasks: - - name: RPC pin nova-compute - exec: "crudini --set /etc/nova/nova.conf upgrade_levels compute $upgrade_level_nova_compute" - tags: step1 - - name: stop nova-compute - service: name=openstack-nova-compute state=stopped - tags: step2 - - name: update heat database - command: nova-manage db_sync - tags: step3 - - name: start nova-compute - service: name=openstack-nova-compute state=started - tags: step4 - ... - -The current proposal is for the upgrade snippets to be expressed in Ansible. -The initial focus will be to drive the upgrade via the existing tripleo -tooling, e.g heat applying ansible similar to how heat applies scripts for -the non composable implementation. In future it may also be possible to -expose the per-role ansible playbooks to enable advanced operators to drive -the upgrade workflow directly, perhaps used in conjunction with the dynamic -inventory provided for tripleo validations. - -One other point of note that was brought up in the Ocata design summit -session_ and which should factor into the design here is that operators may -wish to run the upgrade in stages rather than all at once. It could still be -the case that the new workflow can differentiate between 'controlplane' -vs 'non-controlplane' services. The operator could then upgrade controlplane -services as one stand-alone upgrade step and then later start to roll out the -upgrade of non-controlplane services. - -Alternatives ------------- - -One alternative is to have a stand-alone upgrades workflow driven by ansible. -Some early work and prototyping was done as well as a (linked from the -Ocata design summit session_). Ultimately the proposal was abandoned but it is -still possible that we will use ansible for the upgrade logic as described -above. We could also explore exposing the resulting ansible playbooks for -advanced operators to invoke as part of their own tooling. - -Other End User Impact ---------------------- -Significant change in the tripleo upgrades workflow. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: shardy - -Other contributors: marios, emacchi, matbu, chem, lbezdick, - - -Work Items ----------- -Some prototyping by shardy at -"WIP prototyping composable upgrades with Heat+Ansible" at -I39f5426cb9da0b40bec4a7a3a4a353f69319bdf9_ - - * Decompose the upgrades logic into each service template in the tht - * Design a workflow that incorporates migrations, the per-service upgrade - scripts and any deployment wide upgrades operations. - * Decide how this workflow is to be invoked (mistral? puppet? bash?) - * profit! - - -Dependencies -============ - - - -Testing -======= - -Hopefully we can use the soon to be added upgrades job_ to help with the -development and testing of this feature and obviously guard against changes -that break upgrades. Ideally we will expand that to include jobs for each of -the stable branches (upgrade M->N and N->O). The M->N would exercise the -previous upgrades workflow whereas N->O would be exercising the work developed -as part of this spec. - - -Documentation Impact -==================== - - -References -========== - - -.. _roles: https://blueprints.launchpad.net/tripleo/+spec/custom-roles -.. _roles_data.yaml: https://github.com/openstack/tripleo-heat-templates/blob/78500bc2e606bd1f80e05d86bf7da4d1d27f77b1/roles_data.yaml -.. _workflow: http://docs.openstack.org/developer/tripleo-docs/post_deployment/upgrade.html -.. _session: https://etherpad.openstack.org/p/ocata-tripleo-upgrades -.. _swifts: https://github.com/openstack/tripleo-heat-templates/blob/stable/newton/extraconfig/tasks/major_upgrade_object_storage.sh -.. _computes: https://github.com/openstack/tripleo-heat-templates/blob/stable/newton/extraconfig/tasks/major_upgrade_compute.sh -.. _number: https://github.com/openstack/tripleo-heat-templates/blob/stable/newton/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh -.. _of: https://github.com/openstack/tripleo-heat-templates/blob/stable/newton/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh -.. _steps: https://github.com/openstack/tripleo-heat-templates/blob/stable/newton/extraconfig/tasks/major_upgrade_controller_pacemaker_3.sh -.. _cinders: https://github.com/openstack/tripleo-heat-templates/blob/stable/newton/extraconfig/tasks/major_upgrade_block_storage.sh -.. _cephs: https://github.com/openstack/tripleo-heat-templates/blob/stable/newton/extraconfig/tasks/major_upgrade_ceph_storage.sh -.. _upgrade-non-controller.sh: https://github.com/openstack/tripleo-common/blob/01b68d0b0cdbd0323b7f006fbda616c12cbf90af/scripts/upgrade-non-controller.sh -.. _services: https://github.com/openstack/tripleo-heat-templates/tree/master/puppet/services -.. _I39f5426cb9da0b40bec4a7a3a4a353f69319bdf9 : https://review.openstack.org/#/c/393448/ -.. _job: https://bugs.launchpad.net/tripleo/+bug/1583125 diff --git a/specs/ocata/tripleo-opstools-performance-monitoring.rst b/specs/ocata/tripleo-opstools-performance-monitoring.rst deleted file mode 100644 index 5f9ba2fe..00000000 --- a/specs/ocata/tripleo-opstools-performance-monitoring.rst +++ /dev/null @@ -1,105 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Enable deployment of performace monitoring -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-opstools-performance-monitoring - -TripleO should have a possibility to automatically setup and install -the performance monitoring agent (collectd) to service the overcloud. - -Problem Description -=================== - -We need to easily enable operators to connect overcloud nodes to performance -monitoring stack. The possible way to do so is to install collectd agent -together with set of plugins, depending on a metrics we want to collect -from overcloud nodes. - -Summary of use cases: - -1. collectd deployed on each overcloud node reporting configured metrics -(via collectd plugins) to external collector. - -Proposed Change -=============== - -Overview --------- - -The collectd service will be deployed as a composable service on -the overcloud stack when it is explicitly stated via environment file. - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -Metric collection and transport to the monitoring node can create I/O which -might have performance impact on monitored nodes. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Lars Kellogg-Stedman (larsks) - -Other contributors: - Martin Magr (mmagr) - -Work Items ----------- - -* puppet-tripleo profile for collectd service -* tripleo-heat-templates composable service for collectd deployment - -Dependencies -============ - -* Puppet module for collectd service: puppet-collectd [1] -* CentOS Opstools SIG repo [2] - -Testing -======= - -We should consider creating CI job for deploying overcloud with monitoring -node to perform functional testing. - - -Documentation Impact -==================== - -New template parameters will have to be documented. - - -References -========== - -[1] https://github.com/voxpupuli/puppet-collectd -[2] https://wiki.centos.org/SpecialInterestGroup/OpsTools diff --git a/specs/ocata/tripleo-repos.rst b/specs/ocata/tripleo-repos.rst deleted file mode 100644 index 8a0a25c9..00000000 --- a/specs/ocata/tripleo-repos.rst +++ /dev/null @@ -1,139 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================== - TripleO Repo Management Tool -============================== - -https://blueprints.launchpad.net/tripleo/tripleo-repos - -Create a tool to handle the repo setup for TripleO - -Problem Description -=================== - -The documented repo setup steps for TripleO are currently: - -* 3 curls -* a sed -* a multi-line bash command -* a yum install -* (optional) another yum install and sed command - -These steps are also implemented in multiple other places, which means every -time a change needs to be made it has to be done in at least three different -places. The stable branches also need slightly different commands which further -complicates the documentation. They also need to appear in multiple places -in the docs (e.g. virt system setup, undercloud install, image build, -undercloud upgrade). - -Proposed Change -=============== - -Overview --------- - -My proposal is to abstract away the repo management steps into a standalone -tool. This would essentially change the repo setup from the process -described above to something like:: - - sudo yum install -y http://tripleo.org/tripleo-repos.rpm - sudo tripleo-repos current - -Historical note: The original proposal was called dlrn-repo because it was -dealing exclusively with dlrn repos. Now that we've started to add more -repos like Ceph that are not from dlrn, that name doesn't really make sense. - -This will mean that when repo setup changes are needed (which happen -periodically), they only need to be made in one place and will apply to both -developer and user environments. - -Alternatives ------------- - -Use tripleo.sh's repo setup. However, tripleo.sh is not intended as a -user-facing tool. It's supposed to be a thin wrapper that essentially -implements the documented deployment commands. - -Security Impact ---------------- - -The tool would need to make changes to the system's repo setup and install -packages. This is the same thing done by the documented commands today. - -Other End User Impact ---------------------- - -This would be a new user-facing CLI. - -Performance Impact ------------------- - -No meaningful change - -Other Deployer Impact ---------------------- - -Deployers would need to switch to this new method of configuring the -TripleO repos in their deployments. - -Developer Impact ----------------- - -There should be little to no developer impact because they are mostly using -other tools to set up their repos, and those tools should be converted to use -the new tool. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - bnemec - -Other contributors: - - -Work Items ----------- - -* Update the proposed tool to match the current repo setup -* Import code into gerrit -* Package tool -* Publish the package somewhere easily accessible -* Update docs to use tool -* Convert existing developer tools to use this tool - - -Dependencies -============ - -NA - -Testing -======= - -tripleo.sh would be converted to use this tool so it would be covered by -existing CI. - - -Documentation Impact -==================== - -Documentation would be simplified. - - -References -========== - -Original proposal: -http://lists.openstack.org/pipermail/openstack-dev/2016-June/097221.html - -Current version of the tool: -https://github.com/cybertron/dlrn-repo diff --git a/specs/ocata/undercloud-heat.rst b/specs/ocata/undercloud-heat.rst deleted file mode 100644 index f0846101..00000000 --- a/specs/ocata/undercloud-heat.rst +++ /dev/null @@ -1,177 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================ -composable-undercloud -================================ - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/heat-undercloud - -Deploy the undercloud with Heat instead of elements. This will allow us to use -composable services for the Undercloud and better fits with the architecture -of TripleO (providing a feedback loop between the Undercloud and Overcloud). -Furthermore this gets us a step closer to an HA undercloud and will help -us potentially convert the Undercloud to containers as work is ongoing -in t-h-t for containers as well. - -Problem Description -=================== - -The Undercloud today uses instack-undercloud. Instack undercloud is built -around the concept of 'instack' which uses elements to install service. - -* When instack-undercloud started we shared elements across the undercloud - and overcloud via the tripleo-image-elements project. This is no longer the - case, thus we have lost the feedback loop of using the same elements in - both the overcloud and undercloud. - -* We retro-fitted instack-undercloud with a single element called - puppet-stack-config that contains a single (large) puppet manifest for - all the services. Being able to compose the Undercloud would be more - scalable. - -* A maintenance problem. Ideally we could support the under and overcloud with the same tooling. - -Proposed Change -=============== - -Overview --------- - -We can use a single process Heat API/Engine in noauth mode to leverage -recent "composable services" work in the tripleo-heat-templates project. - -* A new heat-all launcher will be created. - -* We will run the heat-all launcher with "noauth" middleware to skip keystone - auth at a high level. - -* The heat-all process will use fake RPC driver and SQLite thus avoiding - the need to run RabbitMQ or MySQL on the deployment server for bootstrapping. - -* To satisfy client library requirements inside heat we will run a fake keystone - API (a thread in our installer perhaps), that will return just enough to - make these clients functionally work in noauth mode. - -* The new "deployed-server" feature in tripleo-heat-templates will make it - it possible to create Heat "server" objects and thus run - OS::Heat::SoftwareDeployment resources on pre-installed servers. - -* We will use os-collect-config to communicate with the local Heat API via - the Heat signal transport. We will run os-collect-config until the - stack finished processing and either completes or fails. - -Alternatives ------------- - -* Create another tool which can read composable services in - tripleo-heat-templates. This tool would be required to have feature - parity with Heat such that things like parameters, nested stacks, - environments all worked in a similar fashion so that we could share the - template work across the Undercloud and Overcloud. This approach isn't - really feasable. - -* Use an alternate tool like Ansible. This would creating duplicate services - in Ansible playbooks for each service we require in the Undercloud. This - approach isn't ideal in that it involves duplicate work across the Undercloud - and Overcloud. Ongoing work around multi-node configuration and containers - would need to be duplicated into both the Overcloud (tripleo-heat-templates) - and Undercloud (Ansible) frameworks. - -Security Impact ---------------- - -* The approach would run Heat on a single node in noauth mode. Heat - API and the fake Keystone stub would listen on 127.0.0.1 only. This - would be similar to other projects which allow noauth in local mode - as well. - -Other End User Impact ---------------------- - -* We would again have a single template language driving our Undercloud - and Overcloud tooling. Heat templates are very well documented. - -Performance Impact ------------------- - -* Initial testing shows the single process Heat API/Engine is quite light - taking only 70MB of RAM on a machine. - -* The approach is likely to be on-par with the performance of - instack-undercloud. - - -Other Deployer Impact ---------------------- - -* The format of undercloud.conf may change. We will add a - 'compat' layer which takes the format of 'undercloud.conf' today - and sets Heat parameters and or includes heat environments to give - feature parity and an upgrade path for existing users. Additional, - CI jobs will also be created to ensure users who upgrade from - previous instack environments can use the new tool. - -Developer Impact ----------------- - -* Developers would be able to do less work to maintain the UnderCloud by - sharing composable services. - -* Future work around composable upgrades could also be utilized and shared - across the Undercloud and Overcloud. - - -Implementation -============== - -Assignee(s) ------------ - -dprince (dan-prince on LP) - -Work Items ----------- - -* Create heat-all launcher. - -* Create python-tripleoclient command to run 'undercloud deploy'. - -* Create undercloud.yaml Heat templates. - - -Dependencies -============ - -* Heat all launcher and noauth middleware. - -Testing -======= - -Swapping in the new Undercloud as part of CI should allow us to fully test it. - -Additionally, we will have an upgrade job that tests an upgrade from -an instack-undercloud installation to a new t-h-t driven Undercloud install. - -Documentation Impact -==================== - -Documentation changes will need to be made that explains new config -interfaces (Heat parameters and environments). We could minimiz doc changes -by developing a 'compat' interface to process the legacy undercloud.conf -and perhaps even re-use the 'undercloud install' task in python-tripleoclient -as well so it essentially acts the same on the CLI. - -References -========== - -* Onward dark owl presentation: https://www.youtube.com/watch?v=y1qMDLAf26Q - -* https://etherpad.openstack.org/p/tripleo-composable-undercloud - -* https://blueprints.launchpad.net/tripleo/+spec/heat-undercloud diff --git a/specs/ocata/undercloud-ntp-server.rst b/specs/ocata/undercloud-ntp-server.rst deleted file mode 100644 index 4ae68f65..00000000 --- a/specs/ocata/undercloud-ntp-server.rst +++ /dev/null @@ -1,142 +0,0 @@ -============================= -TripleO Undercloud NTP Server -============================= - -The Undercloud should provide NTP services for when external NTP services are -not available. - -Problem Description -=================== - -NTP services are required to deploy with HA, but we rely on external services. -This means that TripleO can't be installed without Internet access or a local -NTP server. - -This has several drawbacks: - -* The NTP server is a potential point of failure, and it is an external - dependency. - -* Isolated deployments without Internet access are not possible without - additional effort (manually deploying an NTP server). - -* Infra CI is dependent on an external resource, leading to potential - false negative test runs or CI failures. - -Proposed Change -=============== - -Overview --------- - -In order to address this problem, the Undercloud installation process should -include setting up an NTP server on the local Undercloud. The use of this -NTP server would be optional, but we may wish to make it a default. Having -a default is better than none, since HA deployments will fail without time -synchronization between the controller cluster members. - -The operation of the NTP server on the Undercloud would be primarily of use -in small or proof-of-concept deployments. It is expected that sufficiently -large deployments will have an infrastructure NTP server already operating -locally. - -Alternatives ------------- - -The alternative is to continue to require external NTP services, or to -require manual steps to set up a local NTP server. - -Security Impact ---------------- - -Since the NTP server is required for syncing the HA, a skewed clock on one -controller (in relation to the other controllers) may make it ineligable to -participate in the HA cluster. If more than one controller's clock is skewed, -the entire cluster will fail to operate. This opens up an opportunity for -denial-of-service attacks against the cloud, either by causing NTP updates -to fail, or using a man-in-the-middle attack where deliberately false NTP -responses are returned to the controllers. - -Of course, operating the NTP server on the Undercloud moves that attack -vector down to the Undercloud, so sufficient security hardening should be done -on the Undercloud and/or the attached networks. We may wish to bind the NTP -server only to the provisioning (control plane) network. - -Other End User Impact ---------------------- - -This may make the life of the installer easier, since they don't need to open -a network connection to an NTP server or set up a local NTP server. - -Performance Impact ------------------- - -The operation of the NTP server should have a negligible impact on Undercloud -performance. It is a lightweight protocol and the daemon requires little -resources. - -Other Deployer Impact ---------------------- - -We now require that a valid NTP server be configured either in the templates -or on the deployment command-line. This requirement would be optional if we had -a default pointing to NTP services on the Undercloud. - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ -Primary assignees: - -* dsneddon@redhat.com -* bfournie@redhat.com - -Work Items ----------- - -The TripleO Undercloud installation scripts will have to be modified to include -the installation and configuration of an NTP server. This will likely be done -using a composable service for the Undercloud, with configuration data taken -from undercloud.conf. The configuration should include a set of default NTP -servers which are reachable on the public Internet for when no servers are -specified in undercloud.conf. - -Implement opening up iptables for NTP on the control plane network (bound to -only one IP/interface [ctlplane] if possible). - -Dependencies -============ - -The NTP server RPMs must be installed, and upstream NTP servers must be -identified (although we might configure a default such as pool.ntp.org) - -Testing -======= - -Since proper operation of the NTP services are required for successful -deployment of an HA overcloud, this functionality will be tested every time -a TripleO CI HA job is run. - -We may also want to implement a validation that ensures that the NTP server -can reach its upstream stratum 1 servers. This will ensure that the NTP -server is serving up the correct time. This is optional, however, since the -only dependency is that the overcloud nodes agree on the time, not that it -be correct. - -Documentation Impact -==================== - -The setup and configuration of the NTP server should be documented. Basic NTP -best practices should be communicated. - -References -========== - -* [1] - Administration Guide Draft/NTP - Fedora Project - https://fedoraproject.org/wiki/Administration_Guide_Draft/NTP diff --git a/specs/ocata/validations-in-workflows.rst b/specs/ocata/validations-in-workflows.rst deleted file mode 100644 index 00ac91a2..00000000 --- a/specs/ocata/validations-in-workflows.rst +++ /dev/null @@ -1,224 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================ -Validations in TripleO Workflows -================================ - -https://blueprints.launchpad.net/tripleo/+spec/validations-in-workflows - -The Newton release introduced TripleO validations -- a set of -extendable checks that identify potential deployment issues early and -verify that the deployed OpenStack is set up properly. These -validations are automatically being run by the TripleO UI, but there -is no support for the command line workflow and they're not being -exercised by our CI jobs either. - - -Problem Description -=================== - -When enabled, TripleO UI runs the validations at the appropriate phase -of the planning and deployment. This is done within the TripleO UI -codebase and therefore not available to python-tripleoclient or -the CI. - -The TripleO deployer can run the validations manually, but they need -to know at which point to do so and they will need to do it by calling -Mistral directly. - -This causes a disparity between the command line and GUI experience -and complicates the efforts to exercise the validations by the CI. - - -Proposed Change -=============== - -Overview --------- - -Each validation already advertises where in the planning/deployment -process it should be run. This is under the ``vars/metagata/groups`` -section. In addition, the ``tripleo.validations.v1.run_groups`` -Mistral workflow lets us run all validations belonging to a given -group. - -For each validation group (currently ``pre-introspection``, ``pre-deployment`` -and ``post-deployment``) we will update the appropriate workflow in -tripleo-common to optionally call ``run_groups``. - -Each of the workflows above will receive a new Mistral input called -``run_validations``. It will be a boolean value that indicates whether -the validations ought to be run as part of that workflow or not. - -To expose this functionality to the command line user, we will add an -option for enabling/disabling validations into python-tripleoclient -(which will set the ``run_validations`` Mistral input) and a way to -show the results of each validation to the screen output. - -When the validations are run, they will report their status to Zaqar -and any failures will block the deployment. The deployer can disable -validations if they wish to proceed despite failures. - -One unresolved question is the post-deployment validations. The Heat -stack create/update Mistral action is currently asynchronous and we -have no way of calling actions after the deployment has finished. -Unless we change that, the post-deployment validations may have to be -run manually (or via python-tripleoclient). - - -Alternatives ------------- - -1. Document where to run each group and how and leave it at that. This - risks that the users already familiar with TripleO may miss the - validations or that they won't bother. - - We would still need to find a way to run validations in a CI job, - though. - -2. Provide subcommands to run validations (and groups of validations) - into python-tripleoclient and rely on people running them manually. - - This is similar to 1., but provides an easier way of running a - validation and getting its result. - - Note that this may be a useful addition even if with the proposal - outlined in this specification. - -3. Do what the GUI does in python-tripleoclient, too. The client will - know when to run which validation and will report the results back. - - The drawback is that we'll need to implement and maintain the same - set of rules in two different codebases and have no API to do them. - I.e. what the switch to Mistral is supposed to solve. - - - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -We will need to modify python-tripleoclient to be able to display the -status of validations once they finished. TripleO UI already does this. - -The deployers may need to learn about the validations. - -Performance Impact ------------------- - -Running a validation can take about a minute (this depends on the -nature of the validation, e.g. does it check a configuration file or -does it need to log in to all compute nodes). - -This may can be a concern if we run multiple validations at the same -time. - -We should be able to run the whole group in parallel. It's possible -we're already doing that, but this needs to be investigated. -Specifically, does ``with-items`` run the tasks in sequence or in -parallel? - -There are also some options that would allow us to speed up the -running time of a validation itself, by using common ways of speeding -up Ansible playbooks in general: - -* Disabling the default "setup" task for validations that don't need - it (this task gathers hardware and system information about the - target node and it takes some time) -* Using persistent SSH connections -* Making each validation task run independently (by default, Ansible - runs a task on all the nodes, waits for its completion everywhere - and then moves on to another task) -* Each validation runs the ``tripleo-ansible-inventory`` script which - gathers information about deployed servers and configuration from - Mistral and Heat. Running this script can be slow. When we run - multiple validations at the same time, we should generate the - inventory only once and cache the results. - -Since the validations are going to be optional, the deployer can -always choose not to run them. On the other hand, any slowdown should -ideally outweigh the time spent investigating failed deployments. - -We will also document the actual time difference. This information -should be readily available from our CI environments, but we should -also provide measurements on the bare metal. - - -Other Deployer Impact ---------------------- - -Depending on whether the validations will be run by default or not, -the only impact should be an option that lets the deployer to run them -or not. - - -Developer Impact ----------------- - -The TripleO developers may need to learn about validations, where to -find them and how to change them. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - tsedovic - -Other contributors: - None - -Work Items ----------- - -Work items or tasks -- break the feature up into the things that need to be -done to implement it. Those parts might end up being done by different people, -but we're mostly trying to understand the timeline for implementation. - -* Add ``run_validations`` input and call ``run_groups`` from the - deployment and node registration workflows -* Add an option to run the validations to python-tripleoclient -* Display the validations results with python-tripleoclient -* Add or update a CI job to run the validations -* Add a CI job to tripleo-validations - - -Dependencies -============ - -None - - -Testing -======= - -This should make the validations testable in CI. Ideally, we would -verify the expected success/failure for the known validations given -the CI environment. But having them go through the testing machinery -would be a good first step to ensure we don't break anything. - - -Documentation Impact -==================== - -We will need to document the fact that we have validations, where they -live and when and how are they being run. - - -References -========== - -* http://docs.openstack.org/developer/tripleo-common/readme.html#validations -* http://git.openstack.org/cgit/openstack/tripleo-validations/ -* http://docs.openstack.org/developer/tripleo-validations/ diff --git a/specs/pike/aide-database.rst b/specs/pike/aide-database.rst deleted file mode 100644 index 0995c5d3..00000000 --- a/specs/pike/aide-database.rst +++ /dev/null @@ -1,185 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -==================================== -AIDE - Intrustion Detection Database -==================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-aide-database - -AIDE (Advanced Intrusion Detection Environment) is a file and directory -integrity verification system. It computes a checksum of object -attributes, which are then stored into a database. Operators can then -run periodic checks against the current state of defined objects and -verify if any attributes have been changed (thereby suggesting possible -malicious / unauthorised tampering). - -Problem Description -=================== - -Security Frameworks such as DISA STIG [1] / CIS [3] require that AIDE be -installed and configured on all Linux systems. - -To enable OpenStack operators to comply with the aforementioned security -requirements, they require a method of automating the installation of -AIDE and initialization of AIDE's integrity Database. They also require -a means to perform a periodic integrity verification run. - -Proposed Change -=============== - -Overview --------- - -Introduce a puppet-module to manage the AIDE service and ensure the AIDE -application is installed, create rule entries and a CRON job to allow -a periodic check of the AIDE database or templates to allow monitoring -via Sensu checks as part of OpTools. - -Create a tripleo-heat-template service to allow population of hiera data -to be consumed by the puppet-module managing AIDE. - -The proposed puppet-module is lhinds-aide [2] as this module will accept -rules declared in hiera data, initialize the Database and enables CRON -entries. Other puppet AIDE modules were missing hiera functionality or -other features (such as CRON population). - -Within tripleo-heat-templates, a composable service will be created to -feed a rule hash into the AIDE puppet module as follows: - - AIDERules: - description: Mapping of AIDE config rules - type: json - default: {} - -The Operator can then source an environment file and provide rule -information as a hash: - - parameter_defaults: - AIDERules: - 'Monitor /etc for changes': - content: '/etc p+sha256' - order : 1 - 'Monitor /boot for changes': - content: '/boot p+u+g+a' - order : 2 - -Ops Tool Integration --------------------- - -In order to allow active monitoring of AIDE events, a sensu check can -be created to perform an interval based verification of AIDE monitored -files (set using ``AIDERules``) against the last initialized database. - -Results of the Sensu activated AIDE verification checks will then be fed -to the sensu server for alerting and archiving. - -The Sensu clients (all overcloud nodes) will be configured with a -standalone/passive check via puppet-sensu module which is already -installed on overcloud image. - -If the Operator should choose not to use OpTools, then they can still -configure AIDE using the traditional method by means of a CRON entry. - -Alternatives ------------- - -Using a puppet-module coupled with a TripleO service is the most -pragmatic approach to populating AIDE rules and managing the AIDE -service. - -Security Impact ---------------- - -AIDE is an integrity checking application and therefore requires -Operators insure the security of AIDE's database is protected from -tampering. Should an attacker get access to the database, they could -attempt to hide malicious activity by removing records of file integrity -hashes. - -The default location is currently `/var/lib/aide/$database` which -puppet-aide sets with privileges of `0600` and ownership of -`root \ root`. - -AIDE itself introduces no security impact to any OpenStack projects -and has no interaction with any OpenStack services. - -Other End User Impact ---------------------- - -The service interaction will occur via heat templates and the TripleO -UI (should a capability map be present). - -Performance Impact ------------------- - -No Performance Impact - -Other Deployer Impact ---------------------- - -The service will be utlised by means of an environment file. Therefore, -should a deployer not reference the environment template using the -`openstack overcloud deploy -e` flag, there will be no impact. - -Developer Impact ----------------- - -No impact on other OpenStack Developers. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - lhinds - -Work Items ----------- - -1. Add puppet-aide [1] to RDO as a puppet package - -2. Create TripleO Service for AIDE - -3. Create Capability Map - -4. Create CI Job - -5. Submit documentation to tripleo-docs. - - -Dependencies -============ - -Dependency on lhinds-aide Puppet Module. - -Testing -======= - -Will be tested in TripleO CI by adding the service and an environment -template to a TripleO CI scenario. - -Documentation Impact -==================== - -Documentation patches will be made to explain how to use the service. - -References -========== - -Original Launchpad issue: https://bugs.launchpad.net/tripleo/+bug/1665031 - -[1] https://www.stigviewer.com/stig/red_hat_enterprise_linux_6/2016-07-22/finding/V-38489 - -[2] https://forge.puppet.com/lhinds/aide - -[3] -file:///home/luke/project-files/tripleo-security-hardening/CIS_Red_Hat_Enterprise_Linux_7_Benchmark_v2.1.0.pdf - -[3] -file:///home/luke/project-files/tripleo-security-hardening/CIS_Red_Hat_Enterprise_Linux_7_Benchmark_v2.1.0.pdf diff --git a/specs/pike/container-healthchecks.rst b/specs/pike/container-healthchecks.rst deleted file mode 100644 index 2772ce9c..00000000 --- a/specs/pike/container-healthchecks.rst +++ /dev/null @@ -1,148 +0,0 @@ -=========================================== -Container Healthchecks for TripleO Services -=========================================== - -https://blueprints.launchpad.net/tripleo/+spec/container-healthchecks - -An OpenStack deployment involves many services spread across many -hosts. It is important that we provide tooling and APIs that make it -as easy as possible to monitor this large, distributed environment. -The move to containerized services in the overcloud [1] -brings with it many opportunities, such as the ability to bundle -services with their associated health checks and provide a standard -API for assessing the health of the service. - -[1]: https://blueprints.launchpad.net/tripleo/+spec/containerize-tripleo - -Problem Description -=================== - -The people who are in the best position to develop appropriate health -checks for a service are generally those people responsible for -developing the service. Unfortunately, the task of setting up -monitoring generally ends up in the hands of cloud operators or some -intermediary. - -I propose that we take advantage of the bundling offered by -containerized services and create a standard API with which an -operator can assess the health of a service. This makes life easier -for the operator, who can now provide granular service monitoring -without requiring detailed knowledge about every service, and it -allows service developers to ensure that services are monitored -appropriately. - -Proposed Change -=============== - -Overview --------- - -The Docker engine (since version 1.12), as well as most higher-level -orchestration frameworks, provide a standard mechanism for validating -the health of a container. Docker itself provides the -HEALTHCHECK_ directive, while Kubernetes has explicit -support for `liveness and readiness probes`_. Both -mechanisms work by executing a defined command inside the container, -and using the result of that executing to determine whether or not the -container is "healthy". - -.. _liveness and readiness probes: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/ -.. _healthcheck: https://docs.docker.com/engine/reference/builder/#healthcheck - -I propose that we explicitly support these interfaces in containerized -TripleO services through the following means: - -1. Include in every container a `/openstack/healthcheck` command that - will check the health of the containerized service, exit with - status ``0`` if the service is healthy or ``1`` if not, and provide - a message on ``stdout`` describing the nature of the error. - -2. Include in every Docker image an appropriate ``HEALTHCHECK`` - directive to utilize the script:: - - HEALTHCHECK CMD /openstack/healthcheck - -3. If Kubernetes becomes a standard part of the TripleO deployment - process, we may be able to implement liveness or readiness probes - using the same script:: - - livenessProbe: - exec: - command: - - /openstack/healthcheck - -Alternatives ------------- - -The alternative is the status quo: services do not provide a standard -healthcheck API, and service monitoring must be configured -individually by cloud operators. - -Security Impact ---------------- - -N/A - -Other End User Impact ---------------------- - -Users can explicitly run the healthcheck script to immediately assess -the state of a service. - -Performance Impact ------------------- - -This proposal will result in the periodic execution of tasks on the -overcloud hosts. When designing health checks, service developers -should select appropriate check intervals such that there is minimal -operational overhead from the health checks. - -Other Deployer Impact ---------------------- - -N/A - -Developer Impact ----------------- - -Developers will need to determine how best to assess the health of a -service and provide the appropriate script to perform this check. - -Implementation -============== - -Assignee(s) ------------ - -N/A - -Work Items ----------- - -N/A - -Dependencies -============ - -- This requires that we implement `containerize-tripleo-overcloud`_ - blueprint. - -.. _containerize-tripleo-overcloud: https://specs.openstack.org/openstack/tripleo-specs/specs/ocata/containerize-tripleo-overcloud.html - -Testing -======= - -TripleO CI jobs should be updated to utilize the healthcheck API to -determine if services are running correctly. - -Documentation Impact -==================== - -Any documentation describing the process of containerizing a service -for TripleoO must be updated to describe the healthcheck API. - -References -========== - -N/A - diff --git a/specs/pike/containerized-services-logs.rst b/specs/pike/containerized-services-logs.rst deleted file mode 100644 index 57526663..00000000 --- a/specs/pike/containerized-services-logs.rst +++ /dev/null @@ -1,305 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -==================================================== -Best practices for logging of containerized services -==================================================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/containerized-services-logs - -Containerized services shall persist its logs. There are many ways to address -that. The scope of this blueprint is to suggest best practices and intermediate -implementation steps for Pike release as well. - -Problem Description -=================== - -Pike will be released with a notion of hybrid deployments, which is some -services may be running in containers and managed by docker daemon, and -some may be managed by systemd or Pacemaker and placed on hosts directly. - -The notion of composable deployments as well assumes end users and -developers may want to deploy some services non-containerized and tripleo -heat templates shall not prevent them from doing so. - -Despite the service placement type, end users and developers shall get all -logs persisted, consistent and available for future analysis. - -Proposed Change -=============== - -Overview --------- - -.. note:: As the spec transitions from Pike, some of the sections below are - split into the Pike and Queens parts. - -The scope of this document for Pike is limited to recommendations for -developers of containerized services, bearing in mind use cases for hybrid -environments. It addresses only intermediate implementation steps for Pike and -smooth UX with upgrades from Ocata to Pike, and with future upgrades from Pike -as well. - -A `12factor `_ is the general guideline for logging -in containerized apps. Based on it, we rephrase our main design assumption as: -"each running process writes its only event stream to be persisted outside -of its container". And we put an additional design constraint: "each container -has its only running foreground process, nothing else requires persistent -logs that may outlast the container execution time". This assumes all streams -but the main event stream are ephemeral and live no longer than the container -instance does. - -.. note:: HA statefull services may require another approach, see the - alternatives section for more details. - -The scope for future releases, starting from Queens, shall include best -practices for collecting (shipping), storing (persisting), processing (parsing) -and accessing (filtering) logs of hybrid TripleO deployments with advanced -techniques like EFK (Elasticsearch, Fluentd, Kibana) or the like. Hereafter -those are referred as "future steps". - -Note, this is limited to OpenStack and Linux HA stack (Pacemaker and Corosync). -We can do nothing to the rest of the supporting and legacy apps like -webservers, load balancing revers proxies, database and message queue clusters. -Even if we could, this stays out of OpenStack specs scope. - -Here is a list of suggested best practices for TripleO developers for Pike: - -* Host services shall keep writing logs as is, having UIDs, logging configs, - rotation rules and target directories unchanged. - - .. note:: Host services changing its control plane to systemd or pacemaker - in Ocata to Pike upgrade process, may have logging configs, rules and - destinations changed as well, but this is out of the scope of this spec. - -* Containerized services that normally log to files under the `/var/log` dir, - shall keep logging as is inside of containers. The logs shall be persisted - with hostpath mounted volumes placed under the `/var/log/containers` path. - This is required because of the hybrid use cases. For example, containerized - nova services access `/var/log/nova` with different UIDs than the host - services would have. Given that, nova containers should have log volumes - mounted as ``-v /var/log/nova:/var/log/containers/nova`` in order to not - bring conflicts. Persisted log files then can be pulled by a node agent like - fluentd or rsyslog and forwarded to a central logging service. - -* Containerized services that can only log to syslog facilities: bind mount - /dev/log into all tripleo service containers as well so that the host - collects the logs via journald. This should be a standard component of our - container "API": we guarantee (a) a log directory and (b) a syslog socket - for *every* containerized service. Collected journald logs then can be pulled - by a node agent like fluentd or rsyslog and forwarded to a central logging - service. - -* Containerized services that leverage Kolla bootstrap, extended start and/or - config facilities, shall be templated with Heat deployment steps as the - following: - - * Host prep tasks to ensure target directories pre-created for hosts. - - * Kolla config's permissions to enforce ownership for log dirs (hostpath - mounted volumes). - - * Init containers steps to chown log directories early otherwise. Kolla - bootstrap and DB sync containers are normally invoked before the - `kolla_config` permissions to be set. Therefore come init containers. - -* Containerized services that do not use Kolla and run as root in containers - shall be running from a separate user namespace remapped to a non root host - user, for security reasons. No such services are currently deployed by - TripleO, though. - - .. note:: Docker daemon would have to be running under that remapped non root - user as well. See docker documentation for the ``--userns-remap`` option. - -* Containerized services that run under pacemaker (or pacemaker remote) - control plane and do not fall into any of the given cases: bind mount - /dev/log as well. At this stage the way services log is in line with the best - practice w.r.t "dedicated log directory to avoid conflicts". Pacemaker - bundles isolate the containerized resources' logs on the host into - `/var/log/pacemaker/bundles/{resource}`. - -Future steps TBD. - -Alternatives ------------- - -Those below come for future steps only. - -Alternatively to hostpath mounted volumes, create a directory structure such -that each container has a namespace for its logs somewhere under `/var/log`. -So, a container named 12345 would have *all its logs* in the -`/var/log/container-12345` directory structure (requires clarification). -This also alters the assumption that in general there is only one main log -per a container, which is the case for highly available containerized -statefull services bundled with pacemaker remote, with multiple logs to -capture, like `/var/log/pacemaker.log`, logs for cluster bootstrapping -events, control plane agents, helper tools like rsyncd, and the statefull -service itself. - -When we have control over the logging API (e.g. via oslo.log), we can forsake -hostpath mounted volumes and configure containerized services to output to -syslog (via bind mount `/dev/log`) so that the host collects the logs via -journald). Or configure services to log only to stdout, so that docker daemon -collects logs and ships them to the journald. - -.. note:: The "winning" trend is switching all (including openstack - services) to syslog and log nothing to the /var/log/, e.g. just bind-mount - ``-v /dev/null:/var/log`` for containers. - -Or use a specialized log driver like the oslo.log fluentd logging driver -(instead of the default journald or json-file) to output to a fluentd log agent -running on the host or containerized as well, which would then aggregate logs -from all containers, annotate with node metadata, and use the fluentd -`secure_forward` protocol to send the logs to a remote fluentd agent like -common logging. - -These are not doable for Pike as requiring too many changes impacting upgrade -UX as well. Although, this is the only recommended best practice and end goal for -future releases and future steps coming after Pike. - -Security Impact ---------------- - -As the spec transitions from Pike, the section is split into the Pike and -Queens parts. - -UID collisions may happen for users in containers to occasionally match another -user IDs on the host. And to allow those to access logs of foreign services. -This should be mitigated with SELinux policies. - -Future steps impact TBD. - -Other End User Impact ---------------------- - -As the spec transitions from Pike, the section is split into the Pike and -Queens parts. - -Containerized and host services will be logging under different paths. The former -to the `/var/log/containers/foo` and `/var/log/pacemaker/bundles/*`, the latter -to the `/var/log/foo`. This impacts logs collecting tools like -`sosreport `_ et al. - -Future steps impact TBD. - -Performance Impact ------------------- - -As the spec transitions from Pike, the section is split into the Pike and -Queens parts. - -Hostpath mounted volumes bring no performance overhead for containerized -services' logs. Host services are not affected by the proposed change. - -Future steps impact is that handling of the byte stream of stdout can -have a significant impact on performance. - -Other Deployer Impact ---------------------- - -As the spec transitions from Pike, the section is split into the Pike and -Queens parts. - -When upgrading from Ocata to Pike, containerized services will change its -logging destination directory as described in the end user impact section. -This also impacts logs collecting tools like sosreport et al. - -Logrotate scripts must be adjusted for the `/var/log/containers` and -`/var/log/pacemaker/bundles/*` as well. - -Future steps impact TBD. - -Developer Impact ----------------- - -As the spec transitions from Pike, the section is split into the Pike and -Queens parts. - -Developers will have to keep in mind the recommended intermediate best -practices, when designing heat templates for TripleO hybrid deployments. - -Developers will have to understand Kolla and Docker runtime internals, although -that's already the case once we have containerized services onboard. - -Future steps impact (to be finished): - -* The notion of Tracebacks in the events is difficult to handle as a byte - stream, because it becomes the responsibility of the apps to ensure output - of new-line separated text is not interleaved. That notion of Tracebacks - needs to be implemented apps side. - -* Oslo.log is really emitting a stream of event points, or trace points, with - rich metadata to describe those events. Capturing that metadata via a byte - stream later needs to be implemented. - -* Event streams of child processes, forked even temporarily, should or may need - to be captured by the parent events stream as well. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - bogdando - -Other contributors: - michele - flaper87 - larsks - dciabrin - -Work Items ----------- - -As the spec transitions from Pike, the work items are split into the Pike and -Queens parts: - -* Implement an intermediate logging solution for tripleo-heat-templates for - containerized services that log under `/var/log` (flaper87, bogdando). Done - for Pike. -* Come up with an intermediate logging solution for containerized services that - log to syslog only (larsks). Done for Pike. -* Come up with a solution for HA containerized services managed by Pacemaker - (michele). Done for Pike. -* Make sure that sosreport collects `/var/log/containers/*` and - `/var/log/pacemaker/bundles/*` (no assignee). Pending for Pike. -* Adjust logrotate scripts for the `/var/log/containers` and - `/var/log/pacemaker/bundles/*` paths (no assignee). Pending for Pike. -* Verify if the namespaced `/var/log/` for containers works and fits the case - (no assignee). -* Address the current state of OpenStack infrastructure apps as they are, and - gently move them towards these guidelines referred as "future steps" (no - assignee). - -Dependencies -============ - -None. - -Testing -======= - -Existing CI coverage fully fits the proposed change needs. - -Documentation Impact -==================== - -The given best practices and intermediate solutions built from those do not -involve changes visible for end users but those given in the end users impact -section. The same is true for developers and dev docs. - -References -========== - -* `Sosreport tool `_. -* `Pacemaker container bundles `_. -* `User namespaces in docker `_. -* `Docker logging drivers `_. -* `Engineering blog posts `_. diff --git a/specs/pike/deployment-plan-management.rst b/specs/pike/deployment-plan-management.rst deleted file mode 100644 index 409c6955..00000000 --- a/specs/pike/deployment-plan-management.rst +++ /dev/null @@ -1,230 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================== -Deployment Plan Management changes -================================== - -https://blueprints.launchpad.net/tripleo/+spec/deployment-plan-management-refactor - -The goal of this work is to improve GUI and CLI interoperability by changing the way -deployment configuration is stored, making it more compact and simplify plan import -and export. - -Problem Description -=================== - -The problem is broadly described in mailing list discussion [1]. This spec is a result -of agreement achieved in that discussion. - -TripleO-Common library currently operates on Mistral environment for storing plan -configuration although not all data are stored there since there are additional files -which define plan configuration (roles_data.yaml, network_data.yaml, capabilities-map.yaml) -which are currently used by CLI to drive certain parts of deployment configuration. -This imposes a problem of synchronization of content of those files with Mistral -environment when plan is imported or exported. - -TripleO-Common needs to be able to provide means for roles and networks management. - -Proposed Change -=============== - -Overview --------- - -TripleO plan configuration data should be stored in single place rather than in multiple -(mistral environment + plan meta files stored in Swift container). - -TripleO-Common should move from using mistral environment to storing the information -in file (plan-environment.yaml) in Swift container so all plan configuration data -are stored in 'meta' files in Swift and tripleo-common provides API to perform operations -on this data. - -Plan meta files: capabilities-map.yaml, roles_data.yaml, network_data.yaml [3], -plan-environment.yaml - -Proposed plan-environment.yaml file structure:: - - version: 1.0 - - name: A name of a plan which this file describes - description: > - A description of a plan, it's usage and potential summary of features it provides - template: overcloud.yaml - environments: - - path: overcloud-resource-registry-puppet.yaml - parameter_defaults: - ControllerCount: 1 - passwords: - TrovePassword: "vEPKFbdpTeesCWRmtjgH4s7M8" - PankoPassword: "qJJj3gTg8bTCkbtYtYVPtzcyz" - KeystoneCredential0: "Yeh1wPLUWz0kiugxifYU19qaf5FADDZU31dnno4gJns=" - - -This solution makes whole plan configuration stored in Swift container together with -rest of plan files, simplifies plan import/export functionality as no synchronization -is necessary between the Swift files and mistral environment. Plan configuration is -more straightforward and CLI/GUI interoperability is improved. - -Initially the plan configuration is going to be split into multiple 'meta' files -(plan-environment.yaml, capabilities-map.yaml, roles_data.yaml, network_data.yaml) -all stored in Swift container. -As a next step we can evaluate a solution which merges them all into plan-environment.yaml - -Using CLI workflow user works with local files. Plan, Networks and Roles are configured by -making changes directly in relevant files (plan-management.yaml, roles_data.yaml, ...). -Plan is created and templates are generated on deploy command. - -TripleO Common library will implement CRUD actions for Roles and Networks -management. This will allow clients to manage Roles and Networks and generate relevant -templates (see work items). - -TripleO UI and other clients use tripleo-common library which operates on plan stored in -Swift container. - - -Alternatives ------------- - -Alternative approach is treating Swift 'meta' files as an input during plan creation -and synchronize them to Mistral environment when plan is imported which is described -initially in [1] and is used in current plan import/export implementation [2] - -This solution needs to deal with multiple race conditions, makes plan import/export -much more complicated and overall solution is not simple to understand. Using this -solution should be considered if using mistral environment as a plan configuration -storage has some marginal benefits over using file in Swift. Which is not the case -according to the discussion [1] - -As a subsequent step to proposed solution, it is possible to join all existing -'meta' files into a single one. - -Security Impact ---------------- - -None. - -Other End User Impact ---------------------- - -CLI/GUI interoperability is improved - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -This change makes Deployment Plan import/export functionality much simpler as well as -makes the tripleo-common operate on the same set of files as CLI does. It is much -easier to understand the CLI users how tripleo-common works as it does not do any -swift files -> mistral environment synchronization on the background. - -TripleO-Common can introduce functionality manage Roles and Networks which perfectly -matches to how CLI workflow does it. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - akrivoka - -Other contributors: - * d0ugal - * rbrady - * jtomasek - -Work Items ----------- - -* [tripleo-heat-templates] Update plan-environment.yaml to match new specification. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/update-plan-environment-yaml - -* [tripleo-common] Update relevant actions to store data in plan-environment.yaml in - Swift instead of using mistral-environment. Migrate any existing data away from Mistral. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/stop-using-mistral-env - -* [tripleo-common] On plan creation/update tripleo-common validates the plan and checks - that roles_data.yaml and network_data.yaml exist as well as validates it's format. - On success, plan creation/update templates are generated/regenerated. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/validate-roles-networks - -* [tripleo-common] Provide a GetRoles action to list current roles in json format by reading - roles_data.yaml. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/get-roles-action - -* [tripleo-common] Provide a GetNetworks action to list current networks in json format - by reading network_data.yaml. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/get-networks-action - -* [tripleo-common] Provide an UpdateRoles action to update Roles. It takes data in - json format validates it's contents and persists them in roles_data.yaml, after - successful update, templates are regenerated. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/update-roles-action - -* [tripleo-common] Provide an UpdateNetworks action to update Networks. It takes data in - json format validates it's contents and persists them in network_data.yaml. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/update-networks-action - -* [tripleo-ui] Provide a way to create/list/update/delete Roles by calling tripleo-common - actions. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/roles-crud-ui - -* [tripleo-ui] Provide a way to create/list/update/delete Networks by calling tripleo-common - actions. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/networks-crud-ui - -* [tripleo-ui] Provide a way to assign Networks to Roles. - - blueprint: https://blueprints.launchpad.net/tripleo/+spec/networks-roles-assignment-ui - -* [python-tripleoclient] Update CLI to use tripleo-common actions for operations - that currently modify mistral environment - - related bug: https://bugs.launchpad.net/tripleo/+bug/1635409 - -Dependencies -============ - -None. - -Testing -======= - -Feature will be tested as part of TripleO CI - -Documentation Impact -==================== - -Documentation should be updated to reflect the new capabilities of GUI (Roles/Networks management), -a way to use plan-environment.yaml via CLI workflow and CLI/GUI interoperability using plan import -and export features. - -References -========== - -[1] http://lists.openstack.org/pipermail/openstack-dev/2017-February/111433.html -[2] https://specs.openstack.org/openstack/tripleo-specs/specs/ocata/gui-plan-import-export.html -[3] https://review.openstack.org/#/c/409921/ diff --git a/specs/pike/environment-generator.rst b/specs/pike/environment-generator.rst deleted file mode 100644 index a6db9e47..00000000 --- a/specs/pike/environment-generator.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================ -Sample Environment Generator -============================ - -A common tool to generate sample Heat environment files would be beneficial -in two main ways: - -* Consistent formatting and details. Every environment file would include - parameter descriptions, types, defaults, etc. - -* Ease of updating. The parameters can be dynamically read from the templates - which allows the sample environments to be updated automatically when - parameters are added or changed. - -Problem Description -=================== - -Currently our sample environments are hand written, with no consistency in -terms of what is included. Most do not include a description of what all -the parameters do, and almost none include the types of the parameters or the -default values for them. - -In addition, the environment files often get out of date because developers -have to remember to manually update them any time they make a change to the -parameters for a given feature or service. This is tedious and error-prone. - -The lack of consistency in environment files is also a problem for the UI, -which wants to use details from environments to improve the user experience. -When environments are created manually, these details are likely to be missed. - -Proposed Change -=============== - -Overview --------- - -A new tool, similar to the oslo.config generator, will allow us to eliminate -these problems. It will take some basic information about the environment and -use the parameter definitions in the templates to generate the sample -environment file. - -The resulting environments should contain the following information: - -* Human-readable Title -* Description -* parameter_defaults describing all the available parameters for the - environment -* Optional resource_registry with any necessary entries - -Initially the title and description will simply be comments, but eventually we -would like to get support for those fields into Heat itself so they can be -top-level keys. - -Ideally the tool would be able to update the capabilities map automatically as -well. At some point there may be some refactoring done there to eliminate the -overlap, but during the transition period this will be useful. - -This is also a good opportunity to impose some organization on the environments -directory of tripleo-heat-templates. Currently it is mostly a flat directory -that contains all of the possible environments. It would be good to add -subdirectories that group related environments so they are easier to find. - -The non-generated environments will either be replaced by generated ones, -when that makes sense, or deprecated in favor of a generated environment. -In the latter case the old environments will be left for a cycle to allow -users transition time to the new environments. - -Alternatives ------------- - -We could add more checks to the yaml-validate tool to ensure environment files -contain the required information, but this still requires more developer -time and doesn't solve the maintenance problems as parameters change. - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -Users should get an improved deployment experience through more complete and -better documented sample environments. Existing users who are referencing -the existing sample environments may need to switch to the new generated -environments. - -Performance Impact ------------------- - -No runtime performance impact. Initial testing suggests that it may take a -non-trivial amount of time to generate all of the environments, but it's not -something developers should have to do often. - -Other Deployer Impact ---------------------- - -See End User Impact - -Developer Impact ----------------- - -Developers will need to write an entry in the input file for the tool rather -than directly writing sample environments. The input format of the tool will -be documented, so this should not be too difficult. - -When an existing environment is deprecated in favor of a generated one, a -release note should be written by the developer making the change in order to -communicate it to users. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - bnemec - -Other contributors: - jtomasek - -Work Items ----------- - -* Update the proposed tool to reflect the latest design decisions -* Convert existing environments to be generated - - -Dependencies -============ - -No immediate dependencies, but in the long run we would like to have some -added functionality from Heat to allow these environments to be more easily -consumed by the UI. However, it was agreed at the PTG that we would proceed -with this work and make the Heat changes in parallel so we can get some of -the benefits of the change as soon as possible. - - -Testing -======= - -Any environments used in CI should be generated with the tool. We will want -to add a job that exercises the tool as well, probably a job that ensures any -changes in the patch under test are reflected in the environment files. - - -Documentation Impact -==================== - -We will need to document the format of the input file. - - -References -========== - -`Initial proposed version of the tool -`_ - -https://etherpad.openstack.org/p/tripleo-environment-generator diff --git a/specs/pike/gui-logging.rst b/specs/pike/gui-logging.rst deleted file mode 100644 index 575dcd06..00000000 --- a/specs/pike/gui-logging.rst +++ /dev/null @@ -1,121 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========== -GUI logging -=========== - -The TripleO GUI currently has no way to persist logging information. - -Problem Description -=================== - -The TripleO GUI is a web application without its own dedicated backend. As -such, any and all client-side errors are lost when the End User reloads the page -or navigates away from the application. When things go wrong, the End User is -unable to retrieve client-side logs because this information is not persisted. - -Proposed Change -=============== - -Overview --------- - -I propose that we use Zaqar as a persistence backend for client-side logging. -At present, the web application is already communicating with Zaqar using -websockets. We can use this connection to publish new messages to a dedicated -logging queue. - -Zaqar messages have a TTL of one hour. So once every thirty minutes, Mistral -will query Zaqar using crontrigger, and retrieve all messages from the -``tripleo-ui-logging`` queue. Mistral will then look for a file called -``tripleo-ui-log`` in Swift. If this file exists, Mistral will check its size. -If the size exceeds a predetermined size (e.g. 10MB), Mistral will rename it to -``tripleo-ui-log-``, and create a new file in its place. The file -will then receive the messages from Zaqar, one per line. Once we reach, let's -say, a hundred archives (about 1GB) we can start removing dropping data in order -to prevent unnecessary data accumulation. - -To view the logging data, we can ask Swift for 10 latest messages with a prefix -of ``tripleo-ui-log``. These files can be presented in the GUI for download. -Should the user require, we can present a "View more" link that will display the -rest of the collected files. - -Alternatives ------------- - -None at this time - -Security Impact ---------------- - -There is a chance of logging sensitive data. I propose that we apply some -common scrubbing mechanism to the messages before they are stored in Swift. - -Other End User Impact ---------------------- - -Performance Impact ------------------- - -Sending additional messages over an existing websocket connection should have -a negligible performance impact on the web application. Likewise, running -hourly cron tasks in Mistral shouldn't impose a significant burden on the -undercloud machine. - -Other Deployer Impact ---------------------- - -Developer Impact ----------------- - -Developers should also benefit from having a centralized logging system in -place as a means of improving productivity when debugging. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - hpokorny - -Work Items ----------- - -* Introduce a central logging system (already in progress, see `blueprint`_) -* Introduce a global error handler -* Convert all logging messages to JSON using a standard format -* Configuration: the name for the Zaqar queue to carry the logging data -* Introduce a Mistral workflow to drain a Zaqar queue and publish the acquired - data to a file in Swift -* Introduce GUI elements to download the log files - -Dependencies -============ - -Testing -======= - -We can write unit tests for the code that handles sending messages over the -websocket connection. We might be able to write an integration smoke test that -will ensure that a message is received by the undercloud. We can also add some -testing code to tripleo-common to cover the logic that drains the queue, and -publishes the log data to Swift. - -Documentation Impact -==================== - -We need to document the default name of the Zaqar queue, the maximum size of -each log file, and how many log files can be stored at most. On the End User -side, we should document the fact that a GUI-oriented log is available, and the -way to get it. - -References -========== - -.. _blueprint: https://blueprints.launchpad.net/tripleo/+spec/websocket-logging diff --git a/specs/pike/send-mail-tool.rst b/specs/pike/send-mail-tool.rst deleted file mode 100644 index fa0a4a5a..00000000 --- a/specs/pike/send-mail-tool.rst +++ /dev/null @@ -1,129 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================ -Tool send email with tripleo tempest results -============================================ - -https://blueprints.launchpad.net/tripleo/+spec/send-mail-tool - -To speed up the troubleshooting, debugging and reproducing TripleO tempest -results, we should have a list of people responsible to receive email status -about tempest failures, containing a list of all the failures and failures -that are known issues and are being covered by some opened bug in launchpad. - -Problem Description -=================== - -Currently there is periodic TripleO jobs running tempest, and these results -are not being verified whether is failing or passing. -Even if there is someone responsible to verify these runs, still is a manual -job go to logs web site, check what's the latest job, go to the logs, verify -if tempest ran, list the number of failures, check against a list if these -failures are known failures or new ones, and only after all these steps, -start to work to identify the root cause of the problem. - -Proposed Change -=============== - -Overview --------- - -TripleO should provide a unified method for send email for a list of -users who would be responsible to take action when something goes wrong with -tempest results. -The method should run at the end of every run, in the validate-tempest role, -and read the log file, either by the output generated by tempest, or by the -logs uploaded to the logs website, identifying failures on tempest and report -it by mail, or save the mail content in a file to be verified later. The mail -should contain information such list of failures, list of known -failures, date, link to the logs of the run, and any other information that -might be relevant. - -Alternatives ------------- - -One of the alternatives would be openstack-health, where the user can -subscribe into the rss feed of one of the jobs using a third party application. -Right now, openstack-health doesn't support user subscription or send emails. - -Security Impact ---------------- - -None, since it will use a API running in some cloud service to send the email, -so the username and password remain secure. - -Other End User Impact ---------------------- - -None. - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -Developers in different teams will be more involved in TripleO CI debugging. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - arxcruz - - -Work Items ----------- - -* The script should be writen in Python -* Should be part of validate-tempest role in tripleo-quickstart-extras -* Should be able to read the logs in any run in http://logs.openstack.org -* Once it reads the log, collect information about the failures, - passing and known failures or taking tempest output and parsing it directly. -* Be able to work with Jinja2 template to send email, so it's - possible to have different templates for different types of job -* Read the list of address that the report should be sent - * The list is a dictionary mapping the email address to a list of tests - and/or jobs where the users are interested. -* Render the template with the proper data -* Send the report - - -Dependencies -============ - -None. - -Testing -======= - -As part of CI testing, the new tool should be used to send a -report to a list of interested people - -Documentation Impact -==================== - -Documentation should be updated to reflect the standard ways -to send the report and call the script at the end of every -periodic run. - -References -========== - -Sagi mail tempest: -https://github.com/sshnaidm/various/blob/master/check_tests.py - diff --git a/specs/pike/tripleo-ceph-ansible-integration.rst b/specs/pike/tripleo-ceph-ansible-integration.rst deleted file mode 100644 index 7af09620..00000000 --- a/specs/pike/tripleo-ceph-ansible-integration.rst +++ /dev/null @@ -1,571 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=============================================== - Enable TripleO to Deploy Ceph via Ceph Ansible -=============================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-ceph-ansible - -Enable TripleO to deploy Ceph via Ceph Ansible using a new Mistral -workflow. This will make the Ceph installation less tightly coupled -with TripleO but the existing operator interfaces to deploy Ceph with -TripleO will still be supported until the end of the Queens release. - -Problem Description -=================== - -The Ceph community maintains ceph-ansible to deploy and manage Ceph. -Members of the TripleO community maintain similar tools too. This is -a proposal to have TripleO trigger the Ceph community's tools via -Mistral as an alternative method to deploy and manage Ceph. - -Benefits of using another project to deploy and manage Ceph -=========================================================== - -Avoid duplication of effort ---------------------------- - -If there is a feature or bug fix in the Ceph community's tools not in -the tools used by TripleO, then members of the TripleO community could -allow deployers to use those features directly instead of writing -their own implementation. If this proposal is successful, then it -might result in not maintaining two code bases, (along with the bug -fixes and testing included) in the future. For example, if -ceph-ansible fixed a bug to correctly handle alternative system paths -to block devices, e.g. /dev/disk/by-path/ in lieu of /dev/sdb, then -the same bug would not need to be fixed in puppet-ceph. This detail -would also be nicely abstracted from a deployer because this spec -proposes maintaining parity with TripleO Heat Templates. Thus, the -deployer would not need to change the `ceph::profile::params::osds` -parameter as the same list of OSDs would work. - -In taking this approach it's possible for there to be cases where -TripleO's deployment architecture may have unique features that don't -exist within ceph-ansible. In these cases, efforts may need to be -taken so ensure such a features remian in parity with this approach. -In no way, does this proposal enable a TripleO deployer to bypass -TripleO and use ceph-ansible directly. Also, because Ceph is not an -OpenStack service itself but a service that TripleO uses, this -approach remains consistent with the TripleO mission. - - -Consistency between OpenStack and non-OpenStack Ceph deployments ----------------------------------------------------------------- - -A deployer may seek assistance from the Ceph community with a Ceph -deployment and this process will be simplified if both deployments -were done using the same tool. - -Enable Decoupling of Ceph management from TripleO -------------------------------------------------- - -The complexity of Ceph management can be moved to a different tool -and abstracted, where appropriate, from TripleO making the Ceph -management aspect of TripleO less complex. Combining this with -containerized Ceph would offer flexible deployment options. This -is a deployer benefit that is difficult to deliver today. - -Features in the Ceph community's tools not in TripleO's tools -------------------------------------------------------------- - -The Ceph community tool, ceph-ansible [1]_, offers benefits to -OpenStack users not found in TripleO's tool chain, including playbooks -to deploy Ceph in containers and migrate a non-containerized -deployment to a containerized deployment without downtime. Also, -making the Ceph deployment in TripleO less tightly coupled, by moving -it into a new Mistral workflow, would make it easier in a future -release to add a business logic layer through a tool like Tendrl [2]_, -to offer additional Ceph policy based configurations and possibly a -graphical tool to see the status of the Ceph cluster. However, the -scope of this proposal for Pike does not include Tendrl and instead -takes the first step towards deploying Ceph via a Mistral workflow by -triggering ceph-ansible directly. After the Pike cycle is complete -triggering Mistral may be considered in a future spec. - -Proposed Change -=============== - -Overview --------- - -The ceph-ansible [1]_ project provides a set of playbooks to deploy -and manage Ceph. A proof of concept [3]_ has been written which uses -two custom Mistral actions from the experimental -mistral-ansible-actions project [4]_ to have a Mistral workflow on the -undercloud trigger ceph-ansible to produce a working hyperconverged -overcloud. - -The deployer experience to stand up Ceph with TripleO at the end of -this cycle should be the following: - -#. The deployer chooses to deploy a role containing any of the - Ceph server services: CephMon, CephOSD, CephRbdMirror, CephRgw, - or CephMds. -#. The deployer provides the same Ceph parameters they provide today - in a Heat env file, e.g. a list of OSDs. -#. The deployer starts the deploy and gets an overcloud with Ceph - -Thus, the deployment experience remains the same for the deployer but -behind the scenes a Mistral workflow is started which triggers -ceph-ansible. The details of the Mistral workflow to accomplish this -follows. - -TripleO Ceph Deployment via Mistral ------------------------------------ - -TripleO's workflow to deploy a Ceph cluster would be changed so that -there are two ways to deploy a Ceph cluster; the way currently -supported by TripleO and the way described in this proposal. - -The workflow described here assumes the following: - -#. A deployer chooses to deploy Ceph server services from the - following list of five services found in THT's roles_data.yaml: - CephMon, CephOSD, CephRbdMirror, CephRgw, or CephMds. -#. The deployer chooses to include new Heat environment files which - will be in THT when this spec is implemented. The new Heat - environment file will change the implementation of any of the five - services from the previous step. Using storage-environment.yaml, - which defaults to Ceph deployed by puppet-ceph, will still trigger - the Ceph deployment by puppet-ceph. However, if the new Heat - environment files are included instead of storage-environment.yaml, - then the implementation of the service will be done by ceph-ansible - instead; which already configures these services for hosts under - the following roles in the Ansible inventory: mons, osds, mdss, - rgws, or rbdmirrors. -#. The undercloud has a directory called /usr/share/ceph-ansible - which contains the ceph-ansible playbooks described in this spec. - It will be present because its install will contain the - installation of the ceph-ansible package. -#. The Mistral on the Undercloud will contain to custom actions called - `ansible` and `ansible-playbook` (or similar) and will also contain - the workflow for each task below and can be observed by running - `openstack workflow list`. Assume this is the case because the - tripleo-common package will be modified to ship these actions and - they will be available after undercloud installation. -#. Heat will ship a new CustomResource type like - OS::Mistral::WorflowExecution [6]_, which will execute custom - Mistral workflows. - -The standard TripleO workflow, as executed by a deployer, will create -a custom Heat resource which starts an independent Mistral workflow to -interact with ceph-ansible. An example of such a Heat resource would be -OS::Mistral::WorflowExecution [6]_. - -Each independent Mistral workflow may be implemented directly in -tripleo-common/workbooks. A separate Mistral workbook will be created -for each goal described below: - -* Initial deployment of OpenStack and Ceph -* Adding additional Ceph OSDs to existing OpenStack and Ceph clusters - -The initial goal for the Pike cycle will be to maintain feature parity -with what is possible today in TripleO and puppet-ceph but with -containerized Ceph. Additional Mistral workflows may be written, time -permitting or in a future cycle to add new features to TripleO's Ceph -deployment which leverage ceph-ansible playbooks to shrink the Ceph -Cluster and safely remove an OSD or to perform maintenance on the -cluster by using Ceph's 'noout' flag so that the maintenance does not -result in more data migration than necessary. - -Initial deployment of OpenStack and Ceph ----------------------------------------- - -The sequence of events for this new Mistral workflow and Ceph-Ansible -to be triggered during initial deployment with TripleO follows: - -#. Define the Overcloud on the Undercloud in Heat. This includes the - Heat parameters that are related to storage which will later be - passed to ceph-ansible via a Mistral workflow. -#. Run `openstack overcloud deploy` with standard Ceph options but - including a new Heat environment file to make the implementation - of the service deployment use ceph-ansible. -#. The undercloud assembles and uploads the deployment plan to the - undercloud Swift and Mistral environment. -#. Mistral starts the workflow to deploy the Overcloud and interfaces - with Heat accordingly. -#. A point in the deployment is reached where the Overcloud nodes are - imaged, booted, and networked. At that point the undercloud has - access to the provisioning or management IPs of the Overcloud - nodes. -#. A new Heat Resource is created which starts a Mistral workflow to - Deploy Ceph on the systems with the any of the five Ceph server - services, including CephMon, CephOSD, CephRbdMirror, CephRgw, or - CephMds [6]_. -#. The servers which host Ceph services have their relevant firewall - ports opened according to the needs of their service, e.g. the Ceph - monitor firewalls are configured to accept connections on TCP - port 6789. [7]_. -#. The Heat resource is passed the same parameters normally found in - the tripleo-heat-templates environments/storage-environment.yaml - but instead through a new Heat environment file. Additional files - may be passed to include overrides, e.g. the list of OSD disks. -#. The Heat resource passes its parameters to the Mistral workflow as - parameters. This will include information about which hosts should - have which of the five Ceph server services. -#. The Mistral workflow translates these parameters so that they match - the parameters that ceph-ansible expects, e.g. - ceph::profile::params::osds would become devices though they'd have - the same content, which would be a list of block devices. The - translation entails building an argument list that may be passed - to the playbook by calling `ansible-playbook --extra-vars`. - Typically ceph-ansible uses modified files in the group_vars - directory but in this case, no files are modified and instead the - parameters are passed programmatically. Thus, the playbooks in - /usr/share/ceph-ansible may be run unaltered and that will be the - default directory. However, it will be possible to pass an - alternative location for the /usr/share/ceph-ansible playbook as - an argument. No playbooks are run yet at this stage. -#. The Mistral environment is updated to generate a new SSH key-pair - for ceph-ansible and the Overcloud nodes using the same process - that is used to create the SSH keys for TripleO validations and - install the public key on Overcloud nodes. After this environment - update it will be possible to run `mistral environment-get - ssh_keys_ceph` on the undercloud and see the public and private - keys in JSON. -#. The Mistral Action Plugin `ansible-playbook` is called and passed - the list of parameters as described earlier. The dynamic ansible - inventory used by tripleo-validations is used with the `-i` - option. In order for ceph-ansible to work as usual there must be a - group called `[mons]` and `[osds]` in the inventory. In addition to - optional groups for `[mdss]`, `[rgws]`, or `[rbdmirrors]`. - Modifications to the tripleo-validations project's - tripleo-ansible-inventory script may be made to support this, or a - derivative work of the same as shipped by TripleO common. The SSH - private key for the heat-admin user and the provisioning or - management IPs of the Overcloud nodes are what Ansible will use. -#. The mistral workflow computes the number of forks in Ansible - according to the number of machines that are going to be - bootstrapped and will pass this number with `ansible-playbook - --forks`. -#. Mistral verifies that the Ansible ping module can execute `ansible - $group -m ping` for any group in mons, osds, mdss, rgws, or - rbdmirrors, that was requested by the deployer. For example, if the - deployer only specified the CephMon and CephOSD service, then - Mistral will only run `ansible mons -m ping` and `ansible osds -m - ping`. The Ansible ping module will SSH into each host as the - heat-admin user with key which was generated as described - previously. If this fails, then the deployment fails. -#. Mistral starts the Ceph install using the `ansible-playbook` - action. -#. The Mistral workflow creates a Zaqar queue to send progress - information back to the client (CLI or web UI). -#. The workflow posts messages to the "tripleo" Zaqar queue or the - queue name provided to the original deploy workflow. -#. If there is a problem during the status of the deploy may be seen - by `openstack workflow execution list | grep ceph` and in the logs - at /var/log/mistral/{engine.log,executor.log}. Running `openstack - stack resource list` would show the custom Heat resource that - started the Mistral workflow, but `openstack workflow execution - list` and `openstack workflow task list` would contain more details - about what steps completed within the Mistral workflow. -#. The Ceph deployment is done in containers in a way which must - prevent any configuration file conflict for any composed service, - e.g. if a Nova compute container (as deployed by TripleO) and a - Ceph OSD container are on the same node, then they must have - different ceph.conf files, even if those files have the same - content. Though, ceph-ansible will manage ceph.conf for Ceph - services and puppet-ceph will still manage ceph.conf for OpenStack - services, neither tool will both try to manage the same ceph.conf - because it will be in a different location on the container host - and bind mounted to /etc/ceph/ceph.conf within different - containers. -#. After the Mistral workflow is completed successfully, the custom - Heat resource is considered successfully created. If the Mistral - workflow does not complete successfully, then the Heat resource - is not considered successfully created. TripleO should handle this - the same way that it handles any Heat resource that failed to be - created. For example, because the workflow is idempotent, if the - resource creation fails because the wrong parameter was passed or - because of a temporary network issue, the deployer could simply run - a stack-update the Mistral worklow would run again and if the - issues which caused the first run to fail were resolved, the - deployment should succeed. Similarly if a user updates a parameter, - e.g. a new disk is added to `ceph::profile::params::osds`, then the - workflow will run again without breaking the state of the running - Ceph cluster but it will configure the new disk. -#. After the dependency of the previous step is satisfied, the TripleO - Ceph external Heat resource is created to configure the appropriate - Overcloud nodes as Ceph clients. -#. For the CephRGW service, hieradata will be emitted so that it may - be used for the haproxy listener setup and keystone users setup. -#. The Overcloud deployment continues as if it was using an external - Ceph cluster. - -Adding additional Ceph OSD Nodes to existing OpenStack and Ceph clusters ------------------------------------------------------------------------- - -The process to add an additional Ceph OSD node is similar to the -process to deploy the OSDs along with the Overcloud: - -#. Introspect the new hardware to host the OSDs. -#. In the Heat environment file containing the node counts, increment - the CephStorageCount. -#. Run `openstack overcloud deploy` with standard Ceph options and the - environment file which specifies the implementation of the Ceph - deployment via ceph-ansible. -#. The undercloud updates the deployment plan. -#. Mistral starts the workflow to update the Overcloud and interfaces - with Heat accordingly. -#. A point in the deployment is reached where the new Overcloud nodes - are imaged, booted, and networked. At that point the undercloud has - access to the provisioning or management IPs of the Overcloud - nodes. -#. A new Heat Resource is created which starts a Mistral workflow to - add new Ceph OSDs. -#. TCP ports 6800:7300 are opened on the OSD host [7]_. -#. The Mistral environment already has an SSH key-pair as described in - the initial deployment scenario. The same process that is used to - install the public SSH key on Overcloud nodes for TripleO - validations is used to install the SSH keys for ceph-ansible. -#. If necessary, the Mistral workflow updates the number of forks in - Ansible according to the new number of machines that are going to - be bootstrapped. -#. The dynamic Ansible inventory will contain the new node. -#. Mistral confirms that Ansible can execute `ansible osds -m ping`. - This causes Ansible to SSH as the heat-admin user into all of the - CephOsdAnsible nodes, including the new nodes. If this fails, then - the update fails. -#. Mistral uses the Ceph variables found in Heat as described in the - initial deployment scenario. -#. Mistral runs the osd-configure.yaml playbook from ceph-ansible to - add the extra Ceph OSD server. -#. The OSDs on the server are each deployed in their own containers - and `docker ps` will list each OSD container. -#. After the Mistral workflow is completed, the Custom Heat resource - is considered to be updated. -#. No changes are necessary for the TripleO Ceph external Heat - resource since the Overcloud Ceph clients only need information - about new OSDs from the Ceph monitors. -#. The Overcloud deployment continues as if it was using an external - Ceph cluster. - -Containerization of configuration files ---------------------------------------- - -As described in the Containerize TripleO spec, configuration files -for the containerized service will be generated by Puppet and then -passed to the containerized service using a configuration volume [8]_. -A similar containerization feature is already supported by -ceph-ansible, which uses the following sequence to generate the -ceph.conf configuration file. - -* Ansible generates a ceph.conf on a monitor node -* Ansible runs the monitor container and bindmount /etc/ceph -* No modification is being done in the ceph.conf -* Ansible copies the ceph.conf to the Ansible server -* Ansible copies the ceph.conf and keys to the appropriate machine -* Ansible runs the OSD container and bindmount /etc/ceph -* No modification is being done in the ceph.conf - -These similar processes are compatible, even in the case of container -hosts which run more than one OpenStack service but which each need -their own copy of the configuration file per container. For example, -consider a containerzation node which hosts both Nova compute and Ceph -OSD services. In this scenario, the Nova compute service would be a -Ceph client and puppet-ceph would generate its ceph.conf and the Ceph -OSD service would be a Ceph server and ceph-ansible would generate its -ceph.conf. It is necessary for Puppet to configure the Ceph client -because Puppet configures the other OpenStack related configuration -files as is already provided by TripleO. Both generated ceph.conf -files would need to be stored in a separate directory on the -containerization hosts to avoid conflicts and the directories could be -mapped to specific containers. For example, host0 could have the -following versions of foo.conf for two different containers:: - - host0:/container1/etc/foo.conf <--- generated by conf tool 1 - host0:/container2/etc/foo.conf <--- generated by conf tool 2 - -When each container is started on the host, the different -configuration files could then be mapped to the different containers:: - - docker run containter1 ... /container1/etc/foo.conf:/etc/foo.conf - docker run containter2 ... /container2/etc/foo.conf:/etc/foo.conf - -In the above scenario, it is necessary for both configuration files -to be generated from the same parameters. I.e. both Puppet and Ansible -will use the same values from the Heat environment file, but will -generate the configuration files differently. After the configuration -programs have run it won't matter that Puppet idempotently updated -lines of the ceph.conf and that Ansible used a Jina2 template. What -will matter is that both configuration files have the same value, -e.g. the same FSID. - -Configuration files generated as described in the Containerize TripleO -spec will not store those configuration files on the container -host's /etc directory before passing it to the container guest with a -bind mount. By default, ceph-ansible generates the initial ceph.conf -on the container host's /etc directory before it uses a bind mount to -pass it through to the container. In order to be consistent with the -Containerize TripleO spec, ceph-ansible will get a new feature for -deploying Ceph in containers so that it will not generate the -ceph.conf on the container host's /etc directory. The same option will -need to apply when generating Ceph key rings; which will be stored in -/etc/ceph in the container, but not on the container host. - -Because Mistral on the undercloud runs the ansible playbooks, the -user "mistral" on the undercloud will be the one that SSH's into the -overcloud nodes to run ansible playbooks. Care will need to be taken -to ensure that user doesn't make changes which are out of scope. - -Alternatives ------------- - -From a high level, this proposal is an alternative to the current -method of deploying Ceph with TripleO and offers the benefits listed -in the problem description. - -From a lower level, how this proposal is implemented as described in -the Workflow section should be considered. - -#. In a split-stack scenario, after the hardware has been provisioned - by the first Heat stack and before the configuration Heat stack is - created, a Mistral workflow like the one in the POC [3]_ could be - run to configured Ceph on the Ceph nodes. This scenario would be - more similar to the one where TripleO is deployed using the TripleO - Heat Templates environment file puppet-ceph-external.yaml. This - could be an alternative to a new OS::Mistral::WorflowExecution Heat - resource [6]_. -#. Trigger the ceph-ansible deployment before the OpenStack deployment - In the initial workflow section, it is proposed that "A new - Heat Resource is created which starts a Mistral workflow to Deploy - Ceph". This may be difficult because, in general, composable services - currently define snippets of puppet data which is then later combined - to define the deployment steps, and there is not yet a way to support - running an arbitrary Mistral workflow at a given step of a deployment. - Thus, the Mistral workflow could be started first and then it could - wait for what is described in step 6 of the overview section. - -Security Impact ---------------- - -* A new SSH key pair will be created on the undercloud and will be - accessible in the Mistral environment via a command like - `mistral environment-get ssh_keys_ceph`. The public key of this - pair will be installed in the heat-admin user's authorized_keys - file on all Overcloud nodes which will be Ceph Monitors or OSDs. - This process will follow the same pattern used to create the SSH - keys used for TripleO validations so nothing new would happen in - that respect; just another instance on the same type of process. -* An additional tool would do configuration on the Overcloud, though - the impact of this should be isolated via Containers. -* Regardless of how Ceph services are configured, they require changes - to the firewall. This spec will implement parity in fire-walling for - Ceph services [7]_. - -Other End User Impact ---------------------- - -None. - -Performance Impact ------------------- - -The following applies to the undercloud: - -* Mistral will need to run an additional workflow -* Heat's role in deploying Ceph would be lessened so the Heat stack - would be smaller. - -Other Deployer Impact ---------------------- - -Ceph will be deployed using a method that is proven but who's -integration is new to TripleO. - -Developer Impact ----------------- - -None. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - fultonj - -Other contributors: - gfidente - leseb - colonwq - d0ugal (to review Mistral workflows/actions) - -Work Items ----------- - -* Prototype a Mistral workflow to independently install Ceph on - Overcloud nodes [3]_. [done] -* Prototype a Heat Resource to start an independent Mistral Workflow - [6]_. [done] -* Expand mistral-ansible-actions with necessary options (fultonj) -* Parametize mistral workflow (fultonj) -* Update and have merged Heat CustomResource [6]_ (gfidente) -* Have ceph-ansible create openstack pools and keys for containerized - deployments: https://github.com/ceph/ceph-ansible/issues/1321 (leseb) -* get ceph-ansible packaged in ceph.com and push to centos cbs - (fultonj / leseb) -* Make undercloud install produce /usr/share/ceph-ansible by modifying - RDO's instack RPM's spec file to add a dependency (fultonj) -* Submit mistral workflow and ansible-mistral-actions to - tripleo-common (fultonj) -* Prototype new service plugin interface that defines per-service - workflows (gfidente / shardy / fultonj) -* Submit new services into tht/roles_data.yaml so users can use it. - This should include a change to the tripleo-heat-templates - ci/environments/scenario001-multinode.yaml to include the new - service, e.g. CephMonAnsible so that CI is tested. This may not - work unless it all co-exists in a single overcloud deploy. - If it works, we use it to get started. The initial plan is for - scenario004 to keep using puppet-ceph. -* Implement the deleting the Ceph Cluster scenario -* Implement the adding additional Ceph OSDs to existing OpenStack and - Ceph clusters scenario -* Implement the removing Ceph OSD nodes scenario -* Implement the performing maintenance on Ceph OSD nodes (optional) - -Dependencies -============ - -Containerization of the Ceph services provided by ceph-ansible is -used to ensure the configuration tools aren't competing. This -will need to be compatible with the Containerize TripleO spec -[9]_. - -Testing -======= - -A change to tripleo-heat-templates' scenario001-multinode.yaml will be -submitted which includes deployment of the new services CephMonAnsible -and CephOsdAnsible (note that these role names will be changed when -fully working). This testing scenario may not work unless all of the -services may co-exist; however, preliminary testing indicates that -this will work. Initially scenario004 will not be modified and will be -kept using puppet-ceph. We may start by changing ovb-nonha scenario -first as we believe this may be faster. When the CI move to -tripleo-quickstart happens and there is a containers only scenario we -will want to add a hyperconverged containerized deployment too. - -Documentation Impact -==================== - -A new TripleO Backend Configuration document "Deploying Ceph with -ceph-ansible" would be required. - -References -========== - -.. [1] `ceph-ansible `_ -.. [2] `Tendrl `_ -.. [3] `POC tripleo-ceph-ansible `_ -.. [4] `Experimental mistral-ansible-actions project `_ -.. [6] `Proposed new Heat resource OS::Mistral::WorflowExecution `_ -.. [7] `These firewall changes must be managed in a way that does not conflict with TripleO's mechanism for managing host firewall rules and should be done before the Ceph servers are deployed. We are working on a solution to this problem.` -.. [8] `Configuration files generated by Puppet and passed to a containerized service via a config volume `_ -.. [9] `Spec to Containerize TripleO `_ diff --git a/specs/pike/tripleo-derive-parameters.rst b/specs/pike/tripleo-derive-parameters.rst deleted file mode 100644 index ab42a540..00000000 --- a/specs/pike/tripleo-derive-parameters.rst +++ /dev/null @@ -1,440 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================== -Deriving TripleO Parameters -=========================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-derive-parameters - -This specification proposes a generic interface for automatically -populating environment files with parameters which were derived from -formulas; where the formula's input came from introspected hardware -data, workload type, and deployment type. It also provides specific -examples of how this interface may be used to improve deployment of -overclouds to be used in DPDK or HCI usecases. Finally, it proposes -how this generic interface may be shared and extended by operators -who optionally chose to have certain parameters prescribed so that -future systems tuning expertise may be integrated into TripleO. - -Problem Description -=================== - -Operators must populate parameters for a deployment which may be -specific to hardware and deployment type. The hardware information -of a node is available to the operator once the introspection of the -node is completed. However, the current process requires that the -operator manually read the introspected data, make decisions based on -that data and then update the parameters in an environment file. This -makes deployment preparation unnecessarily complex. - -For example, when deploying for DPDK, the operator must provide the -list of CPUs which should be assigned to the DPDK Poll Mode Driver -(PMD) and the CPUs should be provided from the same NUMA node on which -the DPDK interface is present. In order to provide the correct -parameters, the operator must cross check all of these details. - -Another example is the deployment of HCI overclouds, which run both -Nova compute and Ceph OSD services on the same nodes. In order to -prevent contention between compute and storage services, the operator -may manually apply formulas, provided by performance tuning experts, -which take into account available hardware, type of workload, and type -of deployment, and then after computing the appropriate parameters -based on those formulas, manually store them in environment files. - -In addition to the complexity of the DPDK or HCI usecase, knowing the -process to assign CPUs to the DPDK Poll Mode Driver or isolate compute -and storage resources for HCI is, in itself, another problem. Rather -than document the process and expect operators to follow it, the -process should be captured in a high level language with a generic -interface so that performance tuning experts may easily share new -similar processes for other use cases with operators. - -Proposed Change -=============== - -This spec aims to make three changes to TripleO outlined below. - -Mistral Workflows to Derive Parameters --------------------------------------- - -A group of Mistral workflows will be added for the features which are -complex to determine the deployment parameters. Features like DPDK, -SR-IOV and HCI require, input from the introspection data to be -analyzed to compute the deployment parameters. This derive parameters -workflow will provide a default set of computational formulas by -analyzing the introspected data. Thus, there will be a hard dependency -with node introspection for this workflow to be successful. - -During the first iterations, all the roles in a deployment will be -analyzed to find a service associated with the role, which requires -parameter derivation. Various options of using this and the final -choice for the current iteration is discussed in below section -`Workflow Association with Services`_. - -This workflow assumes that all the nodes in a role have a homegenous -hardware specification and introspection data of the first node will -be used for processing the parameters for the entire role. This will -be reexamined in later iterations, based on the need for node specific -derivations. The workflow will consider the flavor-profile association -and nova placement scheduler to identify the nodes associated with a -role. - -Role-specific parameters are an important requirement for this workflow. -If there are multiple roles with the same service (feature) enabled, -the parameters which are derived from this workflow will be applied -only on the corresponding role. - -The input sources for these workflows are the ironic database and ironic -introspection data stored in Swift, in addition to the Deployment plan stored -in Swift. Computations done to derive the parameters within the Mistral -workflow will be implemented in YAQL. These computations will be a separate -workflow on per feature basis so that the formulas can be customizable. If an -operator has to modify the default formulas, he or she has to update only this -workflow with customized formula. - - -Applying Derived Parameters to the Overcloud --------------------------------------------- - -In order for the resulting parameters to be applied to the overcloud, -the deployment plan, which is stored in Swift on the undercloud, -will be modified with the Mistral `tripleo.parameters.update` action -or similar. - -The methods for providing input for derivation and the update of -parameters which are derivation output should be consistent with the -Deployment Plan Management specification [1]_. The implementation of -this spec with respect to the interfaces to set and get parameters may -change as it is updated. However, the basic workflow should remain the -same. - -Trigger Mistral Workflows with TripleO --------------------------------------- - -Assuming that workflows are in place to derive parameters and update the -deployment plan as described in the previous two sections, an operator may -take advantage of this optional feature by enabling it via ``plan- -environment.yaml``. A new section ``workflow_parameters`` will be added to -the ``plan-environments.yaml`` file to accomodate the additional parameters -required for executing workflows. With this additional section, we can ensure -that the workflow specific parameters are provide only to the workflow, -without polluting the heat environments. It will also be possible to provide -multiple plan environment files which will be merged in the CLI before plan -creation. - -These additional parameters will be read by the derive params workflow -directly from the merged ``plan-environment.yaml`` file stored in Swift. - -It is possible to modify the created plan or modify the profile-node -association, after the derive parameters workflow execution. As of -now, we assume that there no such alterations done, but it will be -extended after the initial iteration, to fail the deployment with -some validations. - -An operator should be able to derive and view parameters without doing a -deployment; e.g. "generate deployment plan". If the calculation is done as -part of the plan creation, it would be possible to preview the calculated -values. Alternatively the workflow could be run independently of the overcloud -deployment, but how that will fit with the UI workflow needs to be determined. - -Usecase 1: Derivation of DPDK Parameters -======================================== - -A part of the Mistral workflow which uses YAQL to derive DPDK -parameters based on introspection data, including NUMA [2]_, exists -and may be seen on GitHub [3]_. - -Usecase 2: Derivation Profiles for HCI -====================================== - -This usecase uses HCI, running Ceph OSD and Nova Compute on the same node. HCI -derive parameters workflow works with a default set of configs to categorize -the type of the workload that the role will host. An option will be provide to -override the default configs with deployment specific configs via ``plan- -environment.yaml``. - -In case of HCI deployment, the additional plan environment used for the -deployment will look like:: - - workflow_parameters: - tripleo.workflows.v1.derive_parameters: - # HCI Derive Parameters - HciProfile: nfv-default - HciProfileConfig: - default: - average_guest_memory_size_in_mb: 2048 - average_guest_CPU_utilization_percentage: 50 - many_small_vms: - average_guest_memory_size_in_mb: 1024 - average_guest_CPU_utilization_percentage: 20 - few_large_vms: - average_guest_memory_size_in_mb: 4096 - average_guest_CPU_utilization_percentage: 80 - nfv_default: - average_guest_memory_size_in_mb: 8192 - average_guest_CPU_utilization_percentage: 90 - -In the above example, the section ``workflow_parameters`` is used to provide -input parameters for the workflow in order to isolate Nova and Ceph -resources while maximizing performance for different types of guest -workloads. An example of the derivation done with these inputs is -provided in nova_mem_cpu_calc.py on GitHub [4]_. - - -Other Integration of Parameter Derivation with TripleO -====================================================== - -Users may still override parameters ------------------------------------ - -If a workflow derives a parameter, e.g. cpu_allocation_ratio, but the -operator specified a cpu_allocation_ratio in their overcloud deploy, -then the operator provided value is given priority over the derived -value. This may be useful in a case where an operator wants all of the -values that were derived but just wants to override a subset of those -parameters. - -Handling Cross Dependency Resources ------------------------------------ - -It is possible that multiple workflows will end up deriving parameters based -on the same resource (like CPUs). When this happens, it is important to have a -specific order for the workflows to be run considering the priority. - -For example, let us consider the resource CPUs and how it should be used -between DPDK and HCI. DPDK requires a set of dedicated CPUs for Poll Mode -Drivers (NeutronDpdkCoreList), which should not be used for host process -(ComputeHostCpusList) and guest VM's (NovaVcpuPinSet). HCI requires the CPU -allocation ratio to be derived based on the number of CPUs that are available -for guest VMs (NovaVcpuPinSet). Priority is given to DPDK, followed by HOST -parameters and then HCI parameters. In this case, the workflow execution -starts with a pool of CPUs, then: - -* DPDK: Allocate NeutronDpdkCoreList -* HOST: Allocate ComputeHostCpusList -* HOST: Allocate NovaVcpuPinSet -* HCI: Fix the cpu allocation ratio based on NovaVcpuPinSet - -Derived parameters for specific services or roles -------------------------------------------------- - -If an operator only wants to configure Enhanced Placement Awareness (EPA) -features like CPU pinning or huge pages, which are not associated with any -feature like DPDK or HCI, then it should be associated with just the compute -service. - -Workflow Association with Services ----------------------------------- - -The optimal way to associate the derived parameter workflows with -services, is to get the list of the enabled services on a given role, -by previewing Heat stack. With the current limitations in Heat, it is -not possible fetch the enabled services list on a role. Thus, a new -parameter will be introduced on the service which is associated with a -derive parameters workflow. If this parameter is referenced in the -heat resource tree, on a specific role, then the corresponding derive -parameter workflow will be invoked. For example, the DPDK service will -have a new parameter "EnableDpdkDerivation" to enable the DPDK -specific workflows. - -Future integration with TripleO UI ----------------------------------- - -If this spec were implemented and merged, then the TripleO UI could -have a menu item for a deployment, e.g. HCI, in which the deployer may -choose a derivation profile and then deploy an overcloud with that -derivation profile. - -The UI could better integrate with this feature by allowing a deployer -to use a graphical slider to vary an existing derivation profile and -then save that derivation profile with a new name. The following -cycle could be used by the deployer to tune the overcloud. - -* Choose a deployment, e.g. HCI -* Choose an HCI profile, e.g. many_small_vms -* Run the deployment -* Benchmark the planned workload on the deployed overcloud -* Use the sliders to change aspects of the derivation profile -* Update the deployment and re-run the benchmark -* Repeat as needed -* Save the new derivation profile as the one to be deployed in the field - -The implementation of this spec would enable the TripleO UI to support -the above. - -Alternatives ------------- - -The simplest alternative is for operators to determine what tunings -are appropriate by testing or reading documentation and then implement -those tunings in the appropriate Heat environment files. For example, -in an HCI scenario, an operator could run nova_mem_cpu_calc.py [4]_ -and then create a Heat environment file like the following with its -output and then deploy the overcloud and directly reference this -file:: - - parameter_defaults: - ExtraConfig: - nova::compute::reserved_host_memory: 75000 - nova::cpu_allocation_ratio: 8.2 - -This could translate into a variety of overrides which would require -initiative on the operator's part. - -Another alternative is to write separate tools which generate the -desired Heat templates but don't integrate them with TripleO. For -example, nova_mem_cpu_calc.py and similar, would produce a set of Heat -environment files as output which the operator would then include -instead of output containing the following: - -* nova.conf reserved_host_memory_mb = 75000 MB -* nova.conf cpu_allocation_ratio = 8.214286 - -When evaluating the above, keep in mind that only two parameters for -CPU allocation and memory are being provided as an example, but that -a tuned deployment may contain more. - -Security Impact ---------------- - -There is no security impact from this change as it sits at a higher -level to automate, via Mistral and Heat, features that already exist. - -Other End User Impact ---------------------- - -Operators need not manually derive the deployment parameters based on the -introspection or hardware specification data, as it is automatically derived -with pre-defined formulas. - -Performance Impact ------------------- - -The deployment and update of an overcloud may take slightly longer if -an operator uses this feature because an additional Mistral workflow -needs to run to perform some analytics before applying configuration -updates. However, the performance of the overcloud would be improved -because this proposal aims to make it easier to tune the overcloud for -performance. - -Other Deployer Impact ---------------------- - -A new configuration option is being added, but it has to be explicitly -enabled, and thus it would not take immediate effect after its merged. -Though, if a deployer chooses to use it and there is a bug in it, then -it could affect the overcloud deployment. If a deployer uses this new -option, and had a deploy in which they set a parameter directly, -e.g. the Nova cpu_allocation_ratio, then that parameter may be -overridden by a particular tuning profile. So that is something a -deployer should be aware of when using this proposed feature. - -The config options being added will ship with a variety of defaults -based on deployments put under load in a lab. The main idea is to make -different sets of defaults, which were produced under these -conditions, available. The example discussed in this proposal and to -be made available on completion could be extended. - -Developer Impact ----------------- - -This spec proposes modifying the deployment plan which, if there was a -bug, could introduce problems into a deployment. However, because the -new feature is completely optional, a developer could easily disable -it. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignees: - skramaja - fultonj - -Other contributors: - jpalanis - abishop - shardy - gfidente - -Work Items ----------- - -* Derive Params start workflow to find list of roles -* Workflow run for each role to fetch the introspection data and trigger - individual features workflow -* Workflow to identify if a service associated with a features workflow is - enabled in a role -* DPDK Workflow: Analysis and concluding the format of the input data (jpalanis) -* DPDK Workflow: Parameter deriving workflow (jpalanis) -* HCI Workflow: Run a workflow that calculates the parameters (abishop) -* SR-IOV Workflow -* EPA Features Workflow -* Run the derive params workflow from CLI -* Add CI scenario testing if workflow with produced expected output - -Dependencies -============ - -* NUMA Topology in introspection data (ironic-python-agent) [5]_ - -Testing -======= - -Create a new scenario in the TripleO CI in which a deployment is done -using all of the available options within a derivation profile called -all-derivation-options. A CI test would need to be added that would -test this new feature by doing the following: - -* A deployment would be done with the all-derivation-options profile -* The deployment would be checked that all of the configurations had been made -* If the configuration changes are in place, then the test passed -* Else the test failed - -Relating the above to the HCI usecase, the test could verify one of -two options: - -1. A Heat environment file created with the following syntactically - valid Heat:: - - parameter_defaults: - ExtraConfig: - nova::compute::reserved_host_memory: 75000 - nova::cpu_allocation_ratio: 8.2 - -2. The compute node was deployed such that the commands below return - something like the following:: - - [root@overcloud-osd-compute-0 ~]# grep reserved_host_memory /etc/nova/nova.conf - reserved_host_memory_mb=75000 - [root@overcloud-osd-compute-0 ~]# grep cpu_allocation_ratio /etc/nova/nova.conf - cpu_allocation_ratio=8.2 - [root@overcloud-osd-compute-0 ~]# - -Option 1 would put less load on the CI infrastructure and produce a -faster test but Option 2 tests the full scenario. - -If a new derived parameter option is added, then the all-derivation-options -profile would need to be updated and the test would need to be updated -to verify that the new options were set. - -Documentation Impact -==================== - -A new chapter would be added to the TripleO document on deploying with -derivation profiles. - -References -========== - -.. [1] `Deployment Plan Management specification `_ -.. [2] `Spec for Ironic to retrieve NUMA node info `_ -.. [3] ``_ -.. [4] `nova_mem_cpu_calc.py `_ -.. [5] `NUMA Topology in introspection data (ironic-python-agent) `_ diff --git a/specs/pike/tripleo-realtime.rst b/specs/pike/tripleo-realtime.rst deleted file mode 100644 index 31c35aaf..00000000 --- a/specs/pike/tripleo-realtime.rst +++ /dev/null @@ -1,235 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================================== -Add real-time compute nodes to TripleO -====================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-realtime - -Real-time guest VMs require compute nodes with a specific configuration to -control the sources of latency spikes. - -Problem Description -=================== - -Manual configuration of compute nodes to support real-time guests is possible. -However this is complex and time consuming where there is large number of -compute nodes to configure. - -On a real-time compute node a subset of the available physical CPUs (pCPUs) are -isolated and dedicated to real-time tasks. The remaining pCPUs are dedicated to -general housekeeping tasks. This requires a real-time Linux Kernel and real-time -KVM that allow their housekeeping tasks to be isolated. The real-time and -housekeeping pCPUs typically reside on different NUMA nodes. - -Huge pages are also reserved for guest VMs to prevent page faults, either via -the kernel command line or via sysfs. Sysfs is preferable as it allows the -reservation on each individual NUMA node to be set. - -A real-time Linux guest VM is partitioned in a similar manner, having one or -more real-time virtual CPUs (vCPUs) and one or more general vCPUs to handle -the non real-time housekeeping tasks. - -A real-time vCPU is pinned to a real-time pCPU while a housekeeping vCPU is -pinned to a housekeeping pCPUS. - -It is expected that operators would require both real-time and non real-time -compute nodes on the same overcloud. - -Use Cases ---------- - -The primary use-case is NFV appliances deployed by the telco community which -require strict latency guarantees. Other latency sensitive applications should -also benefit. - -Proposed Change -=============== - -This spec proposes changes to automate the deployment of real-time capable -compute nodes using TripleO. - -* a custom overcloud image for the real-time compute nodes, which shall include: - - * real-time Linux Kernel - * real-time KVM - * real-time tuned profiles - -* a new real-time compute role that is a variant of the existing compute role - - * huge pages shall be enabled on the real-time compute nodes. - * huge pages shall be reserved for the real-time guests. - * CPU pinning shall be used to isolate kernel housekeeping tasks from the - real-time tasks by configuring tuned. - * CPU pinning shall be used to isolate virtualization housekeeping tasks from - the real-time tasks by configuring nova. - -Alternatives ------------- - -None - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -Worse-case latency in real-time guest VMs should be significantly reduced. -However a real-time configuration potentially reduces the overall throughput of -a compute node. - -Other Deployer Impact ---------------------- - -The operator will remain responsible for: - -* appropriate BIOS settings on compute node. -* setting appropriate parameters for the real-time role in an environment file -* post-deployment configuration - - * creating/modifying overcloud flavors to enable CPU pinning, hugepages, - dedicated CPUs, real-time policy - * creating host aggregates for real-time and non real-time compute nodes - - - -Developer Impact ----------------- - -None - -Implementation -============== - -Real-time ``overcloud-full`` image creation: - -* create a disk-image-builder element to include the real-time packages -* add support for multiple overcloud images in python-tripleoclient CLIs:: - - openstack overcloud image build - openstack overcloud image upload - -Real-time compute role: - -* create a ``ComputeRealtime`` role - - * variant of the ``Compute`` role that can be configued and scaled - independently - * allows a different image and flavor to be used for real-time nodes - * includes any additional parameters/resources that apply to real-time nodes - -* create a ``NovaRealtime`` service - - * contains a nested ``NovaCompute`` service - * allows parameters to be overridden for the real-time role only - -Nova configuration: - -* Nova ``vcpu_pin_set`` support is already implemented. See NovaVcpuPinSet in - :ref:`references` - -Kernel/system configuration: - -* hugepages support - - * set default hugepage size (kernel cmdline) - * number of hugepages of each size to reserve at boot (kernel cmdline) - * number of hugepages of each size to reserve post boot on each NUMA node - (sysfs) - -* Kernel CPU pinning - - * isolcpu option (kernel cmdline) - -Ideally this can be implemented outside of TripleO in the Tuned profiles, where -it is possible to set the kernel command line and manage sysfs. TripleO would -then manage the Tuned profile config files. -Alternatively the grub and systemd config files can be managed directly. - -.. note:: - - This requirement is shared with OVS-DPDK. The development should be - coordinated to ensure a single implementation is implemented for - both use-cases. - Managing the grub config via a UserData script is the current approach used - for OVS-DPDK. See OVS-DPDK documentation in :ref:`references`. - -Assignee(s) ------------ - -Primary assignee: - owalsh - -Other contributors: - ansiwen - -Work Items ----------- - -As outlined in the proposed changes. - -Dependencies -============ - -* Libvirt real time instances - https://blueprints.launchpad.net/nova/+spec/libvirt-real-time -* Hugepages enabled in the Compute nodes. - https://bugs.launchpad.net/tripleo/+bug/1589929 -* CPU isolation of real-time and non real-time tasks. - https://bugs.launchpad.net/tripleo/+bug/1589930 -* Tuned - https://fedorahosted.org/tuned/ - -Testing -======= - -Genuine real-time guests are unlikely to be testable in CI: - -* specific BIOS settings are required. -* images with real-time Kernel and KVM modules are required - -However the workflow to deploy these guest should be testable in CI. - -Documentation Impact -==================== - -Manual steps performed by the operator shall be documented: - -* BIOS settings for low latency -* Real-time overcloud image creation - - .. note:: - - CentOS repos do not include RT packages. The CERN CentOS RT repository is an - alternative. -* Flavor and profile creation -* Parameters required in a TripleO environment file -* Post-deployment configuration - -.. _references: - -References -========== - -Nova blueprint `"Libvirt real time instances" -`_ - -The requirements are similar to :doc:`../newton/tripleo-ovs-dpdk` - -CERN CentOS 7 RT repo http://linuxsoft.cern.ch/cern/centos/7/rt/ - -NoveVcpuPinSet parameter added: https://review.openstack.org/#/c/343770/ - -OVS-DPDK documentation (work-in-progress): https://review.openstack.org/#/c/395431/ diff --git a/specs/pike/tripleo-routed-networks-ironic-inspector.rst b/specs/pike/tripleo-routed-networks-ironic-inspector.rst deleted file mode 100644 index cc730abb..00000000 --- a/specs/pike/tripleo-routed-networks-ironic-inspector.rst +++ /dev/null @@ -1,386 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================================== -Modify TripleO Ironic Inspector to PXE Boot Via DHCP Relay -========================================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-routed-networks-ironic-inspector - -This blueprint is part of the series tripleo-routed-networks-deployment [0]_. - -This spec describes adding features to the Undercloud to support Ironic -Inspector performing PXE boot services for multiple routed subnets (with -DHCP relay on the routers forwarding the requests). The changes required -to support this will be in the format of ``undercloud.conf`` and in the Puppet -script that writes the ``dnsmasq.conf`` configuration for Ironic Inspector. - -TripleO uses Ironic Inspector to perform baremetal inspection of overcloud -nodes prior to deployment. Today, the ``dnsmasq.conf`` that is used by Ironic -Inspector is generated by Puppet scripts that run when the Undercloud is -configured. A single subnet and IP allocation range is entered in -``undercloud.conf`` in the parameter ``inspection_iprange``. This spec would -implement support for multiple subnets in one provisioning network. - -Background Context -================== - -For a detailed description of the desired topology and problems being -addresssed, please reference the parent blueprint -triplo-routed-networks-deployment [0]_. - -Problem Descriptions -==================== - -Ironic Inspector DHCP doesn't yet support DHCP relay. This makes it -difficult to do introspection when the hosts are not on the same L2 domain -as the controllers. The dnsmasq process will actually function across a DHCP -relay, but the configuration must be edited by hand. - -Possible Solutions, Ideas, or Approaches: - -1. Add support for DHCP scopes and support for DHCP relays. -2. Use remote DHCP/PXE boot but provide L3 routes back to the introspection server -3. Use Neutron DHCP agent to PXE boot nodes for introspection (the Neutron - dhcp-agent already supports multiple subnets, and can be modified to support - DHCP relay). Note that there has been discussion about moving to Neutron for - Ironic Introspection on this bug [3]_. This is currently infeasible due to - Neutron not being able to issue IPs for unknown MACs. The related patch has - been abandoned [5]_. - - -Solution Implementation - -The Ironic Inspector DHCP server uses dnsmasq, but only configures one subnet. -We need to modify the Ironic Inspector DHCP configuration so that we can -configure DHCP for multiple Neutron subnets and allocation pools. Then we -should be able to use DHCP relay to send DHCP requests to the Ironic -Inspector DHCP server. In the long term, we can likely leverage the Routed -Networks work being done in Neutron to represent the subnets and allocation -pools that would be used for the DHCP range sets below. This spec only covers -the minimum needed for TripleO, so the work can be achieved simply by modifying -the Undercloud Puppet scripts. The following has been tested and shown -to result in successful introspection across two subnets, one local and one -across a router configured with DHCP relay:: - - Current dnsmasq.conf representing one network (172.20.0.0/24), which is - configured in the "inspection_iprange" in undercloud.conf: - port=0 - interface=br-ctlplane - bind-interfaces - dhcp-range=172.21.0.100,172.21.0.120,29 - dhcp-sequential-ip - dhcp-match=ipxe,175 - # Client is running iPXE; move to next stage of chainloading - dhcp-boot=tag:ipxe,http://172.20.0.1:8088/inspector.ipxe - dhcp-boot=undionly.kpxe,localhost.localdomain,172.20.0.1 - - Multiple-subnet dnsmasq.conf representing multiple subnets: - port=0 - interface=br-ctlplane - bind-interfaces - # Ranges and options - dhcp-range=172.21.0.100,172.21.0.120,29 - dhcp-range=set:leaf1,172.20.0.100,172.20.0.120,255.255.255.0,29 - dhcp-option=tag:leaf1,option:router,172.20.0.254 - dhcp-range=set:leaf2,172.19.0.100,172.19.0.120,255.255.255.0,29 - dhcp-option=tag:leaf2,option:router,172.19.0.254 - - dhcp-sequential-ip - dhcp-match=ipxe,175 - # Client is running iPXE; move to next stage of chainloading - dhcp-boot=tag:ipxe,http://172.20.0.1:8088/inspector.ipxe - dhcp-boot=undionly.kpxe,localhost.localdomain,172.20.0.1 - -In the above configuration, a router is supplied for all subnets, including -the subnet to which the Undercloud is attached. Note that the router is not -required for nodes on the same subnet as the inspector host, but if it gets -automatically generated it won't hurt anything. - -This file is created by the Puppet file located in [1]_. That is where the -changes will have to be made. - -As discussed above, using a remote DHCP/PXE server is a possibility only if we -have support in the top-of-rack switches, or if there is a system or VM -listening on the remote subnet to relay DHCP requests. This configuration of -dnsmasq will allow it to send DHCP offers to the DHCP relay, which forwards the -offer on to the requesting host. After the offer is accepted, the host can -communicate directly with the Undercloud, since it has already received the -proper gateway address for packets to be forwarded. It will send a DHCP request -directly based on the offer, and the DHCP ACK will be sent directly from the -Undercloud to the client. Downloading of the PXE images is then done via TFTP -and HTTP, not through the DHCP relay. - -An additional problem is that Ironic Inspector blacklists nodes that have -already been introspected using iptables rules blocking traffic from -particular MAC addresses. Since packets relayed via DHCP relay will come -from the MAC address of the router (not the original NIC that sent the packet), -we will need to blacklist MACs based on the contents of the relayed DHCP -packet. If possible, this blacklisting would be done using dnsmasq, which -would provide the ability to decode the DHCP Discover packets and act on the -contents. In order to do blacklisting directly with ``dnsmasq`` instead of -using iptables, we need to be able to influence the ``dnsmasq`` configuration -file. - -Proposed Change -=============== -The proposed changes are discussed below. - -Overview --------- - -The Puppet modules will need to be refactored to output a multi-subnet -``dnsmasq.conf`` from a list of subnets in undercloud.conf. - -The blacklisting functionality will need to be updated. Filtering by MAC -address won't work for DHCP requests that are relayed by a router. In that -case, the source MAC address will be the router interface that sent the -relayed request. There are methods to blacklist MAC addresses within dnsmasq, -such as this configuration:: - - dhcp-mac=blacklist, - dhcp-ignore=blacklist - -Or this configuration:: - - # Never offer DHCP service to a machine whose Ethernet - # address is 11:22:33:44:55:66 - dhcp-host=11:22:33:44:55:66,ignore - -The configuration could be placed into the main ``dnsmasq.conf`` file, or into -a file in ``/etc/dnsmasq.d/``. Either way, dnsmasq will have to be restarted -in order to re-read the configuration files. This is due to a security feature -in dnsmasq to prevent foreign configuration being loaded as root. Since DHCP -has a built-in retry mechanism, the brief time it takes to restart dnsmasq -should not impact introspection, as long as we don't restart dnsmasq too -many times in any 60-second period. - -It does not appear that the dnsmasq DBus interface can be used to set the -"dhcp-ignore" option for individual MAC addresses [4]_ [6]_. - -Alternatives ------------- - -One alternative approach is to use DHCP servers to assign IP addresses on all -hosts on all interfaces. This would simplify configuration within the Heat -templates and environment files. Unfortunately, this was the original approach -of TripleO, and it was deemed insufficient by end-users, who wanted stability -of IP addresses, and didn't want to have an external dependency on DHCP. - -Another approach which was considered was simply trunking all networks back -to the Undercloud, so that dnsmasq could respond to DHCP requests directly, -rather than requiring a DHCP relay. Unfortunately, this has already been -identified as being unacceptable by some large operators, who have network -architectures that make heavy use of L2 segregation via routers. This also -won't work well in situations where there is geographical separation between -the VLANs, such as in split-site deployments. - -Another approach is to use the DHCP server functionality in the network switch -infrastructure in order to PXE boot systems, then assign static IP addresses -after the PXE boot is done via DHCP. This approach would require configuration -at the switch level that influenced where systems PXE boot, potentially opening -up a security hole that is not under the control of OpenStack. This approach -also doesn't lend itself to automation that accounts for things like changes -to the PXE image that is being served to hosts. - -It is not necessary to use hardware routers to forward DHCP packets. There -are DHCP relay and DHCP proxy packages available for Linux. It is possible -to place a system or a VM on both the Provisioning network and the remote -network in order to forward DHCP requests. This might be one method for -implementing CI testing. Another method might trunk all remote provisioning -networks back to the Undercloud, with DHCP relay running on the Undercloud -forwarding to the local br-ctlplane. - -Security Impact ---------------- - -One of the major differences between spine-and-leaf and standard isolated -networking is that the various subnets are connected by routers, rather than -being completely isolated. This means that without proper ACLs on the routers, -private networks may be opened up to outside traffic. - -This should be addressed in the documentation, and it should be stressed that -ACLs should be in place to prevent unwanted network traffic. For instance, the -Internal API network is sensitive in that the database and message queue -services run on that network. It is supposed to be isolated from outside -connections. This can be achieved fairly easily if *supernets* are used, so that -if all Internal API subnets are a part of the ``172.19.0.0/16`` supernet, an -ACL rule will allow only traffic between Internal API IPs (this is a simplified -example that could be applied on all Internal API router VLAN interfaces -or as a global ACL):: - - allow traffic from 172.19.0.0/16 to 172.19.0.0/16 - deny traffic from * to 172.19.0.0/16 - -In the case of Ironic Inspector, the TFTP server is a potential point of -vulnerability. TFTP is inherently unauthenticated and does not include an -access control model. The network(s) where Ironic Inspector is operating -should be secured from remote access. - -Other End User Impact ---------------------- - -Deploying with spine-and-leaf will require additional parameters to -provide the routing information and multiple subnets required. This will have -to be documented. Furthermore, the validation scripts may need to be updated -to ensure that the configuration is validated, and that there is proper -connectivity between overcloud hosts. - -Performance Impact ------------------- - -Much of the traffic that is today made over layer 2 will be traversing layer -3 routing borders in this design. That adds some minimal latency and overhead, -although in practice the difference may not be noticeable. One important -consideration is that the routers must not be too overcommitted on their -uplinks, and the routers must be monitored to ensure that they are not acting -as a bottleneck, especially if complex access control lists are used. - -The DHCP process is not likely to be affected, however delivery of system -images via TFTP may suffer a performance degredation. Since TFTP does not -deal well with packet loss, deployers will have to take care not to -oversaturate the links between routing switches. - -Other Deployer Impact ---------------------- - -A spine-and-leaf deployment will be more difficult to troubleshoot than a -deployment that simply uses a set of VLANs. The deployer may need to have -more network expertise, or a dedicated network engineer may be needed to -troubleshoot in some cases. - -Developer Impact ----------------- - -Spine-and-leaf is not easily tested in virt environments. This should be -possible, but due to the complexity of setting up libvirt bridges and -routes, we may want to provide a simulation of spine-and-leaf for use in -virtual environments. This may involve building multiple libvirt bridges -and routing between them on the Undercloud, or it may involve using a -DHCP relay on the virt-host as well as routing on the virt-host to simulate -a full routing switch. A plan for development and testing will need to be -formed, since not every developer can be expected to have a routed -environment to work in. It may take some time to develop a routed virtual -environment, so initial work will be done on bare metal. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Dan Sneddon - -Final assignees to be determined. - -Approver(s) ------------ - -Primary approver: - Emilien Macchi - -Work Items ----------- - -1. Modify Ironic Inspector ``dnsmasq.conf`` generation to allow export of - multiple DHCP ranges. The patch enabling this has merged [7]_. -2. Modify the Ironic Inspector blacklisting mechanism so that it supports DHCP - relay, since the DHCP requests forwarded by the router will have the source - MAC address of the router, not the node being deployed. -3. Modify the documentation in ``tripleo-docs`` to cover the spine-and-leaf case. -4. Add an upstream CI job to test booting across subnets (although - hardware availability may make this a long-term goal). - -[*] Note that depending on the timeline for Neutron/Ironic integration, it might -make sense to implement support for multiple subnets via changes to the Puppet -modules which process ``undercloud.conf`` first, then follow up with a patch -to integrate Neutron networks into Ironic Inspector later on. - -Implementation Details ----------------------- - -Workflow for introspection and deployment: - -1. Network Administrator configures all provisioning VLANs with IP address of - Undercloud server on the ctlplane network as DHCP relay or "helper-address". -2. Operator configures IP address ranges and default gateways in - ``undercloud.conf``. Each subnet will require its own IP address range. -3. Operator imports baremetal instackenv.json. -4. When introspection or deployment is run, the DHCP server receives the DHCP - request from the baremetal host via DHCP relay. -5. If the node has not been introspected, reply with an IP address from the - introspection pool and the inspector PXE boot image. -6. Introspection is performed. LLDP collection [2]_ is performed to gather - information about attached network ports. -7. The node is blacklisted in ``dnsmasq.conf`` (or in ``/etc/dnsmasq.d``), - and dnsmasq is restarted. -8. On the next boot, if the MAC address is blacklisted and a port exists in - Neutron, then Neutron replies with the IP address from the Neutron port - and the overcloud-full deployment image. -9. The Heat templates are processed which generate os-net-config templates, and - os-net-config is run to assign static IPs from the correct subnets, as well - as routes to other subnets via the router gateway addresses. - -When using spine-and-leaf, the DHCP server will need to provide an introspection -IP address on the appropriate subnet, depending on the information contained in -the DHCP relay packet that is forwarded by the segment router. dnsmasq will -automatically match the gateway address (GIADDR) of the router that forwarded -the request to the subnet where the DHCP request was received, and will respond -with an IP and gateway appropriate for that subnet. - -The above workflow for the DHCP server should allow for provisioning IPs on -multiple subnets. - -Dependencies -============ - -There will be a dependency on routing switches that perform DHCP relay service -for production spine-and-leaf deployments. Since we will not have routing -switches in our virtual testing environment, a DHCP proxy may be set up as -described in the testing section below. - -Testing -======= - -In order to properly test this framework, we will need to establish at least -one CI test that deploys spine-and-leaf. As discussed in this spec, it isn't -necessary to have a full routed bare metal environment in order to test this -functionality, although there is some work required to get it working in virtual -environments such as OVB. - -For virtual testing, it is sufficient to trunk all VLANs back to the -Undercloud, then run DHCP proxy on the Undercloud to receive all the -requests and forward them to br-ctlplane, where dnsmasq listens. This -will provide a substitute for routers running DHCP relay. - -Documentation Impact -==================== - -The TripleO docs will need to be updated to include detailed instructions -for deploying in a spine-and-leaf environment, including the environment -setup. Covering specific vendor implementations of switch configurations -is outside this scope, but a specific overview of required configuration -options should be included, such as enabling DHCP relay (or "helper-address" -as it is also known) and setting the Undercloud as a server to receive -DHCP requests. - -The updates to TripleO docs will also have to include a detailed discussion -of choices to be made about IP addressing before a deployment. If supernets -are to be used for network isolation, then a good plan for IP addressing will -be required to ensure scalability in the future. - -References -========== - -.. [0] `Spec: Routed Networks for Neutron `_ -.. [1] `Source Code: inspector_dnsmasq_http.erb `_ -.. [2] `Review: Add LLDP processing hook and new CLI commands `_ -.. [3] `Bug: [RFE] Implement neutron routed networks support in Ironic `_ -.. [4] `Wikibooks: Python Programming: DBus `_ -.. [5] `Review: Enhanced Network/Subnet DHCP Options `_ -.. [6] `Documentation: DBus Interface for dnsmasq `_ -.. [7] `Review: Multiple DHCP Subnets for Ironic Inspector `_ diff --git a/specs/policy-template.rst b/specs/policy-template.rst deleted file mode 100644 index 442d35d5..00000000 --- a/specs/policy-template.rst +++ /dev/null @@ -1,126 +0,0 @@ -.. - This template should be in ReSTructured text. For help with syntax, - see http://sphinx-doc.org/rest.html - - To test out your formatting, build the docs using tox, or see: - http://rst.ninjs.org - - The filename in the git repository should match the launchpad URL, - for example a URL of - https://blueprints.launchpad.net/oslo?searchtext=awesome-thing should be - named awesome-thing.rst. - - For specs targeted at a single project, please prefix the first line - of your commit message with the name of the project. For example, - if you're submitting a new feature for oslo.config, your git commit - message should start something like: "config: My new feature". - - Wrap text at 79 columns. - - Do not delete any of the sections in this template. If you have - nothing to say for a whole section, just write: None - - If you would like to provide a diagram with your spec, ascii diagrams are - required. http://asciiflow.com/ is a very nice tool to assist with making - ascii diagrams. The reason for this is that the tool used to review specs is - based purely on plain text. Plain text will allow review to proceed without - having to look at additional files which can not be viewed in gerrit. It - will also allow inline feedback on the diagram itself. - -========================= - The title of the policy -========================= - -Introduction paragraph -- why are we doing anything? - -Problem Description -=================== - -A detailed description of the problem. - -Policy -====== - -Here is where you cover the change you propose to make in detail. How do you -propose to solve this problem? - -If the policy seeks to modify a process or workflow followed by the -team, explain how and why. - -If this is one part of a larger effort make it clear where this piece ends. In -other words, what's the scope of this policy? - -Alternatives & History -====================== - -What other ways could we do this thing? Why aren't we using those? This doesn't -have to be a full literature review, but it should demonstrate that thought has -been put into why the proposed solution is an appropriate one. - -If the policy changes over time, summarize the changes here. The exact -details are always available by looking at the git history, but -summarizing them will make it easier for anyone to follow the desired -policy and understand when and why it might have changed. - -Implementation -============== - -Author(s) ---------- - -Who is leading the writing of the policy? If more than one person is -working on it, please designate the primary author and contact. - -Primary author: - - -Other contributors: - - -Milestones ----------- - -When will the policy go into effect? - -If there is a built-in deprecation period for the policy, or criteria -that would trigger it no longer being in effect, describe them. - -Work Items ----------- - -List any concrete steps we need to take to implement the policy. - -References -========== - -Please add any useful references here. You are not required to have -any references. Moreover, this policy should still make sense when -your references are unavailable. Examples of what you could include -are: - -* Links to mailing list or IRC discussions - -* Links to notes from a summit session - -* Links to relevant research, if appropriate - -* Related policies as appropriate - -* Anything else you feel it is worthwhile to refer to - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - - - Introduced - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/adding-ci-jobs.rst b/specs/policy/adding-ci-jobs.rst deleted file mode 100644 index a6b23a10..00000000 --- a/specs/policy/adding-ci-jobs.rst +++ /dev/null @@ -1,146 +0,0 @@ -==================== - Adding New CI Jobs -==================== - -New CI jobs need to be added following a specific process in order to ensure -they don't block patches unnecessarily and that they aren't ignored by -developers. - -Problem Description -=================== - -We need to have a process for adding CI jobs that is not going to result -in a lot of spurious failures due to the new jobs. Bogus CI results force -additional rechecks and reduce developer/reviewer confidence in the results. - -In addition, maintaining CI jobs is a non-trivial task, and each one we add -increases the load on the team. Hopefully having a process that requires the -involvement of the new job's proposer makes it clear that the person/team -adding the job has a responsibility to help maintain it. CI is everyone's -problem. - -Policy -====== - -The following steps should be completed in the order listed when adding a new -job: - -#. Create an experimental job or hijack an existing job for a single Gerrit - change. See the references section for details on how to add a new job. - This job should be passing before moving on to the next step. - -#. Verify that the new job is providing a reasonable level of logging. Not - too much, not too little. Important logs, such as the OpenStack service - logs and basic system logs, are necessary to determine why jobs fail. - However, OpenStack Infra has to store the logs from an enormous number of - jobs, so it is also important to keep our log artifact sizes under control. - When in doubt, try to capture about the same amount of logs as the existing - jobs. - -#. Promote the job to check non-voting. While the job should have been - passing prior to this, it most likely has not been run a significant number - of times, so the overall stability is still unknown. - - "Stable" in this case would be defined as not having significantly more - spurious failures than the ovb-ha job. Due to the additional complexity of - an HA deployment, that job tends to fail for reasons unrelated to the patch - being tested more often than the other jobs. We do not want to add any - jobs that are less stable. Note that failures due to legitimate problems - being caught by the new job should not count against its stability. - - .. important:: Before adding OVB jobs to the check queue, even as - non-voting, please check with the CI admins to ensure there is enough - OVB capacity to run a large number of new jobs. As of this writing, - the OVB cloud capacity is significantly more constrained than regular - OpenStack Infra. - - A job should remain in this state until it has been proven stable over a - period of time. A good rule of thumb would be that after a week of - stability the job can and should move to the next step. - - .. important:: Jobs should not remain non-voting indefinitely. This causes - reviewers to ignore the results anyway, so the jobs become a waste of - resources. Once a job is believed to be stable, it should be made - voting as soon as possible. - -#. To assist with confirming the stability of a job, it should be added to the - `CI Status `_ page at this point. This - can actually be done at any time after the job is moved to the check queue, - but must be done before the job becomes voting. - - Additionally, contact Sagi Shnaidman (sshnaidm on IRC) to get the job - added to the `Extended CI Status `_ - page. - -#. Send an e-mail to openstack-dev, tagged with [tripleo], that explains the - purpose of the new job and notifies people that it is about to be made - voting. - -#. Make the job voting. At this point there should be sufficient confidence - in the job that reviewers can trust the results and should not merge - anything which does not pass it. - - In addition, be aware that voting multinode jobs are also gating. If the - job fails the patch cannot merge. This means a broken job can block all - TripleO changes from merging. - -#. Keep an eye on the `CI Status `_ page to - ensure the job keeps running smoothly. If it starts to fail an unusual - amount, please investigate. - -Alternatives & History -====================== - -Historically, a number of jobs have been added to the check queue when they -were completely broken. This is bad and reduces developer and reviewer -confidence in the CI results. It can also block TripleO changes from merging -if the broken job is gating. - -We also have a bad habit of leaving jobs in the non-voting state, which makes -them fairly worthless since reviewers will not respect the results. Per -this policy, we should clean up all of the non-voting jobs by either moving -them back to experimental, or stabilizing them and making them voting. - -Implementation -============== - -Author(s) ---------- - -Primary author: - bnemec - -Milestones ----------- - -This policy would go into effect immediately. - -Work Items ----------- - -This policy is mostly targeted at new jobs, but we do have a number of -non-voting jobs that should be brought into compliance with it. - -References -========== - -`OpenStack Infra Manual `_ - -`Adding a New Job `_ - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - Pike - - Introduced - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/bug-tagging.rst b/specs/policy/bug-tagging.rst deleted file mode 100644 index 6fce6f31..00000000 --- a/specs/policy/bug-tagging.rst +++ /dev/null @@ -1,150 +0,0 @@ -======== -Bug tags -======== - -The main TripleO bug tracker is used to keep track of bugs for multiple -projects that are all parts of TripleO. In order to reduce confusion, -we are using a list of approved tags to categorize them. - -Problem Description -=================== - -Given the heavily interconnected nature of the various TripleO -projects, there is a desire to track all the related bugs in a single -bug tracker. However when it is needed, it can be difficult to narrow -down the bugs related to a specific aspect of the project. Launchpad -bug tags can help us here. - -Policy -====== - -The Launchpad official tags list for TripleO contains the following -tags. Keeping them official in Launchpad means the tags will -auto-complete when users start writing them. A bug report can have any -combination of these tags, or none. - -Proposing new tags should be done via policy update (proposing a change -to this file). Once such a change is merged, a member of the driver -team will create/delete the tag in Launchpad. - -Tags ----- - -+-------------------------------+----------------------------------------------------------------------------+ -| Tag | Description | -+===============================+============================================================================+ -| alert | For critical bugs requiring immediate attention. Triggers IRC notification | -+-------------------------------+----------------------------------------------------------------------------+ -| ci | A bug affecting the Continuous Integration system | -+-------------------------------+----------------------------------------------------------------------------+ -| ci-reproducer | A bug affecting local recreation of Continuous Integration environments | -+-------------------------------+----------------------------------------------------------------------------+ -| config-agent | A bug affecting os-collect-config, os-refresh-config, os-apply-config | -+-------------------------------+----------------------------------------------------------------------------+ -| containers | A bug affecting container based deployments | -+-------------------------------+----------------------------------------------------------------------------+ -| depcheck | A bug affecting 3rd party dependencies, for example ceph-ansible, podman | -+-------------------------------+----------------------------------------------------------------------------+ -| deployment-time | A bug affecting deployment time | -+-------------------------------+----------------------------------------------------------------------------+ -| documentation | A bug that is specific to documentation issues | -+-------------------------------+----------------------------------------------------------------------------+ -| edge | A bug that correlates to EDGE computing cases by network/scale etc. areas | -+-------------------------------+----------------------------------------------------------------------------+ -| i18n | A bug related to internationalization issues | -+-------------------------------+----------------------------------------------------------------------------+ -| low-hanging-fruit | A good starter bug for newcomers | -+-------------------------------+----------------------------------------------------------------------------+ -| networking | A bug that is specific to networking issues | -+-------------------------------+----------------------------------------------------------------------------+ -| promotion-blocker | Bug that is blocking promotion job(s) | -+-------------------------------+----------------------------------------------------------------------------+ -| puppet | A bug affecting the TripleO Puppet templates | -+-------------------------------+----------------------------------------------------------------------------+ -| quickstart | A bug affecting tripleo-quickstart or tripleo-quickstart-extras | -+-------------------------------+----------------------------------------------------------------------------+ -| selinux | A bug related to SELinux | -+-------------------------------+----------------------------------------------------------------------------+ -| tech-debt | A bug related to TripleO tech debt | -+-------------------------------+----------------------------------------------------------------------------+ -| tempest | A bug related to tempest running on TripleO | -+-------------------------------+----------------------------------------------------------------------------+ -| tripleo-common | A bug affecting tripleo-common | -+-------------------------------+----------------------------------------------------------------------------+ -| tripleo-heat-templates | A bug affecting the TripleO Heat Templates | -+-------------------------------+----------------------------------------------------------------------------+ -| tripleoclient | A bug affecting python-tripleoclient | -+-------------------------------+----------------------------------------------------------------------------+ -| ui | A bug affecting the TripleO UI | -+-------------------------------+----------------------------------------------------------------------------+ -| upgrade | A bug affecting upgrades | -+-------------------------------+----------------------------------------------------------------------------+ -| ux | A bug affecting user experience | -+-------------------------------+----------------------------------------------------------------------------+ -| validations | A bug affecting the Validations | -+-------------------------------+----------------------------------------------------------------------------+ -| workflows | A bug affecting the Mistral workflows | -+-------------------------------+----------------------------------------------------------------------------+ -| xxx-backport-potential | Cherry-pick request for the stable team | -+-------------------------------+----------------------------------------------------------------------------+ - -Alternatives & History -====================== - -The current ad-hoc system is not working well, as people use -inconsistent subject tags and other markers. Likewise, with the list -not being official Launchpad tags do not autocomplete and quickly -become inconsistent, hence not as useful. - -We could use the wiki to keep track of the tags, but the future of the -wiki is in doubt. By making tags an official policy, changes to the -list can be reviewed. - -Implementation -============== - -Author(s) ---------- - -Primary author: - jpichon - -Milestones ----------- - -Newton-3 - -Work Items ----------- - -Once the policy has merged, someone with the appropriate Launchpad -permissions should create the tags and an email should be sent to -openstack-dev referring to this policy. - -References -========== - -Launchpad page to manage the tag list: -https://bugs.launchpad.net/tripleo/+manage-official-tags - -Thread that led to the creation of this policy: -http://lists.openstack.org/pipermail/openstack-dev/2016-July/099444.html - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - Newton - - Introduced - * - Queens - - tech-debt tag added - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/ci-team-structure.rst b/specs/policy/ci-team-structure.rst deleted file mode 100644 index b50bf14d..00000000 --- a/specs/policy/ci-team-structure.rst +++ /dev/null @@ -1,246 +0,0 @@ -CI Team Structure -================= - -Problem Description -------------------- -The soft analysis over the past one to two years is that landing major new -features and function in CI is difficult while being interrupted by a constant -stream of issues. Each individual is siloed in their own work, feature or -section of the production chain and there is very little time for thoughtful -peer review and collaborative development. - -Policy ------- - -Goals -^^^^^ - - * Increase developer focus, decrease distractions, interruptions, and time - slicing. - * Encourage collaborative team development. - * Better and faster code reviews - -Team Structure -^^^^^^^^^^^^^^ - * The Ruck - * The Rover - * The Sprint Team - -The Ruck -^^^^^^^^ -One person per week will be on the front lines reporting failures found in CI. -The Ruck & Rover switch roles in the second week of the sprint. - - * Primary focus is to watch CI, report bugs, improve debug documentation. - * Does not participate in the sprint - * Attends the meetings where the team needs to be represented - * Responds to pings on #oooq / #tripleo regarding CI - * Reviews and improves documentation - * Attends meetings for the group where possible - * For identification, use the irc nick $user|ruck - -The Rover -^^^^^^^^^ -The primary backup for the Ruck. The Ruck should be catching all the issues -in CI and passing the issues to the Rover for more in depth analysis or -resolution of the bug. - - * Back up for the Ruck - * Workload is driven from the tripleo-quickstart bug queue, the Rover is - not monitoring CI - * A secondary input for work is identified technical debt defined in the - Trello board. - * Attends the sprint meetings, but is not responsible for any sprint work - * Helps to triage incoming gerrit reviews - * Responds to pings on irc #oooq / #tripleo - * If the Ruck is overwhelmed with any of their responsibilities the - Rover is the primary backup. - * For identification, use the irc nick $user|rover - -The Sprint Team -^^^^^^^^^^^^^^^ -The team is defined at the beginning of the sprint based on availability. -Members on the team should be as focused on the sprint epic as possible. -A member of team should spend 80% of their time on sprint goals and 20% -on any other duties like code review or incoming high priority bugs that -the Rover can not manage alone. - - * hand off interruptions to the Ruck and Rover as much as possible - * focus as a team on the sprint epic - * collaborate with other members of the sprint team - * seek out peer review regarding sprint work - * keep the Trello board updated daily - * One can point to Trello cards in stand up meetings for status - -The Squads -^^^^^^^^^^ -The squads operate as a subunit of the sprint team. Each squad will operate -with the same process and procedures and are managed by the team catalyst. - - * Current Squads - * CI - * Responsible for the TripleO CI system ( non-infra ) and build - verification. - * Tempest - * Responsible for tempest development. - -Team Leaders ------------- - -The team catalyst (TC) -^^^^^^^^^^^^^^^^^^^^^^ -The member of the team responsible organizing the group. The team will elect or -appoint a team catalyst per release. - - * organize and plan sprint meetings - * collect status and send status emails - -The user advocate (UA) -^^^^^^^^^^^^^^^^^^^^^^ -The member of the team responsible for help to prioritize work. The team will -elect or appoint a user advocate per release. - - * organize and prioritize the Trello board for the sprint planning - * monitor the board during the sprint. - * ensure the right work is being done. - -The Squads -^^^^^^^^^^ -There are two squads on the CI team. - - * tripleo ci - * tempest development - -Each squad has a UA and they share a TC. Both contribute to Ruck and Rover rotations. - - -Current Leaders for Rocky -^^^^^^^^^^^^^^^^^^^^^^^^^^ - * team catalyst (ci, tempest) - Matt Young - * user advocate (ci) - Gabriele Cerami - * user advocate (tempest) - Chandan Kumar - -Sprint Structure -^^^^^^^^^^^^^^^^ -The goal of the sprint is to define a narrow and focused feature called an epic -to work on in a collaborative way. Work not completed in the sprint will be -added to the technical debt column of Trello. - -**Note:** Each sprint needs a clear definition of done that is documented in -the epic used for the sprint. - -Sprint Start ( Day 1 ) - 2.5 hours -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - * Sprints are three weeks in length - * A planning meeting is attended by the entire team including the Ruck and - Rover - * Review PTO - * Review any meetings that need to be covered by the Ruck/Rover - * The UA will present options for the sprint epic - * Discuss the epic, lightly breaking each one down - * Vote on an epic - * The vote can be done using a doodle form - * Break down the sprint epic into cards - * Review each card - * Each card must have a clear definition of done - * As a group include as much detail in the card as to provide enough - information for an engineer with little to no background with the task. - - -Sprint End ( Day 15 ) - 2.5 hours -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - * Retrospective - * team members, ruck and rover only - * Document any technical debt left over from the sprint - * Ruck / Rover hand off - * Assign Ruck and Rover positions - * Sprint demo - when available - * Office hours on irc - -Scrum meetings - 30 Min -^^^^^^^^^^^^^^^^^^^^^^^ - * Planning meeting, video conference - * Sprint End, video and irc #oooq on freenode - * 2 live video conference meetings per week - * sprint stand up - * Other days, post status to the team's Trello board and/or cards - - -TripleoO CI Community meeting -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - * A community meeting should be held once a week. - * The meeting should ideally be conveniently scheduled immediately after - the TripleO community meeting on #tripleo (OFTC) - * The CI meeting should be announced as part of the TripleO community meeting - to encourage participation. - -Alternatives & History ----------------------- - -In the past the CI team has worked as individuals or by pairing up for distinct -parts of the CI system and for certain features. Neither has been -overwhelmingly successful for delivering features on a regular cadence. - -Implementation --------------- - -Primary author: Wes Hayutin weshayutin at gmail - -Other contributors: - * Ronelle Landy rlandy at redhat - * Arx Cruz acruz at redhat - * Sagi Shnaidman at redhat - - -Milestones ----------- - -This document is likely to evolve from the feedback discussed in sprint -retrospectives. An in depth retrospective should be done at the end of each -upstream cycle. - - -References ----------- - -Trello -^^^^^^ -A Trello board will be used to organize work. The team is expected to keep the -board and their cards updated on a daily basis. - - * https://trello.com/b/U1ITy0cu/tripleo-ci-squad - -Dashboards -^^^^^^^^^^ -A number of dashboards are used to monitor the CI - - * http://cistatus.tripleo.org/ - * https://dashboards.rdoproject.org/rdo-dev - * http://zuul-status.tripleo.org/ - -Team Notes -^^^^^^^^^^ - - * https://etherpad.openstack.org/p/tripleo-ci-squad-meeting - -Bug Queue -^^^^^^^^^ - * http://tinyurl.com/yag6y9ne - - -Revision History ----------------- - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - Rocky - - April 16 2018 - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/expedited-approvals.rst b/specs/policy/expedited-approvals.rst deleted file mode 100644 index 00faeadc..00000000 --- a/specs/policy/expedited-approvals.rst +++ /dev/null @@ -1,122 +0,0 @@ -===================== - Expedited Approvals -===================== - -In general, TripleO follows the standard "2 +2" review standard, but there are -situations where we want to make an exception. This policy is intended to -document those exceptions. - -Problem Description -=================== - -Core reviewer time is precious, and there is never enough of it. In some -cases, requiring 2 +2's on a patch is a waste of that core time, so we need -to be reasonable about when to make exceptions. While core reviewers are -always free to use their judgment about when to merge or not merge a patch, -it can be helpful to list some specific situations where it is acceptable and -even expected to approve a patch with a single +2. - -Part of this information is already in the wiki, but the future of the wiki -is in doubt and it's better to put policies in a place that they can be -reviewed anyway. - -Policy -====== - -Single +2 Approvals -------------------- - -A core can and should approve patches without a second +2 under the following -circumstances: - -* The change has multiple +2's on previous patch sets, indicating an agreement - from the other cores that the overall design is good, and any alterations to - the patch since those +2's must be minor implementation details only - - trivial rebases, minor syntax changes, or comment/documentation changes. -* Backports proposed by another core reviewer. Backports should already have - been reviewed for design when they merged to master, so if two cores agree - that the backport is good (one by proposing, the other by reviewing), they - can be merged with a single +2 review. -* Requirements updates proposed by the bot. -* Translation updates proposed by the bot. (See also `reviewing - translation imports - `_.) - -Co-author +2 ------------- - -Co-authors on a patch are allowed to +2 that patch, but at least one +2 from a -core not listed as a co-author is required to merge the patch. For example, if -core A pushes a patch with cores B and C as a co-authors, core B and core C are -both allowed to +2 that patch, but another core is required to +2 before the -patch can be merged. - -Self-Approval -------------- - -It is acceptable for a core to self-approve a patch they submitted if it has the -requisite 2 +2's and a CI pass. However, this should not be done if there is any -dispute about the patch, such as on a change with 2 +2's and an unresolved -1. - -Note on CI ----------- - -This policy does not affect CI requirements. Patches must still pass CI before -merging. - -Alternatives & History -====================== - -This policy has been in effect for a while now, but not every TripleO core is -aware of it, so it is simply being written down in an official location for -reference. - -Implementation -============== - -Author(s) ---------- - -Primary author: - bnemec - -Milestones ----------- - -The policy is already in effect. - -Work Items ----------- - -Ensure all cores are aware of the policy. Once the policy has merged, an email -should be sent to openstack-dev referring to it. - -References -========== - -Existing wiki on review guidelines: -https://wiki.openstack.org/wiki/TripleO/ReviewGuidelines - -Previous spec that implemented some of this policy: -https://specs.openstack.org/openstack/tripleo-specs/specs/kilo/tripleo-review-standards.html - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - Newton - - Introduced - * - Newton - - Added co-author +2 policy - * - Ocata - - Added note on translation imports - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - https://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/first-principles.rst b/specs/policy/first-principles.rst deleted file mode 100644 index 332c66ea..00000000 --- a/specs/policy/first-principles.rst +++ /dev/null @@ -1,257 +0,0 @@ -.. - -======================== -TripleO First Principles -======================== - -The TripleO first principles are a set of principles that guide decision making -around future direction with TripleO. The principles are used to evaluate -choices around changes in direction and architecture. Every impactful decision -does not necessarily have to follow all the principles, but we use them to make -informed decisions about trade offs when necessary. - -Problem Description -=================== - -When evaluating technical direction within TripleO, a better and more -consistent method is needed to weigh pros and cons of choices. Defining the -principles is a step towards addressing that need. - - -Policy -====== - - -Definitions ------------ - -Framework - The functional implementation which exposes a set of standard enforcing - interfaces that can be consumed by a service to describe that service's - deployment and management. The framework includes all functional pieces that - implement such interfaces, such as CLI's, API's, or libraries. - - Example: tripleoclient/tripleo-common/tripleo-ansible/tripleo-heat-templates - -Service - The unit of deployment. A service will implement the necessary framework - interfaces in order to describe it's deployment. - - The framework does not enforce a particular service boundary, other than by - prescribing best practices. For example, a given service implementation could - deploy both a REST API and a database, when in reality the API and database - should more likely be deployed as their own services and expressed as - dependencies. - - Example: Keystone, MariaDB, RabbitMQ - -Third party integrations - Service implementations that are developed and maintained outside of the - TripleO project. These are often implemented by vendors aiming to add support - for their products within TripleO. - - Example: Cinder drivers, Neutron plugins - -First Principles ----------------- - -#. [UndercloudMigrate] No Undercloud Left Behind - - #. TripleO itself as the deployment tool can be upgraded. We do - not immediately propose what the upgrade will look like or the technology - stack, but we will offer an upgrade path or a migration path. - -#. [OvercloudMigrate] No Overcloud Left Behind - - #. An overcloud deployed with TripleO can be upgraded to the next major version - with either an in place upgrade or migration. - -#. [DefinedInterfaces] TripleO will have a defined interface specification. - - #. We will document clear boundaries between internal and external - (third party integrations) interfaces. - #. We will document the supported interfaces of the framework in the same - way that a code library or API would be documented. - #. Individual services of the framework can be deployed and tested in - isolation from other services. Service dependencies are expressed per - service, but do not preclude using the framework to deploy a service - isolated from its dependencies. Whether that is successful or not - depends on how the service responds to missing dependencies, and that is - a behavior of the service and not the framework. - #. The interface will offer update and upgrade tasks as first class citizens - #. The interface will offer validation tasks as first class citizens - -#. [OSProvisioningSeparation] Separation between operating system provisioning - and software configuration. - - #. Baremetal configuration, network configuration and base operating system - provisioning is decoupled from the software deployment. - #. The software deployment will have a defined set of minimal requirements - which are expected to be in-place before it begins the software deployment. - - #. Specific linux distributions - #. Specific linux distribution versions - #. Password-less access via ssh - #. Password-less sudo access - #. Pre-configured network bridges - -#. [PlatformAgnostic] Platform agnostic deployment tooling. - - #. TripleO is sufficiently isolated from the platform in a way that allows - for use in a variety of environments (baremetal/virtual/containerized/OS - version). - #. The developer experience is such that it can easily be run in - isolation on developer workstations - -#. [DeploymentToolingScope] The deployment tool has a defined scope - - #. Data collection tool. - - #. Responsible for collecting host and state information and posting to a - centralized repository. - #. Handles writes to central repository (e.g. read information from - repository, do aggregation, post to central repository) - - #. A configuration tool to configure software and services as part of the - deployment - - #. Manages Software Configuration - - #. Files - #. Directories - #. Service (containerized or non-containerized) state - #. Software packages - - #. Executes commands related to “configuration” of a service - Example: Configure OpenStack AZ's, Neutron Networks. - #. Isolated executions that are invoked independently by the orchestration tool - #. Single execution state management - - #. Input is configuration data/tasks/etc - #. A single execution produces the desired state or reports failure. - #. Idempotent - - #. Read-only communication with centralized data repository for configuration data - - #. The deployment process depends on an orchestration tool to handle various - task executions. - - #. Task graph manager - #. Task transport and execution tracker - #. Aware of hosts and work to be executed on the hosts - #. Ephemeral deployment tooling - #. Efficient execution - #. Scale and reliability/durability are first class citizens - -#. [CI/CDTooling] TripleO functionality should be considered within the context - of being directly invoked as part of a CI/CD pipeline. - -#. [DebuggableFramework] Diagnosis of deployment/configuration failures within - the framework should be quick and simple. Interfaces should be provided to - enable debuggability of service failures. - -#. [BaseOSBootstrap] TripleO can start from a base OS and go to full cloud - - #. It should be able to start at any point after base OS, but should be able - to handle the initial OS bootstrap - -#. [PerServiceManagement] TripleO can manage individual services in isolation, - and express and rely on dependencies and ordering between services. - -#. [Predictable/Reproducible/Idempotent] The deployment is predictable - - #. The operator can determine what changes will occur before actually applying - those changes. - #. The deployment is reproducible in that the operator can re-run the - deployment with the same set of inputs and achieve the same results across - different environments. - #. The deployment is idempotent in that the operator can re-run the - deployment with the same set of inputs and the deployment will not change other - than when it was first deployed. - #. In the case where a service needs to restart a process, the framework - will have an interface that the service can use to notify of the - needed restart. In this way, the restarts are predictable. - #. The interface for service restarts will allow for a service to describe - how it should be restarted in terms of dependencies on other services, - simultaneous restarts, or sequential restarts. - -Non-principles --------------- - -#. [ContainerImageManagement] The framework does not manage container images. - Other than using a given container image to start a container, the framework - does not encompass common container image management to include: - - #. Building container images - #. Patching container images - #. Serving or mirroring container images - #. Caching container images - - Specific tools for container image and runtime management and that need to - leverage the framework during deployment are expected to be implemented as - services. - -#. [SupportingTooling] Tools and software executed by the framework to deploy - services or tools required prior to service deployment by the framework are - not considered part of the framework itself. - - Examples: podman, TCIB, image-serve, nova-less/metalsmith - -Alternatives & History -====================== - -Many, if not all, the principles are already well agreed upon and understood as -core to TripleO. Writing them down as policy makes them more discoverable and -official. - -Historically, there have been instances when decisions have been guided by -desired technical implementation or outcomes. Recording the principles does not -necessarily mean those decisions would stop, but it does allow for a more -reasonable way to think about the trade offs. - -We do not need to adopt any principles, or record them. However, there is no -harm in doing so. - -Implementation -============== - -Author(s) ---------- - -Primary author: - James Slagle - -Other contributors: - - -Milestones ----------- - -None. - -Work Items ----------- - -None. - -References -========== - -None. - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - v0.0.1 - - Introduced - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/patch-abandonment.rst b/specs/policy/patch-abandonment.rst deleted file mode 100644 index 89d94150..00000000 --- a/specs/policy/patch-abandonment.rst +++ /dev/null @@ -1,109 +0,0 @@ -================= -Patch Abandonment -================= - -Goal -==== - -Provide basic policy that core reviewers can apply to outstanding reviews. As -always, it is up to the core reviewers discretion on whether a patch should or -should not be abandoned. This policy is just a baseline with some basic rules. - -Problem Description -=================== - -TripleO consists of many different projects in which many patches become stale -or simply forgotten. This can lead to problems when trying to review the -current patches for a given project. - -When to Abandon -=============== - -If a proposed patch has been marked -1 WIP by the author but has sat idle for -more than 180 days, a core reviewer should abandon the change with a reference -to this policy. - -If a proposed patch is submitted and given a -2 and the patch has sat idle for -90 days with no effort to address the -2, a core reviewer should abandon the -change with a reference to this policy. - -If a proposed patch becomes stale by ending up with a -1 from CI for 90 days -and no activity to resolve the issues, a core reviewer should abandon the -change with a reference to this policy. - -If a proposed patch with no activity for 90 days is in merge conflict, even -with a +1 from CI, a core reviewer should abandon the change with a reference -to this policy. - -When NOT to Abandon -=================== - -If a proposed patch has no feedback but is +1 from CI, a core reviewer should -not abandon such changes. - -If a proposed patch a given a -1 by a reviewer but the patch is +1 from CI and -not in merge conflict and the author becomes unresponsive for a few weeks, -reviewers can leave a reminder comment on the review to see if there is -still interest in the patch. If the issues are trivial then anyone should feel -welcome to checkout the change and resubmit it using the same change ID to -preserve original authorship. Core reviewers should not abandon such changes. - -Restoration -=========== - -Feel free to restore your own patches. If a change has been abandoned -by a core reviewer, anyone can request the restoration of the patch by -asking a core reviewer on IRC in #tripleo on OFTC or by sending a -request to the openstack-dev mailing list. Should the patch again -become stale it may be abandoned again. - -Alternative & History -===================== - -This topic was previously brought up on the openstack mailing list [1]_ along -with proposed code to use for automated abandonment [2]_. Similar policies are -used by the Puppet OpenStack group [3]_. - -Implementation -============== - -Author(s) ---------- - -Primary author: - aschultz - -Other contributors: - bnemec - -Milestones ----------- - -Pike-2 - -Work Items ----------- - -References -========== - -.. [1] http://lists.openstack.org/pipermail/openstack-dev/2015-October/076666.html -.. [2] https://github.com/cybertron/tripleo-auto-abandon -.. [3] https://docs.openstack.org/developer/puppet-openstack-guide/reviews.html#abandonment - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - Pike - - Introduced - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/spec-review.rst b/specs/policy/spec-review.rst deleted file mode 100644 index fc9c40df..00000000 --- a/specs/policy/spec-review.rst +++ /dev/null @@ -1,163 +0,0 @@ -========================= - Spec Review Process -========================= - -Document the existing process to help reviewers, especially newcomers, -understand how to review specs. This is migrating the existing wiki -documentation into a policy. - -Problem Description -=================== - -Care should be taken when approving specs. An approved spec, and an -associated blueprint, indicate that the proposed change has some -priority for the TripleO project. We don't want a bunch of approved -specs sitting out there that no community members are owning or working -on. We also want to make sure that our specs and blueprints are easy to -understand and have sufficient enough detail to effectively communicate -the intent of the change. The more effective the communication, the -more likely we are to elicit meaningful feedback from the wider -community. - -Policy -====== - -To this end, we should be cognizant of the following checklist when -reviewing and approving specs. - -* Broad feedback from interested parties. - - * We should do our best to elicit feedback from operators, - non-TripleO developers, end users, and the wider OpenStack - community in general. - * Mail the appropriate lists, such as opentack-operators and - openstack-dev to ask for feedback. Respond to feedback on the list, - but also encourage direct comments on the spec itself, as those - will be easier for other spec reviewers to find. - -* Overall consensus - - * Check for a general consensus in the spec. - * Do reviewers agree this change is meaningful for TripleO? - * If they don't have a vested interest in the change, are they at - least not objecting to the change? - -* Review older patchsets to make sure everything has been addressed - - * Have any reviewers raised objections in previous patchsets that - were not addressed? - * Have any potential pitfalls been pointed out that have not been - addressed? - -* Impact/Security - - * Ensure that the various Impact (end user, deployer, etc) and - Security sections in the spec have some content. - * These aren't sections to just gloss off over after understanding - the implementation and proposed change. They are actually the most - important sections. - * It would be nice if that content had elicited some feedback. If it - didn't, that's probably a good sign that the author and/or - reviewers have not yet thought about these sections carefully. - -* Ease of understandability - - * The spec should be easy to understand for those reviewers who are - familiar with the project. While the implementation may contain - technical details that not everyone will grasp, the overall - proposed change should be able to be understood by folks generally - familiar with TripleO. Someone who is generally familiar with - TripleO is likely someone who has run through the undercloud - install, perhaps contributed some code, or participated in reviews. - * To aid in comprehension, grammar nits should generally be corrected - when they have been pointed out. Be aware though that even nits can - cause disagreements, as folks pointing out nits may be wrong - themselves. Do not bikeshed over solving disagreements on nits. - -* Implementation - - * Does the implementation make sense? - * Are there alternative implementations, perhaps easier ones, and if - so, have those been listed in the Alternatives section? - * Are reasons for discounting the Alternatives listed in the spec? - -* Ownership - - * Is the spec author the primary assignee? - * If not, has the primary assignee reviewed the spec, or at least - commented that they agree that they are the primary assignee? - -* Reviewer workload - - * Specs turn into patches to codebases. - * A +2 on a spec means that the core reviewer intends to review the - patches associated with that spec in addition to their other core - commitments for reviewer workload. - * A +1 on a spec from a core reviewer indicates that the core - reviewer is not necessarily committing to review that spec's - patches. - * It's fine to +2 even if the spec also relates to other repositories - and areas of expertise, in addition to the reviewer's own. We - probably would not want to merge any spec that spanned multiple - specialties without a representative from each group adding their - +2. - * Have any additional (perhaps non-core) reviewers volunteered to - review patches that implement the spec? - * There should be a sufficient number of core reviewers who have - volunteered to go above and beyond their typical reviewer workload - (indicated by their +2) to review the relevant patches. A - "sufficient number" is dependent on the individual spec and the - scope of the change. - * If reviewers have said they'll be reviewing a spec's patches - instead of patches they'd review otherwise, that doesn't help much - and is actually harmful to the overall project. - -Alternatives & History -====================== - -This is migrating the already agreed upon policy from the wiki. - -Implementation -============== - -Author(s) ---------- - -Primary author: - james-slagle (from the wiki history) - -Other contributors: - jpichon - -Milestones ----------- - -None - -Work Items ----------- - -Once the policy has merged, an email should be sent to openstack-dev -referring to this document. - -References -========== - -* Original documentation: https://wiki.openstack.org/wiki/TripleO/SpecReviews - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - Ocata - - Migrated from wiki - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/squads.rst b/specs/policy/squads.rst deleted file mode 100644 index 8f2636a0..00000000 --- a/specs/policy/squads.rst +++ /dev/null @@ -1,141 +0,0 @@ -============== -TripleO Squads -============== - -Scaling-up a team is a common challenge in OpenStack. -We always increase the number of projects, with more contributors -and it often implies some changes in the organization. -This policy is intended to document how we will address this challenge in -the TripleO project. - -Problem Description -=================== - -Projects usually start from a single git repository and very often grow to -dozen of repositories, doing different things. As long as a project gets -some maturity, people who work together on a same topic needs some space -to collaborate the open way. -Currently, TripleO is acting as a single team where everyone meets -on IRC once a week to talk about bugs, CI status, release management. -Also, it happens very often that new contributors have hard time to find -an area of where they could quickly start to contribute. -Time is precious for our developers and we need to find a way to allow -them to keep all focus on their area of work. - -Policy -====== - -The idea of this policy is to create squads of people who work on the -same topic and allow them to keep focus with low amount of external -distractions. - -* Anyone would be free to join and leave a squad at will. - Right now, there is no size limit for a squad as this is something - experimental. If we realize a squad is too big (more than 10 people), - we might re-consider the focus of area of the squad. -* Anyone can join one or multiple squads at the same time. Squads will be - documented in a place anyone can contribute. -* Squads are free to organize themselves a weekly meeting. -* #tripleo remains the official IRC channel. We won't add more channels. -* Squads will have to choose a representative, who would be a squad liaison - with TripleO PTL. -* TripleO weekly meeting will still exist, anyone is encouraged to join, - but topics would stay high level. Some examples of topics: release - management; horizontal discussion between squads, CI status, etc. - The meeting would be a TripleO cross-projects meeting. - -We might need to test the idea for at least 1 or 2 months and invest some -time to reflect what is working and what could be improved. - -Benefits --------- - -* More collaboration is expected between people working on a same topic. - It will reflect officially what we have nearly done over the last cycles. -* People working on the same area of TripleO would have the possibility - to do public and open meetings, where anyone would be free to join. -* Newcomers would more easily understand what TripleO project delivers - since squads would provide a good overview of the work we do. Also - it would be an opportunity for people who want to learn on a specific - area of TripleO to join a new squad and learn from others. -* Open more possibilities like setting up mentoring program for each squad, - or specific docs to get involved more quickly. - -Challenges ----------- - -* We need to avoid creating silos and keep horizontal collaboration. - Working on a squad doesn't meen you need to ignore other squads. - -Squads ------- - -The list tends to be dynamic over the cycles, depending on which topics -the team is working on. The list below is subject to change as squads change. - -+-------------------------------+----------------------------------------------------------------------------+ -| Squad | Description | -+===============================+============================================================================+ -| ci | Group of people focusing on Continuous Integration tooling and system | -+-------------------------------+----------------------------------------------------------------------------+ -| upgrade | Group of people focusing on TripleO upgrades | -+-------------------------------+----------------------------------------------------------------------------+ -| validations | Group of people focusing on TripleO validations tooling | -+-------------------------------+----------------------------------------------------------------------------+ -| networking | Group of people focusing on networking bits in TripleO | -+-------------------------------+----------------------------------------------------------------------------+ -| integration | Group of people focusing on configuration management (eg: services) | -+-------------------------------+----------------------------------------------------------------------------+ -| security | Group of people focusing on security | -+-------------------------------+----------------------------------------------------------------------------+ -| edge | Group of people focusing on Edge/multi-site/multi-cloud | -| | https://etherpad.openstack.org/p/tripleo-edge-squad-status | -+-------------------------------+----------------------------------------------------------------------------+ -| transformation | Group of people focusing on converting heat templates / puppet to Ansible | -| | within the tripleo-ansible framework | -+-------------------------------+----------------------------------------------------------------------------+ - -.. note:: - - Note about CI: the squad is about working together on the tooling used - by OpenStack Infra to test TripleO, though every squad has in charge of - maintaining the good shape of their tests. - - -Alternatives & History -====================== - -One alternative would be to continue that way and keep a single horizontal -team. As long as we try to welcome in the team and add more projects, we'll -increase the problem severity of scaling-up TripleO project. -The number of people involved and the variety of topics that makes it really difficult to become able to work on everything. - -Implementation -============== - -Author(s) ---------- - -Primary author: - emacchi - -Milestones ----------- - -Ongoing - -Work Items ----------- - -* Work with TripleO developers to document the area of work for every squad. -* Document the output. -* Document squads members. -* Setup Squad meetings if needed. -* For each squad, find a liaison or a squad leader. - - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/policy/tech-debt-tracking.rst b/specs/policy/tech-debt-tracking.rst deleted file mode 100644 index ec3e7446..00000000 --- a/specs/policy/tech-debt-tracking.rst +++ /dev/null @@ -1,113 +0,0 @@ -================== -Tech Debt Tracking -================== - -Goal -==== - -Provide a basic policy for tracking and being able to reference tech debt -related changes in TripleO. - -Problem Description -=================== - -During the development of TripleO, sometimes tech debt is acquired due to time -or resource constraints that may exist. Without a solid way of tracking when -we intentially add tech debt, it is hard to quantify how much tech debt is -being self inflicted. Additionally tech debt gets lost in the code and without -a way to remember where we left it, it is almost impossible to remember when -and where we need to go back to fix some known issues. - -Proposed Change -=============== - -Tracking Code Tech Debt with Bugs ---------------------------------- - -Intentionally created tech debt items should have a bug [1]_ created with the -`tech-debt` tag added to it. Additionally the commit message of the change -should reference this `tech-debt` bug and if possible a comment should be added -into the code referencing who put it in there. - -Example Commit Message:: - - Always exit 0 because foo is currently broken - - We need to always exit 0 because the foo process eroneously returns - 42. A bug has been reported upstream but we are not sure when it - will be addressed. - - Related-Bug: #1234567 - -Example Comment:: - - # TODO(aschultz): We need this because the world is falling apart LP#1234567 - foo || exit 0 - -Triaging Bugs as Tech Debt --------------------------- - -If an end user reports a bug that we know is a tech debt item, the person -triaging the bug should add the `tech-debt` tag to the bug. - -Reporting Tech Debt -------------------- - -With the `tech-debt` tag on bugs, we should be able to keep a running track -of the bugs we have labeled and should report on this every release milestone -to see trends around how much is being added and when. As part of our triaging -of bugs, we should strive to add net-zero tech-debt bugs each major release if -possible. - - -Alternatives ------------- - -We continue to not track any of these things and continue to rely on developers -to remember when they add code and circle back around to fix it themselves or -when other developers find the issue and remove it. - -Implementation -============== - -Core reviewers should request that any tech debt be appropriately tracked and -feel free to -1 any patches that are adding tech debt without proper -attribution. - -Author(s) ---------- - -Primary author: - aschultz - -Milestones ----------- - -Queens-1 - -Work Items ----------- - -* aschultz to create tech-debt tag in Launchpad. - -References -========== - -.. [1] https://docs.openstack.org/tripleo-docs/latest/contributor/contributions.html#reporting-bugs - -Revision History -================ - -.. list-table:: Revisions - :header-rows: 1 - - * - Release Name - - Description - * - Queens - - Introduced - -.. note:: - - This work is licensed under a Creative Commons Attribution 3.0 - Unported License. - http://creativecommons.org/licenses/by/3.0/legalcode diff --git a/specs/queens/fast-forward-upgrades.rst b/specs/queens/fast-forward-upgrades.rst deleted file mode 100644 index 26861876..00000000 --- a/specs/queens/fast-forward-upgrades.rst +++ /dev/null @@ -1,351 +0,0 @@ -. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -===================== -Fast-forward upgrades -===================== - -https://blueprints.launchpad.net/tripleo/+spec/fast-forward-upgrades - -Fast-forward upgrades are upgrades that move an environment from release `N` to -`N+X` in a single step, where `X` is greater than `1` and for fast-forward -upgrades is typically `3`. This spec outlines how such upgrades can be -orchestrated by TripleO between the Newton and Queens OpenStack releases. - -Problem Description -=================== - -OpenStack upgrades are often seen by operators as problematic [1]_ [2]_. -Whilst TripleO upgrades have improved greatly over recent cycles many operators -are still reluctant to upgrade with each new release. - -This often leads to a situation where environments remain on the release used -when first deployed. Eventually this release will come to the end of its -supported life (EOL), forcing operators to upgrade to the next supported -release. There can also be restrictions imposed on an environment that simply -do not allow upgrades to be performed ahead of the EOL of a given release, -forcing operators to again wait until the release hits EOL. - -While it is possible to then linearly upgrade to a supported release with the -cadence of upstream releases, downstream distributions providing long-term -support (LTS) releases may not be able to provide the same path once the -initially installed release reaches EOL. Operators in such a situation may also -want to avoid running multiple lengthy linear upgrades to reach their desired -release. - -Proposed Change -=============== - -Overview --------- - -TripleO support for fast-forward upgrades will first target `N` to `N+3` -upgrades between the Newton and Queens releases: - -.. code-block:: bash - - Newton Ocata Pike Queens - +-----+ +-----+ +-----+ +-----+ - | | | N+1 | | N+2 | | | - | N | ---------------------> | N+3 | - | | | | | | | | - +-----+ +-----+ +-----+ +-----+ - - -This will give the impression of the Ocata and Pike releases being skipped with -the fast-forward upgrade moving the environment from Newton to Queens. In -reality as OpenStack projects with the `supports-upgrade` tag are only required -to support `N` to `N+1` upgrades [3]_ the upgrade will still need to move -through each release, completing database migrations and a limited set of other -tasks. - -Caveats -------- - -Before outlining the suggested changes to TripleO it is worth highlighting the -following caveats for fast-forward upgrades: - -* The control plane is inaccessible for the duration of the upgrade -* The data plane and active workloads must remain available for the duration of - the upgrade. - -Prerequisites -------------- - -Prior to the overcloud fast-forward upgrade starting the following prerequisite -tasks must be completed: - -* Rolling minor update of the overcloud on `N` - -This is a normal TripleO overcloud update [4]_ and should bring each node in -the environment up to the latest supported version of the underlying OS and -pulling in the latest packages. Operators can then reboot the nodes as -required. The reboot ensuring that the latest kernel, openvswitch, QEMU and any -other reboot dependant package is reloaded before proceeding with the upgrade. -This can happen well in advance of the overcloud fast-forward upgrade and -should remove the need for additional reboots during the upgrade. - -* Upgrade undercloud from `N` to `N+3` - -The undercloud also needs to be upgraded to `N+3` ahead of any overcloud -upgrade. Again this can happen well in advance of the overcloud upgrade. For -the time being this is a traditional, linear upgrade between `N` and `N+1` -releases until we reach the target `N+3` Queens release. - -* Container images cached prior to the start of the upgrade - -With the introduction of containerised TripleO overclouds in Pike operators -will need to cache the required container images prior to the fast-forward -upgrade if they wish to end up with a containerised Queens overcloud. - -High level flow ---------------- - -At a high level the following actions will be carried out by the fast-forward -upgrade to move the overcloud from `N` to `N+3`: - -* Stop all OpenStack control and compute services across all roles - -This will bring down the OpenStack control plane, leaving infrastructure -services such as the databases running, while allowing any workloads to -continue running without interruption. For HA environments this will disable -the cluster, ensuring that OpenStack services are not restarted. - -* Upgrade a single host from `N` to `N+1` then `N+1` to `N+2` - -As alluded to earlier, OpenStack projects currently only support `N` to `N+1` -upgrades and so fast-forward upgrades still need to cycle through each release in -order to complete data migrations and any other tasks that are required before -these migrations can be completed. This part of the upgrade is limited to a -single host per role to ensure this is completed as quickly as possible. - -* Optional upgrade and deployment of single canary compute host to `N+3` - -As fast-forward upgrades aim to ensure workloads are online and accessible -during the upgrade we can optionally upgrade all control service hosting roles -_and_ a single canary compute to `N+3` to verify that workloads will remain -active and accessible during the upgrade. - -A canary compute node will be selected at the start of the upgrade and have -instances launched on it to validate that both it and the data plane remain -active during the upgrade. The upgrade will halt if either become inaccessible -with a recovery procedure being provided to move all hosts back to `N+1` -without further disruption to the active workloads on the untouched compute -hosts. - -* Upgrade and deployment of all roles to `N+3` - -If the above optional canary compute host upgrade is not used then the final -action in the fast-forward upgrade will be a traditional `N` to `N+1` migration -between `N+2` and `N+3` followed by the deployment of all roles on `N+3`. This -final action essentially being a redeployment of the overcloud to containers on -`N+3` (Queens) as previously seen when upgrading TripleO environments from -Ocata to Pike. - -A python-tripleoclient command and associated Mistral workflow will control if -this final step is applied to all roles in parallel (default), all hosts in a -given role or selected hosts in a given role. The latter being useful if a user -wants to control the order in which computes are moved from `N+1` to `N+3` etc. - -Implementation --------------- - -As with updates [5]_ and upgrades [6]_ specific fast-forward upgrade Ansible -tasks associated with the first two actions above will be introduced into the -`tripleo-heat-template` service templates for each service as `RoleConfig` -outputs. - -As with `upgrade_tasks` each task is associated with a particular step in the -process. For `fast_forward_upgrade_tasks` these steps are split between prep -tasks that apply to all hosts and bootstrap tasks that only apply to a single -host for a given role. - -Prep step tasks will map to the following actions: - -- Step=1: Disable the overall cluster -- Step=2: Stop OpenStack services -- Step=3: Update host repositories - -Bootstrap step tasks will map to the following actions: - -- Step=4: Take OpenStack DB backups -- Step=5: Pre package update commands -- Step=6: Update required packages -- Step=7: Post package update commands -- Step=8: OpenStack service DB sync -- Step=9: Validation - -As with `update_tasks` each task will use simple `when` conditionals to -identify which step and release(s) it is associated with, ensuring these tasks -are executed at the correct point in the upgrade. - -For example, a step 2 `fast_forward_upgrade_task` task on Ocata is listed below: - -.. code-block:: yaml - - fast_forward_upgrade_tasks: - - name: Example Ocata step 2 task - command: /bin/foo bar - when: - - step|int == 2 - - release == 'ocata' - - -These tasks will then be collated into role specific Ansible playbooks via the -RoleConfig output of the `overcloud` heat template, with step and release -variables being fed in to ensure tasks are executed in the correct order. - -As with `major upgrades` [8]_ a new mistral workflow and tripleoclient command -will be introduced to generate and execute the associated Ansible tasks. - -.. code-block:: bash - - openstack overcloud fast-forward-upgrade --templates [..path to latest THT..] \ - [..original environment arguments..] \ - [..new container environment agruments..] - -Operators will also be able to generate [7]_ , download and review the -playbooks ahead of time using the latest version of `tripleo-heat-templates` -with the following commands: - -.. code-block:: bash - - openstack overcloud deploy --templates [..path to latest THT..] \ - [..original environment arguments..] \ - [..new container environment agruments..] \ - -e environments/fast-forward-upgrade.yaml \ - -e environments/noop-deploy-steps.yaml - openstack overcloud config download - - -Dev workflow ------------- - -The existing tripleo-upgrade Ansible role will be used to automate the -fast-forward upgrade process for use by developers and CI, including the -initial overcloud minor update, undercloud upgrade to `N+3` and fast-forward -upgrade itself. - -Developers working on fast_forward_upgrade_tasks will also be able to deploy -minimal overcloud deployments via `tripleo-quickstart` using release configs -also used by CI. - -Further, when developing tasks, developers will be able to manually render and -run `fast_forward_upgrade_tasks` as standalone Ansible playbooks. Allowing them -to run a subset of the tasks against specific nodes using -`tripleo-ansible-inventory`. Examples of how to do this will be documented -hopefully ensuring a smooth development experience for anyone looking to -contribute tasks for specific services. - -Alternatives ------------- - -* Continue to force operators to upgrade linearly through each major release -* Parallel cloud migrations. - -Security Impact ---------------- - -N/A - -Other End User Impact ---------------------- - -* The control plane will be down for the duration of the upgrade -* The data plane and workloads will remain up. - -Performance Impact ------------------- - -N/A - -Other Deployer Impact ---------------------- - -N/A - -Developer Impact ----------------- - -* Third party service template providers will need to provide - fast_forward_upgrade_steps in their THT service configurations. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignees: - -* lbezdick -* marios -* chem - -Other contributors: - -* shardy -* lyarwood - -Work Items ----------- - -* Introduce fast_forward_upgrades_playbook.yaml to RoleConfig -* Introduce fast_forward_upgrade_tasks in each service template -* Introduce a python-tripleoclient command and associated Mistral workflow. - -Dependencies -============ - -* TripleO - Ansible upgrade Workflow with UI integration [9]_ - -The new major upgrade workflow being introduced for Pike to Queens upgrades -will obviously impact what fast-forward upgrades looks like to Queens. At -present the high level flow for fast-forward upgrades assumes that we can reuse -the current `upgrade_tasks` between N+2 and N+3 to disable and then potentially -remove baremetal services. This is likely to change as the major upgrade -workflow is introduced and so it is likely that these steps will need to be -encoded in `fast_forward_upgrade_tasks`. - -Testing -======= - -* Third party CI jobs will need to be created to test Newton to Queens using - RDO given the upstream EOL of stable/newton with the release of Pike. - -* These jobs should cover the initial undercloud upgrade, overcloud upgrade and - optional canary compute node checks. - -* An additional third party CI job will be required to verify that a Queens - undercloud can correctly manage a Newton overcloud, allowing the separation - of the undercloud upgrade and fast-forward upgrade discussed under - prerequisites. - -* Finally, minimal overcloud roles should be used to verify the upgrade for - certain services. For example, when changes are made to the - `fast_forward_upgrade_tasks` of Nova via changes to - `docker/services/nova-*.yaml` files then a basic overcloud deployment of - Keystone, Glance, Swift, Cinder, Neutron and Nova could be used to quickly - verify the changes in regards to fast-forward upgrades. - -Documentation Impact -==================== - -* This will require extensive developer and user documentation to be written, - most likely in a new section of the docs specifically detailing the - fast-forward upgrade flow. - -References -========== -.. [1] https://etherpad.openstack.org/p/MEX-ops-migrations-upgrades -.. [2] https://etherpad.openstack.org/p/BOS-forum-skip-level-upgrading -.. [3] https://governance.openstack.org/tc/reference/tags/assert_supports-upgrade.html -.. [4] http://tripleo.org/install/post_deployment/package_update.html -.. [5] https://github.com/openstack/tripleo-heat-templates/blob/master/puppet/services/README.rst#update-steps -.. [6] https://github.com/openstack/tripleo-heat-templates/blob/master/puppet/services/README.rst#upgrade-steps -.. [7] https://review.openstack.org/#/c/495658/ -.. [8] https://review.openstack.org/#/q/topic:major-upgrade+(status:open+OR+status:merged) -.. [9] https://specs.openstack.org/openstack/tripleo-specs/specs/queens/tripleo_ansible_upgrades_workflow.html diff --git a/specs/queens/instance-ha.rst b/specs/queens/instance-ha.rst deleted file mode 100644 index 0e0da811..00000000 --- a/specs/queens/instance-ha.rst +++ /dev/null @@ -1,145 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================== -Instance High Availability -========================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/instance-ha - -A very often requested feature by operators and customers is to be able to -automatically resurrect VMs that were running on a compute node that failed (either -due to hardware failures, networking issues or general server problems). -Currently we have a downstream-only procedure which consists of many manual -steps to configure Instance HA: -https://access.redhat.com/documentation/en/red-hat-openstack-platform/9/paged/high-availability-for-compute-instances/chapter-1-overview - -What we would like to implement here is basically an optional opt-in automatic -deployment of a cloud that has Instance HA support. - -Problem Description -=================== - -Currently if a compute node has a hardware failure or a kernel panic all the -instances that were running on the node, will be gone and manual intervention -is needed to resurrect these instances on another compute node. - -Proposed Change -=============== - -Overview --------- - -The proposed change would be to add a few additional puppet-tripleo profiles that would help -us configure the pacemaker resources needed for instance HA. Unlike in previous iterations -we won't need to move nova-compute resources under pacemaker's management. We managed to -achieve the same result without touching the compute nodes (except by setting -up pacemaker_remote on the computes, but that support exists already) - -Alternatives ------------- - -There are a few specs that are modeling host recovery: - -Host Recovery - https://review.openstack.org/#/c/386554/ -Instances auto evacuation - https://review.openstack.org/#/c/257809 - -The first spec uses pacemaker in a very similar way but is too new -and too high level to really be able to comment at this point in time. -The second one has been stalled for a long time and it looks like there -is no consensus yet on the approaches needed. The longterm goal is -to morph the Instance HA deployment into the spec that gets accepted. -We are actively working on both specs as well. In any case we have -discussed the long-term plan with SuSe and NTT and we agreed -on a long-term plan of which this spec is the first step for TripleO. - -Security Impact ---------------- - -No additional security impact. - -Other End User Impact ---------------------- - -End users are not impacted except for the fact that VMs can be resurrected -automatically on a non-failed compute node. - -Performance Impact ------------------- - -There are no performance related impacts as compared to a current deployment. - -Other Deployer Impact ---------------------- - -So this change does not affect the default deployments. What it does it adds a boolean -and some additional profiles so that a deployer can have a cloud configured with Instance -HA support out of the box. - -* One top-level parameter to enable the Instance HA deployment - -* Although fencing configuration is already currently supported by tripleo, we will need - to improve bits and pieces so that we won't need an extra command to generate the - fencing parameters. - -* Upgrades will be impacted by this change in the sense that we will need to make sure to test - them when Instance HA is enabled. - -Developer Impact ----------------- - -No developer impact is planned. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - michele - -Other contributors: - cmsj, abeekhof - -Work Items ----------- - -* Make the fencing configuration fully automated (this is mostly done already, we need oooq integration - and some optimization) - -* Add the logic and needed resources on the control-plane - -* Test the upgrade path when Instance HA is configured - - -Testing -======= - -Testing this manually is fairly simple: - -* Deploy with Instance HA configured and two compute nodes - -* Spawn a test VM - -* Crash the compute node where the VM is running - -* Observe the VM being resurrected on the other compute node - -Testing this in CI is doable but might be a bit more challenging due to resource constraints. - -Documentation Impact -==================== - -A section under advanced configuration is needed explaining the deployment of -a cloud that supports Instance HA. - -References -========== - -* https://access.redhat.com/documentation/en/red-hat-openstack-platform/9/paged/high-availability-for-compute-instances/ diff --git a/specs/queens/ipsec.rst b/specs/queens/ipsec.rst deleted file mode 100644 index e9e81eb3..00000000 --- a/specs/queens/ipsec.rst +++ /dev/null @@ -1,189 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================== -IPSEC encrypted networks -======================== - -https://blueprints.launchpad.net/tripleo/+spec/ipsec - -This proposes the usage of IPSEC tunnels for encrypting all communications in a -TripleO cloud. - -Problem Description -=================== - -Having everything in the network encrypted is a hard requirements for certain -use-cases. While TLS everywhere provides support for this, not everyone wants a -full-fledged CA. IPSEC provides an alternative which requires one component -less (the CA) while still fulfilling the security requirements. With the -downside that IPSEC tunnel configurations can get quite verbose. - - -Proposed Change -=============== - -Overview --------- - -As mentioned in the mailing list [1], for OSP10 we already worked on an ansible -role that runs on top of a TripleO deployment [2]. - -It does the following: - -* Installs IPSEC if it's not available in the system. - -* Sets up the firewall rules. - -* Based on a hard-coded set of networks, it discovers the IP addresses for each - of them. - -* Based on a hard-coded set of networks, it discovers the Virtual IP addresses - (including the Redis VIP). - -* It puts up an IPSEC tunnel for most IPs in each network. - - - Regular IPs are handled as a point-to-point IPSEC tunnel. - - - Virtual IPs are handled with road-warrior configurations. This means that - the VIP's tunnel listens for any connections. This enables easier - configuration of the tunnel, as the VIP-holder doesn't need to be aware nor - configure each tunnel. - - - Similarly to TLS everywhere, this focuses on service-to-service - communication, so we explicitly skip the tenant network. Or, - as it was in the original ansible role, compute-to-compute communication. - This significantly reduces the amount of tunnels we need to set up, but - leaves application security to the deployer. - - - Authentication for the tunnels is done via a Pre-Shared Key (PSK), which is - shared between all nodes. - -* Finally, it creates an OCF resource that tracks each VIP and puts up or down - its corresponding IPSEC tunnel depending on the VIP's location. - - - While this resource is still in the repository [3], it has now landed - upstream [4]. Once this resource is available in the packaged version of - the resource agents, the preferred version will be the packaged one. - - - This resource effectively handles VIP fail-overs, by detecting that a VIP - is no longer hosted by the node, it cleanly puts down the IPSEC tunnel and - enables it where the VIP is now hosted. - -All of this work is already part of the role, however, to have better -integration with the current state of TripleO, the following work is needed: - -* Support for composable networks. - - - Now that composable networks are a thing, we can no longer rely on the - hard-coded values we had in the role. - - - Fortunately, this is information we can get from the tripleo dynamic - inventory. So we would need to add information about the available networks - and the VIPs. - -* Configurable skipping of networks. - - - In order to address the tenant network skipping, we need to somehow make it - configurable. - -* Add the IPSEC package as part of the image. - -* Configure Firewall rules the TripleO way. - - - Currently the role handles the firewall rule setup. However, it should be - fairly simple to configure these rules the same way other services - configure theirs (Using the tripleo..firewall_rules entry). This - will require the usage of a composable service template. - -* As mentioned above, we will need to create a composable service template. - - - This could take into use the recently added `external_deploy_tasks` section - of the templates, which will work similarly to the Kubernetes configuration - and would rely on the config-download mechanism [5]. - -Alternatives ------------- - -While deployers can already use TLS everywhere. A few are already using the -aforementioned ansible role. So this would provide a seamless upgrade path for -them. - -Security Impact ---------------- - -This by itself is a security enhancement, as it enables encryption in the -network. - -The PSK being shared by all the nodes is not ideal and could be addressed by -per-network PSKs. However, this work could be done in further iterations. - -Other End User Impact ---------------------- - -Currently, the deployer needs to provide their PSK. However, this could be -automated as part of the tasks that TripleO does. - -Performance Impact ------------------- - -Same as with TLS everywhere, adding encryption in the network will have a -performance impact. We currently don't have concrete data on what this impact -actually is. - -Other Deployer Impact ---------------------- - -This would be added as a composable service. So it would be something that the -deployer would need to enable via an environment file. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - jaosorior - -Work Items ----------- - -* Add libreswan (IPSEC's frontend) package to the overcloud-full iamge. - -* Add required information to the dynamic inventory (networks and VIPs) - -* Based on the inventory, create the IPSEC tunnels dynamically, and not based - on the hardcoded networks. - -* Add tripleo-ipsec ansible role as part of the TripleO umbrella. - -* Create composable service. - - -Dependencies -============ - -* This requires the triple-ipsec role to be available. For this, it will be - moved to the TripleO umbrella and packaged as such. - - -Testing -======= - -Given that this doesn't require an extra component, we could test this as part -of our upstream tests. The requirement being that the deployment has -network-isolation enabled. - - -References -========== - -[1] http://lists.openstack.org/pipermail/openstack-dev/2017-November/124615.html -[2] https://github.com/JAORMX/tripleo-ipsec -[3] https://github.com/JAORMX/tripleo-ipsec/blob/master/files/ipsec-resource-agent.sh -[4] https://github.com/ClusterLabs/resource-agents/blob/master/heartbeat/ipsec -[5] https://github.com/openstack/tripleo-heat-templates/blob/master/extraconfig/services/kubernetes-master.yaml#L58 diff --git a/specs/queens/network-configuration.rst b/specs/queens/network-configuration.rst deleted file mode 100644 index 76535cfd..00000000 --- a/specs/queens/network-configuration.rst +++ /dev/null @@ -1,115 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -===================== -Network configuration -===================== - -Network configuration for the TripleO GUI - -Problem Description -=================== - -Currently, it's not possible to make advanced network configurations using the -TripleO GUI. - -Proposed Change -=============== - -Overview --------- - -In the GUI, we will provide a wizard to guide the user through configuring the -networks of their deployment. The user will be able to assign networks to -roles, and configure additional network parameters. We will use the -``network_data.yaml`` in the `TripleO Heat Templates`_. The idea is to expose -the data in ``network_data.yaml`` via the web interface. - -In addition to the wizard, we will implement a dynamic network topology diagram -to visually present the configured networks. This will enable the Deployer to -quickly validate their work. The diagram will rely on ``network_data.yaml`` -and ``roles_data.yaml`` for the actual configuration. - -For details, please see the `wireframes`_. - -.. _wireframes: https://openstack.invisionapp.com/share/UM87J4NBQ#/screens -.. _TripleO Heat Templates: https://review.openstack.org/#/c/409921/ - -Alternatives ------------- - -As an alternative, heat templates can be edited manually to allow customization -before uploading. - -Security Impact ---------------- - -The Deployer could accidentally misconfigure the network topology, and thereby -cause data to be exposed. - -Other End User Impact ---------------------- - -Performance Impact ------------------- - -The addition of the configuration wizard and the network topology diagram should -have no performance impact on the amount of time needed to run a deployment. - -Other Deployer Impact ---------------------- - -Developer Impact ----------------- - -As with any new substantial feature, the impact on the developer is cognitive. -We will have to gain a detail understanding of network configuration in -``network_data.yaml``. Also, testing will add overhead on our efforts. - -Implementation -============== - -We can proceed with implementation immediately. - -Assignee(s) ------------ - -Primary assignee: - hpokorny - -Work Items ----------- - -* Network configuration wizard - - Reading data from the backend - - Saving changes - - UI based on wireframes -* Network topology diagram - - Investigate suitable javascript libraries - - UI based on wireframes - -Dependencies -============ - -* The presence of ``roles_data.yaml`` and ``network_data.yaml`` in the plan -* A javascript library for drawing the diagram - -Testing -======= - -Testing shouldn't pose any real challenges with the exception of the network -topology diagram rendering. At best, this is currently unknown as it depends on -the chosen javascript library. Verifying that the correct diagram is displayed -using automated testing might be non-trivial. - -Documentation Impact -==================== - -We should document the new settings introduced by the wizard. The documentation -should be transferable between the heat template project, and TripleO UI. - -References -========== diff --git a/specs/queens/tripleo-messaging.rst b/specs/queens/tripleo-messaging.rst deleted file mode 100644 index c73a33e3..00000000 --- a/specs/queens/tripleo-messaging.rst +++ /dev/null @@ -1,316 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================== -Tripleo RPC and Notification Messaging Support -============================================== - -https://blueprints.launchpad.net/tripleo - -This specification proposes changes to tripleo to enable the selection -and configuration of separate messaging backends for oslo.messaging -RPC and Notification communications. This proposal is a derivative of -the work associated with the original blueprint [1]_ and specification -[2]_ to enable dual backends for oslo.messaging in tripleo. - -Most of the groundwork to enable dual backends was implemented during -the pike release and the introduction of an alternative messaging -backend (qdrouterd) service was made. Presently, the deployment of this -alternative messaging backend is accomplished by aliasing the rabbitmq -service as the tripleo implementation does not model separate -messaging backends. - -Problem Description -=================== - -The oslo.messaging library supports the deployment of dual messaging -system backends for RPC and Notification communications. However, tripleo -currently deploys a single rabbitmq server (cluster) that serves as a -single messaging backend for both RPC and Notifications. - -:: - - +------------+ +----------+ - | RPC Caller | | Notifier | - +-----+------+ +----+-----+ - | | - +--+ +--+ - | | - v v - +-+---------------+-+ - | RabbitMQ Service | - | Messaging Backend | - | | - +-+---------------+-+ - ^ ^ - | | - +--+ +--+ - | | - v v - +------+-----+ +------+-------+ - | RPC | | Notification | - | Server | | Server | - +------------+ +--------------+ - - -To support two separate and distinct messaging backends, tripleo needs -to "duplicate" the set of parameters needed to specify each messaging -system. The oslo.messaging library in OpenStack provides the API to the -messaging services. It is proposed that the implementation model the -RPC and Notification messaging services in place of the backend -messaging server (e.g. rabbitmq). - -:: - - +------------+ +----------+ - | RPC Caller | | Notifier | - +-----+------+ +----+-----+ - | | - | | - v v - +-------------------+ +-------------------+ - | RPC | | Notification | - | Messaging Service | | Messaging Service | - | | | | - +--------+----------+ +--------+----------+ - | | - | | - v v - +------------+ +------+-------+ - | RPC | | Notification | - | Server | | Server | - +------------+ +--------------+ - - -Introducing the separate messaging services and associated parameters in place -of the rabbitmq server is not a major rework but special consideration -must be made to upgrade paths and capabilities to ensure that existing -configurations are not impacted. - -Having separate messaging backends for RPC and Notification -communications provides a number of benefits. These benefits include: - -* tuning the backend to the messaging patterns -* increased aggregate message capacity -* reduced applied load to messaging servers -* increased message throughput -* reduced message latency -* etc. - - -Proposed Change -=============== - -A number of issues need to be resolved in order to express RPC -and Notification messaging services on top of the backend messaging systems. - -Overview --------- - -The proposed change is similar to the concept of a service "backend" -that is configured by tripleo. A number of existing services support -such a backend (or plugin) model. The implementation of a messaging -service backend model should account for the following requirements: - -* deploy a single messaging backend for both RPC and Notifications -* deploy a messaging backend twice, once for RPC and once for - Notifications -* deploy a messaging backend for RPC and a different messaging backend - for Notifications -* deploy an external messaging backend for RPC -* deploy an external messaging backend for Notifications - -Generally, the parameters that were required for deployment of the -rabbitmq service should be duplicated and renamed to "RPC Messaging" -and "Notify Messaging" backend service definitions. Individual backend -files would exist for each possible backend type (e.g. rabbitmq, -qdrouterd, zeromq, kafka or external). The backend selected will -correspondingly define the messaging transport for the messaging -system. - -* transport specifier -* username -* password (and generation) -* host -* port -* virtual host(s) -* ssl (enabled) -* ssl configuration -* health checks - -Tripleo should continue to have a default configuration that deploys -RPC and Notifications messaging services on top of a single rabbitmq -backend server (cluster). Tripleo upgrades should map the legacy -rabbitmq service deployment onto the RPC and Notification messaging -services model. - - -Alternatives ------------- - -The configuration of separate messaging backends could be post -overcloud deployment (e.g. external to tripleo framework). This would -be problematic over the lifecycle of deployments e.g. during upgrades etc. - -Security Impact ---------------- - -The deployment of dual messaging backends for RPC and Notification -communications should be the same from a security standpoint. This -assumes the backends have parity from a security feature -perspective, e.g authentication and encryption. - -Other End User Impact ---------------------- - -Depending on the configuration of the messaging backend deployment, -there could be a number of end user impacts including the following: - -* monitoring of separated messaging backend services -* understanding differences in functionality/behaviors between different - messaging backends (e.g. broker versus router, etc.) -* handling exceptions (e.g. different places for logs, etc.) - -Performance Impact ------------------- - -Using separate messaging systems for RPC and Notifications should -have a positive impact on performance and scalability by: - -* separating RPC and Notification messaging loads -* increased parallelism in message processing -* increased aggregate message transfer capacity -* tuned backend configuration aligned to messaging patterns - -Other Deployer Impact ---------------------- - -The deployment of hybrid messaging will be new to OpenStack -operators. Operators will need to learn the architectural differences -as compared to a single backend deployment. This will include capacity -planning, monitoring, troubleshooting and maintenance best practices. - -Developer Impact ----------------- - -Discuss things that will affect other developers working on OpenStack. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - -* Andy Smith - -* John Eckersberg - -Work Items ----------- - -tripleo-heat-templates: - -* Modify *puppet/services/base.yaml* to introduce separate RPC and - Notification Messaging parameters (e.g. replace 'rabbit' parameters) -* Support two ssl environments (e.g. one for RPC and one for - Notification when separate backends are deployed) -* Consider example backend model such as the following: - -:: - - tripleo-heat-templates - | - +--+ /environments - | | - | +--+ /messaging - | | - | +--+ messaging-(rpc/notify)-rabbitmq.yaml - | +--+ messaging-(rpc/notify)-qdrouterd.yaml - | +--+ messaging-(rpc/notify)-zmq.yaml - | +--+ messaging-(rpc/notify)-kafka.yaml - +--+ /puppet - | | - | +--+ /services - | | - | +--+ messaging-(rpc/notify)-backend-rabbitmq.yaml - | +--+ messaging-(rpc/notify)-backend-qdrouterd.yaml - | +--+ messaging-(rpc/notify)-backend-zmq.yaml - | +--+ messaging-(rpc/notify)-backend-kafka.yaml - | - +--+ /roles - - -puppet_tripleo: - -* Replace rabbitmq_node_names with messaging_rpc_node_names and - messaging_notify_node_names or similar -* Add vhost support -* Consider example backend model such as the following: - -:: - - puppet-tripleo - | - +--+ /manifests - | - +--+ /profile - | - +--+ /base - | - +--+ /messaging - | - +--+ backend.pp - +--+ rpc.pp - +--+ notify.pp - | - +--+ /backend - | - +--+ rabbitmq.pp - +--+ qdrouterd.pp - +--+ zmq.pp - +--+ kafka.pp - - -tripleo_common: - -* Add user and password management for RPC and Messaging services -* Support distinct health checks for separated messaging backends - -packemaker: - -* Determine what should happen when two separate rabbitmq clusters - are deployed. Does this result in two pacemaker services or one? - Some experimentation may be required. - -Dependencies -============ - -None. - -Testing -======= - -In order to test this in CI, an environment will be needed where separate -messaging system backends (e.g. rabbitMQ server and dispatch-router -server) are deployed. Any existing hardware configuration should be -appropriate for the dual backend deployment. - - -Documentation Impact -==================== - -The deployment documentation will need to be updated to cover the -configuration of the separate messaging (RPC and Notify) services. - - -References -========== - -.. [1] https://blueprints.launchpad.net/tripleo/+spec/om-dual-backends -.. [2] https://review.openstack.org/#/c/396740/ diff --git a/specs/queens/tripleo-ptp.rst b/specs/queens/tripleo-ptp.rst deleted file mode 100644 index 30818cfd..00000000 --- a/specs/queens/tripleo-ptp.rst +++ /dev/null @@ -1,141 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================= -TripleO PTP (Precision Time Protocol) Support -============================================= - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-ptp - -This spec introduces support for a time synchronization method called PTP [0] -which provides better time accuracy than NTP in general. With hardware -timestamping support on the host, PTP can achieve clock accuracy in the -sub-microsecond range, making it suitable for measurement and control systems. - -Problem Description -=================== - -Currently tripleo deploys NTP services by default which provide millisecond -level time accuracy, but this is not enough for some cases: - -* Fault/Error events will include timestamps placed on the associated event - messages, retrieved by detectors with the purpose of accurately identifying - the time that the event occurred. Given that the target Fault Management - cycle timelines are in tens of milliseconds on most critical faults, events - ordering may reverse against actual time if precison and accuracy of clock - synchronization are in the same level of accuracy. - -* NFV C-RAN (Cloud Radio Access Network) is looking for better time - sychronization and distribution in micro-second level accuracy as alternative - for NTP, PTP has been evaluated as one of the technologies. - -This spec is not intended to cover all the possible ways of PTP usage, rather -to provide a basic deployment path for PTP in tripleo with default -configuration set to support PTP Ordinary Clock (slave mode); the master mode -ptp clock configuration is not in the scope of this spec, but shall be deployed -by user to provide the time source for the PTP Ordinary Clock. The full support -of PTP capability can be enhanced further based on this spec. - -User shall be aware of the fact that NTP and PTP can not be configured together -on the same node without a coordinator program like timemaster which is also -provided by linuxptp package. How to configure and use timemaster is not in the -scope of this spec. - -Proposed Change -=============== - -Overview --------- - -Provide the capability to configure PTP as time synchronization method: - -* Add PTP configuration file path in overcloud resource registry. - -* Add puppet-tripleo profile for PTP services. - -* Add tripleo-heat-templates composable service for PTP. - -Retain the current default behavior to deploy NTP as time synchronization -source: - -* The NTP services remain unchanged as the default time synchronization method. - -* The NTP services must be disabled on nodes where PTP are deployed. - -Alternatives ------------- - -The alternative is to continue to use NTP. - -Security Impact ---------------- - -Security issues originated from PTP will need to be considered. - -Other End User Impact ---------------------- - -Users will get more accurate time from PTP. - -Performance Impact ------------------- - -No impact with default deployment mode which uses NTP as time source. - -Other Deployer Impact ---------------------- - -The operator who wants to use PTP should identify and provide the PTP capable -network interface name and make sure NTP is not deployed on the nodes where PTP -will be deployed. The default PTP network interface name is set to 'nic1' where -user should change it according to real interface name. By default, PTP will -not be deployed unless explicitly configured. - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - zshi - -Work Items ----------- - -* Puppet-tripleo profile for PTP services -* Tripleo-heat-templates composable service for PTP deployment - -Dependencies -============ - -* Puppet module for PTP services: ptp [1] -* The linuxptp RPM must be installed, and PTP capable NIC must be identified. -* Refer to linuxptp project page [2] for the list of drivers that support the - PHC (Physical Hardware Clock) subsystem. - -Testing -======= - -The deployment of PTP should be testable in CI. - -Documentation Impact -==================== - -The deployment documation will need to be updated to cover the configuration of -PTP. - -References -========== - -* [0] https://standards.ieee.org/findstds/standard/1588-2008.html -* [1] https://github.com/redhat-nfvpe/ptp -* [2] http://linuxptp.sourceforge.net diff --git a/specs/queens/tripleo-routed-networks-deployment.rst b/specs/queens/tripleo-routed-networks-deployment.rst deleted file mode 100644 index c89ad8d8..00000000 --- a/specs/queens/tripleo-routed-networks-deployment.rst +++ /dev/null @@ -1,733 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================================== -TripleO Routed Networks Deployment (Spine-and-Leaf Clos) -======================================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-routed-networks-deployment - -TripleO uses shared L2 networks today, so each node is attached to the -provisioning network, and any other networks are also shared. This -significantly reduces the complexity required to deploy on bare metal, -since DHCP and PXE booting are simply done over a shared broadcast domain. -This also makes the network switch configuration easy, since there is only -a need to configure VLANs and ports, but no added complexity from dynamic -routing between all switches. - -This design has limitations, however, and becomes unwieldy beyond a certain -scale. As the number of nodes increases, the background traffic from Broadcast, -Unicast, and Multicast (BUM) traffic also increases. This design also requires -all top-of-rack switches to trunk the VLANs back to the core switches, which -centralizes the layer 3 gateway, usually on a single core switch. That creates -a bottleneck which is not present in Clos architecture. - -This spec serves as a detailed description of the overall problem set, and -applies to the master blueprint. The sub-blueprints for the various -implementation items also have their own associated spec. - -Problem Description -=================== - -Where possible, modern high-performance datacenter networks typically use -routed networking to increase scalability and reduce failure domains. Using -routed networks makes it possible to optimize a Clos (also known as -"spine-and-leaf") architecture for scalability:: - - ,=========. ,=========. - | spine 1 |__ __| spine 2 | - '==|\=====\_ \__________________/ _/=====/|==' - | \_ \___ / \ ___/ _/ | ^ - | \___ / \ _______ / \ ___/ | |-- Dynamic routing (BGP, OSPF, - | / \ / \ / \ | v EIGRP) - ,------. ,------ ,------. ,------. - |leaf 1|....|leaf 2| |leaf 3|....|leaf 4| ======== Layer 2/3 boundary - '------' '------' '------' '------' - | | | | - | | | | - |-[serv-A1]=-| |-[serv-B1]=-| - |-[serv-A2]=-| |-[serv-B2]=-| - |-[serv-A3]=-| |-[serv-B3]=-| - Rack A Rack B - - - -In the above diagram, each server is connected via an Ethernet bond to both -top-of-rack leaf switches, which are clustered and configured as a virtual -switch chassis. Each leaf switch is attached to each spine switch. Within each -rack, all servers share a layer 2 domain. The subnets are local to the rack, -and the default gateway is the top-of-rack virtual switch pair. Dynamic routing -between the leaf switches and the spine switches permits East-West traffic -between the racks. - -This is just one example of a routed network architecture. The layer 3 routing -could also be done only on the spine switches, or there may even be distribution -level switches that sit in between the top-of-rack switches and the routed core. -The distinguishing feature that we are trying to enable is segregating local -systems within a layer 2 domain, with routing between domains. - -In a shared layer-2 architecture, the spine switches typically have to act in an -active/passive mode to act as the L3 gateway for the single shared VLAN. All -leaf switches must be attached to the active switch, and the limit on North-South -bandwidth is the connection to the active switch, so there is an upper bound on -the scalability. The Clos topology is favored because it provides horizontal -scalability. Additional spine switches can be added to increase East-West and -North-South bandwidth. Equal-cost multipath routing between switches ensures -that all links are utlized simultaneously. If all ports are full on the spine -switches, an additional tier can be added to connect additional spines, -each with their own set of leaf switches, providing hyperscale expandability. - -Each network device may be taken out of service for maintenance without the entire -network being offline. This topology also allows the switches to be configured -without physical loops or Spanning Tree, since the redundant links are either -delivered via bonding or via multiple layer 3 uplink paths with equal metrics. -Some advantages of using this architecture with separate subnets per rack are: - -* Reduced domain for broadcast, unknown unicast, and multicast (BUM) traffic. -* Reduced failure domain. -* Geographical separation. -* Association between IP address and rack location. -* Better cross-vendor support for multipath forwarding using equal-cost - multipath forwarding (ECMP) via L3 routing, instead of proprietary "fabric". - -This topology is significantly different from the shared-everything approach that -TripleO takes today. - -Problem Descriptions -==================== - -As this is a complex topic, it will be easier to break the problems down into -their constituent parts, based on which part of TripleO they affect: - -**Problem #1: TripleO uses DHCP/PXE on the Undercloud provisioning net (ctlplane).** - -Neutron on the undercloud does not yet support DHCP relays and multiple L2 -subnets, since it does DHCP/PXE directly on the provisioning network. - -Possible Solutions, Ideas, or Approaches: - -1. Modify Ironic and/or Neutron to support multiple DHCP ranges in the dnsmasq - configuration, use DHCP relay running on top-of-rack switches which - receives DHCP requests and forwards them to dnsmasq on the Undercloud. - There is a patch in progress to support that [11]_. -2. Modify Neutron to support DHCP relay. There is a patch in progress to - support that [10]_. - -Currently, if one adds a subnet to a network, Neutron DHCP agent will pick up -the changes and configure separate subnets correctly in ``dnsmasq``. For instance, -after adding a second subnet to the ``ctlplane`` network, here is the resulting -startup command for Neutron's instance of dnsmasq:: - - dnsmasq --no-hosts --no-resolv --strict-order --except-interface=lo \ - --pid-file=/var/lib/neutron/dhcp/aae53442-204e-4c8e-8a84-55baaeb496cf/pid \ - --dhcp-hostsfile=/var/lib/neutron/dhcp/aae53442-204e-4c8e-8a84-55baaeb496cf/host \ - --addn-hosts=/var/lib/neutron/dhcp/aae53442-204e-4c8e-8a84-55baaeb496cf/addn_hosts \ - --dhcp-optsfile=/var/lib/neutron/dhcp/aae53442-204e-4c8e-8a84-55baaeb496cf/opts \ - --dhcp-leasefile=/var/lib/neutron/dhcp/aae53442-204e-4c8e-8a84-55baaeb496cf/leases \ - --dhcp-match=set:ipxe,175 --bind-interfaces --interface=tap4ccef953-e0 \ - --dhcp-range=set:tag0,172.19.0.0,static,86400s \ - --dhcp-range=set:tag1,172.20.0.0,static,86400s \ - --dhcp-option-force=option:mtu,1500 --dhcp-lease-max=512 \ - --conf-file=/etc/dnsmasq-ironic.conf --domain=openstacklocal - -The router information gets put into the dhcp-optsfile, here are the contents -of /var/lib/neutron/dhcp/aae53442-204e-4c8e-8a84-55baaeb496cf/opts:: - - tag:tag0,option:classless-static-route,172.20.0.0/24,0.0.0.0,0.0.0.0/0,172.19.0.254 - tag:tag0,249,172.20.0.0/24,0.0.0.0,0.0.0.0/0,172.19.0.254 - tag:tag0,option:router,172.19.0.254 - tag:tag1,option:classless-static-route,169.254.169.254/32,172.20.0.1,172.19.0.0/24,0.0.0.0,0.0.0.0/0,172.20.0.254 - tag:tag1,249,169.254.169.254/32,172.20.0.1,172.19.0.0/24,0.0.0.0,0.0.0.0/0,172.20.0.254 - tag:tag1,option:router,172.20.0.254 - -The above options file will result in separate routers being handed out to -separate IP subnets. Furthermore, Neutron appears to "do the right thing" with -regard to routes for other subnets on the same network. We can see that the -option "classless-static-route" is given, with pointers to both the default -route and the other subnet(s) on the same Neutron network. - -In order to modify Ironic-Inspector to use multiple subnets, we will need to -extend instack-undercloud to support network segments. There is a patch in -review to support segments in instack undercloud [0]_. - -**Potential Workaround** - -One possibility is to use an alternate method to DHCP/PXE boot, such as using -DHCP configuration directly on the router, or to configure a host on the remote -network which provides DHCP and PXE URLs, then provides routes back to the -ironic-conductor and metadata server as part of the DHCP response. - -It is not always feasible for groups doing testing or development to configure -DHCP relay on the switches. For proof-of-concept implementations of -spine-and-leaf, we may want to configure all provisioning networks to be -trunked back to the Undercloud. This would allow the Undercloud to provide DHCP -for all networks without special switch configuration. In this case, the -Undercloud would act as a router between subnets/VLANs. This should be -considered a small-scale solution, as this is not as scalable as DHCP relay. -The configuration file for dnsmasq is the same whether all subnets are local or -remote, but dnsmasq may have to listen on multiple interfaces (today it only -listens on br-ctlplane). The dnsmasq process currently runs with -``--bind-interface=tap-XXX``, but the process will need to be run with either -binding to multiple interfaces, or with ``--except-interface=lo`` and multiple -interfaces bound to the namespace. - -For proof-of-concept deployments, as well as testing environments, it might -make sense to run a DHCP relay on the Undercloud, and trunk all provisioning -VLANs back to the Undercloud. This would allow dnsmasq to listen on the tap -interface, and DHCP requests would be forwarded to the tap interface. The -downside of this approach is that the Undercloud would need to have IP -addresses on each of the trunked interfaces. - -Another option is to configure dedicated hosts or VMs to be used as DHCP relay -and router for subnets on multiple VLANs, all of which would be trunked to the -relay/router host, thus acting exactly like routing switches. - ------------- - -**Problem #2: Neutron's model for a segmented network that spans multiple L2 -domains uses the segment object to allow multiple subnets to be assigned to -the same network. This functionality needs to be integrated into the -Undercloud.** - -Possible Solutions, Ideas, or Approaches: - -1. Implement Neutron segments on the undercloud. - -The spec for Neutron routed network segments [1]_ provides a schema that we can -use to model a routed network. By implementing support for network segments, we -can provide assign Ironic nodes to networks on routed subnets. This allows us -to continue to use Neutron for IP address management, as ports are assigned by -Neutron and tracked in the Neutron database on the Undercloud. See approach #1 -below. - -2. Multiple Neutron networks (1 set per rack), to model all L2 segments. - -By using a different set of networks in each rack, this provides us with -the flexibility to use different network architectures on a per-rack basis. -Each rack could have its own set of networks, and we would no longer have -to provide all networks in all racks. Additionally, a split-datacenter -architecture would naturally have a different set of networks in each -site, so this approach makes sense. This is detailed in approach #2 below. - -3. Multiple subnets per Neutron network. - -This is probably the best approach for provisioning, since Neutron is -already able to handle DHCP relay with multiple subnets as part of the -same network. Additionally, this allows a clean separation between local -subnets associated with provisioning, and networks which are used -in the overcloud, such as External networks in two different datacenters). -This is covered in more detail in approach #3 below. - -4. Use another system for IPAM, instead of Neutron. - -Although we could use a database, flat file, or some other method to keep -track of IP addresses, Neutron as an IPAM back-end provides many integration -benefits. Neutron integrates DHCP, hardware switch port configuration (through -the use of plugins), integration in Ironic, and other features such as -IPv6 support. This has been deemed to be infeasible due to the level of effort -required in replacing both Neutron and the Neutron DHCP server (dnsmasq). - -**Approaches to Problem #2:** - -Approach 1 (Implement Neutron segments on the Undercloud): - -The Neutron segments model provides a schema in Neutron that allows us to -model the routed network. Using multiple subnets provides the flexibility -we need without creating exponentially more resources. We would create the same -provisioning network that we do today, but use multiple segments associated -to different routed subnets. The disadvantage to this approach is that it makes -it impossible to represent network VLANs with more than one IP subnet (Neutron -technically supports more than one subnet per port). Currently TripleO only -supports a single subnet per isolated network, so this should not be an issue. - -Approach 2 (Multiple Neutron networks (1 set per rack), to model all L2 segments): - -We will be using multiple networks to represent isolated networks in multiple -L2 domains. One sticking point is that although Neutron will configure multiple -routes for multiple subnets within a given network, we need to be able to both -configure static IPs and routes, and be able to scale the network by adding -additional subnets after initial deployment. - -Since we control addresses and routes on the host nodes using a -combination of Heat templates and os-net-config, it is possible to use -static routes to supernets to provide L2 adjacency. This approach only -works for non-provisioning networks, since we rely on Neutron DHCP servers -providing routes to adjacent subnets for the provisioning network. - -Example: -Suppose 2 subnets are provided for the Internal API network: ``172.19.1.0/24`` -and ``172.19.2.0/24``. We want all Internal API traffic to traverse the Internal -API VLANs on both the controller and a remote compute node. The Internal API -network uses different VLANs for the two nodes, so we need the routes on the -hosts to point toward the Internal API gateway instead of the default gateway. -This can be provided by a supernet route to 172.19.x.x pointing to the local -gateway on each subnet (e.g. 172.19.1.1 and 172.19.2.1 on the respective -subnets). This could be represented in os-net-config with the following:: - - - - type: interface - name: nic3 - addresses: - - - ip_netmask: {get_param: InternalApiIpSubnet} - routes: - - - ip_netmask: {get_param: InternalApiSupernet} - next_hop: {get_param: InternalApiRouter} - -Where InternalApiIpSubnet is the IP address on the local subnet, -InternalApiSupernet is '172.19.0.0/16', and InternalApiRouter is either -172.19.1.1 or 172.19.2.1 depending on which local subnet the host belongs to. - -The end result of this is that each host has a set of IP addresses and routes -that isolate traffic by function. In order for the return traffic to also be -isolated by function, similar routes must exist on both hosts, pointing to the -local gateway on the local subnet for the larger supernet that contains all -Internal API subnets. - -The downside of this is that we must require proper supernetting, and this may -lead to larger blocks of IP addresses being used to provide ample space for -scaling growth. For instance, in the example above an entire /16 network is set -aside for up to 255 local subnets for the Internal API network. This could be -changed into a more reasonable space, such as /18, if the number of local -subnets will not exceed 64, etc. This will be less of an issue with native IPv6 -than with IPv4, where scarcity is much more likely. - -Approach 3 (Multiple subnets per Neutron network): - -The approach we will use for the provisioning network will be to use multiple -subnets per network, using Neutron segments. This will allow us to take -advantage of Neutron's ability to support multiple networks with DHCP relay. -The DHCP server will supply the necessary routes via DHCP until the nodes are -configured with a static IP post-deployment. - ---------- - -**Problem #3: Ironic introspection DHCP doesn't yet support DHCP relay** - -This makes it difficult to do introspection when the hosts are not on the same L2 -domain as the controllers. Patches are either merged or in review to support -DHCP relay. - -Possible Solutions, Ideas, or Approaches: - -1. A patch to support a dnsmasq PXE filter driver has been merged. This will - allow us to support selective DHCP when using DHCP relay (where the packet - is not coming from the MAC of the host but rather the MAC of the switch) - [12]_. - -2. A patch has been merged to puppet-ironic to support multiple DHCP subnets - for Ironic Inspector [13]_. - -3. A patch is in review to add support for multiple subnets for the - provisioning network in the instack-undercloud scripts [14]_. - -For more information about solutions, please refer to the -tripleo-routed-networks-ironic-inspector blueprint [5]_ and spec [6]_. - -------- - -**Problem #4: The IP addresses on the provisioning network need to be -static IPs for production.** - -Possible Solutions, Ideas, or Approaches: - -1. Dan Prince wrote a patch [9]_ in Newton to convert the ctlplane network - addresses to static addresses post-deployment. This will need to be - refactored to support multiple provisioning subnets across routers. - -Solution Implementation - -This work is done and merged for the legacy use case. During the -initial deployment, the nodes receive their IP address via DHCP, but during -Heat deployment the os-net-config script is called, which writes static -configuration files for the NICs with static IPs. - -This work will need to be refactored to support assigning IPs from the -appropriate subnet, but the work will be part of the TripleO Heat Template -refactoring listed in Problems #6, and #7 below. - -For the deployment model where the IPs are specified (ips-from-pool-all.yaml), -we need to develop a model where the Control Plane IP can be specified -on multiple deployment subnets. This may happen in a later cycle than the -initial work being done to enable routed networks in TripleO. For more -information, reference the tripleo-predictable-ctlplane-ips blueprint [7]_ -and spec [8]_. - ------- - -**Problem #5: Heat Support For Routed Networks** - -The Neutron routed networks extensions were only added in recent releases, and -there was a dependency on TripleO Heat Templates. - -Possible Solutions, Ideas or Approaches: - -1. Add the required objects to Heat. At minimum, we will probably have to - add ``OS::Neutron::Segment``, which represents layer 2 segments, the - ``OS::Neutron::Network`` will be updated to support the ``l2-adjacency`` - attribute, ``OS::Neutron::Subnet`` and ``OS::Neutron:port`` would be extended - to support the ``segment_id`` attribute. - -Solution Implementation: - -Heat now supports the OS::Neutron::Segment resource. For example:: - - heat_template_version: 2015-04-30 - ... - resources: - ... - the_resource: - type: OS::Neutron::Segment - properties: - description: String - name: String - network: String - network_type: String - physical_network: String - segmentation_id: Integer - -This work has been completed in Heat with this review [15]_. - ------- - -**Problem #6: Static IP assignment: Choosing static IPs from the correct -subnet** - -Some roles, such as Compute, can likely be placed in any subnet, but we will -need to keep certain roles co-located within the same set of L2 domains. For -instance, whatever role is providing Neutron services will need all controllers -in the same L2 domain for VRRP to work properly. - -The network interfaces will be configured using templates that create -configuration files for os-net-config. The IP addresses that are written to each -node's configuration will need to be on the correct subnet for each host. In -order for Heat to assign ports from the correct subnets, we will need to have a -host-to-subnets mapping. - -Possible Solutions, Ideas or Approaches: - -1. The simplest implementation of this would probably be a mapping of role/index - to a set of subnets, so that it is known to Heat that Controller-1 is in - subnet set X and Compute-3 is in subnet set Y. -2. We could associate particular subnets with roles, and then use one role - per L2 domain (such as per-rack). -3. The roles and templates should be refactored to allow for dynamic IP - assignment within subnets associated with the role. We may wish to evaluate - the possibility of storing the routed subnets in Neutron using the routed - networks extensions that are still under development. This would provide - additional flexibility, but is probably not required to implement separate - subnets in each rack. -4. A scalable long-term solution is to map which subnet the host is on - during introspection. If we can identify the correct subnet for each - interface, then we can correlate that with IP addresses from the correct - allocation pool. This would have the advantage of not requiring a static - mapping of role to node to subnet. In order to do this, additional - integration would be required between Ironic and Neutron (to make Ironic - aware of multiple subnets per network, and to add the ability to make - that association during introspection). - -Solution Impelementation: - -Solutions 1 and 2 above have been implemented in the "composable roles" series -of patches [16]_. The initial implementation uses separate Neutron networks -for different L2 domains. These templates are responsible for assigning the -isolated VLANs used for data plane and overcloud control planes, but does not -address the provisioning network. Future work may refactor the non-provisioning -networks to use segments, but for now non-provisioning networks must use -different networks for different roles. - -Ironic autodiscovery may allow us to determine the subnet where each node -is located without manual entry. More work is required to automate this -process. - ------- - -**Problem #7: Isolated Networking Requires Static Routes to Ensure Correct VLAN -is Used** - -In order to continue using the Isolated Networks model, routes will need to be -in place on each node, to steer traffic to the correct VLAN interfaces. The -routes are written when os-net-config first runs, but may change. We -can't just rely on the specific routes to other subnets, since the number of -subnets will increase or decrease as racks are added or taken away. Rather than -try to deal with constantly changing routes, we should use static routes that -will not need to change, to avoid disruption on a running system. - -Possible Solutions, Ideas or Approaches: - -1. Require that supernets are used for various network groups. For instance, - all the Internal API subnets would be part of a supernet, for instance - 172.17.0.0/16 could be used, and broken up into many smaller subnets, such - as /24. This would simplify the routes, since only a single route for - 172.17.0.0/16 would be required pointing to the local router on the - 172.17.x.0/24 network. -2. Modify os-net-config so that routes can be updated without bouncing - interfaces, and then run os-net-config on all nodes when scaling occurs. - A review for this functionality was considered and abandeded [3]_. - The patch was determined to have the potential to lead to instability. - -os-net-config configures static routes for each interface. If we can keep the -routing simple (one route per functional network), then we would be able to -isolate traffic onto functional VLANs like we do today. - -It would be a change to the existing workflow to have os-net-config run on -updates as well as deployment, but if this were a non-impacting event (the -interfaces didn't have to be bounced), that would probably be OK. - -At a later time, the possibility of using dynamic routing should be considered, -since it reduces the possibility of user error and is better suited to -centralized management. SDN solutions are one way to provide this, or other -approaches may be considered, such as setting up OVS tunnels. - -Proposed Change -=============== -The proposed changes are discussed below. - -Overview --------- - -In order to provide spine-and-leaf networking for deployments, several changes -will have to be made to TripleO: - -1. Support for DHCP relay in Ironic and Neutron DHCP servers. Implemented in - patch [15]_ and the patch series [17]_. -2. Refactoring of TripleO Heat Templates network isolation to support multiple - subnets per isolated network, as well as per-subnet and supernet routes. - The bulk of this work is done in the patch series [16]_ and in patch [10]_. -3. Changes to Infra CI to support testing. -4. Documentation updates. - -Alternatives ------------- - -The approach outlined here is very prescriptive, in that the networks must be -known ahead of time, and the IP addresses must be selected from the appropriate -pool. This is due to the reliance on static IP addresses provided by Heat. - -One alternative approach is to use DHCP servers to assign IP addresses on all -hosts on all interfaces. This would simplify configuration within the Heat -templates and environment files. Unfortunately, this was the original approach -of TripleO, and it was deemed insufficient by end-users, who wanted stability -of IP addresses, and didn't want to have an external dependency on DHCP. - -Another approach is to use the DHCP server functionality in the network switch -infrastructure in order to PXE boot systems, then assign static IP addresses -after the PXE boot is done via DHCP. This approach only solves for part of the -requirement: the net booting. It does not solve the desire to have static IP -addresses on each network. This could be achieved by having static IP addresses -in some sort of per-node map. However, this approach is not as scalable as -programatically determining the IPs, since it only applies to a fixed number of -hosts. We want to retain the ability of using Neutron as an IP address -management (IPAM) back-end, ideally. - -Another approach which was considered was simply trunking all networks back -to the Undercloud, so that dnsmasq could respond to DHCP requests directly, -rather than requiring a DHCP relay. Unfortunately, this has already been -identified as being unacceptable by some large operators, who have network -architectures that make heavy use of L2 segregation via routers. This also -won't work well in situations where there is geographical separation between -the VLANs, such as in split-site deployments. - -Security Impact ---------------- - -One of the major differences between spine-and-leaf and standard isolated -networking is that the various subnets are connected by routers, rather than -being completely isolated. This means that without proper ACLs on the routers, -networks which should be private may be opened up to outside traffic. - -This should be addressed in the documentation, and it should be stressed that -ACLs should be in place to prevent unwanted network traffic. For instance, the -Internal API network is sensitive in that the database and message queue -services run on that network. It is supposed to be isolated from outside -connections. This can be achieved fairly easily if *supernets* are used, so -that if all Internal API subnets are a part of the ``172.19.0.0/16`` supernet, -an ACL rule will allow only traffic between Internal API IPs (this is a -simplified example that could be applied to any Internal API VLAN, or as a -global ACL):: - - allow traffic from 172.19.0.0/16 to 172.19.0.0/16 - deny traffic from * to 172.19.0.0/16 - -Other End User Impact ---------------------- - -Deploying with spine-and-leaf will require additional parameters to -provide the routing information and multiple subnets required. This will have -to be documented. Furthermore, the validation scripts may need to be updated -to ensure that the configuration is validated, and that there is proper -connectivity between overcloud hosts. - -Performance Impact ------------------- - -Much of the traffic that is today made over layer 2 will be traversing layer -3 routing borders in this design. That adds some minimal latency and overhead, -although in practice the difference may not be noticeable. One important -consideration is that the routers must not be too overcommitted on their -uplinks, and the routers must be monitored to ensure that they are not acting -as a bottleneck, especially if complex access control lists are used. - -Other Deployer Impact ---------------------- - -A spine-and-leaf deployment will be more difficult to troubleshoot than a -deployment that simply uses a set of VLANs. The deployer may need to have -more network expertise, or a dedicated network engineer may be needed to -troubleshoot in some cases. - -Developer Impact ----------------- - -Spine-and-leaf is not easily tested in virt environments. This should be -possible, but due to the complexity of setting up libvirt bridges and -routes, we may want to provide a simulation of spine-and-leaf for use in -virtual environments. This may involve building multiple libvirt bridges -and routing between them on the Undercloud, or it may involve using a -DHCP relay on the virt-host as well as routing on the virt-host to simulate -a full routing switch. A plan for development and testing will need to be -developed, since not every developer can be expected to have a routed -environment to work in. It may take some time to develop a routed virtual -environment, so initial work will be done on bare metal. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Dan Sneddon - -Approver(s) ------------ - -Primary approver: - Emilien Macchi - -Work Items ----------- - -1. Add static IP assignment to Control Plane [DONE] -2. Modify Ironic Inspector ``dnsmasq.conf`` generation to allow export of - multiple DHCP ranges, as described in Problem #1 and Problem #3. -3. Evaluate the Routed Networks work in Neutron, to determine if it is required - for spine-and-leaf, as described in Problem #2. -4. Add OS::Neutron::Segment and l2-adjacency support to Heat, as described - in Problem #5. This may or may not be a dependency for spine-and-leaf, based - on the results of work item #3. -5. Modify the Ironic-Inspector service to record the host-to-subnet mappings, - perhaps during introspection, to address Problem #6. -6. Add parameters to Isolated Networking model in Heat to support supernet - routes for individual subnets, as described in Problem #7. -7. Modify Isolated Networking model in Heat to support multiple subnets, as - described in Problem #8. -8. Add support for setting routes to supernets in os-net-config NIC templates, - as described in the proposed solution to Problem #2. -9. Implement support for iptables on the Controller, in order to mitigate - the APIs potentially being reachable via remote routes. Alternatively, - document the mitigation procedure using ACLs on the routers. -10. Document the testing procedures. -11. Modify the documentation in tripleo-docs to cover the spine-and-leaf case. - - -Implementation Details ----------------------- - -Workflow: - -1. Operator configures DHCP networks and IP address ranges -2. Operator imports baremetal instackenv.json -3. When introspection or deployment is run, the DHCP server receives the DHCP - request from the baremetal host via DHCP relay -4. If the node has not been introspected, reply with an IP address from the - introspection pool* and the inspector PXE boot image -5. If the node already has been introspected, then the server assumes this is - a deployment attempt, and replies with the Neutron port IP address and the - overcloud-full deployment image -6. The Heat templates are processed which generate os-net-config templates, and - os-net-config is run to assign static IPs from the correct subnets, as well - as routes to other subnets via the router gateway addresses. - -* The introspection pool will be different for each provisioning subnet. - -When using spine-and-leaf, the DHCP server will need to provide an introspection -IP address on the appropriate subnet, depending on the information contained in -the DHCP relay packet that is forwarded by the segment router. dnsmasq will -automatically match the gateway address (GIADDR) of the router that forwarded -the request to the subnet where the DHCP request was received, and will respond -with an IP and gateway appropriate for that subnet. - -The above workflow for the DHCP server should allow for provisioning IPs on -multiple subnets. - -Dependencies -============ - -There may be a dependency on the Neutron Routed Networks. This won't be clear -until a full evaluation is done on whether we can represent spine-and-leaf -using only multiple subnets per network. - -There will be a dependency on routing switches that perform DHCP relay service -for production spine-and-leaf deployments. - -Testing -======= - -In order to properly test this framework, we will need to establish at least -one CI test that deploys spine-and-leaf. As discussed in this spec, it isn't -necessary to have a full routed bare metal environment in order to test this -functionality, although there is some work to get it working in virtual -environments such as OVB. - -For bare metal testing, it is sufficient to trunk all VLANs back to the -Undercloud, then run DHCP proxy on the Undercloud to receive all the -requests and forward them to br-ctlplane, where dnsmasq listens. This -will provide a substitute for routers running DHCP relay. For Neutron -DHCP, some modifications to the iptables rule may be required to ensure -that all DHCP requests from the overcloud nodes are received by the -DHCP proxy and/or the Neutron dnsmasq process running in the dhcp-agent -namespace. - -Documentation Impact -==================== - -The procedure for setting up a dev environment will need to be documented, -and a work item mentions this requirement. - -The TripleO docs will need to be updated to include detailed instructions -for deploying in a spine-and-leaf environment, including the environment -setup. Covering specific vendor implementations of switch configurations -is outside this scope, but a specific overview of required configuration -options should be included, such as enabling DHCP relay (or "helper-address" -as it is also known) and setting the Undercloud as a server to receive -DHCP requests. - -The updates to TripleO docs will also have to include a detailed discussion -of choices to be made about IP addressing before a deployment. If supernets -are to be used for network isolation, then a good plan for IP addressing will -be required to ensure scalability in the future. - -References -========== - -.. [0] `Review: TripleO Heat Templates: Tripleo routed networks ironic inspector, and Undercloud `_ -.. [1] `Spec: Routed Networks for Neutron `_ -.. [3] `Review: Modify os-net-config to make changes without bouncing interface `_ -.. [5] `Blueprint: Modify TripleO Ironic Inspector to PXE Boot Via DHCP Relay `_ -.. [6] `Spec: Modify TripleO Ironic Inspector to PXE Boot Via DHCP Relay `_ -.. [7] `Blueprint: User-specifiable Control Plane IP on TripleO Routed Isolated Networks `_ -.. [8] `Spec: User-specifiable Control Plane IP on TripleO Routed Isolated Networks `_ -.. [9] `Review: Configure ctlplane network with a static IP `_ -.. [10] `Review: Neutron: Make "on-link" routes for subnets optional `_ -.. [11] `Review: Ironic Inspector: Make "on-link" routes for subnets optional `_ -.. [12] `Review: Ironic Inspector: Introducing a dnsmasq PXE filter driver `_ -.. [13] `Review: Multiple DHCP Subnets for Ironic Inspector `_ -.. [14] `Review: Instack Undercloud: Add support for multiple inspection subnets `_ -.. [15] `Review: DHCP Agent: Separate local from non-local subnets `_ -.. [16] `Review Series: topic:bp/composable-networks `_ -.. [17] `Review Series: project:openstack/networking-baremetal `_ diff --git a/specs/queens/tripleo_ansible_upgrades_workflow.rst b/specs/queens/tripleo_ansible_upgrades_workflow.rst deleted file mode 100644 index ca3839ee..00000000 --- a/specs/queens/tripleo_ansible_upgrades_workflow.rst +++ /dev/null @@ -1,190 +0,0 @@ - -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================================== -TripleO - Ansible upgrade Worklow with UI integration -========================================================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/major-upgrade-workflow - -During the Pike cycle the minor update and some parts of the major upgrade -are significantly different to any previous cycle, in that they are *not* being -delivered onto nodes via Heat stack update. Rather, Heat stack update is used -to only collect, but not execute, the relevant ansible tasks defined in each -of the service manifests_ as upgrade_tasks_ or update_tasks_ accordingly. -These tasks are then written as stand-alone ansible playbooks in the stack -outputs_. - -These 'config' playbooks are then downloaded using the *openstack overcloud -config download* utility_ and finally executed to deliver the actual -upgrade or update. See bugs 1715557_ and 1708115_ for more information -(or pointers/reviews) about this mechanism as used during the P cycle. - -For Queens and as discussed at the Denver PTG_ we aim to extend this approach -to include the controlplane upgrade too. That is, instead of using HEAT -SoftwareConfig and Deployments_ to invoke_ ansible we should instead collect -the upgrade_tasks for the controlplane nodes into ansible playbooks that can -then be invoked to deliver the actual upgrade. - -Problem Description -=================== - -Whilst it has continually improved in each cycle, complexity and difficulty to -debug or understand what has been executed at any given point of the upgrade -is still one of the biggest complaints from operators about the TripleO -upgrades workflow. In the P cycle and as discussed above, the minor version -update and some part of the 'non-controller' upgrade have already moved to the -model being proposed here, i.e. generate ansible-playbooks with an initial heat -stack update and then execute them. - -If we are to use this approach for all parts of the upgrade, including for the -controlplane services then we will also need a mistral workbook that can handle -the download and execution of the ansible-playbook invocations. With this kind -of ansible driven workflow, executed by mistral action/workbook, we can for -the first time consider integration with the UI for upgrade/updates. This -aligns well with the effort_ by the UI team for feature parity in CLI/UI for -Queens. It should also be noted that there is already some work underway to -adding the required mistral actions, at least for the minor update for Pike -deployments in changes 487488_ and 487496_ - -Implementing a fully ansible-playbook delivered workflow for the entire major -upgrade workflow will offer a number of benefits: - - * very short initial heat stack update to generate the playbooks - * easier to follow and understand what is happening at a given step of the upgrade - * easier to debug and re-run any particular step of the upgrade - * implies full python-tripleoclient and mistral workbook support for the - ansible-playbook invocations. - * can consider integrating upgrades/updates into the UI, for the first time - -Proposed Change -=============== - -We will need an initial heat stack update to populate the -upgrade_tasks_playbook into the overcloud stack output (the cli is just a -suggestion): - - * openstack overcloud upgrade --init --init-commands [ "sudo curl -L -o /etc/yum.repos.d/delorean-pike.repo https://trunk.rdoproject.org/centos7-ocata/current/pike.repo", - "sudo yum install my_package", ... ] - -The first step of the upgrade will be used to deliver any required common -upgrade initialisation, such as switching repos to the target version, -installing any new packages required during the upgrade, and populating the upgrades playbooks. - -Then the operator will run the upgrade targeting specific nodes: - - * openstack overcloud upgrade --nodes [overcloud-novacompute-0, overcloud-novacompute-1] or - openstack overcloud upgrade --nodes "Compute" - -Download and execute the ansible playbooks on particular specified set of -nodes. Ideally we will make it possible to specify a role name with the -playbooks being invoked in a rolling fashion on each node. - -One of the required changes is to convert all the service templates to have -'when' conditionals instead of the current 'stepN'. For Pike we did this in -the client_ to avoid breaking the heat driven upgrade workflow still in use -for the controlplane during the Ocata to Pike upgrade. This will allow us to -use the 'ansible-native' loop_ control we are currently using in the generated -ansible playbooks. - - -Other End User Impact ---------------------- - -There will be significant changes to the workflow and cli the operator uses -for the major upgrade as documented above. - -Performance Impact ------------------- - -The initial Heat stack update will not deliver any of the puppet or docker -config to nodes since the DeploymentSteps will be disabled_ as we currently -do for Pike minor update. This will mean a much shorter heat stack update - -exact numbers TBD but 'minutes not hours'. - -Developer Impact ----------------- - -Should make it easier for developers to debug particular parts of the upgrades -workflow. - - -Implementation -============== - -Assignee(s) ------------ -Contributors: -Marios Andreou (marios) -Mathieu Bultel (matbu) -Sofer Athlang Guyot (chem) -Steve Hardy (shardy) -Carlos Ccamacho (ccamacho) -Jose Luis Franco Arza (jfrancoa) -Marius Cornea (mcornea) -Yurii Prokulevych (yprokule) -Lukas Bezdicka (social) -Raviv Bar-Tal (rbartal) -Amit Ugol (amitu) - -Work Items ----------- - - * Remove steps and add when for all the ansible upgrade tasks, minor - update tasks, deployment steps, post_upgrade_tasks - * Need mistral workflows that can invoke the required stages of the - workflow (--init and --nodes). There is some existing work in this - direction in 463765_. - * CLI/python-tripleoclient changes required. Related to the previous - item there is some work started on this in 463728_. - * UI work - we will need to collaborate with the UI team for the - integration. We have never had UI driven upgrade or updates. - * CI: Implement a simple job (one nodes, just controller, which does the - heat-setup-output and run ansible --nodes Controller) with keystone - only upgrade. Then iterate on this as we can add service upgrade_tasks. - * Docs! - -Testing -======= - -We will aim to land a 'keystone-only' job asap which will be updated as the various -changes required to deliver this spec are closer to merging. For example we -may deploy only a very small subset of services (e.g. first keystone) and then iterate as changes -related to this spec are proposed. - -Documentation Impact -==================== - -We should also track changes in the documented upgrades workflow since as -described here it is going to change significantly both internally as well as -the interface exposed to an operator. - -References -========== -Check the source_ for links - -.. _manifests: https://github.com/openstack/tripleo-heat-templates/tree/master/docker/services -.. _upgrade_tasks: https://github.com/openstack/tripleo-heat-templates/blob/211d7f32dc9cda261e96c3f5e0e1e12fc0afdbb5/docker/services/nova-compute.yaml#L147 -.. _update_tasks: https://github.com/openstack/tripleo-heat-templates/blob/60f3f10442f3b4cedb40def22cf7b6938a39b391/puppet/services/tripleo-packages.yaml#L59 -.. _outputs: https://github.com/openstack/tripleo-heat-templates/blob/3dcc9b30e9991087b9e898e25685985df6f94361/common/deploy-steps.j2#L324-L372 -.. _utility: https://github.com/openstack/python-tripleoclient/blob/27bba766daa737a56a8d884c47cca1c003f16e3f/tripleoclient/v1/overcloud_config.py#L26-L154 -.. _1715557: https://bugs.launchpad.net/tripleo/+bug/1715557 -.. _1708115: https://bugs.launchpad.net/tripleo/+bug/1708115 -.. _PTG: https://etherpad.openstack.org/p/tripleo-ptg-queens-upgrades -.. _Deployments: https://github.com/openstack/tripleo-heat-templates/blob/f4730632a51dec2b9be6867d58184fac3b8a11a5/common/major_upgrade_steps.j2.yaml#L132-L173 -.. _invoke: https://github.com/openstack/tripleo-heat-templates/blob/f4730632a51dec2b9be6867d58184fac3b8a11a5/puppet/upgrade_config.yaml#L21-L50 -.. _effort: http://lists.openstack.org/pipermail/openstack-dev/2017-September/122089.html -.. _487488: https://review.openstack.org/#/c/487488/ -.. _487496: https://review.openstack.org/#/c/487496/ -.. _client: https://github.com/openstack/python-tripleoclient/blob/4d342826d6c3db38ee88dccc92363b655b1161a5/tripleoclient/v1/overcloud_config.py#L63 -.. _loop: https://github.com/openstack/tripleo-heat-templates/blob/fe2acfc579295965b5f39c5ef7a34bea35f3d6bf/common/deploy-steps.j2#L364-L365 -.. _disabled: https://review.openstack.org/#/c/487496/21/tripleo_common/actions/package_update.py@63 -.. _source: https://raw.githubusercontent.com/openstack/tripleo-specs/master/specs/queens/tripleo_ansible_upgrades_workflow.rst -.. _463728: https://review.openstack.org/#/c/463728/ -.. _463765: https://review.openstack.org/#/c/463765/ diff --git a/specs/queens/triplo-ovs-hw-offload.rst b/specs/queens/triplo-ovs-hw-offload.rst deleted file mode 100644 index 03e4b8f6..00000000 --- a/specs/queens/triplo-ovs-hw-offload.rst +++ /dev/null @@ -1,141 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Adding OVS Hardware Offload to TripleO -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-ovs-hw-offload - -OVS Hardware Offload leverages SR-IOV technology to control the SR-IOV -VF using VF representor port. OVS 2.8.0 supports the hw-offload option which -allows to offload OVS datapath rule to hardware using linux traffic control -tool and the VF representor port. This feature accelerates the OVS -with a SR-IOV NIC which support switchdev mode. - -Problem Description -=================== - -Today the installation and configuration of OVS hardware offload feature is -done manually after overcloud deployment. It shall be automated via tripleo. - -Proposed Change -=============== - -Overview --------- - -* Configure the SR-IOV NIC to be in switchdev mode using the following - syntax :: for NeutronSriovNumVFs. - mode can be legacy or switchdev -* Configure the OVS with other_config:hw-offload. The options can - be added for the cluster without side effects, because if then NIC doesn't - support OVS will fall-back to kernel datapath. - -* Nova scheduler should be configured to use the PciPassthroughFilter - (same SR-IOV) -* Nova compute should be configured with passthrough_whitelist (same SR-IOV) - -Alternatives ------------- - -None - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -* OVS Hardware Offload leverage the SR-IOV technology to provides near - native I/O performance for each virtual machine that managed by OpenVswitch. - -Other Deployer Impact ---------------------- - -* The operator shall ensure that the BIOS supports VT-d/IOMMU virtualization - technology on the compute nodes. - -* IOMMU needs to be enabled in the Compute+SR-IOV nodes. Boot parameters - (intel_iommu=on or amd_iommu=pt) shall be added in the grub.conf, using the - PreNetworkConfig. - -* Post deployment, operator shall - - * Create neutron ports prior to creating VM’s (nova boot) - openstack port create --vnic-type direct --binding-profile '{"capabilities": ["switchdev"]}' port1 - - * Create the VM with the required flavor and SR-IOV port id - openstack server create --image cirros-mellanox_sriov --port=port1 --flavor m1.tiny vm_a1 - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - waleedm (waleedm@mellanox.com - -Other contributors: - moshele (moshele@mellanox.com) - -Work Items ----------- - -* Update tripleo::host::sriov::numvfs_persistence to allow configure SR-IOV - in switchdev mode. extending the vf_defs to - ::. Mode can be legacy which is default - SR-IOV or switchdev which is used for ovs hardware offload. -* Add a template parameter called NeutronOVSHwOffload to enable. -* provide environment YAML for OVS hardware offload in tripleo-heat-templates. - -Dependencies -============ - -None - - -Testing -======= - -* Since SR-IOV needs specific hardware support, this feature can be tested - under third party CI. We hope to provide Mellanox CI to SR-IOV and this - feature. - -Documentation Impact -==================== - -None - -References -========== - -* Introduction to SR-IOV - http://goo.gl/m7jP3 - -* SR-IOV OVS hardware offload netdevconf - http://netdevconf.org/1.2/papers/efraim-gerlitz-sriov-ovs-final.pdf - -* OVS hardware offload in OpenVswitch - https://mail.openvswitch.org/pipermail/ovs-dev/2017-April/330606.html - -* OpenStack OVS mechanism driver support in neutron/nova/os-vif - https://review.openstack.org/#/c/398265/ - https://review.openstack.org/#/c/275616/ - https://review.openstack.org/#/c/460278/ diff --git a/specs/rocky/custom-validations.rst b/specs/rocky/custom-validations.rst deleted file mode 100644 index 374fc979..00000000 --- a/specs/rocky/custom-validations.rst +++ /dev/null @@ -1,160 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Add Support for Custom TripleO Validations -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/custom-validations - -All validations are currently stored in a single directory. This makes -it inconvenient to try and write new validations, update from a remote -repository or to add an entirely new (perhaps private) source. - - -Problem Description -=================== - -* The deployer wants to develop and test their own validations in a - personal checkout without risking changes to the default ones. - -* The deployer wants to use a stable release of TripleO but consume - the latest validations because they are non-disruptive and check for - more stuff. - -* A third party has developed validations specific to their product - that they don't want to or can't include in the tripleo-validations - repository. - - - -Proposed Change -=============== - -Overview --------- - -We will store a default set of TripleO validations in a Swift container called -``tripleo-validations``. These will be shared across all plans and are not -expected to be updated by the deployer. This container should be created on -initial undercloud deployment. - -We will provide a mechanism for deployers to add a custom set of validations -per deployment plan. These plan-specific validations will be stored in a -``custom-validations`` subdirectory in the plan's Swift container. Storing them -together with the plan makes sense as these validations can be specific to -particular deployment plan configuration, as well as makes the import/export -easier. - -Since custom validation will be stored as part of the plan, no additional -workflows/actions to perform CRUD operations for them will be necessary; we can -simply use the existing plan create/update for this purpose. - -The validation Mistral actions (e.g. ``list`` and ``run_validation``) -will need to be updated to take into account this new structure of -validations. They will need to look for validations in the -``tripleo-validations`` Swift container (for default validations) and the -plan's ``custom-validations`` subdirectory (for custom validations), instead of -sourcing them from a directory on disk, as they are doing now. - -If a validation with the same name is found both in default in custom -validations, we will always pick the one stored in custom validations. - -.. note:: As a further iteration, we can implement validations as per-service - tasks in standalone service Ansible roles. They can then be consumed - by tripleo-heat-templates service templates. - -Alternatives ------------- - -* Do nothing. The deployers can already bring in additional - validations, it's just less convenient and potentially error-prone. - -* We could provide a know directory structure conceptually similar to - ``run-parts`` where the deployers could add their own validation - directories. - - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -In order to add their own validations, the deployer will need to -update the deployment plan by adding a ``custom-validations`` directory to it, -and making sure this directory contains the desired custom validations. The -plan update operation is already supported in the CLI and the UI. - -Performance Impact ------------------- - -Since the validation sources will now be Swift containers, downloading -validations will potentially be necessary on each run. We will have to keep an -eye on this an potentially introduce caching if this turns out to be a problem. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Testing and developing new validations in both development and -production environments will be easier with this change. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignees: - * akrivoka - -Other contributors: - * florianf - -Work Items ----------- -* Move to using Swift as default storage for tripleo-validations ([1]_). - -* Update ``load_validations`` and ``find_validation`` functions to - read validations from all the sources specified in this document. - -Dependencies -============ - -In order to be able to implement this new functionality, we first need to have -the validations use Swift as the default storage. In other words, this spec -depends on the blueprint [1]_. - -Testing -======= - -The changes will be unit-tested in all the tripleo repos that related -changes land in (tripleo-common, instack-undercloud, tripleo-heat-templates, -etc). - -We could also add a new CI scenario that would have a custom-validations -directory within a plan set up. - - -Documentation Impact -==================== - -We will need to document the format of the new custom-validations plan -subdirectory and the new behaviour this will introduce. - - -References -========== - -.. [1] https://blueprints.launchpad.net/tripleo/+spec/store-validations-in-swift diff --git a/specs/rocky/logging-stdout.rst b/specs/rocky/logging-stdout.rst deleted file mode 100644 index 1a8c6fd3..00000000 --- a/specs/rocky/logging-stdout.rst +++ /dev/null @@ -1,172 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================= -Enable logging to stdout/journald for rsyslog -============================================= - -https://blueprints.launchpad.net/tripleo/+spec/logging-stdout-rsyslog - -We can optimize the current logging implementation to take advantage -of metadata that our default logging driver (journald) adds to the -logs. - -Problem Description -=================== - -Currently, we put all the logs of the containers into a directory in -the host (/var/log/containers/). While this approach works, it relies -on mounting directories from the host itself. This makes it harder for -logging forwarders, since we need to configure them to track all those -files. With every service that we add, we end up having to write -configuration for that service for those specific files. - -Furthermore, we lose important metadata with this approach. We can -figure out what service wrote what log, but we lose the container name and ID, -which is very useful. These we can easily get just by using the default -docker logging mechanism. - -Instead of relying on the host filesystem for our logs, we can adopt a -simpler solution that both preserves important metadata that is -discarded by the current implementation and that will support most -services without requiring per-service configuration. - -Proposed Change -=============== - -Overview --------- - -The proposal is to configure containerized services to log to -stdout/stderr as is common practice for containerized applications. -This allows the logs to get picked up by the docker logging driver, -and thus we can use "docker logs" to view the logs of a service as one -would usually expect. It will also help us decouple the -containers from the host, since we will no longer be relying on host -filesystem mounts for log collection. - -In the case of services where it's difficult or not possible to log to -stdout or stderr, we will place log files in a docker volume, and this -volume will be shared with a sidecar container that will output the -logs to stdout so they are consumable by the logging drvier. This will -also apply for containers that log only to syslog (such as HAProxy). -We will stop mounting ``/dev/log`` from the host, and instead add a -sidecar container that will output the logs instead. - -Additionally, since our default logging driver is journald, we will -get all the container logs accessible via ``journalctl`` and the -journald libraries. So one would be able to do ``journalctl -CONTAINER_NAME=`` to get the logs of a specific -container on the node. Furthermore, we would get extra metadata -information for each log entry [1]. We would benefit for -getting the container name (as the ``CONTAINER_NAME`` metadata item) -and the container ID (as the ``CONTAINER_ID`` and -``CONTAINER_ID_FULL`` metadata items) from each journald log entry -without requiring extra processing. Adding extra tags to the -containers is possible [2], and would get reflected via the -``CONTAINER_TAG`` metadata entry. These tags can optionally describe the -application that emitted the logs or describe the platform that it -comes from. - -This will also make it easier for us to forward logs, since there will -be a centralized service (journald) on each host from which we can -collect the logs. When we add a new service, it will be a matter of -following the same logging pattern, and we will automatically be able -to forward those logs without requiring specific configuration to -track a new set of log files. - -With this solution in place, we need to also provide tooling to -integrate with centralized logging solutions. This will then cover -integration to the Openshift Logging Stack [3] and ViaQ [4]. We are -proposing the use of rsyslog for message collection, manipulation, and -log forwarding. This will also be done in a containerized fashion, -where rsyslog will be a "system container" that reads from the host -journal. Rsyslog will perform metadata extraction from log messages -(such as extracting the user, project, and domain from standard oslo -format logs), and will then finally forward the logs to a central -collector. - -Pluggable implementation -~~~~~~~~~~~~~~~~~~~~~~~~ - -The implementation needs to be done in a pluggable manner. This is because -end-users have already created automation based on the assumption that logs -exist in the ``/var/log/`` / ``/var/log/containers/*`` directories -that we have been providing. For this reason, logging to stdout/stderr will be -optional, and we'll keep logging to files in the host as a default for now. -This will then be optionally enabled via an environment file. - -Example -~~~~~~~ - -nova-api container: - -In the proposed solution, the standard nova logs will go to the -nova_api container's stdout/stderr. However, since we are also -interested in the apache access logs, we will then create a docker -volume where the access logs will be hosted. A sidecar container will -mount this volume, create a FIFO (named pipe) and output whatever it -gets from that file. Note that this sidecar container will need to be -started before the actual nova_api container. - -For each log file generated in the main container, we will create a -sidecar container that outputs that log. This will make it easier to -associate log messages with the originating service. - -Alternatives ------------- - -Keep logging to files in the hosts' directory. - -We can still use the current solution; however, it is not ideal as it -violates container logging best practices, relies heavily on -directories on the host (which we want to avoid) and is inconsistent -in the way we can get logging from services (some in files, some in -syslog). - -Other End User Impact ---------------------- - -Since we're not getting rid of the previous logging solution, users won't be -impacted. They will, however, get another way of getting logs and interacting -with them in the host system, and further create automation from that if -needed. - -Performance Impact ------------------- - -* TODO: Any performance considerations on getting everything to journald? - -Implementation -============== - -Primary assignees: - jaosorior - jbadiapa - larsks - -Work Items ----------- - -* Allow services to log to stdout/stderr (if possible). - -* Implement pluggable logging for each service in t-h-t. - -* Add Rsyslog container. - -Testing -======= - -TODO: Evaluate how can we log to an EFK stack in upstream CI. Do we have one -available? - -References -========== - -[1] https://docs.docker.com/engine/admin/logging/journald/ -[2] https://docs.docker.com/engine/admin/logging/log_tags/ -[3] https://docs.openshift.com/container-platform/3.5/install_config/aggregate_logging.html -[4] https://github.com/ViaQ/Main/blob/master/README-install.md diff --git a/specs/rocky/split-controlplane.rst b/specs/rocky/split-controlplane.rst deleted file mode 100644 index d4f2b53d..00000000 --- a/specs/rocky/split-controlplane.rst +++ /dev/null @@ -1,248 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================================== -TripleO Split Control Plane from Compute/Storage Support -======================================================== - -https://blueprints.launchpad.net/tripleo/+spec/split-controlplane - -This spec introduces support for a mode of deployment where the controlplane -nodes are deployed and then batches of compute/storage nodes can be added -independently. - -Problem Description -=================== - -Currently tripleo deploys all services, for all roles (groups of nodes) in -a single heat stack. This works quite well for small to medium size deployments -but for very large environments, there is considerable benefit to dividing the -batches of nodes, e.g when deploying many hundreds/thousands of compute nodes. - -* Scalability can be improved when deploying a fairly static controlplane then - adding batches of e.g compute nodes when demand requires scale out. The overhead - of updating all the nodes in every role for any scale out operation is non-trivial - and although this is somewhat mitigated by the split from heat deployed servers - to config download & ansible for configuration, making modular deployments easier - is of benefit when needing to scale deployments to very large environments. - -* Risk reduction - there are often requests to avoid any update to controlplane - nodes when adding capacity for e.g compute or storage, and modular deployments - makes this easier as no modification is required to the controalplane nodes to - e.g add compute nodes. - -This spec is not intended to cover all the possible ways achieving modular deployments, -but instead outline the requirements and give an overview of the interfaces we need to -consider to enable this flexibility. - -Proposed Change -=============== - -Overview --------- - -To enable incremental changes, I'm assuming we could still deploy the controlplane -nodes via the existing architecture, e.g Heat deploys the nodes/networks and we -then use config download to configure those nodes via ansible. - -To deploy compute nodes, we have several options: - -1. Deploy multiple "compute only" heat stacks, which would generate - ansible playbooks via config download, and consume some output data - from the controlplane stack. - -2. Deploy additional nodes via mistral, then configure them via - ansible (today this still requires heat to generate the - playbooks/inventory even if it's a transient stack). - -3. Deploy nodes via ansible, then configure them via ansible (again, - with the config download mechanism we have available today we'd - need heat to generate the configuration data). - -The above doesn't consider a "pure ansible" solution as we would have to first make ansible -role equivalents for all the composable service templates available, and that effort -is out of scope for this spec. - -Scope and Phases ----------------- - -The three items listed in the overview cover an incremental approach -and the first phase is to implement the first item. Though this item -adds an additional dependency on Heat, this is done only to allow the -desired functionality using what is available today. In future phases -any additional dependency on Heat will need to be addressed and any -changes done during the first phase should be minimal and focus on -parameter exposure between Heat stacks. Implementation of the other -items in the overview could span multiple OpenStack development cycles -and additional details may need to be addressed in future -specifications. - -If a deployer is able to do the following simple scenario, then this -specification is implemented as phase 1 of the larger feature: - -- Deploy a single undercloud with one control-plane network -- Create a Heat stack called overcloud-controllers with 0 compute nodes -- Create a Heat stack called overcloud-computes which may be used by the controllers -- Use the APIs of the controllers to boot an instance on the computes deployed from the overcloud-computes Heat stack - -In the above scenario the majority of the work involves exposing the -correct parameters between Heat stacks so that a controller node is -able to use a compute node as if it were an external service. This is -analogous to how TripleO provides a template where properties of an -external Ceph cluster may be used by TripleO to configure a service -like Cinder which uses the external Ceph cluster. - -The simple scenario above is possible without network isolation. In -the more complex workload site vs control site scenario, described -in the following section, network traffic will not be routed through -the controller. How the networking aspect of that deployment scenario -is managed will need to be addressed in a separate specification and -the overall effort will likely to span multiple OpenStack development -cycles. - -For the phase of implementation covered in this specification, the -compute nodes will be PXE booted by Ironic from the same provisioning -network as the controller nodes during deployment. Instances booted on -these compute nodes could connect to a provider network to which their -compute nodes have direct access. Alternatively these compute nodes -could be deployed with physical access to the network which hosts -the overlay networks. The resulting overcloud should look the same as -one in which the compute nodes were deployed as part of the overcloud -Heat stack. Thus, the controller and compute nodes will run the same -services they normally would regardless of if the deployment were -split between two undercloud Heat stacks. The services on the -controller and compute nodes could be composed to multiple servers -but determining the limits of composition is out of scope for the -first phase. - -Example Usecase Scenario: Workload vs Control Sites ---------------------------------------------------- - -One application of this feature includes the ability to deploy -separate workload and control sites. A control site provides -management and OpenStack API services, e.g. the Nova API and -Scheduler. A workload site provides resources needed only by the -workload, e.g. Nova compute resources with local storage in -availability zones which directly serve workload network traffic -without routing back to the control site. Though there would be -additional latency between the control site and workload site with -respect to managing instances, there would be no reason that the -workload itself could not perform adequately once running and each -workload site would have a smaller footprint. - -.. image:: ../../../../images/split-controlplane/ceph-details.png - :height: 445px - :width: 629px - :alt: Diagram of an example control site with multiple workload sites - :align: center - -This scenario is included in this specification as an example -application of the feature. This specification does not aim to address -all of the details of operating separate control and workload sites -but only to describe how the proposed feature, *deployment of -independent controlplane and compute nodes*, for TripleO could be -built upon to simplify deployment of such sites in future versions of -TripleO. For example the blueprint to make it possible to deploy -multiple Ceph clusters in the overcloud [1]_ could be applied to -provide a separate Ceph cluster per workload site, but its scope only -focuses on changes to roles in order to enable only that feature; it -is orthogonal to this proposal. - -Alternatives ------------- - -Alternatives to the incremental change outlined in the overview include reimplementing service -configuration in ansible, such that nodes can be configured via playbooks without dependency -on the existing heat+ansible architecture. Work is ongoing in this area e.g the ansible roles -to deploy services on k8s, but this spec is primarily concerned with finding an interim -solution that enables our current architecture to scale to very large deployments. - -Security Impact ---------------- - -Potentially sensitive data such as passwords will need to be shared between the controlplane -stack and the compute-only deployments. Given the admin-only nature of the undercloud I think -this is OK. - -Other End User Impact ---------------------- - -Users will have more flexibility and control with regard to how they -choose to scale their deployments. An example of this includes -separate control and workload sites as mentioned in the example use -case scenario. - -Performance Impact ------------------- - -Potentially better performance at scale, although the total time could be increased assuming -each scale out is serialized. - -Other Deployer Impact ---------------------- - -None - - -Developer Impact ----------------- - -It is already possible to deploy multiple overcloud Heat stacks from -one undercloud, but if there are parts of the TripleO tool-chain which -assume a single Heat stack, they made need to be updated. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - shardy - -Other assignees: - gfidente - fultonj - - -Work Items ----------- - -* Proof of concept showing how to deploy independent controlplane and compute nodes using already landed patches [2]_ and by overriding the EndpointMap -* If there are problems with overriding the EndpointMap, rework all-nodes-config to output the "all nodes" hieradata and vip details, such that they could span stacks -* Determine what data are missing in each stack and propose patches to expose the missing data to each stack that needs it -* Modify the proof of concept to support adding a separate and minimal ceph cluster (mon, mgr, osd) through a heat stack separate from the controller node's heat stack. -* Refine how the data is shared between each stack to improve the user experience -* Update the documentation to include an example of the new deployment method -* Retrospect and write a follow up specification covering details necessary for the next phase - - -Dependencies -============ - -None. - -Testing -======= - -Ideally scale testing will be performed to validate the scalability -aspects of this work. For the first phase, any changes done to enable -the simple scenario described under Scope and Phases will be tested -manually and the existing CI will ensure they do not break current -functionality. Changes implemented in the follow up phases could have -CI scenarios added. - -Documentation Impact -==================== - -The deployment documation will need to be updated to cover the configuration of -split controlplane environments. - -References -========== - -.. [1] `Make it possible to deploy multiple Ceph clusters in the overcloud `_ -.. [2] `Topic: topic:compute_only_stack2 `_ diff --git a/specs/rocky/tripleo-barometer-integration.rst b/specs/rocky/tripleo-barometer-integration.rst deleted file mode 100644 index b5c19f42..00000000 --- a/specs/rocky/tripleo-barometer-integration.rst +++ /dev/null @@ -1,112 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -==================================================================== -Support Barometer(Software Fastpath Service Quality Metrics) Service -==================================================================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-barometer-integration - -The scope of the [Barometer]_ project is to provide interfaces to support -monitoring of the NFVI. The project has plugins for telemetry frameworks -to enable the collection of platform stats and events and relay gathered -information to fault management applications or the VIM. The scope is -limited to collecting/gathering the events and stats and relaying them -to a relevant endpoint. - -The consumption of performance and traffic-related information/events -provided by this project should be a logical extension of any existing -VNF/NFVI monitoring framework. - -Problem Description -=================== - -Integration of Barometer in TripleO is a benefit for building the OPNFV platform. -The Barometer project is complementary to the Doctor project to build the fault -management framework with [Apex_Installer]_ installer which is an OPNFV installation and -deployment tool based on TripleO. - -Proposed Change -=============== - -Overview --------- - -This spec proposes changes to automate the deployment of Barometer using TripleO. - -* Add puppet-barometer package to the overcloud-full image. - -* Define Barometer Service in THT. - -* Add how and when to deploy Barometer in puppet-tripleo. - -Alternatives ------------- - -None - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -Barometer service is default disabled in a Deployment. Need to enable it -if deployer wants to use it. - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Akhila Kishore - -Work Items ----------- - -As outlined in the proposed changes. - -Dependencies -============ - -The Barometer RPM package must be in RDO repo. - -Testing -======= - -Add the test for CI scenarios. - -Documentation Impact -==================== - -The setup and configuration of the Barometer service should be documented. - -References -========== - -.. [Barometer] https://wiki.opnfv.org/display/fastpath/Barometer+Home -.. [Apex_Installer] https://wiki.opnfv.org/display/apex diff --git a/specs/rocky/tripleo-ha-utils.rst b/specs/rocky/tripleo-ha-utils.rst deleted file mode 100644 index 3ef95fb5..00000000 --- a/specs/rocky/tripleo-ha-utils.rst +++ /dev/null @@ -1,143 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================= -TripleO tools for testing HA deployments -============================================= - -We need a way to verify a Highly Available TripleO deployment with proper tests -that check if the HA bits are behaving correctly. - -Problem Description -=================== - -Currently, we test HA behavior of TripleO deployments only by deploying -environments with three controllers and see if we're able to spawn an instance, -but this is not enough. - -There should be a way to verify the HA capabilities of deployments, and if the -behavior of the environment is still correct after inducted failures, -simulated outages and so on. - -This tool should be a standalone component to be included by the user if -necessary, without breaking any of the dynamics present in TripleO. - -Proposed Change -=============== - -Overview --------- - -The proposal is to create an Ansible based project named tripleo-ha-utils that -will be consumable by the various tools that we use to deploy TripleO -environments like tripleo-quickstart or infrared or by manual deployments. - -The project will initially cover three principal roles: - -* **stonith-config**: a playbook used to automate the creation of fencing - devices in the overcloud; -* **instance-ha**: a playbook that automates the seventeen manual steps needed - to configure instance HA in the overcloud, test them via rally and verify - that instance HA works appropriately; -* **validate-ha**: a playbook that runs a series of disruptive actions in the - overcloud and verifies it always behaves correctly by deploying a - heat-template that involves all the overcloud components; - -Today the project exists outside the TripleO umbrella, and it is named -tripleo-quickstart-utils [1] (see "Alternatives" for the historical reasons of -this name). It is used internally inside promotion pipelines, and has -also been tested with success in RDOCloud. - -Pluggable implementation -~~~~~~~~~~~~~~~~~~~~~~~~ - -The base principle of the project is to give people the ability to integrate -the first roles with whatever kind of test. For example, today we're using -a simple bash framework to interact with the cluster (so pcs commands and -other interactions), rally to test instance-ha and Ansible itself to simulate -full power outage scenarios. -The idea is to keep this pluggable approach leaving the final user the choice -about what to use. - -Retro compatibility -~~~~~~~~~~~~~~~~~~~ - -One of the aims of this project is to be retro-compatible with the previous -version of OpenStack. Starting from Liberty, we cover instance-ha and -stonith-config Ansible playbooks for all the releases. -The same happens while testing HA since all the tests are plugged in depending -on the release. - -Alternatives ------------- - -While evaluating alternatives, the first thing to consider is that this -project aims to be a TripleO-centric set of tools for HA, not a generic -OpenStack's one. -We want tools to help the user answer questions like "Is the Galera bundle -cluster resource able to tolerate a stop and a consecutive start without -affecting the environment capabilities?" or "Is the environment able to -evacuate instances after being configured for Instance HA?". And the answer we -want is YES or NO. - -* *tripleo-validations*: the most logical place to put this, at least - looking at the name, would be tripleo-validations. By talking with folks - working on it, it came out that the meaning of tripleo-validations project is - not doing disruptive tests. Integrating this stuff would be out of scope. - -* *tripleo-quickstart-extras*: apart from the fact that this is not - something meant just for quickstart (the project supports infrared and - "plain" environments as well) even if we initially started there, in the - end, it came out that nobody was looking at the patches since nobody was - able to verify them. The result was a series of reviews stuck forever. - So moving back to extras would be a step backward. - -Other End User Impact ---------------------- - -None. The good thing about this solution is that there's no impact for anyone -unless the solution gets loaded inside an existing project. Since this will be -an external project, it will not impact anything of the current stuff. - -Performance Impact ------------------- - -None. Unless the deployments, the CI runs or whatever include the roles there -will be no impact, and so the performances will not change. - -Implementation -============== - -Primary assignees: - -* rscarazz - -Work Items ----------- - -* Import the tripleo-quickstart-utils [1] as a new repository and start new - deployments from there. - -Testing -======= - -Due to the disruptive nature of these tests, the TripleO CI should not be -updated to include these tests, mostly because of timing issues. -This project should remain optionally usable by people when needed, or in -specific CI environments meant to support longer than usual jobs. - -Documentation Impact -==================== - -All the implemented roles are today fully documented in the -tripleo-quickstart-utils [1] project, so importing its repository as is will -also give its full documentation. - -References -========== - -[1] Original project to import as new - https://github.com/redhat-openstack/tripleo-quickstart-utils diff --git a/specs/rocky/tripleo-rsyslog-remote-logging.rst b/specs/rocky/tripleo-rsyslog-remote-logging.rst deleted file mode 100644 index cfe8d70f..00000000 --- a/specs/rocky/tripleo-rsyslog-remote-logging.rst +++ /dev/null @@ -1,276 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== - TripleO Remote Logging -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/remote-logging - -This spec is meant to extend the tripleo-logging spec also for queens to -address key issues about log transport and storage that are separate from -the technical requirements created by logging for containerized processes. - -Problem Description -=================== - -Having logs stuck on individual overcloud nodes isn't a workable solution -for a modern system deployed at scale. But log aggregation is complex both -to implement and to scale. TripleO should provide a robust, well documented, -and scalable solution that will serve the majority of users needs and be -easily extensible for others. - - -Proposed Change -=============== - -Overview --------- - -In addition to the rsyslog logging to stdout defined for containers in the -triple-logging spec this spec outlines how logging to remote targets should -work in detail. - -Essentially this comes down to a set of options for the config -of the rsyslog container. Other services will have a fixed rsyslog config -that forwards messages to the rsyslog container to pick up over journald. - -1. Logging destination, local, remote direct, or remote aggregator. - -Remote direct means to go direct to a storage solution, in this case -Elasticsearch or plaintext on the disk. Remote aggregator is a design where -the processing, formatting, and insertion of the logs is a task left to the -aggregator server. Using aggregators it's possible to scale log collection to -hundreds of overcloud nodes without overwhelming the storage backend with -inefficient connections. - -2. Log caching for remote targets - -In the case of remote targets a caching system can be setup, where logs are -stored temporarily on the local machine in a configurable disk or memory cache -until they can be uploaded to an aggregator or storage system. While some in -memory cache is mandatory users may select a disk cache depending on how -important it is that all logs be saved and stored. This allows recovery -without loss of messages during network outages or service outages. - - -3. Log security in transit - -In some cases encryption during transit may be required. rsyslog offers -ssl based encryption that should be easily deployable. - -4. Standard and extensible format - -By default logs should be formatted as outlined by the Redhat common logging -initiative. By standardizing logging format where possible various tools -and analytics become more portable. - -Mandatory fields for this standard formatting include. - -version: the version of the logging template -level: loglevel -message: the log message -tags: user specific tagging info - -Additional fields must be added in the format of - -. - -See an example by rsyslog for storage in Elasticsearch below. - -@timestamp November 27th 2017, 08:54:40.091 -@version 2016.01.06-0 -_id AV_9wiWQzdGOuK5_zY5J -_index logstash-2017.11.27.08 -_score -_type rsyslog -browbeat.cloud_name openstack-12-noncontainers-beta -hostname lorenzo.perf.lab.eng.rdu.redhat.com -level info -message Stopping LVM2 PV scan on device 8:2... -pid 1 -rsyslog.appname systemd -rsyslog.facility daemon -rsyslog.fromhost-ip 10.12.20.155 -rsyslog.inputname imptcp -rsyslog.protocol-version 1 -syslog.timegenerated November 27th 2017, 08:54:40.092 -systemd.t.BOOT_ID 1e99848dbba047edaf04b150313f67a8 -systemd.t.CAP_EFFECTIVE 1fffffffff -systemd.t.CMDLINE /usr/lib/systemd/systemd --switched-root --system --deserialize 21 -systemd.t.COMM systemd -systemd.t.EXE /usr/lib/systemd/systemd -systemd.t.GID 0 -systemd.t.MACHINE_ID 0d7fed5b203f4664b0b4be90e4a8a992 -systemd.t.SELINUX_CONTEXT system_u:system_r:init_t:s0 -systemd.t.SOURCE_REALTIME_TIMESTAMP 1511790880089672 -systemd.t.SYSTEMD_CGROUP / -systemd.t.TRANSPORT journal -systemd.t.UID 0 -systemd.u.CODE_FILE src/core/unit.c -systemd.u.CODE_FUNCTION unit_status_log_starting_stopping_reloading -systemd.u.CODE_LINE 1417 -systemd.u.MESSAGE_ID de5b426a63be47a7b6ac3eaac82e2f6f -systemd.u.UNIT lvm2-pvscan@8:2.service -tags - -As a visual aid here's a quick diagram of the flow of data. - - -> -> -> - -In the process container logs from the application are packaged with metadata -from systemd and other components depending on how rsyslog is configured, -journald acts as a transport aggregating this input across all containers for -the rsyslog container which formats this data into storable json and handles -things like transforming fields and adding additional metadta as desired. -Finally the data is inserted into elasticsearch or further held by an -aggrebator for a few seconds before being bulk inserted into Elasticsearch. - - -Alternatives ------------- - -TripleO already has some level of FluentD integration, but performance issues -make it unusable at scale. Furthermore it's not well prepared for container -logging. - -Ideally FluentD as a logging backend would be maintained, improved, and modified -to use the common logging format for easy swapping of solutions. - -Security Impact ---------------- - -The security of remotely stored data and the log storage database is outside -of the scope of this spec. The major remaining concerns are security in -in transit and the changes required to systemd for rsyslog to send data -remotely. - -A new systemd policy will have to be put into place to ensure that systemd -can successfully log to remote targets. By default the syslog rules prevent -any outside world access or port access, both of which are required for -log forwarding. - -For log encryption in transit a ssl certificate will have to be generated and -distributed to all nodes in the cloud securely, probably during deployment. -Special care should be taken to ensure that any misconfigured instance of -rsyslog without a certificate where one is required do not transmit logs -by accident. - - -Other End User Impact ---------------------- - -Ideally users will read some documentation and pass an extra 5-6 variables to -TripleO to deploy with logging aggregation. It's very important that logging -be easy to setup with sane defaults and no requirement on the user to implement -their own formatting or template. - -Users may also have to setup a database for log storage and an aggregator if -their deployment is large enough that they need one. Playbooks to do this -automatically will be provided, but probably don't belong in TripleO. - -Special care will have to be taken to size storage and aggregation hardware -to the task, while rsyslog is very efficient storage quickly becomes a problem -when a cloud can generate 100gb of logs a day. Especially since log storage -systems leave it up to the user to put in place rotation rules. - - -Performance Impact ------------------- - -For small clouds rsyslog direct to Elasticsearch will perform just fine. -As scale increases an aggregator (also running rsyslog, except configured -to accept and format input) is required. I have yet to test a large enough -cloud that an aggregator was at all stressed. Hundreds of gigs of logs a day -are possible with a single 32gb ram VM as an Elastic instance. - -For the Overcloud nodes forwarding their logs the impact is variable depending -on the users configuration. CPU requirements don't exceed single digits of a -single core even under heavy load but storage requirements can balloon if a -large on disk cache was specified and connectivity with the aggregator or -database is lost for prolonged periods. - -Memory usage is no more than a few hundred mb and most of that is the default -in memory log cache. Which once again could be expanded by the user. - - -Other Deployer Impact ---------------------- - -N/A - -Developer Impact ----------------- - -N/A - -Implementation -============== - -Assignee(s) ------------ - -Who is leading the writing of the code? Or is this a blueprint where you're -throwing it out there to see who picks it up? - -If more than one person is working on the implementation, please designate the -primary author and contact. - -Primary assignee: - jkilpatr - -Other contributors: - jaosorior - -Work Items ----------- - -rsyslog container - jaosorior - -rsyslog templating and deployment role - jkilpatr - -aggregator and storage server deployment tooling - jkilpatr - - -Dependencies -============ - -Blueprint dependencies: - -https://blueprints.launchpad.net/tripleo/+spec/logging-stdout-rsyslog - -Package dependencies: - -rsyslog, rsyslog-elasticsearch, rsyslog-mmjsonparse - -specifically version 8 of rsyslog, which is the earliest -supported by rsyslog-elasticsearch, these are packaged in -Centos and rhel 7.4 extras. - -Testing -======= - -Logging aggregation can be tested in CI by deploying it during any existing CI job. - -For extra validation have a script to check the output into Elasticsearch. - - -Documentation Impact -==================== - -Documentation will need to be written about the various modes and tunables for -logging and how to deploy them. As well as sizing recommendations for the log -storage system and aggregators where required. - - -References -========== - -https://review.openstack.org/#/c/490047/ - -https://review.openstack.org/#/c/521083/ - -https://blueprints.launchpad.net/tripleo/+spec/logging-stdout-rsyslog diff --git a/specs/rocky/tripleo-upgrade.rst b/specs/rocky/tripleo-upgrade.rst deleted file mode 100644 index 372c81f5..00000000 --- a/specs/rocky/tripleo-upgrade.rst +++ /dev/null @@ -1,100 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -===================================================== -A unified tool for upgrading TripleO based deploments -===================================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-upgrade - -In order to avoid work duplication and automation code being out of sync with the -official documentation we would like to create a single repository hosting the upgrade -automation code that can be run on top of deployments done with various tools. - -Problem Description -=================== -Currently automation code for TripleO upgrades is spread across several repositories -and it is tightly coupled with the framework being used for deployment, e.g. tripleo- -quickstart or Infrared. - -Proposed Change -=============== - -Overview --------- - -Our proposal is to decouple the upgrade automation code and make it deployment tool -agnostic. This way it could be consumed in different scenarios such as CI, automated -or manual testing. - -Alternatives ------------- - -For the previous releases the automation code has been hosted in diffrent repositories -such as tripleo-quickstart-extras, infrared or private repos. This is not convenient -as they all cover basically the same workflow so we are duplicating work. We would like -to avoid this and collaborate on a single repository. - -Security Impact ---------------- - -None. - -Other End User Impact ---------------------- - -This tool allows the users to run the TripleO upgrade in an automated fashion or -semi-automatic by creating scripts for each upgrade step which can be later run manually -by the user. - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -This tools helps developers by providing a quick way to run TripleO upgrades. This could -be useful when reproducing and debugging reported issues. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - matbu, mcornea - -Work Items ----------- - -* Create new repository in Openstack Git -* Migrate repository with its history from https://github.com/redhat-openstack/tripleo-upgrade - -Dependencies -============ - -* ansible - -Testing -======= - - -Documentation Impact -==================== - - -References -========== - diff --git a/specs/rocky/tripleo-vitrage-integration.rst b/specs/rocky/tripleo-vitrage-integration.rst deleted file mode 100644 index 38f19358..00000000 --- a/specs/rocky/tripleo-vitrage-integration.rst +++ /dev/null @@ -1,119 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================== -Support Vitrage(Root Cause Analysis, RCA) Service -================================================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-vitrage-integration - -[Vitrage]_ is the official OpenStack RCA project. It can perfectly organizes, -analyzes and visualizes the holistic view of the Cloud. - -Vitrage provides functions as follows: - -* A clear view of the Cloud Topology - -* Deduced alarms and states - -* RCA for alarms/events - -Via Vitrage, the end users can understand what happened in a complex cloud -environment, get the root cause of problems and then resolve issues in time. - -Problem Description -=================== - -Currently the installation and configuration of Vitrage in openstack is done -manually or using devstack. It shall be automated via tripleo. - -Integration Vitrage in TripleO is benefit for building the OPNFV platform. -It helps the OPNFV [Doctor]_ project using Vitrage as inspector component to -build the fault management framework with [Apex]_ installer which is an OPNFV -installation and deployment tool based on TripleO. - -Proposed Change -=============== - -Overview --------- - -This spec proposes changes to automate the deployment of Vitrage using TripleO. - -* Add puppet-vitrage package to overcloud-full image. - -* Define Vitrage Service in THT. - -* Add how and when to deploy Vitrage in puppet-tripleo. - -Alternatives ------------- - -None - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -Vitrage service is default disabled in a Deployment. Need to enable it -if deployer want to use it. - -Developer Impact ----------------- - -None - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - dong wenjuan - -Work Items ----------- - -As outlined in the proposed changes. - -Dependencies -============ - -The Vitrage RPM package must be in RDO repo. - -Testing -======= - -Add the test for CI scenarios. - -Documentation Impact -==================== - -The setup and configuration of the Vitrage server should be documented. - -References -========== - -.. [Vitrage] https://wiki.openstack.org/wiki/Vitrage -.. [Apex] https://wiki.opnfv.org/display/apex -.. [Doctor] https://wiki.opnfv.org/display/doctor diff --git a/specs/rocky/ui-automation-testing.rst b/specs/rocky/ui-automation-testing.rst deleted file mode 100644 index 59d06d56..00000000 --- a/specs/rocky/ui-automation-testing.rst +++ /dev/null @@ -1,123 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -UI Automation Testing -========================================== - -https://blueprints.launchpad.net/tripleo/+spec/automated-ui-testing - -We would like to introduce a suite of automated integration tests for the -TripleO UI. This will prevent regressions, and will lead to more stable -software. - -Problem Description -=================== - -At the moment, upstream CI only tests for lint errors, and runs our unit tests. -We'd like to add more integration tests for tripleo-ui to the CI pipeline. This -will include a selenium-based approach. This allows us to simulate a browser by -using a headless browser when running in CI, and we can detect a lot more -problems than we ever could with just unit testing. - -Proposed Change -=============== - -Overview --------- - -We would like write a Tempest plugin for tripleo-ui which uses Selenium to drive -a headless browser to execute the tests. We chose Tempest because it's a -standard in OpenStack, and gives us nice error reporting. - -We already have the `tempest-tripleo-ui`_ project set up. - -We plan to write a CI job to run our code in Tempest. In the initial -implementation, this will only cover checking for presence of certain UI -elements, and no deployments will actually be run. - -Alternatives ------------- - -The alternative is that we do all of our testing manually, waste time, have -lower velocity, and have more bugs. - -Security Impact ---------------- - -The security impact of this is minimal as it's CI-specific, and not user-facing. - -Other End User Impact ---------------------- - -End users won't interact with this feature. - -Performance Impact ------------------- - -This feature will only consume CI resources. There should be no negative -resource impact on the End User. - -Other Deployer Impact ---------------------- - -Our goal is to produce software that is more stable. But we're not changing any -features, per se. - -Developer Impact ----------------- - -Developers will gain a higher degree of confidence in their software. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - hpokorny - -Other contributors: - ukalifon - akrivoka - -Work Items ----------- - -* Write Selenium tests -* Write Tempest plugin code to run Selenium tests -* Write a new openstack-infra job to run the Tempest plugin on ``check`` and - ``gate``. At first, this will be a simple sanity job to make sure that the UI - has been rendered. The CI job won't run a deployment. - -Dependencies -============ - -* Tempest -* Selenium - -Testing -======= - -This is a bit meta. - -Documentation Impact -==================== - -We will document how a developer who is new to the tripleo-ui project can get -started with writing new integration tests. - -References -========== - -.. _tempest-tripleo-ui: https://github.com/openstack/tempest-tripleo-ui - -openstack-dev mailing list discussion: - -* http://lists.openstack.org/pipermail/openstack-dev/2017-June/119185.html -* http://lists.openstack.org/pipermail/openstack-dev/2017-July/119261.html diff --git a/specs/stein/all-in-one-upgrades-jobs.rst b/specs/stein/all-in-one-upgrades-jobs.rst deleted file mode 100644 index 697e41d4..00000000 --- a/specs/stein/all-in-one-upgrades-jobs.rst +++ /dev/null @@ -1,233 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=============================================================== -Improve upgrade_tasks CI coverage with the standalone installer -=============================================================== - -https://blueprints.launchpad.net/tripleo/+spec/upgrades-ci-standalone - -The main goal of this work is to improve coverage of service upgrade_tasks in -tripleo ci upgrades jobs, by making use of the Standalone_installer_work_. -Using a standalone node as a single node 'overcloud' allows us to exercise -both controlplane and dataplane services in the same job and within current -resources of 2 nodes and 3 hours. Furthermore and once proven successful -this approach can be extended to include even single service upgrades testing -to vastly improve on the current coverage with respect to all the service -upgrade_tasks defined in the tripleo-heat-templates (which is currently minimal). - -Traditionally upgrades jobs have been restricted by resource constraints -(nodes and walltime). For example the undercloud and overcloud upgrade are -never exercised in the same job, that is an overcloud upgrade job uses an undercloud that is already on the target version (so called mixed version deployment). - -A further example is that upgrades jobs have typically exercised either -controlplane or dataplane upgrades (i.e. controllers only, or compute only) -and never both in the same job, again because constraints. The currently running -tripleo-ci-centos-7-scenario000-multinode-oooq-container-upgrades_ job for -example has 2 nodes, where one is undercloud and one is overcloud controller. -The workflow *is* being exercised, but controller only. Furthermore, whilst -the current_upgrade_ci_scenario_ is only exercising a small subset of the -controlplane services, it is still running at well over 140 minutes. So there -is also very little coverage with respect to the upgrades_tasks across the -many different service templates defined in the tripleo-heat-templates. - -Thus the main goal of this work is to use the standalone installer to define -ci jobs that test the service upgrade_tasks for a one node 'overcloud' with -both controlplane and dataplane services. This approach is composable as the -services in the stand-alone are fully configurable. Thus after the first -iteration of compute/control, we can also define per-service ci jobs and over -time hopefully reach coverage for all the services deployable by TripleO. - -Finally it is worth emphasising that the jobs defined as part of this work will not -be testing the TripleO upgrades *workflow* at all. Rather this is about testing -the service upgrades_tasks specifically. The workflow instead will be tested -using the existing ci upgrades job (tripleo-ci-centos-7-scenario000-multinode-oooq-container-upgrades_) subject to modifications to strip it down to a bare -minimum required (e.g. hardly any services). There are more pointers to this -from the discussion at the TripleO-Stein-PTG_ but ultimately we will have two -approximations of the upgrade tested in ci - the service upgrade_tasks as -described by this spec, and the workflow itself using a different ci job or -modifying the existing one. - -.. _Standalone_installer_work: http://lists.openstack.org/pipermail/openstack-dev/2018-June/131135.html -.. _tripleo-ci-centos-7-scenario000-multinode-oooq-container-upgrades: https://github.com/openstack-infra/tripleo-ci/blob/4101a393f29c18a84f64cd95a28c41c8142c5b05/zuul.d/multinode-jobs.yaml#L384 -.. _current_upgrade_ci_scenario: https://github.com/openstack/tripleo-heat-templates/blob/9f1d855627cf54d26ee540a18fc8898aaccdda51/ci/environments/scenario000-multinode-containers.yaml#L21 -.. _TripleO-Stein-PTG: https://etherpad.openstack.org/p/tripleo-ptg-stein - -Problem Description -=================== - -As described above we have not been able to have control and dataplane -services upgraded as part of the same tripleo ci job. Such a job would -have to be 3 nodes for starters (undercloud,controller,compute). - -A *full* upgrade workflow would need the following steps: - - * deploy undercloud, deploy overcloud - * upgrade undercloud - * upgrade prepare the overcloud (heat stack update generates playbooks) - * upgrade run controllers (ansible-playbook via mistral workflow) - * upgrade run computes/storage etc (repeat until all done) - * upgrade converge (heat stack update). - -The problem being solved here is that we can run only some approximation of -the upgrade workflow, specifically the upgrade_tasks, for a composed set -of services and do so within the ci timeout. The first iteration will focus on -modelling a one node 'overcloud' with both controller and compute services. If -we prove this to be successful we can also consider single-service upgrades -jobs (a job for testing just nova,or glance upgrade tasks for example) for -each of services that we want to test the upgrades tasks. Thus even though -this is just an approximation of the upgrade (upgrade_tasks only, not the full -workflow), it can hopefully allow for a wider coverage of services in ci -than is presently possible. - -One of the early considerations when writing this spec was how we could enforce -a separation of services with respect to the upgrade workflow. That is, enforce -that controlplane upgrade_tasks and deploy_steps are executed first and then -dataplane compute/storage/ceph as is usually the case with the upgrade workflow. -However review comments on this spec as well as PTG discussions around it, in -particular that this is just some approximation of the upgrade (service -upgrade tasks, not workflow) in which case it may not be necessary to artificially -induce this control/dataplane separation here. This may need to be revisited -once implementation begins. - -Another core challenge that needs solving is how to collect ansible playbooks -from the tripleo-heat-templates since we don't have a traditional undercloud -heat stack to query. This will hopefully be a lesser challenge assuming we can -re-use the transient heat process used to deploy the standalone node. Futhermore -discussion around this point at the TripleO-Stein-PTG_ has informed us of a way -to keep the heat stack after deployment with keep-running_ so we could just -re-use it as we would with a 'normal' deployment. - -Proposed Change -=============== - -Overview --------- - -We will need to define a new ci job in the tripleo-ci_zuul.d_standalone-jobs_ -(preferably following the currently ongoing ci_v3_migrations_ define this as -v3 job). - -For the generation of the playbooks themselves we hope to use the ephemeral -heat service that is used to deploy the stand-alone node, or use the keep-running_ -option to the stand-alone deployment to keep the stack around after deployment. - -As described in the problem statement we hope to avoid the task of having to -distinguish between control and dataplane services in order to enforce that -controlplane services are upgraded first. - -.. _tripleo-ci_zuul.d_standalone-jobs: https://github.com/openstack-infra/tripleo-ci/blob/4101a393f29c18a84f64cd95a28c41c8142c5b05/zuul.d/standalone-jobs.yaml -.. _ci_v3_migrations: https://review.openstack.org/#/c/578432/8 -.. _keep-running: https://github.com/openstack/python-tripleoclient/blob/a57531382535e92e2bfd417cee4b10ac0443dfc8/tripleoclient/v1/tripleo_deploy.py#L911 - -Alternatives ------------- - -Add another node and have 3 node upgrades jobs together with increasing the -walltime but this is not scalable in the long term assuming limited -resources! - - -Security Impact ---------------- - -None - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -More coverage of services should mean less breakage because of upgrades -incompatible things being merged. - -Developer Impact ----------------- - -Might be easier for developers too who may have limited access to resources -to take the reproducer script with the standalone jobs and get a dev env for -testing upgrades. - -Implementation -============== - -Assignee(s) ------------ - -tripleo-ci and upgrades squads - -Work Items ----------- - -First we must solve the problem of generating the ansible playbooks, that -will include all the latest configuration from the tripleo-heat-templates at -the time of upgrade (including all upgrade_tasks etc) when there is no -undercloud Heat stack to query. - -We might consider some non-heat solution by parsing the tripleo-heat-templates -but I don't think that is a feasible solution (re-inventing wheels). There is -ongoing work to transfer tasks to roles which is promising and that is another -area to explore. - -One obvious mechanism to explore given the current tools is to re-use the -same ephemeral heat process that the stand-alone uses in deploying the -overcloud, but setting the usual 'upgrade-init' environment files for a short -stack 'update'. This is not tested at all yet so needs to be investigated -further. As identified earlier there is now in fact a keep-running_ option to the -tripleoclient that will keep this heat process around - -For the first iteration of this work we will aim to use the minimum possible combination -of services to implement a 'compute'/'control' overcloud. That is, using the existing -services from the current current_upgrade_ci_scenario_ with the addition of nova-compute -and any dependencies. - -Finally a third major consideration is how to execute this service upgrade, that -is how to invoke the playbook generation and then run the resulting playbooks -(it probably doesn't need to converge if we are just interested in the upgrades -tasks). One consideration might be to re-use the existing python-tripleoclient -"openstack overcloud upgrade" prepare and run sub-commands. However the first -and currently favored approach will be to use the existing stand-alone client -commands (tripleo_upgrade_ tripleo_deploy_). So one work item is to try these -and discover any modifications we might need to make them work for us. - -Items: - * Work out/confirm generation the playbooks for the standalone upgrade tasks. - * Work out any needed changes in the client/tools to execute the ansible playbooks - * Define new ci job in the tripleo-ci_zuul.d_standalone-jobs_ with control and - compute services, that will exercise upgrade_tasks, deployment_tasks and - post_upgrade_tasks playbooks. - -Once this first iteration is complete we can then consider defining multiple -jobs for small subsets of services, or even for single services. - -.. _tripleo_upgrade: https://github.com/openstack/python-tripleoclient/blob/6b0f54c07ae8d0dd372f16684c863efa064079da/tripleoclient/v1/tripleo_upgrade.py#L33 -.. _tripleo_deploy: https://github.com/openstack/python-tripleoclient/blob/6b0f54c07ae8d0dd372f16684c863efa064079da/tripleoclient/v1/tripleo_deploy.py#L80 - -Dependencies -============ - -This obviously depends on stand-alone installer - -Testing -======= - -There will be at least one new job defined here - -Documentation Impact -==================== - -None - -References -========== diff --git a/specs/stein/inflight-validations.rst b/specs/stein/inflight-validations.rst deleted file mode 100644 index f65b6c13..00000000 --- a/specs/stein/inflight-validations.rst +++ /dev/null @@ -1,142 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================= -In-flight Validations for the overcloud -======================================= - - -https://blueprints.launchpad.net/tripleo/+spec/inflight-validations - -Currently, we don't have any way to run validations inside a deploy run. This -spec aims to provide the necessary information on how to implement such -in-flight validations for an overcloud deploy. - -Problem Description -=================== - -Currently, operators and developers have to wait a long time before getting an -error in case a service isn't running as expected. - -This leads to loss of time and resources. - -Proposed Change -=============== - -Overview --------- - -After each container/service is started, a new step is added to run one or more -validations on the deployed host in order to ensure the service is actually -working as expected at said step. - -These validations must not use Mistral Workflow, in order to provide support -for the undercloud/standalone case. - -The best way to push those validations would be through the already existing -``deploy_steps_tasks`` keywork. A validation should be either at the start -of the next step, or at the end of the current step we want to check. - -The validations should point to an external playbook, for instance hosted in -``tripleo-validations``. If there isn't real use to create a playbook for the -validation, it might be inline - but it must be short, for example a single test -for an open port. - -Alternatives ------------- - -There isn't really other alternative. We might think running the validation -ansible playbook directly is a good idea, but it will break the wanted -convergence with the UI. - -For now, there isn't such validations, we can start fresh. - -Security Impact ---------------- - -No security impact. - -Upgrade Impact --------------- - -If a service isn't starting properly, the upgrade might fail. This is also true -for a fresh deploy. - -We might want different validation tasks/workflows if we're in an upgrade -state. - -Other End User Impact ---------------------- - -End user will get early failure in case of issues detected by the validations. -This is an improvement, as for now it might fail at a later step, and might -break things due to the lack of valid state. - -Performance Impact ------------------- - -Running in-flight validation WILL slow the overall deploy/upgrade process, but -on the other hand, it will ensure we have a clean state before each step. - -Other Deployer Impact ---------------------- - -No other deployer impact. - -Developer Impact ----------------- - -Validations will need to be created and documented in order to get proper runs. - - -Implementation -============== - -Assignee(s) ------------ - -Who is leading the writing of the code? Or is this a blueprint where you're -throwing it out there to see who picks it up? - -If more than one person is working on the implementation, please designate the -primary author and contact. - -Primary assignee: - cjeanner - -Other contributors: - - -Work Items ----------- - -* Add new hook for the ``validation_tasks`` -* Provide proper documentation on its use - -Dependencies -============ - -* Please keep in mind the Validation Framework spec when implementing things: - https://review.openstack.org/589169 - - -Testing -======= - -TBD - - -Documentation Impact -==================== - -What is the impact on the docs? Don't repeat details discussed above, but -please reference them here. - - -References -========== - -* https://review.openstack.org/589169 diff --git a/specs/stein/nova-less-deploy.rst b/specs/stein/nova-less-deploy.rst deleted file mode 100644 index 01e315c5..00000000 --- a/specs/stein/nova-less-deploy.rst +++ /dev/null @@ -1,638 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================= -Provision nodes without Nova and Glance -======================================= - -https://blueprints.launchpad.net/tripleo/+spec/nova-less-deploy - -Currently TripleO undercloud uses Heat, Nova, Glance, Neutron and Ironic for -provisioning bare metal machines. This blueprint proposes excluding Heat, Nova -and Glance from this flow, removing Nova and Glance completely from the -undercloud. - -Problem Description -=================== - -Making TripleO workflows use Ironic directly to provision nodes has quite a few -benefits: - -#. First and foremost, getting rid of the horrible "no valid hosts found" - exception. The scheduling will be much simpler and the errors will be - clearer. - - .. note:: - This and many other problems with using Nova in the undercloud come from - the fact that Nova is cloud-oriented software, while the undercloud is - more of a traditional installer. In the "pet vs cattle" metaphore, Nova - handles the "cattle" case, while the undercloud is the "pet" case. - -#. Also important for the generic provisioner case, we'll be able to get rid of - Nova and Glance, reducing the memory footprint. - -#. We'll get rid of pre-deploy validations that currently try to guess what - Nova scheduler will expect. - -#. We'll be able to combine nodes deployed by Ironic with pre-deployed servers. - -#. We'll become in charge of building the configdrive, potentially putting more - useful things there. - -#. Hopefully, scale-up will be less error-prone. - -Also in the future we may be able to: - -#. Integrate things like building RAID on demand much easier. - -#. Use introspection data in scheduling and provisioning decisions. - Particularly, we can automate handling root device hints. - -#. Make Neutron optional and use static DHCP and/or *os-net-config*. - -Proposed Change -=============== - -Overview --------- - -This blueprint proposes removal replacing the triad Heat-Nova-Glance with -Ironic driven directly by Mistral. To avoid placing Ironic-specific code into -tripleo-common, a new library metalsmith_ has been developed and accepted into -the Ironic governance. - -As part of the implementation, this blueprint proposes completely separting the -bare metal provisioning process from software configuration, including the CLI -level. This has two benefits: - -#. Having a clear separation between two error-prone processes simplifies - debugging for operators. - -#. Reusing the existing *deployed-server* workflow simplifies the - implementation. - -In the distant future, the functionality of metalsmith_ may be moved into -Ironic API itself. In this case it will be phased out, while keeping the same -Mistral workflows. - -Operator workflow ------------------ - -As noted in Overview_, the CLI/GUI workflow will be split into hardware -provisioning and software configuration parts (the former being optional). - -#. In addition to existing Heat templates, a new file - baremetal_deployment.yaml_ will be populated by an operator with the bare - metal provisioning information. - -#. Bare metal deployment will be conducted by a new CLI command or GUI - operation using the new `deploy_roles workflow`_:: - - openstack overcloud node provision \ - -o baremetal_environment.yaml baremetal_deployment.yaml - - This command will take the input from baremetal_deployment.yaml_, provision - requested bare metal machines and output a Heat environment file - baremetal_environment.yaml_ to use with the *deployed-server* feature. - -#. Finally, the regular deployment is done, including the generated file:: - - openstack overcloud deploy \ - \ - -e baremetal_environment.yaml \ - -e /usr/share/openstack-tripleo-heat-templates/environments/deployed-server-environment.yaml \ - -e /usr/share/openstack-tripleo-heat-templates/environments/deployed-server-bootstrap-environment-centos.yaml \ - -r /usr/share/openstack-tripleo-heat-templates/deployed-server/deployed-server-roles-data.yaml - -For simplicity the two commands can be combined:: - - openstack overcloud deploy \ - \ - -b baremetal_deployment.yaml \ - -e /usr/share/openstack-tripleo-heat-templates/environments/deployed-server-environment.yaml \ - -e /usr/share/openstack-tripleo-heat-templates/environments/deployed-server-bootstrap-environment-centos.yaml \ - -r /usr/share/openstack-tripleo-heat-templates/deployed-server/deployed-server-roles-data.yaml - -The new argument ``--baremetal-deployment``/``-b`` will accept the -baremetal_deployment.yaml_ and do the deployment automatically. - -Breakdown of the changes ------------------------- - -This section describes the required changes in depth. - -Image upload -~~~~~~~~~~~~ - -As Glance will no longer be used, images will have to be served from other -sources. Ironic supports HTTP and file sources from its images. For the -undercloud case, the file source seems to be the most straightforward, also the -*Edge* case may require using HTTP images. - -To make both cases possible, the ``openstack overcloud image upload`` command -will now copy the three overcloud images (``overcloud-full.qcow2``, -``overcloud-full.kernel`` and ``overcloud-full.ramdisk``) to -``/var/lib/ironic/httpboot/overcloud-images``. This will allow referring to -images both via ``file:///var/lib/ironic/httpboot/overcloud.images/...`` and -``http(s)://:/overcloud-images/...``. - -Finally, a checksum file will be generated from the copied images using:: - - cd /var/lib/ironic/httpboot/overcloud-images - md5sum overcloud-full.* > MD5SUMS - -This is required since the checksums will no longer come from Glance. - -baremetal_deployment.yaml -~~~~~~~~~~~~~~~~~~~~~~~~~ - -This file will describe which the bare metal provisioning parameters. It will -provide the information that is currently implicitly deduced from the Heat -templates. - -.. note:: - We could continue extracting it from the templates well. However, a separate - file will avoid a dependency on any Heat-specific logic, potentially - benefiting standalone installer cases. It also provides the operators with - more control over the provisioning process. - -The format of this file resembles one of the ``roles_data`` file. It describes -the deployment parameters for each role. The file contains a list of roles, -each with a ``name``. Other accepted parameters are: - -``count`` - number of machines to deploy for this role. Defaults to 1. -``profile`` - profile (``compute``, ``control``, etc) to use for this role. Roughly - corresponds to a flavor name for a Nova based deployment. Defaults to no - profile (any node can be picked). -``hostname_format`` - a template for generating host names. This is similar to - ``HostnameFormatDefault`` of a ``roles_data`` file and should use - ``%index%`` to number the nodes. The default is ``%stackname%--%index%``. -``instances`` - list of instances in the format accepted by `deploy_instances workflow`_. - This allows to tune parameters per instance. - -Examples -^^^^^^^^ - -Deploy one compute and one control with any profile: - -.. code-block:: yaml - - - name: Compute - - name: Controller - -HA deployment with two computes and profile matching: - -.. code-block:: yaml - - - name: Compute - count: 2 - profile: compute - hostname_format: compute-%index%.example.com - - name: Controller - count: 3 - profile: control - hostname_format: controller-%index%.example.com - -Advanced deployment with custom hostnames and parameters set per instance: - -.. code-block:: yaml - - - name: Compute - profile: compute - instances: - - hostname: compute-05.us-west.example.com - nics: - - network: ctlplane - fixed_ip: 10.0.2.5 - traits: - - HW_CPU_X86_VMX - - hostname: compute-06.us-west.example.com - nics: - - network: ctlplane - fixed_ip: 10.0.2.5 - traits: - - HW_CPU_X86_VMX - - name: Controller - profile: control - instances: - - hostname: controller-1.us-west.example.com - swap_size_mb: 4096 - - hostname: controller-2.us-west.example.com - swap_size_mb: 4096 - - hostname: controller-3.us-west.example.com - swap_size_mb: 4096 - -deploy_roles workflow -~~~~~~~~~~~~~~~~~~~~~ - -The workflow ``tripleo.baremetal_deploy.v1.deploy_roles`` will accept the -information from baremetal_deployment.yaml_, convert it into the low-level -format accepted by the `deploy_instances workflow`_ and call the -`deploy_instances workflow`_ with it. - -It will accept the following mandatory input: - -``roles`` - parsed baremetal_deployment.yaml_ file. - -It will accept one optional input: - -``plan`` - plan/stack name, used for templating. Defaults to ``overcloud``. - -It will return the same output as the `deploy_instances workflow`_ plus: - -``environment`` - the content of the generated baremetal_environment.yaml_ file. - -Examples -^^^^^^^^ - -The examples from baremetal_deployment.yaml_ will be converted to: - -.. code-block:: yaml - - - hostname: overcloud-compute-0 - - hostname: overcloud-controller-0 - -.. code-block:: yaml - - - hostname: compute-0.example.com - profile: compute - - hostname: compute-1.example.com - profile: compute - - hostname: controller-0.example.com - profile: control - - hostname: controller-1.example.com - profile: control - - hostname: controller-2.example.com - profile: control - -.. code-block:: yaml - - - hostname: compute-05.us-west.example.com - nics: - - network: ctlplane - fixed_ip: 10.0.2.5 - profile: compute - traits: - - HW_CPU_X86_VMX - - hostname: compute-06.us-west.example.com - nics: - - network: ctlplane - fixed_ip: 10.0.2.5 - profile: compute - traits: - - HW_CPU_X86_VMX - - hostname: controller-1.us-west.example.com - profile: control - swap_size_mb: 4096 - - hostname: controller-2.us-west.example.com - profile: control - swap_size_mb: 4096 - - hostname: controller-3.us-west.example.com - profile: control - swap_size_mb: 4096 - -deploy_instances workflow -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The workflow ``tripleo.baremetal_deploy.v1.deploy_instances`` is a thin wrapper -around the corresponding metalsmith_ calls. - -The following inputs are mandatory: - -``instances`` - list of requested instances in the format described in `Instance format`_. -``ssh_keys`` - list of SSH public keys contents to put on the machines. - -The following inputs are optional: - -``ssh_user_name`` - SSH user name to create, defaults to ``heat-admin`` for compatibility. -``timeout`` - deployment timeout, defaults to 3600 seconds. -``concurrency`` - deployment concurrency - how many nodes to deploy at the same time. Defaults - to 20, which matches introspection. - -Instance format -^^^^^^^^^^^^^^^ - -The instance record format closely follows one of the `metalsmith ansible -role`_ with only a few TripleO-specific additions and defaults changes. - -Either or both of the following fields must be present: - -``hostname`` - requested hostname. It is used to identify the deployed instance later on. - Defaults to ``name``. -``name`` - name of the node to deploy on. If ``hostname`` is not provided, ``name`` is - also used as the hostname. - -The following fields will be supported: - -``capabilities`` - requested node capabilities (except for ``profile`` and ``boot_option``). -``conductor_group`` - requested node's conductor group. This is primary for the *Edge* case when - nodes managed by the same Ironic can be physically separated. -``nics`` - list of requested NICs, see metalsmith_ documentation for details. Defaults - to ``{"network": "ctlplane"}`` which requests creation of a port on the - ``ctlplane`` network. -``profile`` - profile to use (e.g. ``compute``, ``control``, etc). -``resource_class`` - requested node's resource class, defaults to ``baremetal``. -``root_size_gb`` - size of the root partition in GiB, defaults to 49. -``swap_size_mb`` - size of the swap partition in MiB, if needed. -``traits`` - list of requested node traits. -``whole_disk_image`` - boolean, whether to treat the image (``overcloud-full.qcow2`` or provided - through the ``image`` field) as a whole disk image. Defaults to false. - -The following fields will be supported, but the defaults should work for all -but the most extreme cases: - -``image`` - file or HTTP URL of the root partition or whole disk image. -``image_kernel`` - file or HTTP URL of the kernel image (partition images only). -``image_ramdisk`` - file or HTTP URL of the ramdisk image (partition images only). -``image_checksum`` - checksum of URL of checksum of the root partition or whole disk image. - -Certificate authority configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If TLS is used in the undercloud, we need to make the nodes trust -the Certificate Authority (CA) that signed the TLS certificates. -If ``/etc/pki/ca-trust/source/anchors/cm-local-ca.pem`` exists, it will be -included in the generated configdrive, so that the file is copied into the same -location on target systems. - -Outputs -^^^^^^^ - -The workflow will provide the following outputs: - -``ctlplane_ips`` - mapping of host names to their respective IP addresses on the ``ctlplane`` - network. -``instances`` - mapping of host names to full instance representations with fields: - - ``node`` - Ironic node representation. - ``ip_addresses`` - mapping of network names to list of IP addresses on them. - ``hostname`` - instance hostname. - ``state`` - `metalsmith instance state`_. - ``uuid`` - Ironic node uuid. - -Also two subdicts of ``instances`` are provided: - -``existing_instances`` - only instances that already existed. -``new_instances`` - only instances that were deployed. - -.. note:: - Instances are distinguised by their hostnames. - -baremetal_environment.yaml -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This file will serve as an output of the bare metal provisioning process. It -will be fed into the overcloud deployment command. Its goal is to provide -information for the *deployed-server* workflow. - -The file will contain the ``HostnameMap`` generated from role names and -hostnames, e.g. - -.. code-block:: yaml - - parameter_defaults: - HostnameMap: - overcloud-controller-0: controller-1.us-west.example.com - overcloud-controller-1: controller-2.us-west.example.com - overcloud-controller-2: controller-3.us-west.example.com - overcloud-novacompute-0: compute-05.us-west.example.com - overcloud-novacompute-1: compute-06.us-west.example.com - -undeploy_instances workflow -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The workflow ``tripleo.baremetal_deploy.v1.undeploy_instances`` will take a -list of hostnames and undeploy the corresponding nodes. - -Novajoin replacement --------------------- - -The *novajoin* service is currently used to enroll nodes into IPA and provide -them with TLS certificates. Unfortunately, it has hard dependencies on Nova, -Glance and Metadata API, even though the information could be provided via -other means. Actually, the metadata API cannot always be provided with Ironic -(notably, it may not be available when using isolated provisioning networks). - -A potential solution is to provide the required information via a configdrive, -and make the nodes register themselves instead. - -Alternatives ------------- - -* Do nothing, continue to rely on Nova and work around cases when it does - match our goals well. See `Problem Description`_ for why it is not desired. - -* Avoid metalsmith_, use OpenStack Ansible modules or Bifrost. They currently - lack features (such as VIF attach/detach API) and do not have any notion of - scheduling. Implementing sophisticated enough scheduling in pure Ansible - seems a serious undertaking. - -* Avoid Mistral, drive metalsmith_ via Ansible. This is a potential future - direction of this work, but currently it seems much simpler to call - metalsmith_ Python API from Mistral actions. We would anyway need Mistral ( - (or Ansible Tower) to drive Ansible, because we need some API level. - -* Remove Neutron in the same change. Would reduce footprint even further, but - some operators may find the presence of an IPAM desirable. Also setting up - static DHCP would increase the scope of the implementation substantially and - complicate the upgrade even further. - -* Keep Glance but remove Nova. Does not make much sense, since Glance is only a - requirement because of Nova. Ironic can deploy from HTTP or local file - locations just as well. - -Security Impact ---------------- - -* Overcloud images will be exposed to unauthenticated users via HTTP. We need - to communicate it clearly that secrets must not be built into images in plain - text and should be delivered via *configdrive* instead. If it proves - a problem, we can limit ourselves to providing images via local files. - - .. note:: - This issue exists today, as images are transferred via insecure medium in - all supported deploy methods. - -* Removing two services from the undercloud will reduce potential attack - surface and simplify audit. - -Upgrade Impact --------------- - -The initial version of this feature will be enabled for new deployments only. - -The upgrade procedure will happen within a release, not between releases. -It will go roughly as follows: - -#. Upgrade to a release where undercloud without Nova and Glance is supported. - -#. Make a full backup of the undercloud. - -#. Run ``openstack overcloud image upload`` to ensure that the - ``overcloud-full`` images are available via HTTP(s). - -The next steps will probably be automated via an Ansible playbook or a Mistral -workflow: - -#. Mark deployed nodes *protected* in Ironic to prevent undeploying them - by mistake. - -#. Run a Heat stack update replacing references to Nova servers with references - to deployed servers. This will require telling Heat not to remove the - instances. - -#. Mark nodes as managed by *metalsmith* (optional, but simplifies - troubleshooting). - -#. Update node's ``instance_info`` to refer to images over HTTP(s). - - .. note:: This may require temporary moving nodes to maintenance. - -#. Run an undercloud update removing Nova and Glance. - -Other End User Impact ---------------------- - -* Nova CLI will no longer be available for troubleshooting. It should not be a - big problem in reality, as most of the problems it is used for are caused by - using Nova itself. - - metalsmith_ provides a CLI tool for troubleshooting and advanced users. We - will document using it for tasks like determining IP addresses of nodes. - -* It will no longer be possible to update images via Glance API, e.g. from GUI. - It should not be a bit issue, as most of users use pre-built images. Advanced - operators are likely to resort to CLI anyway. - -* *No valid host found* error will no longer be seen by operators. metalsmith_ - provides more detailed errors, and is less likely to fail because of its - scheduling approach working better with the undercloud case. - -Performance Impact ------------------- - -* A substantial speed-up is expected for deployments because of removing - several layers of indirection. The new deployment process will also fail - faster if the scheduling request cannot be satisfied. - -* Providing images via local files will remove the step of downloading them - from Glance, providing even more speed-up for larger images. - -* An operator will be able to tune concurrency of deployment via CLI arguments - or GUI parameters, other than ``nova.conf``. - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -New features for bare metal provisioning will have to be developed with this -work in mind. It may mean implementing something in metalsmith_ code instead of -relying on Nova servers or flavors, or Glance images. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Dmitry Tantsur, IRC: dtantsur, LP: divius - -Work Items ----------- - -Phase 1 (Stein, technical preview): - -#. Update ``openstack overcloud image upload`` to copy images into the HTTP - location and generate checksums. - -#. Implement `deploy_instances workflow`_ and `undeploy_instances workflow`_. - -#. Update validations to not fail if Nova and/or Glance are not present. - -#. Implement `deploy_roles workflow`_. - -#. Provide CLI commands for the created workflows. - -#. Provide an experimental OVB CI job exercising the new approach. - -Phase 2 (T+, fully supported): - -#. Update ``openstack overcloud deploy`` to support the new workflow. - -#. Support scaling down. - -#. Provide a `Novajoin replacement`_. - -#. Provide an upgrade workflow. - -#. Consider deprecating provisioning with Nova and Glance. - -Dependencies -============ - -* metalsmith_ library will be used for easier access to Ironic+Neutron API. - -Testing -======= - -Since testing this feature requires bare metal provisioning, a new OVB job will -be created for it. Initially it will be experimental, and will move to the -check queue before the feature is considered fully supported. - -Documentation Impact -==================== - -Documentation will have to be reworked to explain the new deployment approach. -Troubleshooting documentation will have to be updated. - -References -========== - -.. _metalsmith: https://docs.openstack.org/metalsmith/latest/ -.. _metalsmith ansible role: https://docs.openstack.org/metalsmith/latest/user/ansible.html#instance -.. _metalsmith instance state: https://docs.openstack.org/metalsmith/latest/reference/api/metalsmith.html#metalsmith.Instance.state diff --git a/specs/stein/ostempest-tripleo.rst b/specs/stein/ostempest-tripleo.rst deleted file mode 100644 index 045a6f86..00000000 --- a/specs/stein/ostempest-tripleo.rst +++ /dev/null @@ -1,154 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================================== -Integrate os_tempest role with TripleO -====================================== - -Launchpad Blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/os-tempest-tripleo - -Tempest provides a set of API and integrations tests with batteries -included in order to validate the OpenStack Deployment. In TripleO -project, we are working towards using a unified tempest role i.e. -`os_tempest` provided by OpenStack Ansible project in TripleO CI -in order to foster collaboration with multiple deployment tools and -improve our testing strategies within OpenStack Community. - -Problem Description -=================== - -In the OpenStack Ecosystem, we have multiple *ansible based* deployment tools -that use their own roles for install/configure and running tempest testing. -Each of these roles is trying to do similar stuff tied to the different -deployment tools. For example: `validate-tempest` ansible role on TripleO CI -provides most of the stuff but it is tied with the TripleO deployment and -provides some nice feature (Like: bugcheck, failed tests email notification, -stackviz, python-tempestconf support for auto tempest.conf generation) which -are missing in other roles. It is leading to duplication and reduces what -tempest tests are not working across them, leading to no collaboration on -the Testing side. - -The OpenStack Ansible team provides `os_tempest` role for installing/ -configuring/running tempest and post tempest results processing and there -is a lot of duplication between their work and the roles used for testing -by the various deployment tools.It almost provides most of the stuff -provided by each of the deployment tool specific tempest roles. There are -few stuffs which are missing can be added in the role and make it useable -so that other deployment tools can consume it. - -Proposed Change -=============== - -Using unified `os_tempest` ansible role in TripleO CI will help to maintain -one less role within TripleO project and help us to collaborate with -openstack-ansible team in order to share/improve tests strategies across -OpenStack ecosystem and solve tempest issues fastly. - -In order to achieve that, we need: - * Improve `os_tempest` role to add support for package/container install, - python-tempestconf, stackviz, skip list, bugcheck, tempest - log collection at the proper place. - - * Have a working CI job on standalone running tempest from `os_tempest` - role as well as on OSA side. - - * Provide an easy migration path from validate-tempest role. - -Alternatives ------------- - -If we do not use the existing `os_tempest` role then we need to re-write the -`validate-tempest` role which will result in again duplication and it will -cost too much time and it also requires another set of efforts for adoption -in the community which does not seems to feasible. - -Security Impact ---------------- - -None - -Upgrade Impact --------------- - -None - -Other End User Impact ---------------------- - -We need to educate users for migrating to `os_tempest`. - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -Helps more collaboration and improves testing. - -Implementation -============== - -Assignee(s) ------------ - - -Primary assignee: - * Arx Cruz (arxcruz) - * Chandan Kumar (chkumar246) - * Martin Kopec (mkopec) - - -Work Items ----------- - -* Install tempest and it's dependencies from Distro packages -* Running tempest from containers -* Enable stackviz -* python-tempestconf support -* skiplist management -* Keeping all tempest related files at one place -* Bugcheck -* Standalone based TripleO CI job consuming os_tempest role -* Migration path from validate-tempest to os_tempest role -* Documentation update on How to use it -* RDO packaging - -Dependencies -============ - -Currently, os_tempest role depends on `python_venv_build` role when -tempest is installed from source (git, pip, venv). We need to package it in RDO. - -Testing -======= - -The unified tempest role `os_tempest` will replace validate-tempest -role with much more improvements. - - -Documentation Impact -==================== - -Documentation on how to consume `os_tempest` needs to be updated. - - -References -========== - -* Unified Tempest role creation & calloboration email: - http://lists.openstack.org/pipermail/openstack-dev/2018-August/133838.html - -* os_tempest role: - http://git.openstack.org/cgit/openstack/openstack-ansible-os_tempest diff --git a/specs/stein/podman.rst b/specs/stein/podman.rst deleted file mode 100644 index caf6d8d4..00000000 --- a/specs/stein/podman.rst +++ /dev/null @@ -1,322 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================= -Podman support for container management -======================================= - -Launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/podman-support - -There is an ongoing desire to manage TripleO containers with a set of tools -designed to solve complex problems when deploying applications. -The containerization of TripleO started with a Docker CLI implementation -but we are looking at how we could leverage the container orchestration -on a Kubernetes friendly solution. - - -Problem Description -=================== - -There are three problems that this document will cover: - -* There is an ongoing discussion on whether or not Docker will be - maintained on future versions of Red Hat platforms. There is a general - move on OCI (Open Containers Initiative) conformant runtimes, as CRI-O - (Container Runtime Interface for OCI). - -* The TripleO community has been looking at how we could orchestrate the - containers lifecycle with Kubernetes, in order to bring consistency with - other projects like OpenShift for example. - -* The TripleO project aims to work on the next version of Red Hat platforms, - therefore we are looking at Docker alternatives in Stein cycle. - - -Proposed Change -=============== - -Introduction ------------- - -The containerization of TripleO has been an ongoing effort since a few releases -now and we've always been looking at a step-by-step approach that tries to -maintain backward compatibility for the deployers and developers; and also -in a way where upgrade from a previous release is possible, without too much -pain. With that said, we are looking at a proposed change that isn't too much -disruptive but is still aligned with the general roadmap of the container -story and hopefully will drive us to manage our containers with Kubernetes. -We use Paunch project to provide an abstraction in our container integration. -Paunch will deal with container configurations formats with backends support. - -Integrate Podman CLI --------------------- - -The goal of Podman is to allow users to run standalone (non-orchestrated) -containers which is what we have been doing with Docker until now. -Podman also allows users to run groups of containers called Pods where a Pod is -a term developed for the Kubernetes Project which describes an object that -has one or more containerized processes sharing multiple namespaces -(Network, IPC and optionally PID). -Podman doesn't have any daemon which makes it lighter than Docker and use a -more traditional fork/exec model of Unix and Linux. -The container runtime used by Podman is runc. -The CLI has a partial backward compatibility with Docker so its integration -in TripleO shouldn't be that painful. - -It is proposed to add support for Podman CLI (beside Docker CLI) in TripleO -to manage the creation, deletion, inspection of our containers. -We would have a new parameter called ContainerCli in TripleO, that if set to -'podman', will make the container provisioning done with Podman CLI and not -Docker CLI. - -Because there is no daemon, there are some problems that we needs to solve: - -* Automatically restart failed containers. -* Automatically start containers when the host is (re)booted. -* Start the containers in a specific order during host boot. -* Provide an channel of communication with containers. -* Run container healthchecks. - -To solve the first 3 problems, it is proposed to use Systemd: - -* Use Restart so we can configure a restart policy for our containers. - Most of our containers would run with Restart=always policy, but we'll - have to support some exceptions. -* The systemd services will be enabled by default so the containers start - at boot. -* The ordering will be managed by Wants which provides Implicit Dependencies - in Systemd. Wants is a weaker version of Requires. It'll allow to make sure - we start HAproxy before Keepalived for example, if they are on the same host. - Because it is a weak dependency, they will only be honored if the containers - are running on the same host. -* The way containers will be managed (start/stop/restart/status) will be - familiar for our operators used to control Systemd services. However - we probably want to make it clear that this is not our long term goal to - manage the containers with Systemd. - -The Systemd integration would be: - -* complete enough to cover our use-cases and bring feature parity with the - Docker implementation. -* light enough to be able to migrate our container lifecycle with Kubernetes - in the future (e.g. CRI-O). - - -For the fourth problem, we are still investigating the options: - -* varlink: interface description format and protocol that aims to make services - accessible to both humans and machines in the simplest feasible way. -* CRI-O: CI-based implementation of Kubernetes Container Runtime Interface - without Kubelet. For example, we could use a CRI-O Python binding to - communicate with the containers. -* A dedicated image which runs the rootwrap daemon, with rootwrap filters to only run the allowed - commands. The controlling container will have the rootwrap socket mounted in so that it can - trigger allowed calls in the rootwrap container. For pacemaker, the rootwrap container will allow - image tagging. For neutron, the rootwrap container will spawn the processes inside the container, - so it will need to be a long-lived container that is managed outside paunch. - - +---------+ +----------+ - | | | | - | L3Agent +-----+ Rootwrap | - | | | | - +---------+ +----------+ - - In this example, the L3Agent container has mounted in the rootwrap daemon socket so that it can - run allowed commands inside the rootwrap container. - -Finally, the fifth problem is still an ongoing question. -There are some plans to support healthchecks in Podman but nothing has been -done as of today. We might have to implement something on our side with -Systemd. - -Alternatives -============ - -Two alternatives are proposed. - -CRI-O Integration ------------------ - -CRI-O is meant to provide an integration path between OCI conformant runtimes -and the kubelet. Specifically, it implements the Kubelet Container Runtime -Interface (CRI) using OCI conformant runtimes. Note that the CLI utility for -interacting with CRI-O isn't meant to be used in production, so managing -the containers lifecycle with a CLI is only possible with Docker or Podman. - -So instead of a smooth migration from Docker CLI to Podman CLI, we could go -straight to Kubernetes integration and convert our TripleO services to work -with a standalone Kubelet managed by CRI-O. -We would have to generate YAML files for each container in a Pod format, -so CRI-O can manage them. -It wouldn't require Systemd integration, as the containers will be managed -by Kubelet. -The operator would control the container lifecycle by using kubectl commands -and the automated deployment & upgrade process would happen in Paunch with -a Kubelet backend. - -While this implementation will help us to move to a multi-node Kubernetes -friendly environment, it remains the most risky option in term of the -quantity of work that needs to happen versus the time that we have to design, -implement, test and ship the next tooling before the end of Stein cycle. - -We also need to keep in mind that CRI-O and Podman share containers/storage -and containers/image libraries, so the issues that we have had with Podman -will be hit with CRI-O as well. - -Keep Docker ------------ - -We could keep Docker around and do not change anything in the way we manage -containers. We could also keep Docker and make it work with CRI-O. -The only risk here is that Docker tooling might not be supported in the future -by Red Hat platforms and we would be on our own if any issue with Docker. -The TripleO community is always seeking for an healthy and long term -collaboration between us and the projects communities that we are interracting -with. - -Proposed roadmap -================ - -In Stein: - -* Make Paunch support Podman as an alternative to Docker. -* Get our existing services fully deployable on Podman, with parity to - what we had with Docker. -* If we have time, add Podman pod support to Paunch - -In "T" cycle: - -* Rewrite all of our container yaml to the pod format. -* Add a Kubelet backend to Paunch (or change our agent tooling to call - Kubelet directly from Ansible). -* Get our existing service fully deployable via Kublet, with parity to - what we had with Podman / Docker. -* Evaluate switching to Kubernetes proper. - - -Security Impact -=============== - -The TripleO containers will rely on Podman security. -If we don't use CRI-O or varlink to communicate with containers, we'll have -to consider running some containers in privileged mode and mount -/var/lib/containers into the containers. This is a security concern and -we'll have to evaluate it. -Also, we'll have to make the proposed solution with SELinux in Enforcing mode. - -Docker solution doesn't enforce selinux separation between containers. -Podman does, and there's currently no easy way to deactivate that globally. -So we'll basically get a more secure containers with Podman, as we have to -support separation from the very beginning. - -Upgrade Impact -============== - -The containers that were managed by Docker Engine will be removed and -provisioned into the new runtime. This process will happen when Paunch -generates and execute the new container configuration. -The operator shouldn't have to do any manual action and the migration will be -automated, mainly by Paunch. -The Containerized Undercloud upgrade job will test the upgrade of an Undercloud -running Docker containers on Rocky and upgrade to Podman containers on Stein. -The Overcloud upgrade jobs will also test. - -Note: as the docker runtime doesn't have the selinux separation, -some chcon/relabelling might be needed prior the move to podman runtime. - -End User Impact -=============== - -The operators won't be able to run Docker CLI like before and instead will -have to use Podman CLI, where some backward compatibility is garanteed. - -Performance Impact -================== - -There are different aspects of performances that we'll need to investigate: - -* Container performances (relying on Podman). -* How Systemd + Podman work together and how restart work versus Docker engine. - -Deployer Impact -=============== - -There shouldn't be much impact for the deployer, as we aim to make this change -the most transparent as possible. The only option (so far) that will be -exposed to the deployer will be "ContainerCli", where only 'docker' and -'podman' will be supported. If 'podman' is choosen, the transition will be -automated. - -Developer Impact -================ - -There shouldn't be much impact for the developer of TripleO services, except -that there are some things in Podman that slightly changed when comparing -with Docker. For example Podman won't create the missing directories when -doing bind-mount into the containers, while Docker create them. - -Implementation -============== - -Contributors ------------- - -* Bogdan Dobrelya -* Cédric Jeanneret -* Emilien Macchi -* Steve Baker - -Work Items ----------- - -* Update TripleO services to work with Podman (e.g. fix bind-mounts issues). -* SELinux separation (relates to bind-mounts rights + some other issues when - we're calling iptables/other host command from a containe) -* Systemd integration. -* Healthcheck support. -* Socket / runtime: varlink? CRI-O? -* Upgrade workflow. -* Testing. -* Documentation for operators. - - -Dependencies -============ - -* The Podman integration depends a lot on how stable is the tool and how - often it is released and shipped so we can test it in CI. -* The Healthchecks interface depends on Podman's roadmap. - -Testing -======= - -First of all, we'll switch the Undercloud jobs to use Podman and this work -should be done by milestone-1. Both the deployment and upgrade jobs should -be switched and actually working. -The overcloud jobs should be switched by milestone-2. - -We'll keep Docker testing support until we keep testing running on CentOS7 -platform. - -Documentation Impact -==================== - -We'll need to document the new commands (mainly the same as Docker), and -the differences of how containers should be managed (Systemd instead of Docker -CLI for example). - - -References -========== - -* https://www.projectatomic.io/blog/2018/02/reintroduction-podman/ -* https://github.com/kubernetes-sigs/cri-o -* https://github.com/kubernetes/community/blob/master/contributors/devel/container-runtime-interface.md -* https://varlink.org/ -* https://github.com/containers/libpod/blob/master/transfer.md -* https://etherpad.openstack.org/p/tripleo-standalone-kubelet-poc diff --git a/specs/stein/safe-side-containers.rst b/specs/stein/safe-side-containers.rst deleted file mode 100644 index e1c3af5b..00000000 --- a/specs/stein/safe-side-containers.rst +++ /dev/null @@ -1,162 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================================== -TripleO - Pattern to safely spawn a container from a container -============================================================== - -This spec describes a pattern which can be used as an alternative to -what TripleO does today to allow certain containers (Neutron, etc.) to -spawn side processes which require special privs like network -namespaces. Specifically it avoids exposing the docker socket or -using Podman nsenter hacks that have recently entered the codebase in Stein. - -Problem Description -=================== - -In Queens TripleO implemented a containerized architecture with the goal of -containerizing all OpenStack services. This architecture was a success but -a few applications had regressions when compared with their baremetal deployed -equivalent. One of these applications was Neutron, which requires the ability -to spawn long lived "side" processes that are launched directly from the -Neutron agents themselves. In the original Queens architecture Neutron -launched these side processes inside of the agent container itself which -caused a service disruption if the neutron agents themselves were restarted. -This was previously not the case on baremetal as these processes would continue -running across an agent restart/upgrade. - -The work around in Rocky was to add "wrapper" scripts for Neutron agents and -to expose the docker socket to each agent container. These wrappers scripts -were bind mounted into the containers so that they overwrote the normal location -of the side process. Using this crude mechanism binaries like 'dnsmasq' and -'haproxy' would instead launch a shell script instead of the normal binary and -these custom shell scripts relied on the an exposed docker socket from the -host to be able to launch a side container with the same arguments supplied -to the script. - -This mechanism functionally solved the issues with our containerization but -exposed some security problems in that we were now exposing the ability to -launch any container to these Neutron agent containers (privileged containers -with access to a docker socket). - -In Stein things changed with our desire to support Podman. Unlike Docker -Podman does not include a daemon on the host. All Podman commands are executed -via a CLI which runs the command on the host directly. We landed -patches which required Podman commands to use nsenter to enter the hosts -namespace and run the commands there directly. Again this mechanism requires -extra privileges to be granted to the Neutron agent containers in order for -them to be able to launch these commands. Furthermore the mechanism is -a bit cryptic to support and debug in the field. - -Proposed Change -=============== - -Overview --------- - -Use systemd on the host to launch the side process containers directly with -support for network namespaces that Neutron agents require. The benefit of -this approach is that we no longer have to give the Neutron containers privs -to launch containers which they shouldn't require. - -The pattern could work like this: - -#. A systemd.path file monitors a know location on the host for changes. - Example (neutron-dhcp-dnsmasq.path): - -.. code-block:: yaml - - [Path] - PathModified=/var/lib/neutron/neutron-dnsmasq-processes-timestamp - PathChanged=/var/lib/neutron/neutron-dnsmasq-processes-timestamp - - [Install] - WantedBy=multi-user.target - -#. When systemd.path notices a change it fires the service for this - path file: - Example (neutron-dhcp-dnsmasq.service): - -.. code-block:: yaml - - [Unit] - Description=neutron dhcp dnsmasq sync service - - [Service] - Type=oneshot - ExecStart=/usr/local/bin/neutron-dhcp-dnsmasq-process-sync - User=root - -#. We use the same "wrapper scripts" used today to write two files. The - first file is a dump of CLI arguments used to launch the process - on the host. This file can optionally include extra data like - network namespaces which are required for some neutron side processes. - The second file is a timestamp which is monitored by systemd.path - on the host for changes and is used as a signal that it needs to - process the first file with arguments. - -# When a change is detected the systemd.service above executes a script on the - host to cleanly launch containerized side processes. When the script finishes - launching processes it truncates the file to start with a clean slate. - -# Both the wrapper scripts and the host scripts use flock to eliminate race - conditions which could cause issues in relaunching or missed containers. - -Alternatives ------------- - -With Podman an API like varlink would be an option however it would likely -still required exposure to a socket on the host which would involve -extra privileges like what we have today. This would avoid the nsenter hacks -however. - -An architecture like Kubernetes would give us an API which could be used -to launch containers directly via the COE. - -Additionally an external process manager in Neutron that is "containers aware" -could be written to improve either of the above options. The current python -in Neutron was writtin primarily for launching processes on baremetal with -assumptions that some of the processes it launches are meant to live across -a contain restart. Implementing a class that can launch side processes via a -clean interface rather than overwriting binaries would be desirable. -Classes which supported launching containers via Kubernetes and or Systemd -via the host directly could be supported. - -Security Impact ---------------- - -This mechanism should allow us to remove some of the container privileges for -neutron agents which in the past were used to execute containers. It is -a more restrictive crude interface that allows the containers only to launch -a specific type of process rather than any container it chooses. - -Upgrade Impact --------------- - -The side process containers should be the same regardless of how they are -launched so the upgrade should be minimal. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - dan-prince - -Other contributors: - emilienm - -Work Items ----------- - -# Ansible playbook to create systemd files, wrappers - -# TripleO Heat template updates to use the new playbooks - -# Remove/deprecate the old docker.socket and nsenter code from puppet-tripleo diff --git a/specs/stein/tripleo-routed-networks-templates.rst b/specs/stein/tripleo-routed-networks-templates.rst deleted file mode 100644 index c1b29463..00000000 --- a/specs/stein/tripleo-routed-networks-templates.rst +++ /dev/null @@ -1,522 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================================== -TripleO Routed Networks Deployment (Spine-and-Leaf Clos) -======================================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-routed-networks-templates - -This blueprint is part of a the series tripleo-routed-networks-deployment [0]_. - -TripleO uses shared L2 networks for all networks except the provisioning -network today. (Support for L3 provisioning network where added in Queens.) - -L3 support on the provisioning network is using network segments, a concept -in Neutron routed networks, we can represent more than one subnet per VLAN. -Without network segments, we would be limited to one subnet per VLAN. - -For the non-provisioning networks we have no way to model a true L3 routed -network in TripleO today. When deploying such an architecture we currently -create custom (neutron) networks for all the different l2 segments for each -isolated network. While this approach works it comes with some caveats. - -This spec covers refactoring the TripleO Heat Templates to support deployment -onto networks which are segregated into multiple layer 2 domains with routers -forwarding traffic between layer 2 domains. - - -Problem Description -=================== - -The master blueprint for routed networks for deployments breaks the problem -set into multiple parts [0]_. This blueprint presents the problems which are -applicable to this blueprint below. - - -Problem Descriptions -==================== - - -Problem #1: Deploy systems onto a routed provisioning network. - -While we can model a routed provisioning network and deploy systems on top of -that network today. Doing so requires additional complex configuration, such -as: - - * Setting up the required static routes to ensure traffic within the L3 - control plane takes the desired path troughout the network. - * L2 segments use different router addresses. - * L2 segments may use different subnet masks. - * Other L2 segment property differences. - - -This configuration is essentially manually passing in information in the -templates to deploy the overcloud. Information that was already provided when -deploying the undercloud. While this works, it increases complexity and the -possibility that the user provides incorrect configuration data. - -We should be able to get as much of this information based on what was provided -when deploying the undercloud. - -In order to support this model, there are some requirements that have to be -met in Heat and Neutron. - -**Alternative approaches to Problem #1:** - - -Approach 1: - -.. NOTE:: This is what we currently do. - -Since we control addresses and routes on the host nodes using a -combination of Heat templates and os-net-config, it may be possible to use -static routes to supernets to provide L2 adjacency, rather than relying on -Neutron to generate dynamic lists of routes that would need to be updated -on all hosts. - -The end result of this is that each host has a set of IP addresses and routes -that isolate traffic by function. In order for the return traffic to also be -isolated by function, similar routes must exist on both hosts, pointing to the -local gateway on the local subnet for the larger supernet that contains all -Internal API subnets. - -The downside of this is that we must require proper supernetting, and this may -lead to larger blocks of IP addresses being used to provide ample space for -scaling growth. For instance, in the example above an entire /16 network is set -aside for up to 255 local subnets for the Internal API network. This could be -changed into a more reasonable space, such as /18, if the number of local -subnets will not exceed 64, etc. This will be less of an issue with native IPv6 -than with IPv4, where scarcity is much more likely. - -Approch 2: - -Instead of passing parameters such as ControlPlaneCidr, -ControlPlaneDefaultRoute etc implement Neutron RFE [5]_ and Heat RFE [6]_. In -tripleo-heat-templates we can then use get_attr to get the data. And we leave -it to neutron to calculate and provide the routes for the L3 network. - -This would require [3]_, which I believe was in quite good shape before it was -abandoned due to activity policy. (An alternative would be to change -os-net-config to have an option to only change and apply routing configuration. -Something like running `ifdown-routes -`_ -/ -`ifup-routes -`_ -, however [3]_ is likely the better solution.) - - ------- - -**Problem #2: Static IP assignment: Choosing static IPs from the correct -subnet** - -Some roles, such as Compute, can likely be placed in any subnet, but we will -need to keep certain roles co-located within the same set of L2 domains. For -instance, whatever role is providing Neutron services will need all controllers -in the same L2 domain for VRRP to work properly. - -The network interfaces will be configured using templates that create -configuration files for os-net-config. The IP addresses that are written to -each node's configuration will need to be on the correct subnet for each host. -In order for Heat to assign ports from the correct subnets, we will need to -have a host-to-subnets mapping. - -Possible Solutions, Ideas or Approaches: - -.. NOTE:: We currently use #2, by specifying parameters for each role. - -1. The simplest implementation of this would probably be a mapping of - role/index to a set of subnets, so that it is known to Heat that - Controller-1 is in subnet set X and Compute-3 is in subnet set Y. The node - would then have the ip and subnet info for each network chosen from the - appropriate set of subnets. For other nodes, we would need to - programatically determine which subnets are correct for a given node. -2. We could associate particular subnets with roles, and then use one role - per L2 domain (such as per-rack). This might be achieved with a map of - roles to subnets, or by specifying parameters for each role such as: - supernet, subnet (ID and/or ip/netmask), and subnet router. -3. Initial implementation might follow the model for isolated networking - demonstrated by the environments/ips-from-pool-all.yaml. Developing the - ips-from-pool model first will allow testing various components with - spine-and-leaf while the templates that use dynamic assignment of IPs - within specified subnets are developed. -4. The roles and templates should be refactored to allow for dynamic IP - assignment within subnets associated with the role. We may wish to evaluate - the possibility of storing the routed subnets in Neutron using the routed - networks extensions that are still under development. However, in this - case, This is probably not required to implement separate subnets in each - rack. -5. A scalable long-term solution is to map which subnet the host is on - during introspection. If we can identify the correct subnet for each - interface, then we can correlate that with IP addresses from the correct - allocation pool. This would have the advantage of not requiring a static - mapping of role to node to subnet. In order to do this, additional - integration would be required between Ironic and Neutron (to make Ironic - aware of multiple subnets per network, and to add the ability to make - that association during introspection. - -We will also need to take into account sitations where there are heterogeneous -hardware nodes in the same layer 2 broadcast domain (such as within a rack). - -.. Note:: This can be done either using node groups in NetConfigDataLookup as - implemented in review [4]_ or by using additional custom roles. - ------- - -**Problem #3: Isolated Networking Requires Static Routes to Ensure Correct VLAN -is Used** - -In order to continue using the Isolated Networks model, routes will need to be -in place on each node, to steer traffic to the correct VLAN interfaces. The -routes are written when os-net-config first runs, but may change. We -can't just rely on the specific routes to other subnets, since the number of -subnets will increase or decrease as racks are added or taken away. - -Possible Solutions, Ideas or Approaches: - -1. Require that supernets are used for various network groups. For instance, - all the Internal API subnets would be part of a supernet, for instance - 172.17.0.0/16 could be used, and broken up into many smaller subnets, such - as /24. This would simplify the routes, since only a single route for - 172.17.0.0/16 would be required pointing to the local router on the - 172.17.x.0/24 network. - - Example: - Suppose 2 subnets are provided for the Internal API network: 172.19.1.0/24 - and 172.19.2.0/24. We want all Internal API traffic to traverse the Internal - API VLANs on both the controller and a remote compute node. The Internal API - network uses different VLANs for the two nodes, so we need the routes on the - hosts to point toward the Internal API gateway instead of the default - gateway. This can be provided by a supernet route to 172.19.x.x pointing to - the local gateway on each subnet (e.g. 172.19.1.1 and 172.19.2.1 on the - respective subnets). This could be represented in an os-net-config with the - following:: - - - - type: interface - name: nic3 - addresses: - - - ip_netmask: {get_param: InternalApiXIpSubnet} - routes: - - - ip_netmask: {get_param: InternalApiSupernet} - next_hop: {get_param: InternalApiXDefaultRoute} - - Where InternalApiIpSubnet is the IP address on the local subnet, - InternalApiSupernet is '172.19.0.0/16', and InternalApiRouter is either - 172.19.1.1 or 172.19.2.1 depending on which local subnet the host belongs to. -2. Modify os-net-config so that routes can be updated without bouncing - interfaces, and then run os-net-config on all nodes when scaling occurs. - A review for this functionality is in progress [3]_. -3. Instead of passing parameters to THT about routes (or supernet routes), - implement Neutron RFE [5]_ and Heat RFE [6]_. In tripleo-heat-templates we - can then use get_attr to get the data we currently read from user provided - parameters such as the InternalApiSupernet and InternalApiXDefaultRoute in - the example above. (We might also consider replacing [6]_ with a change - extending the ``network/ports/port.j2`` in tripleo-heat-templates to output - this data.) - -os-net-config configures static routes for each interface. If we can keep the -routing simple (one route per functional network), then we would be able to -isolate traffic onto functional VLANs like we do today. - -It would be a change to the existing workflow to have os-net-config run on -updates as well as deployment, but if this were a non-impacting event (the -interfaces didn't have to be bounced), that would probably be OK. (An -alternative is to add an option to have an option in os-net-config that only -adds new routes. Something like, os-net-config --no-activate + -ifdown-routes/ifup-routes.) - -At a later time, the possibility of using dynamic routing should be considered, -since it reduces the possibility of user error and is better suited to -centralized management. The overcloud nodes might participate in internal -routing protocols. SDN solutions are another way to provide this, or other -approaches may be considered, such as setting up OVS tunnels. - ------- - -**Problem #4: Isolated Networking in TripleO Heat Templates Needs to be -Refactored** - -The current isolated networking templates use parameters in nested stacks to -define the IP information for each network. There is no room in the current -schema to define multiple subnets per network, and no way to configure the -routers for each network. These values are provided by single parameters. - -Possible Solutions, Ideas or Approaches: - -1. We would need to refactor these resources to provide different routers - for each network. -2. We extend the custom and isolated networks in TripleO to add support for - Neutron routed-networks (segments) and multiple subnets. Each subnet will be - mapped to a different L2 segment. We should make the extension backward - compatible and only enable Neutron routed-networks (I.e associate subnets - with segments.) when the templates used define multiple subnets on a - network. To enable this we need some changes to land in Neutron and Heat, - these are the in-progress reviews: - - * Allow setting network-segment on subnet update [7]_ - * Allow updating the segment property of OS::Neutron::Subnet [8]_ - * Add first_segment convenience attr to OS::Neutron::Net [9]_ - - - -Proposed Change -=============== -The proposed changes are discussed below. - -Overview --------- - -In order to provide spine-and-leaf networking for deployments, several changes -will have to be made to TripleO: - -1. Support for DHCP relay in Neutron DHCP servers (in progress), and Ironic - DHCP servers (this is addressed in separate blueprints in the same series). -2. Refactor assignment of Control Plane IPs to support routed networks (that - is addressed by a separate blueprint: tripleo-predictable-ctlplane-ips [2]_. -3. Refactoring of TripleO Heat Templates network isolation to support multiple - subnets per isolated network, as well as per-subnet and supernet routes. -4. Changes to Infra CI to support testing. -5. Documentation updates. - -Alternatives ------------- - -The approach outlined here is very prescriptive, in that the networks must be -known ahead of time, and the IP addresses must be selected from the appropriate -pool. This is due to the reliance on static IP addresses provided by Heat. -Heat will have to model the subnets and associate them with roles (node -groups). - -One alternative approach is to use DHCP servers to assign IP addresses on all -hosts on all interfaces. This would simplify configuration within the Heat -templates and environment files. Unfortunately, this was the original approach -of TripleO, and it was deemed insufficient by end-users, who wanted stability -of IP addresses, and didn't want to have an external dependency on DHCP. - -Another approach is to use the DHCP server functionality in the network switch -infrastructure in order to PXE boot systems, then assign static IP addresses -after the PXE boot is done via DHCP. This approach only solves for part of the -requirement: the net booting. It does not solve the desire to have static IP -addresses on each network. This could be achieved by having static IP addresses -in some sort of per-node map. However, this approach is not as scalable as -programatically determining the IPs, since it only applies to a fixed number of -hosts. We want to retain the ability of using Neutron as an IP address -management (IPAM) back-end, ideally. - -Another approach which was considered was simply trunking all networks back -to the Undercloud, so that dnsmasq could respond to DHCP requests directly, -rather than requiring a DHCP relay. Unfortunately, this has already been -identified as being unacceptable by some large operators, who have network -architectures that make heavy use of L2 segregation via routers. This also -won't work well in situations where there is geographical separation between -the VLANs, such as in split-site deployments. - -Security Impact ---------------- - -One of the major differences between spine-and-leaf and standard isolated -networking is that the various subnets are connected by routers, rather than -being completely isolated. This means that without proper ACLs on the routers, -networks which should be private may be opened up to outside traffic. - -This should be addressed in the documentation, and it should be stressed that -ACLs should be in place to prevent unwanted network traffic. For instance, the -Internal API network is sensitive in that the database and message queue -services run on that network. It is supposed to be isolated from outside -connections. This can be achieved fairly easily if supernets are used, so that -if all Internal API subnets are a part of the 172.19.0.0/16 supernet, a simple -ACL rule will allow only traffic between Internal API IPs (this is a simplified -example that would be generally applicable to all Internal API router VLAN -interfaces or for a global ACL):: - - allow traffic from 172.19.0.0/16 to 172.19.0.0/16 - deny traffic from * to 172.19.0.0/16 - -The isolated networks design separates control plane traffic from data plane -traffic, and separates administrative traffic from tenant traffic. In order -to preserve this separatation of traffic, we will use static routes pointing -to supernets. This ensures all traffic to any subnet within a network will exit -via the interface attached to the local subnet in that network. It will be -important for the end user to implement ACLs in a routed network to prevent -remote access to networks that would be completely isolated in a shared L2 -deployment. - -Other End User Impact ---------------------- - -Deploying with spine-and-leaf will require additional parameters to -provide the routing information and multiple subnets required. This will have -to be documented. Furthermore, the validation scripts may need to be updated -to ensure that the configuration is validated, and that there is proper -connectivity between overcloud hosts. - -Performance Impact ------------------- - -Much of the traffic that is today made over layer 2 will be traversing layer -3 routing borders in this design. That adds some minimal latency and overhead, -although in practice the difference may not be noticeable. One important -consideration is that the routers must not be too overcommitted on their -uplinks, and the routers must be monitored to ensure that they are not acting -as a bottleneck, especially if complex access control lists are used. - -Other Deployer Impact ---------------------- - -A spine-and-leaf deployment will be more difficult to troubleshoot than a -deployment that simply uses a set of VLANs. The deployer may need to have -more network expertise, or a dedicated network engineer may be needed to -troubleshoot in some cases. - -Developer Impact ----------------- - -Spine-and-leaf is not easily tested in virt environments. This should be -possible, but due to the complexity of setting up libvirt bridges and -routes, we may want to provide a pre-configured quickstart environment -for testing. This may involve building multiple libvirt bridges -and routing between them on the Undercloud, or it may involve using a -DHCP relay on the virt-host as well as routing on the virt-host to simulate -a full routing switch. A plan for development and testing will need to be -developed, since not every developer can be expected to have a routed -environment to work in. It may take some time to develop a routed virtual -environment, so initial work will be done on bare metal. - -A separate blueprint will cover adding routed network support to -tripleo-quickstart. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - - Dan Sneddon - -Other assignees: - - Bob Fournier - - Harald Jensas - - Steven Hardy - - Dan Prince - -Approver(s) ------------ - -Primary approver: - Alex Schultz - -Work Items ----------- - -1. Implement support for DHCP on routed networks using DHCP relay, as - described in Problem #1 above. -2. Add parameters to Isolated Networking model in Heat to support supernet - routes for individual subnets, as described in Problem #3. -3. Modify Isolated Networking model in Heat to support multiple subnets, as - described in Problem #4. -4. Implement support for iptables on the Controller, in order to mitigate - the APIs potentially being reachable via remote routes, as described in - the Security Impact section. Alternatively, document the mitigation - procedure using ACLs on the routers. -5. Document the testing procedures. -6. Modify the documentation in tripleo-docs to cover the spine-and-leaf case. -7. Modify the Ironic-Inspector service to record the host-to-subnet mappings, - perhaps during introspection, to address Problem #2 (long-term). - - -Implementation Details ----------------------- - -Workflow: - -1. Operator configures DHCP networks and IP address ranges -2. Operator imports baremetal instackenv.json -3. When introspection or deployment is run, the DHCP server receives the DHCP - request from the baremetal host via DHCP relay -4. If the node has not been introspected, reply with an IP address from the - introspection pool* and the inspector PXE boot image -5. If the node already has been introspected, then the server assumes this is - a deployment attempt, and replies with the Neutron port IP address and the - overcloud-full deployment image -6. The Heat templates are processed which generate os-net-config templates, and - os-net-config is run to assign static IPs from the correct subnets, as well - as routes to other subnets via the router gateway addresses. - -When using spine-and-leaf, the DHCP server will need to provide an -introspection IP address on the appropriate subnet, depending on the -information contained in the DHCP relay packet that is forwarded by the segment -router. dnsmasq will automatically match the gateway address (GIADDR) of the -router that forwarded the request to the subnet where the DHCP request was -received, and will respond with an IP and gateway appropriate for that subnet. - -The above workflow for the DHCP server should allow for provisioning IPs on -multiple subnets. - -Dependencies -============ - -There may be a dependency on the Neutron Routed Networks. This won't be clear -until a full evaluation is done on whether we can represent spine-and-leaf -using only multiple subnets per network. - -There will be a dependency on routing switches that perform DHCP relay service -for production spine-and-leaf deployments. - -Testing -======= - -In order to properly test this framework, we will need to establish at least -one CI test that deploys spine-and-leaf. As discussed in this spec, it isn't -necessary to have a full routed bare metal environment in order to test this -functionality, although there is some work to get it working in virtual -environments such as OVB. - -For bare metal testing, it is sufficient to trunk all VLANs back to the -Undercloud, then run DHCP proxy on the Undercloud to receive all the -requests and forward them to br-ctlplane, where dnsmasq listens. This -will provide a substitute for routers running DHCP relay. For Neutron -DHCP, some modifications to the iptables rule may be required to ensure -that all DHCP requests from the overcloud nodes are received by the -DHCP proxy and/or the Neutron dnsmasq process running in the dhcp-agent -namespace. - -Documentation Impact -==================== - -The procedure for setting up a dev environment will need to be documented, -and a work item mentions this requirement. - -The TripleO docs will need to be updated to include detailed instructions -for deploying in a spine-and-leaf environment, including the environment -setup. Covering specific vendor implementations of switch configurations -is outside this scope, but a specific overview of required configuration -options should be included, such as enabling DHCP relay (or "helper-address" -as it is also known) and setting the Undercloud as a server to receive -DHCP requests. - -The updates to TripleO docs will also have to include a detailed discussion -of choices to be made about IP addressing before a deployment. If supernets -are to be used for network isolation, then a good plan for IP addressing will -be required to ensure scalability in the future. - -References -========== - -.. [0] `Blueprint: TripleO Routed Networks for Deployments `_ -.. [2] `Spec: User-specifiable Control Plane IP on TripleO Routed Isolated Networks `_ -.. [3] `Review: Modify os-net-config to make changes without bouncing interface `_ -.. [4] `Review: Add support for node groups in NetConfigDataLookup `_ -.. [5] `[RFE] Create host-routes for routed networks (segments) `_ -.. [6] `[RFE] Extend attributes of Server and Port resource to client interface configuration data `_ -.. [7] `Allow setting network-segment on subnet update `_ -.. [8] `Allow updating the segment property of OS::Neutron::Subnet `_ -.. [9] `Add first_segment convenience attr to OS::Neutron::Net `_ diff --git a/specs/stein/upgrades-with-operating-system.rst b/specs/stein/upgrades-with-operating-system.rst deleted file mode 100644 index e7d17004..00000000 --- a/specs/stein/upgrades-with-operating-system.rst +++ /dev/null @@ -1,747 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================= -Major Upgrades Including Operating System Upgrade -================================================= - -https://blueprints.launchpad.net/tripleo/+spec/upgrades-with-os - -.. note:: - Abbreviation "OS" in this spec stands for "operating system", not - "OpenStack". - -So far all our update and upgrade workflows included doing minor -operating system updates (essentially a ``yum update``) on the -machines managed by TripleO. This will need to change as we can't stay -on a single OS release indefinitely -- we'll need to perform a major -OS upgrade. The intention is for the TripleO tooling to help with the -OS upgrade significantly, rather than leaving this task entirely to -the operator. - - -Problem Description -=================== - -We need to upgrade undercloud and overcloud machines to a new release -of the operating system. - -We would like to provide an upgrade procedure both for environments -where Nova and Ironic are managing the overcloud servers, and -"Deployed Server" environments where we don't have control over -provisioning. - -Further constraints are imposed by Pacemaker clusters: Pacemaker is -non-containerized, so it is upgraded via packages together with the -OS. While Pacemaker would be capable of a rolling upgrade, Corosync -also changes major version, and starts to rely on knet for the link -protocol layer, which is incompatible with previous version of -Corosync. This introduces additional complexity: we can't do OS -upgrade in a rolling fashion naively on machines which belong to the -Pacemaker cluster (controllers). - - -Proposed Change - High Level View -================================= - -The Pacemaker constraints will be addressed by performing a one-by-one -(though not rolling) controller upgrade -- temporarily switching to a -single-controller cluster on the new OS, and gradually upgrading the -rest. This will also require implementation of persistent OpenStack -data transfer from older to newer OS releases (to preserve uptime and -for easier recoverability in case of failure). - -We will also need to ensure that at least 2 ceph-mon services run at -all times, so ceph-mon services will keep running even after we switch -off Pacemaker and OpenStack on the 2 older controllers. - -We should scope two upgrade approaches: full reprovisioning, and -in-place upgrade via an upgrade tool. Each come with different -benefits and drawbacks. The proposed CLI workflows should ideally be -generic enough to allow picking the final preferred approach of -overcloud upgrade late in the release cycle. - -While the overcloud approach is still wide open, undercloud seems to -favor an in-place upgrade due to not having a natural place to persist -the data during reprovisioning (e.g. we can't assume overcloud -contains Swift services), but that could be overcome by making the -procedure somewhat more manual and shifting some tasks onto the -operator. - -The most viable way of achieving an in-place (no reprovisioning) -operating system upgrade currently seems to be `Leapp`_, "an app -modernization framework", which should include in-place upgrade -capabilites. - -Points in favor of in-place upgrade: - -* While some data will need to be persisted and restored regardless of - approach taken (to allow safe one-by-one upgrade), reprovisioning - may also require managing data which would otherwise persist on its - own during an in-place upgrade. - -* In-place upgrade allows using the same approach for Nova+Ironic and - Deployed Server environments. If we go with reprovisioning, on - Deployed Server environments the operator will have to reprovision - using their own tooling. - -* Environments with a single controller will need different DB - mangling procedure. Instead of ``system_upgrade_transfer_data`` step - below, their DB data will be included into the persist/restore - operations when reprovisioning the controller. - -Points in favor of reprovisioning: - -* Not having to integrate with external in-place upgrade tool. E.g. in - case of CentOS, there's currently not much info available about - in-place upgrade capabilities. - -* Allows to make changes which wouldn't otherwise be possible, - e.g. changing a filesystem. - -* Reprovisioning brings nodes to a clean state. Machines which are - continuously upgraded without reprovisioining can potentially - accumulate unwanted artifacts, resulting in increased number of - problems/bugs which only appear after an upgrade, but not on fresh - deployments. - - -Proposed Change - Operator Workflow View -======================================== - -The following is an example of expected upgrade workflow in a -deployment with roles: **ControllerOpenstack, Database, Messaging, -Networker, Compute, CephStorage**. It's formulated in a -documentation-like manner so that we can best imagine how this is -going to work from operator's point of view. - - -Upgrading the Undercloud ------------------------- - -The in-place undercloud upgrade using Leapp will likely consist of the -following steps. First, prepare for OS upgrade via Leapp, downloading -the necessary packages:: - - leapp upgrade - -Then reboot, which will upgrade the OS:: - - reboot - -Then run the undercloud upgrade, which will bring back the undercloud -services (using the newer OpenStack release):: - - openstack tripleo container image prepare default \ - --output-env-file containers-prepare-parameter.yaml - openstack undercloud upgrade - -If we wanted or needed to upgrade the undercloud via reprovisioning, -we would use a `backup and restore`_ procedure as currently -documented, with restore perhaps being utilized just partially. - - -Upgrading the Overcloud ------------------------ - -#. **Update the Heat stack**, generate Heat outputs for building - upgrade playbooks:: - - openstack overcloud upgrade prepare - - Notes: - - * Among the ```` should be - ``containers-prepare-parameter.yaml`` bringing in the containers - of newer OpenStack release. - -#. **Prepare an OS upgrade on one machine from each of the - "schema-/cluster-sensitive" roles**:: - - openstack overcloud upgrade run \ - --tags system_upgrade_prepare \ - --limit controller-openstack-0,database-0,messaging-0 - - Notes: - - * This stops all services on the nodes selected. - - * For external installers like Ceph, we'll have a similar - external-upgrade command, which can e.g. remove the nodes from - the Ceph cluster:: - - openstack overcloud external-upgrade run \ - --tags system_upgrade_prepare \ - -e system_upgrade_nodes=controller-openstack-0,database-0,messaging-0 - - * If we use in-place upgrade: - - * This will run the ``leapp upgrade`` command. It should use - newer OS and newer OpenStack repos to download packages, and - leave the node ready to reboot into the upgrade process. - - * Caution: Any reboot after this is done on a particular node - will cause that node to automatically upgrade to newer OS. - - * If we reprovision: - - * This should persist node's important data to the - undercloud. (Only node-specific data. It would not include - e.g. MariaDB database content, which would later be transferred - from one of the other controllers instead.) - - * Services can export their ``upgrade_tasks`` to do the - persistence, we should provide an Ansible module or role to - make it DRY. - -#. **Upload new overcloud base image**:: - - openstack overcloud image upload --update-existing \ - --image-path /home/stack/new-images - - Notes: - - * For Nova+Ironic environments only. After this step any new or - reprovisioned nodes will receive the new OS. - -#. **Run an OS upgrade on one node from each of the - "schema-/cluster-sensitive" roles** or **reprovision those nodes**. - - Only if we do reprovisioning:: - - openstack server rebuild controller-openstack-0 - openstack server rebuild database-0 - openstack server rebuild messaging-0 - - openstack overcloud admin authorize \ - --overcloud-ssh-user \ - --overcloud-ssh-key \ - --overcloud-ssh-network \ - --limit controller-openstack-0,database-0,messaging-0 - - Both reprovisioning and in-place:: - - openstack overcloud upgrade run \ - --tags system_upgrade_run \ - --limit controller-openstack-0,database-0,messaging-0 - - Notes: - - * This step either performs a reboot of the nodes and lets Leapp - upgrade them to newer OS, or reimages the nodes with a fresh new - OS image. After they come up, they'll have newer OS but no - services running. The nodes can be checked before continuing. - - * In case of reprovisioning: - - * The ``overcloud admin authorize`` will ensure existence of - ``tripleo-admin`` user and authorize Mistral's ssh keys for - connection to the newly provisioned nodes. The - ``--overcloud-ssh-*`` work the same as for ``overcloud - deploy``. - - * The ``--tags system_upgrade_run`` is still necessary because it - will restore the node-specific data from the undercloud. - - * Services can export their ``upgrade_tasks`` to do the - restoration, we should provide an Ansible module or role to - make it DRY. - - * Ceph-mon count is reduced by 1 (from 3 to 2 in most - environments). - - * Caution: This will have bad consequences if run by accident on - unintended nodes, e.g. on all nodes in a single role. If - possible, it should refuse to run if --limit is not specified. If - possible further, it should refuse to run if a full role is - included, rather than individual nodes. - -#. **Stop services on older OS and transfer data to newer OS**:: - - openstack overcloud external-upgrade run \ - --tags system_upgrade_transfer_data \ - --limit ControllerOpenstack,Database,Messaging - - Notes: - - * **This is where control plane downtime starts.** - - * Here we should: - - * Detect which nodes are on older OS and which are on newer OS. - - * Fail if we don't find *at least one* older OS and *exactly - one* newer OS node in each role. - - * On older OS nodes, stop all services except ceph-mon. (On newer - node, no services are running yet.) - - * Transfer data from *an* older OS node (simply the first one in - the list we detect, or do we need to be more specific?) to - *the* newer OS node in a role. This is probably only going to - do anything on the Database role which includes DBs, and will - be a no-op for others. - - * Services can export their ``external_upgrade_tasks`` for the - persist/restore operations, we'll provide an Ansible module or - role to make it DRY. The transfer will likely go via undercloud - initially, but it would be nice to make it direct in order to - speed it up. - -#. **Run the usual upgrade tasks on the newer OS nodes**:: - - openstack overcloud upgrade run \ - --limit controller-openstack-0,database-0,messaging-0 - - Notes: - - * **Control plane downtime stops at the end of this step.** This - means the control plane downtime spans two commands. We should - *not* make it one command because the commands use different - parts of upgrade framework underneath, and the separation will - mean easier re-running of individual parts, should they fail. - - * Here we start pcmk cluster and all services on the newer OS - nodes, using the data previously transferred from the older OS - nodes. - - * Likely we won't need any special per-service upgrade tasks, - unless we discover we need some data conversions or - adjustments. The node will be with all services stopped after - upgrade to newer OS, so likely we'll be effectively "setting up a - fresh cloud on pre-existing data". - - * Caution: At this point the newer OS nodes became the authority on - data state. Do not re-run the previous data transfer step after - services have started on newer OS nodes. - - * (Currently ``upgrade run`` has ``--nodes`` and ``--roles`` which - both function the same, as Ansible ``--limit``. Notably, nothing - stops you from passing role names to ``--nodes`` and vice - versa. Maybe it's time to retire those two and implement - ``--limit`` to match the concept from Ansible closely.) - -#. **Perform any service-specific && node-specific external upgrades, - most importantly Ceph**:: - - openstack overcloud external-upgrade run \ - --tags system_upgrade_run \ - -e system_upgrade_nodes=controller-openstack-0,database-0,messaging-0 - - Notes: - - * Ceph-ansible here runs on a single node and spawns a new version - of ceph-mon. Per-node run capability will need to be added to - ceph-ansible. - - * Ceph-mon count is restored here (in most environments, it means - going from 2 to 3). - -#. **Upgrade the remaining control plane nodes**. Perform all the - previous control plane upgrade steps for the remaining controllers - too. Two important notes here: - - * **Do not run the ``system_upgrade_transfer_data`` step anymore.** - The remaining controllers are expected to join the cluster and - sync the database data from the primary controller via DB - replication mechanism, no explicit data transfer should be - necessary. - - * To have the necessary number of ceph-mons running at any given - time (often that means 2 out of 3), the controllers (ceph-mon - nodes) should be upgraded one-by-one. - - After this step is finished, all of the nodes which are sensitive - to Pacemaker version or DB schema version should be upgraded to - newer OS, newer OpenStack, and newer ceph-mons. - -#. **Upgrade the rest of the overcloud nodes** (Compute, Networker, - CephStorage), **either one-by-one or in batches**, depending on - uptime requirements of particular nodes. E.g. for computes this - would mean evacuating and then also running:: - - openstack overcloud upgrade run \ - --tags system_upgrade_prepare \ - --limit novacompute-0 - - openstack overcloud upgrade run \ - --tags system_upgrade_run \ - --limit novacompute-0 - - openstack overcloud upgrade run \ - --limit novacompute-0 - - - Notes: - - * Ceph OSDs can be removed by the ``external-upgrade run --tags - system_upgrade_prepare`` step before reprovisioning, and after - ``upgrade run`` command, ceph-ansible can recreate the OSD via - the ``external-upgrade run --tags system_upgrade_run`` step, - always limited to the OSD being upgraded:: - - # Remove OSD - openstack overcloud external-upgrade run \ - --tags system_upgrade_prepare \ - -e system_upgrade_nodes=novacompute-0 - - # <> - - # Re-deploy OSD - openstack overcloud external-upgrade run \ - --tags system_upgrade_run \ - -e system_upgrade_nodes=novacompute-0 - -#. **Perform online upgrade** (online data migrations) after all nodes - have been upgraded:: - - openstack overcloud external-upgrade run \ - --tags online_upgrade - -#. **Perfrom upgrade converge** to re-assert the overcloud state:: - - openstack overcloud upgrade converge - -#. **Clean up upgrade data persisted on undercloud**:: - - openstack overcloud external-upgrade run \ - --tags system_upgrade_cleanup - - -Additional notes on data persist/restore ----------------------------------------- - -* There are two different use cases: - - * Persistence for things that need to survive reprovisioning (for - each node) - - * Transfer of DB data from node to node (just once to bootstrap the - first new OS node in a role) - -* The `synchronize Ansible module`_ shipped with Ansible seems - fitting, we could wrap it in a role to handle common logic, and - execute the role via ``include_role`` from - ``upgrade_tasks``. - -* We would persist the temporary data on the undercloud under a - directory accessible only by the user which runs the upgrade - playbooks (``mistral`` user). The root dir could be - ``/var/lib/tripleo-upgrade`` and underneath would be subdirs for - individual nodes, and one more subdir level for services. - - * (Undercloud's Swift also comes to mind as a potential place for - storage. However, it would probably add more complexity than - benefit.) - -* **The data persist/restore operations within the upgrade do not - supplement or replace backup/restore procedures which should be - performed by the operator, especially before upgrading.** The - automated data persistence is solely for upgrade purposes, not for - disaster recovery. - - -Alternatives ------------- - -* **Parallel cloud migration.** We could declare the in-place upgrade - of operating system + OpenStack as too risky and complex and time - consuming, and recommend standing up a new cloud and transferring - content to it. However, this brings its own set of challenges. - - This option is already available for anyone whose environment is - constrained such that normal upgrade procedure is not realistic, - e.g. in case of extreme uptime requirements or extreme risk-aversion - environments. - - Implementing parallel cloud migration is probably best handled on a - per-environment basis, and TripleO doesn't provide any automation in - this area. - -* **Upgrading the operating system separately from OpenStack.** This - would simplify things on several fronts, but separating the - operating system upgrade while preserving uptime (i.e. upgrading the - OS in a rolling fashion node-by-node) currently seems not realistic - due to: - - * The pacemaker cluster (corosync) limitations mentioned earlier. We - would have to containerize Pacemaker (even if just ad-hoc - non-productized image). - - * Either we'd have to make OpenStack (and dependencies) compatible - with OS releases in a way we currently do not intend, or at least - ensure such compatibility when running containerized. E.g. for - data transfer, we could then probably use Galera native - replication. - - * OS release differences might be too large. E.g. in case of - differing container runtimes, we might have to make t-h-t be able - to deploy on two runtimes within one deployment. - -* **Upgrading all control plane nodes at the same time as we've been - doing so far.** This is not entirely impossible, but rebooting all - controllers at the same time to do the upgrade could mean total - ceph-mon unavailability. Also given that the upgraded nodes are - unreachable via ssh for some time, should something go wrong and the - nodes got stuck in that state, it could be difficult to recover back - into a working cloud. - - This is probably not realistic, mainly due to concerns around Ceph - mon availability and risk of bricking the cloud. - - -Security Impact ---------------- - -* How we transfer data from older OS machines to newer OS machines is - a potential security concern. - -* The same security concern applies for per-node data persist/restore - procedure in case we go with reprovisioning. - -* The stored data may include overcloud node's secrets and should be - cleaned up from the undercloud when no longer needed. - -* In case of using the `synchronize Ansible module`_: it uses rsync - over ssh, and we would store any data on undercloud in a directory - only accessible by the same user which runs the upgrade playbooks - (``mistral``). This undercloud user has full control over overcloud - already, via ssh keys authorized for all management operations, so - this should not constitute a significant expansion of ``mistral`` - user's knowledge/capabilities. - - -Upgrade Impact --------------- - -* The upgrade procedure is riskier and more complex. - - * More things can potentially go wrong. - - * It will take more time to complete, both manually and - automatically. - -* Given that we upgrade one of the controllers while the other two are - still running, the control plane services downtime could be slightly - shorter than before. - -* When control plane services are stopped on older OS machines and - running on newer OS machine, we create a window without high - availability. - -* Upgrade framework might need some tweaks but on high level it seems - we'll be able to fit the workflow into it. - -* All the upgrade steps should be idempotent, rerunnable and - recoverable as much as we can make them so. - - -Other End User Impact ---------------------- - -* Floating IP availability could be affected. Neutron upgrade - procedure typically doesn't immediately restart sidecar containers - of L3 agent. Restarting will be a must if we upgrade the OS. - - -Performance Impact ------------------- - -* When control plane services are stopped on older OS machines and - running on newer OS machine, only one controller is available to - serve all control plane requests. - -* Depending on role/service composition of the overcloud, the reduced - throughput could also affect tenant traffic, not just control plane - APIs. - - -Other Deployer Impact ---------------------- - -* Automating such procedure introduces some code which had better not - be executed by accident. The external upgrade tasks which are tagged - ``system_upgrade_*`` should also be tagged ``never``, so that they - only run when explicitly requested. - -* For the data transfer step specifically, we may also introduce a - safety "flag file" on the target overcloud node, which would prevent - re-running of the data transfer until the file is manually removed. - - -Developer Impact ----------------- - -Developers who work on specific composable services in TripleO will -need to get familiar with the new upgrade workflow. - - -Main Risks ----------- - -* Leapp has been somewhat explored but its viability/readiness for our - purpose is still not 100% certain. - -* CI testing will be difficult, if we go with Leapp it might be - impossible (more below). - -* Time required to implement everything may not fit within the release - cycle. - -* We have some idea how to do the data persist/restore/transfer parts, - but some prototyping needs to be done there to gain confidence. - -* We don't know exactly what data needs to be persisted during - reprovisioning. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignees:: - | jistr, chem, jfrancoa - -Other contributors:: - | fultonj for Ceph - - -Work Items ----------- - -With aditional info in format: (how much do we know about this task, -estimate of implementation difficulty). - -* (semi-known, est. as medium) Change tripleo-heat-templates + - puppet-tripleo to be able to set up a cluster on just one controller - (with newer OS) while the Heat stack knows about all - controllers. This is currently not possible. - -* (semi-known, est. as medium) Amend upgrade_tasks to work for - Rocky->Stein with OS upgrade. - -* ``system_upgrade_transfer_data``: - - * (unknown, est. as easy) Detect upgraded vs. unupgraded machines to - transfer data to/from. - - * (known, est. as easy) Stop all services on the unupgraded machines - transfer data to/from. (Needs to be done via external upgrade - tasks which is new, but likely not much different from what we've - been doing.) - - * (semi-known, est. as medium/hard) Implement an Ansible role for - transferring data from one node to another via undercloud. - - * (unknown, est. as medium) Figure out which data needs transferring - from old controller to new, implement it using the above Ansible - role -- we expect only MariaDB to require this, any special - services should probably be tackled by service squads. - -* (semi-known, est. as medium/hard) Implement Ceph specifics, mainly - how to upgrade one node (mon, OSD, ...) at a time. - -* (unknown, either easy or hacky or impossible :) ) Implement - ``--limit`` for ``external-upgrade run``. (As external upgrade runs - on undercloud by default, we'll need to use ``delegate_to`` or - nested Ansible for overcloud nodes. I'm not sure how well --limit - will play with this.) - -* (known, est. as easy) Change update/upgrade CLI from ``--nodes`` - and ``--roles`` to ``--limit``. - -* (semi-known, est. as easy/medium) Add ``-e`` variable pass-through - support to ``external-upgrade run``. - -* (unknown, unknown) Test as much as we can in CI -- integrate with - tripleo-upgrade and OOOQ. - -* For reprovisioning: - - * (semi-known, est. as medium) Implement ``openstack overcloud admin - authorize``. Should take ``--stack``, ``--limit``, - ``--overcloud-ssh-*`` params. - - * (semi-known, est. as medium/hard) Implement an Ansible role for - temporarily persisting overcloud nodes' data on the undercloud and - restoring it. - - * (known, est. as easy) Implement ``external-upgrade run --tags - system_upgrade_cleanup``. - - * (unknown, est. as hard in total, but should probably be tackled by - service squads) Figure out which data needs persisting for - particular services and implement the persistence using the above - Ansible role. - -* For in-place: - - * (semi-known, est. as easy) Calls to Leapp in - ``system_upgrade_prepare``, ``system_upgrade_run``. - - * (semi-known, est. as medium) Implement a Leapp actor to set up or - use the repositories we need. - -Dependencies -============ - -* For in-place: Leapp tool being ready to upgrade the OS. - -* Changes to ceph-ansible might be necessary to make it possible to - run it on a single node (for upgrading mons and OSDs node-by-node). - - -Testing -======= - -Testing is one of the main estimated pain areas. This is a traditional -problem with upgrades, but it's even more pronounced for OS upgrades. - -* Since we do all the OpenStack infra cloud testing of TripleO on - CentOS 7 currently, it would make sense to test an upgrade to - CentOS 8. However, CentOS 8 is nonexistent at the time of writing. - -* It is unclear when Leapp will be ready for testing an upgrade from - CentOS 7, and it's probably the only thing we'd be able to execute - in CI. The ``openstack server rebuild`` alternative is probably not - easily executable in CI, at least not in OpenStack infra clouds. We - might be able to emulate reprovisioning by wiping data. - -* Even if we find a way to execute the upgrade in CI, it might still - take too long to make the testing plausible for validating patches. - - -Documentation Impact -==================== - -Upgrade docs will need to be amended, the above spec is written mainly -from the perspective of expected operator workflow, so it should be a -good starting point. - - -References -========== - -* `Leapp`_ - -* `Leapp actors`_ - -* `Leapp architecture`_ - -* `Stein PTG etherpad`_ - -* `backup and restore`_ - -* `synchronize Ansible module`_ - -.. _Leapp: https://leapp-to.github.io/ -.. _Leapp actors: https://leapp-to.github.io/actors -.. _Leapp architecture: https://leapp-to.github.io/architecture -.. _Stein PTG etherpad: https://etherpad.openstack.org/p/tripleo-ptg-stein -.. _backup and restore: http://tripleo.org/install/controlplane_backup_restore/00_index.html -.. _synchronize Ansible module: https://docs.ansible.com/ansible/latest/modules/synchronize_module.html diff --git a/specs/stein/validation-framework.rst b/specs/stein/validation-framework.rst deleted file mode 100644 index 8f8770ce..00000000 --- a/specs/stein/validation-framework.rst +++ /dev/null @@ -1,279 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================================= -Provide a common Validation Framework inside python-tripleoclient -================================================================= - -https://blueprints.launchpad.net/tripleo/+spec/validation-framework - -Currently, we're lacking a common validation framework in tripleoclient. This -framework should provide an easy way to validate environment prior deploy and -prior update/upgrade, on both undercloud and overcloud. - -Problem Description -=================== - -Currently, we have two types of validations: - -* Those launched prior the undercloud deploy, embedded into the deploy itself - -* Those launched at will via a Mistral Workflow - -There isn't any unified way to call any validations by itself in an easy way, -and we lack the capacity to easily add new validations for the undercloud -preflight checks. - -The current situation is not optimal, as the operator must go in the UI in order -to run validations - there is a way to run them from the CLI, using the exact -same workflows as the UI. This can't be used in order to get proper preflight -validations, especially when we don't get a working Mistral (prior the -undercloud deploy, or with all-on-one/standalone). - -Moreover, there is a need to make the CLI and UI converge. The latter already -uses the full list of validations. Adding the full support of -tripleo-validations to the CLI will improve the overall quality, usability and -maintenance of the validations. - -Finally, a third type should be added: service validations called during the -deploy itself. This doesn't directly affect the tripleoclient codebase, but -tripleo-heat-templates. - -Proposed Change -=============== - -Overview --------- - -In order to improve the current situation, we propose to create a new -"branching" in the tripleoclient commands: `openstack tripleo validator` - -This new subcommand will allow to list and run validations in an independent -way. - -Doing so will allow to get a clear and clean view on the validations we can run -depending on the stage we're in. - -(Note: the subcommand has yet to be defined - this is only a "mock-up".) - -The following subcommands should be supported: - -* ``openstack tripleo validator list``: will display all the available - validations with a small description, like "validate network capabilities on - undercloud" - -* ``openstack tripleo validator run``: will run the validations. Should take - options, like: - - * ``--validation-name``: run only the passed validation. - * ``--undercloud``: runs all undercloud-related validations - * ``--overcloud``: runs all overcloud-related validations - * ``--use-mistral``: runs validations through Mistral - * ``--use-ansible``: runs validations directly via Ansible - * ``--plan``: allows to run validations against specific plan. Defaults to - $TRIPLEO_PLAN_NAME or "overcloud" - -* in addition, common options for all the subcommands: - - * ``--extra-roles``: path to a local directory containing validation - roles maintained by the operator, or swift directory containing extra - validation roles. - * ``--output``: points to a valid Ansible output_callback, such as the native - *json*, or custom *validation_output*. The default one should be the latter - as it renders a "human readable" output. More callbacks can be added later. - -The ``--extra-roles`` must support both local path and remote swift -container, since the custom validation support will push any validation to a -dedicated swift directory. - -The default engine will be determined by the presence of Mistral: if Mistral is -present and accepting requests (meaning the Undercloud is most probably -deployed), the validator has to use it by default. If no Mistral is present, it -must fallback on the ansible-playbook. - -The validations should be in the form of Ansible roles, in order to be -easily accessed from Mistral as well (as it is currently the case). It will -also allow to get a proper documentation, canvas and gives the possibility to -validate the role before running it (ensuring there are metadata, output, -and so on). - -We might also create some dedicated roles in order to make a kind of -"self validation", ensuring we actually can run the validations (network, -resources, and so on). - -The UI uses Mistral workflows in order to run the validations - the CLI must -be able to use those same workflows of course, but also run at least some -validations directly via ansible, especially when we want to validate the -undercloud environment before we even deploy it. - -Also, in order to avoid Mistral modification, playbooks including validation -roles will be created. - -In the end, all the default validation roles should be in one and only one -location: tripleo-validations. The support for "custom validations" being added, -such custom validation should also be supported (see references for details). - -In order to get a proper way to "aim" the validations, proper validation groups -must be created and documented. Of course, one validation can be part of -multiple groups. - -In addition, a proper documentation with examples describing the Good Practices -regarding the roles content, format and outputs should be created. - -For instance, a role should contain a description, a "human readable error -output", and if applicable a possible solution. - -Proper testing for the default validations (i.e. those in tripleo-validations) -might be added as well in order to ensure a new validation follows the Good -Practices. - -We might want to add support for "nagios-compatible outputs" and exit codes, -but it is not sure running those validations through any monitoring tool is a -good idea due to the possible load it might create. This has to be discussed -later, once we get the framework in place. - -Alternatives ------------- - -No real alternatives in fact. Currently, we have many ways to validate, but -they are all unrelated, not concerted. If we don't provide a unified framework, -we will get more and more "side validations ways" and it won't be maintainable. - -Security Impact ---------------- - -Rights might be needed for some validations - they should be added accordingly -in the system sudoers, in a way that limits unwanted privilege escalations. - - -Other End User Impact ---------------------- - -The end user will get a proper way to validate the environment prior to any -action. -This will give more confidence in the final product, and ease the update and -upgrade processes. - -It will also provide a good way to collect information about the systems in -case of failures. - -If a "nagios-compatible output" is to be created (mix of ansible JSON output, -parsing and compatibility stuff), it might provide a way to get a daily report -about the health of the stack - this might be a nice feature, but not in the -current scope (will need a new stdout_callback for instance). - -Performance Impact ------------------- - -The more validations we get, the more time it might take IF we decide to run -them by default prior any action. - -The current way to disable them, either with a configuration file or a CLI -option will stay. - -In addition, we can make a great use of "groups" in order to filter out greedy -validations. - - -Other Deployer Impact ---------------------- - -Providing a CLI subcommand for validation will make the deployment easier. - -Providing a unified framework will allow an operator to run the validations -either from the UI, or from the CLI, without any surprise regarding the -validation list. - -Developer Impact ----------------- - -A refactoring will be needed in python-tripleoclient and probably in -tripleo-common in order to get a proper subcommand and options. - -A correct way to call Ansible from Python is to be decided (ansible-runner?). - -A correct way to call Mistral workflows from the CLI is to be created if it -does not already exist. - -In the end, the framework will allow other Openstack projects to push their own -validations, since they are the ones knowing how and what to validate in the -different services making Openstack. - -All validations will be centralized in the tripleo-validations repository. -This means we might want to create a proper tree in order to avoid having -100+ validations in the same directory. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - cjeanner - -Other contributors: - akrivoka - ccamacho - dpeacock - florianf - - -Work Items ----------- - -* List current existing validations in both undercloud_preflight.py and - openstack-tripleo-validations. - -* Decide if we integrate ansible-runner as a dependency (needs to be packaged). - -* Implement the undercloud_preflight validations as Ansible roles. - -* Implement a proper way to call Ansible from the tripleoclient code. - -* Implement support for a configuration file dedicated for the validations. - -* Implement the new subcommand tree in tripleoclient. - -* Validate, Validate, Validate. - - -Dependencies -============ - -* Ansible-runner: https://github.com/ansible/ansible-runner - -* Openstack-tripleo-validations: https://github.com/openstack/tripleo-validations - - - -Testing -======= - -The CI can't possibly provide the "right" environment with all the requirements. -The code has to implement a way to configure the validations so that the CI -can override the *productive* values we will set in the validations. - - -Documentation Impact -==================== - -A new entry in the documentation must be created in order to describe this new -framework (for the devs) and new subcommand (for the operators). - -References -========== - -* http://lists.openstack.org/pipermail/openstack-dev/2018-July/132263.html - -* https://bugzilla.redhat.com/show_bug.cgi?id=1599829 - -* https://bugzilla.redhat.com/show_bug.cgi?id=1601739 - -* https://review.openstack.org/569513 (custom validation support) - -* https://docs.openstack.org/tripleo-docs/latest/install/validations/validations.html diff --git a/specs/stein/zero-footprint-installer.rst b/specs/stein/zero-footprint-installer.rst deleted file mode 100644 index 423d5620..00000000 --- a/specs/stein/zero-footprint-installer.rst +++ /dev/null @@ -1,127 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================ -TripleO Zero Footprint Installer -================================ - -https://blueprints.launchpad.net/tripleo/+spec/zero-footprint - -This spec introduces support for an installer mode which has zero -(or at least much less) dependencies than we do today. It is meant -to be an iteration of the Undercloud and All-In-One (standalone) -installers that allows you to end up with the same result without -having to install all of the TripleO dependencies on your host machine. - -Problem Description -=================== - -Installing python-tripleoclient on a host machine currently installs -a lot of dependencies many of which may be optional for smaller -standalone type installations. Users of smaller standalone installations -can have a hard time understanding the differences between what TripleO -dependencies get installed vs which services TripleO installs. - -Additionally, some developers would like a fast-track way to develop and -run playbooks without requiring local installation of an Undercloud which -in many cases is done inside a virtual machine to encapsulate the dependencies -that get installed. - -Proposed Change -=============== - -A new zero footprint installer can help drive OpenStack Tripleoclient -commands running within a container. Using this approach you can: - -1. Generate Ansible playbooks from a set of Heat templates - (tripleo-heat-templates), Heat environments, and Heat parameters - exactly like we do today using a Container. No local dependencies - would be required to generate the playbooks. - -2. (optionally) Execute the playbooks locally on the host machine. This would - require some Ansible modules to be installed that TripleO depends on but - is a much smaller footprint than what we require elsewhere today. - -Alternatives ------------- - -Create a subpackage of python-tripleoclient which installs less dependencies. -The general footprint of required packages would still be quite high (lots -of OpenStack packages will still be installed for the client tooling). - -Or do nothing and continue to use VMs to encapsulate the dependencies for -an Undercloud/All-In-One installer and generate Ansible playbooks. Setting -up a local VM requires more initial setup and dependencies however and is -heavier than just using a local container to generate the same playbooks. - -Security Impact ---------------- - -As a container will be used to generate Ansible playbooks the user may -need to expose some local data/files to the installer container. This is -likely a minimal concern as we already require this data to be exposed to -the Undercloud and All-In-One installers. - -Other End User Impact ---------------------- - -None - -Performance Impact ------------------- - -Faster deployment and testing of local All-On-One setups. - -Other Deployer Impact ---------------------- - -None - - -Developer Impact ----------------- - -Faster deployment and testing of local All-On-One setups. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - dprince - -Work Items ----------- - -* A new 'tripleoclient' container -* New project to drive the installation (Talon?) -* Continue to work on refining the Ansible playbook modules to provide a - cleaner set of playbook dependencies. Specifically those that depend on - the any of the traditional TripleO/Heat agent hooks and scripts. -* documentation updates - -Dependencies -============ - -None. - -Testing -======= - -This new installer can likely suppliment or replace some of the testing we -are doing for All-In-One (standalone) deployments in upstream CI. - -Documentation Impact -==================== - -Docs will need to be updated. - -References -========== - -None diff --git a/specs/template.rst b/specs/template.rst deleted file mode 100644 index 5a403f75..00000000 --- a/specs/template.rst +++ /dev/null @@ -1,226 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -========================================== -Example Spec - The title of your blueprint -========================================== - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo - -Introduction paragraph -- why are we doing anything? A single paragraph of -prose that operators can understand. - -Some notes about using this template: - -* Your spec should be in ReSTructured text, like this template. - -* Please wrap text at 80 columns. - -* The filename in the git repository should match the launchpad URL, for - example a URL of: https://blueprints.launchpad.net/tripleo/+spec/awesome-thing - should be named awesome-thing.rst - -* Please do not delete any of the sections in this template. If you have - nothing to say for a whole section, just write: None - -* For help with syntax, see http://sphinx-doc.org/rest.html - -* To test out your formatting, build the docs using tox, or see: - http://rst.ninjs.org - - -Problem Description -=================== - -A detailed description of the problem: - -* For a new feature this might be use cases. Ensure you are clear about the - actors in each use case: End User vs Deployer - -* For a major reworking of something existing it would describe the - problems in that feature that are being addressed. - - -Proposed Change -=============== - -Overview --------- - -Here is where you cover the change you propose to make in detail. How do you -propose to solve this problem? - -If this is one part of a larger effort make it clear where this piece ends. In -other words, what's the scope of this effort? - -Alternatives ------------- - -What other ways could we do this thing? Why aren't we using those? This doesn't -have to be a full literature review, but it should demonstrate that thought has -been put into why the proposed solution is an appropriate one. - -Security Impact ---------------- - -Describe any potential security impact on the system. Some of the items to -consider include: - -* Does this change touch sensitive data such as tokens, keys, or user data? - -* Does this change involve cryptography or hashing? - -* Does this change require the use of sudo or any elevated privileges? - -* Does this change involve using or parsing user-provided data? This could - be directly at the API level or indirectly such as changes to a cache layer. - -* Can this change enable a resource exhaustion attack, such as allowing a - single API interaction to consume significant server resources? Some examples - of this include launching subprocesses for each connection, or entity - expansion attacks in XML. - -For more detailed guidance, please see the OpenStack Security Guidelines as -a reference (https://wiki.openstack.org/wiki/Security/Guidelines). These -guidelines are a work in progress and are designed to help you identify -security best practices. For further information, feel free to reach out -to the OpenStack Security Group at openstack-security@lists.openstack.org. - -Upgrade Impact --------------- - -Describe potential upgrade impact on the system. - -* Is this change meant to become the default for deployments at some - point in the future? How do we migrate existing deployments to that - feature? - -* Can the system be upgraded to this feature using the upgrade hooks - provided by the composable services framework? - -* Describe any plans to deprecate configuration values or - features. (For example, if we change the directory name that - instances are stored in, how do we handle instance directories - created before the change landed? Do we move them? Do we have a - special case in the code? Do we assume that the operator will - recreate all the instances in their cloud?) - -* Please state anything that operators upgrading from the previous - release need to be aware of. Do they need to perform extra manual - operations? - -Other End User Impact ---------------------- - -Are there ways a user will interact with this feature? - -Performance Impact ------------------- - -Describe any potential performance impact on the system, for example -how often will new code be called, and is there a major change to the calling -pattern of existing code. - -Examples of things to consider here include: - -* A small change in a utility function or a commonly used decorator can have a - large impacts on performance. - -Other Deployer Impact ---------------------- - -Discuss things that will affect how you deploy and configure OpenStack -that have not already been mentioned, such as: - -* What config options are being added? Should they be more generic than - proposed (for example a flag that other hypervisor drivers might want to - implement as well)? Are the default values ones which will work well in - real deployments? - -* Is this a change that takes immediate effect after its merged, or is it - something that has to be explicitly enabled? - -Developer Impact ----------------- - -Discuss things that will affect other developers working on OpenStack. - - -Implementation -============== - -Assignee(s) ------------ - -Who is leading the writing of the code? Or is this a blueprint where you're -throwing it out there to see who picks it up? - -If more than one person is working on the implementation, please designate the -primary author and contact. - -Primary assignee: - - -Other contributors: - - -Work Items ----------- - -Work items or tasks -- break the feature up into the things that need to be -done to implement it. Those parts might end up being done by different people, -but we're mostly trying to understand the timeline for implementation. - - -Dependencies -============ - -* Include specific references to specs and/or blueprints in tripleo, or in other - projects, that this one either depends on or is related to. - -* If this requires functionality of another project that is not currently used - by Tripleo (such as the glance v2 API when we previously only required v1), - document that fact. - -* Does this feature require any new library dependencies or code otherwise not - included in OpenStack? Or does it depend on a specific version of library? - - -Testing -======= - -Please discuss how the change will be tested. - -Is this untestable in CI given current limitations (specific hardware / -software configurations available)? If so, are there mitigation plans (3rd -party testing, gate enhancements, etc). - - -Documentation Impact -==================== - -What is the impact on the docs? Don't repeat details discussed above, but -please reference them here. - - -References -========== - -Please add any useful references here. You are not required to have any -reference. Moreover, this specification should still make sense when your -references are unavailable. Examples of what you could include are: - -* Links to mailing list or IRC discussions - -* Links to notes from a summit session - -* Links to relevant research, if appropriate - -* Related specifications as appropriate (e.g. if it's an EC2 thing, link the EC2 docs) - -* Anything else you feel it is worthwhile to refer to diff --git a/specs/train/certificate-management.rst b/specs/train/certificate-management.rst deleted file mode 100644 index a973b791..00000000 --- a/specs/train/certificate-management.rst +++ /dev/null @@ -1,197 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -===================================================== -Move certificate management in tripleo-heat-templates -===================================================== - -Launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/ansible-certmonger - -Problem Description -=================== - -There are multiple issues with the current way certificates are managed with -Puppet and Certmonger, especially in a containerized environment: - -* Multiple containers are using the same certificate -* There isn't any easy way to find out which container needs to be restarted - upon certificate renewal -* Shared certificates are bad - -The main issue now is the use of "pkill", especially for httpd services. Since -Certmonger has no knowledge of what container has an httpd service running, -it uses a wide fly swatter in the hope all related services will effectively -be reloaded with the new certificate. - -The usage of "pkill" by Certmonger is prevented on a SELinux enforcing host. - -Proposed Change -=============== - -Introduction ------------- - -While the use of certmonger isn't in question, the way we're using it is. - -The goal of this document is to describe how we could change that usage, -allowing to provide a better security, while allowing Certmonger to restart -only the needed containers in an easy fashion. - -Implement certmonger in Ansible -------------------------------- - -A first step will be to implement a certmonger "thing" in Ansible. There are -two ways to do that: - -* Reusable role -* Native Ansible module - -While the first one is faster to implement, the second would be better, since -it will allow to provide a clean way to manage the certificates. - -Move certificate management to tripleo-heat-templates ------------------------------------------------------ - -Once we have a way to manage Certmonger within Ansible, we will be able to move -calls directly in relevant tripleo-heat-templates files, allowing to generate -per-container certificate. - -Doing so will also allow Certmonger to know exactly which container to -restart upon certificate renewal, using a simple "container_cli kill" command. - -Alternatives -============ - -One alternative is proposed - -Maintain a list ---------------- - -We could maintain the code as-is, and just add a list for the containers -needing a restart/reload. Certmonger would loop on that list, and do its -job upon certificate renewal. - -This isn't a good solution, since the list will eventually lack updates, and -this will create new issues instead of solving the current ones. - -Also, it doesn't allow to get per-container certificate, which is bad. - -Proposed roadmap -================ - -In Stein: - -* Create "tripleo-certmonger" Ansible reusable role in tripleo-common - -In Train: - -* Move certificate management/generation within tripleo-heat-templates. -* Evaluate the benefices of moving to a proper Ansible module for Certmonger. -* If evaluation is good and we have time, implement it and update current code. - -In "U" release: - -* Check if anything relies on puppet-certmonger, and if not, drop this module. - -Security Impact -=============== - -We will provide a better security level by avoiding shared x509 keypairs. - -Upgrade Impact -============== - -Every container using the shared certificate will be restarted in order to -load the new, dedicated one. - -We will have to ensure the nova metadata are properly updated in order to -let novajoin create the services in FreeIPA, allowing to request per-service -certificates. - -Tests should also be made regarding novajoin update/upgrade in order to ensure -all is working as expected. - -If the containers are already using dedicated certificates, no other impact is -expected. - -End User Impact -=============== - -During the upgrade, a standard short downtime is to be expected, unless -the deployment is done using HA. - -Performance Impact -================== - -No major performance impact is expected. - -Deployer Impact -=============== - -No major deployer impact is expected. - -Developer Impact -================ - -People adding new services requiring a certificate will need to call the -Certmonger module/role in the new tripleo-heat-templates file. - -They will also need to ensure new metadata is properly generated in order to -let novajoin create the related service in FreeIPA. - -Implementation -============== - -Contributors ------------- - -* Cédric Jeanneret -* Grzegorz Grasza -* Nathan Kinder - -Work Items ----------- - -* Implement reusable role for Certmonger -* Move certificate management to tripleo-heat-templates -* Remove certmonger parts from Puppet -* Update/create needed documentations about the certificate management - -Later: -* Implement a proper Ansible Module -* Update the role in order to wrap module calls - - -Dependencies -============ - -None - currently, no Certmonger module for Ansible exists. - -Testing -======= - -We have to ensure the dedicated certificate is generated with the right -content, and ensure it's served by the right container. - -We can do that using openssl CLI, maybe adding a new check in the CI via -a new role in tripleo-quickstart-extras. - -This is also deeply linked to novajoin, thus we have to ensure it works as -expected. - -Documentation Impact -==================== - -We will need to document how the certificate are managed. - -References -========== - -* `Example of existing certificate management in Ansible `_ -* `Skeleton certmonger_getcert `_ -* `Existing reusable roles in TripleO `_ diff --git a/specs/train/undercloud-minion.rst b/specs/train/undercloud-minion.rst deleted file mode 100644 index a2b6a021..00000000 --- a/specs/train/undercloud-minion.rst +++ /dev/null @@ -1,167 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================== -Scale Undercloud with a Minion -============================== - -https://blueprints.launchpad.net/tripleo/undercloud-minion - -In order to improve our scale, we have identified heat-engine and possibly -ironic-conductor as services that we can add on to an existing undercloud -deployment. Adding heat-engine allows for additional processing capacity -when creating and updating stacks for deployment. By adding a new light -weight minion node, we can scale the Heat capacity horizontally. - -Additionally since these nodes could be more remote, we could add an -ironic-conductor instance to be able to manage hosts in a remote region -while still having a central undercloud for the main management. - - -Problem Description -=================== - -Currently we use a single heat-engine on the undercloud for the deployment. -According to the Heat folks, it can be beneficial for processing to have -additional heat-engine instances for scale. The recommended scaling is out -rather than up. Additionally by being able to deploy a secondary host, we -can increase our capacity for the undercloud when additional scale capacity -is required. - - -Proposed Change -=============== - -Overview --------- - -We are proposing to add a new undercloud "minion" configuration that can be -used by operators to configure additional instances of heat-engine and -ironic-conductor when they need more processing capacity. We would also -allow the operator to disable heat-engine from the main undercloud to reduce -the resource usage of the undercloud. By removing the heat-engine from the -regular undercloud, the operator could possibly avoid timeouts on other services -like keystone and neutron that can occur when the system is under load. - -Alternatives ------------- - -An alternative would be to make the undercloud deployable in a traditional -HA capacity where we share the services across multiple nodes. This would -increase the overall capacity but adds additional complexity to the undercloud. -Additionally this does not let us target specific services that are resource -heavy. - -Security Impact ---------------- - -The new node would need to have access to the the main undercloud's keystone, -database and messaging services. - -Upgrade Impact --------------- - -The new minion role would need to be able to be upgraded by the user. - -Other End User Impact ---------------------- - -None. - -Performance Impact ------------------- - -* This additional minion role may improve heat processing due to the additional - resource capacity being provided. - -* Locating an ironic-conductor closer to the nodes being managed can improve - performance by being closer to the systems (less latency, etc). - - -Other Deployer Impact ---------------------- - -Additional undercloud role and a new undercloud-minion.conf configuration file -will be created. Additionally a new option may be added to the undercloud.conf -to manage heat-engine instalation. - -Developer Impact ----------------- - -None. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - mwhahaha - -Other contributors: - slagle - EmilienM - -Work Items ----------- - -Work items or tasks -- break the feature up into the things that need to be -done to implement it. Those parts might end up being done by different people, -but we're mostly trying to understand the timeline for implementation. - -python-tripleoclient -~~~~~~~~~~~~~~~~~~~~ - -* New 'openstack undercloud minion deploy' command for installation - -* New 'openstack undercloud minion upgrade' command for upgrades - -* New configuration file 'undercloud-minion.conf' to drive the installation - and upgrades. - -* New configuration option in 'undercloud.conf' to provide ability to disable - the heat-engine on the undercloud. - -tripleo-heat-templates -~~~~~~~~~~~~~~~~~~~~~~ - -* New 'UndercloudMinion' role file - -* New environment file for the undercloud minion deployment - -* Additional environment files to enable or disable heat-engine and - ironic-conductor. - -Dependencies -============ - -None. - -Testing -======= - -We would add a new CI job to test the deployment of the minion node. This job -will likely be a new multinode job. - - - -Documentation Impact -==================== - -We will need to document the usage of the undercloud minion installation and -the specific use cases where this can be beneficial. - - -References -========== - -See the notes from the Train PTG around Scaling. - -* https://etherpad.openstack.org/p/tripleo-ptg-train - -* https://etherpad.openstack.org/p/DEN-tripleo-forum-scale diff --git a/specs/ussuri/mistral-to-ansible.rst b/specs/ussuri/mistral-to-ansible.rst deleted file mode 100644 index c5fa699c..00000000 --- a/specs/ussuri/mistral-to-ansible.rst +++ /dev/null @@ -1,205 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================ -Replace Mistral with Ansible -============================ - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-mistral-to-ansible - -The goal of this proposal is to replace Mistral in TripleO with Ansible -playbooks. - - -Problem Description -=================== - -Mistral was originally added to take the place of an “API” and provide common -logic for tripleoclient and TripleO UI. After the TripleO UI was removed, the -only consumer of Mistral is tripleoclient. This means that Mistral now adds -unnecessary overhead and complexity. - - -Proposed Change -=============== - -Overview --------- - -Remove Mistral from the TripleO undercloud and convert all Mistral workbooks, -workflows and actions to Ansible playbooks within tripleo-ansible. tripleoclient -will then be updated to execute the Ansible playbooks rather than the Mistral -workflows. - -Alternatives ------------- - -The only other alternative candidate is to keep using Mistral and accept the -complexity and reinvest in the project. - -Security Impact ---------------- - -* As the code will be re-writing Mistral workflows that currently deal with - passwords, tokens and secrets we will need to be careful. However the logic - should be largely the same. - -* With the eventual removal of Mistral and Zaqar two complex systems can be - removed which will reduce the surface area for security issues. - -* The new Ansible playbooks will only use the undercloud OpenStack APIs, - therefore they shouldn't create a new attack vector. - - - -Upgrade Impact --------------- - -* Upgrades will need to remove Mistral services and make sure the Ansible - playbooks are in place. - -* Older versions of tripleoclient will no longer work with the undercloud as - they will expect Mistral to be present. - -* Most of the data in Mistral is ephemeral, but some longer term data is stored - in Mistral environments. This data will likely be moved to Swift. - - -Other End User Impact ---------------------- - -The output of CLI commands will change format. For example, the Mistral -workflow ID will no longer be included and other Ansible specific output will -be included. Where possible we will favour streaming Ansible output to the -user, making tripleoclient very light and transparent. - -Some CLI commands, such as introspection will need to fundamentally change -their output. Currently they send real time updates and progress to the client -with Zaqar. Despite moving the execution locally, we are unable to easily get -messages from a Ansible playbook while it is running. This means the user may -need to wait a long time before they get any feedback. - - -Performance Impact ------------------- - -There is no expected performance impact as the internal logic should be largely -the same. However, the Ansible playbooks will be executed where the user runs -the CLI rather than by the Mistral server. This could then be slower or faster -depending on the resources available to the machine and the network connection -to the undercloud. - -The undercloud itself should have more resources available since it wont be -running Mistral or Zaqar. - - -Other Deployer Impact ---------------------- - -If anyone is using the Mistral workflows directly, they will stop working. We -currently don't know of any users doing this and it was never documented. - - -Developer Impact ----------------- - -Developers will need to contribute to Ansible playbooks instead of Mistral -workflows. As the pool of developers that know Ansible is larger than those -that know Mistral this should make development easier. Ansible contributions -will likely expect unit/functional tests. - - -Implementation -============== - -Assignee(s) ------------ - - -Primary assignee: - d0ugal - -Other contributors: - -- apetrich -- ekultails -- sshnaidm -- cloudnull - -Work Items ----------- - -Storyboard is being used to track this work: - https://storyboard.openstack.org/#!/board/208 - -- Migrate the Mistral workflows to Ansible playbooks. - -- Migrate or replace custom Mistral actions to Ansible native components. - -- Remove Mistral and Zaqar. - -- Update documentation specific to Mistral. - -- Extend our auto-documentation plugin to support playbooks within - tripleo-ansible. This will allow us to generate API documentation for all - playbooks committed to tripleo-ansible, which will include our new `cli` - prefixed playbooks. - -Converting Mistral Workflows to Ansible -*************************************** - -For each Mistral workflow the following steps need to be taken to port them -to Ansible. - -- Re-write the Mistral workflow logic in Ansible, reusing the Mistral Python - actions where appropriate. - -- Update python-tripleoclient to use the new Ansible playbooks. It should - prefer showing the native Ansible output rather than attempting to replicate - the previous output. - -- The Workflows and related code should be deleted from tripleo-common. - -A complete example can be seen for the `openstack undercloud backup` command. - -- `Ansible Playbook `_ -- `Updated tripleoclient `_ -- `Removal of all workflow code `_ - - -Dependencies -============ - -None - - -Testing -======= - -Since this change will largely be a re-working of existing code the changes -will be tested by the existing CI coverage. This should be improved and -expanded as is needed. - - -Documentation Impact -==================== - -Any references to Mistral will need to be updated to point to the new ansible -playbook. - - -References -========== - -* https://review.opendev.org/#/q/topic:mistral-removal+OR+topic:mistral_to_ansible - -* https://bugs.launchpad.net/tripleo/+bugs?field.tag=mistral-removal - -* http://lists.openstack.org/pipermail/openstack-discuss/2019-October/010384.html - -* https://storyboard.openstack.org/#!/board/208 diff --git a/specs/ussuri/scaling-with-ansible-inventory.rst b/specs/ussuri/scaling-with-ansible-inventory.rst deleted file mode 100644 index c99f8de9..00000000 --- a/specs/ussuri/scaling-with-ansible-inventory.rst +++ /dev/null @@ -1,251 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================== -Scaling with the Ansible Inventory -================================== - -https://blueprints.launchpad.net/tripleo/scaling-with-Ansible-inventory - -Scaling an existing deployment should be possible by adding new host -definitions directly to the Ansible inventory, and not having to increase the -Count parameters. - -Problem Description -=================== - -Currently to scale a deployment, a Heat stack update is required. The stack -update reflects the new desired node count of each role, which is then -represented in the generated Ansible inventory. The inventory file is then used -by the config-download process when ansible-playbook is executed to perform the -software configuration on each node. - -Updating the Heat stack with the new desired node count has posed some -scaling challenges. Heat creates a set of resources associated with each node. -As the number of nodes in a deployment increases, Heat has more and more -resources to manage. - -As the stack size grows, Heat must be tuned with software configurations or -horizontally scaled with additional engine workers. However, horizontal scaling -of Heat workers will only help so much as eventually other service workers -would need to be scaled as well, such as database, messaging, or Keystone -worker process. Having to increasingly scale worker processes results in -additional physical resource consumption. - -Heat performance also begins to degrade as stack size increases. It takes -longer and longer for stack operations to complete as node count increases. The -stack operation time often reaches into taking many hours, which is usually -outside the range of typical maintenance windows. - -It is also hard to predict what changes Heat will make. Often, no changes are -desired other than to scale out to new nodes. However, unintended template -changes or user error around forgetting to pass environment files poses -additional unnecessary risk to the scaling operation. - - -Proposed Change -=============== - -Overview --------- - -The proposed change would allow for users to directly add new node definitions -to the Ansible inventory by way of a new Heat parameter to allow for scaling -services onto those new nodes. No change in the Count parameters would be -required. - -A minimum set of data would be required when adding a new node to the Ansible -inventory. Presently, this includes the TripleO role, and an IP address on each -network that is used by that role. - -Only scaling of already defined roles will be possible with this method. -Defining new roles would still require a full Heat stack update which defined -the new role. - -Once the new node(s) are added to the inventory, ansible-playbook could be -rerun with the config-download directory to scale the software services out -on to the new nodes. - -As increasing the node count in the Heat stack operation won't be necessary -when scaling, if baremetal provisioning is required for the new nodes, then -this work depends on the nova-less-deploy work: - -https://specs.openstack.org/openstack/tripleo-specs/specs/stein/nova-less-deploy.html - -Once baremetal provisioning is migrated out of Heat with the above work, then -new nodes can be provisioned with those new workflows before adding them -directly to the Ansible inventory. - -Since new nodes added directly to the Ansible inventory would still be -consuming IP's from the subnet ranges defined for the overcloud networks, -Neutron needs to be made aware of those assignments so that there are no -overlapping IP addresses. This could be done with a new interface in -tripleo-heat-templates that allows for specifying the extra node inventory -data. The parameter would be called ``ExtraInventoryData``. The templates would -take care of operating on that input and creating the appropriate Neutron ports -to correspond to the IP addresses specified in the data. - -When tripleo-ansible-inventory is used to generate the inventory, it would -query Heat as it does today, but also layer in the extra inventory data as -specified by ``ExtraInventoryData``. The resulting inventory would be a unified -view of all nodes in the deployment. - -``ExtraInventoryData`` may be a list of files that are consumed with Heat's -get_file function so that the deployer can keep their inventory data organized -by file. - -Alternatives ------------- - -This change is primarily targeted at addressing scaling issues around the -Heat stack operation. Alternative methods include using undercloud minions: - -https://docs.openstack.org/project-deploy-guide/tripleo-docs/latest/features/undercloud_minion.html - -Multi-stack/split-controlplane also addresses the issue somewhat by breaking up -the deployment into smaller and more manageable stacks: - -https://docs.openstack.org/project-deploy-guide/tripleo-docs/latest/features/distributed_compute_node.html - -These alternatives are complimentary to the proposed solution here, and all of -these solutions can be used together for the greatest benefits. - -Direct manipulation of inventory data -_____________________________________ - -Another alternative would be to not make use of any new interface in the -templates such as the previously mentioned ``ExtraInventoryData``. Users could just -update the inventory file manually, or drop inventory files in a specified -location (since Ansible can use a directory as an inventory source). - -The drawbacks to this approach are that another tool would be necessary to -create associated ports in Neutron so that there are no overlapping IP -addresses. It could also be a manual step, although that is prone to error. - -The advantages to this approach is that it would completely eliminate the stack -update operation as part of the scaling. Not having any stack operation is -appealing in some regards due to the potential to forget environment files or -other user error (out of date templates, etc). - -Security Impact ---------------- - -IP addresses and hostnames would potentially exist in user managed templates -that have the value for ``ExtraInventoryData``, however this is no different than -what is present today. - -Upgrade Impact --------------- - -The upgrade process will need to be aware that not all nodes are represented in -the Heat stack, and some will be represented only in the inventory. This should -not be an issue as long as there is a consistent interface to get a single -unified inventory as there exists now. - -Any changes around creating the unified view of the inventory should be made -within the implementation of that interface (tripleo-ansible-inventory) such -that existing tooling continues to use an inventory that contains all nodes for -a deployment. - -Other End User Impact ---------------------- - -Users will potentially have to manage additional environment files for the -extra inventory data. - -Performance Impact ------------------- - -Performance should be improved during scale out operations. - -However, it should be noted that Ansible will face scaling challenges as well. -While this change does not directly introduce those new challenges, it may -expose them more rapidly as it bypasses the Heat scaling challenges. - -For example, it is not expected that simply adding hundreds or thousands of new -nodes directly to the Ansible inventory means that scaling operation would -succeed. It would likely expose new scaling challenges in other tooling, such -as the playbook and role tasks or Ansible itself. - -Other Deployer Impact ---------------------- - -Since this proposal is meant to align with the nova-less-deploy, all nodes -(whether they are known to Heat or not) would be unprovisioned if the -deployment is deleted. - -If using pre-provisioned nodes, then there is no change in behavior in that -deleting the Heat stack does not actually "undeploy" any software. This -proposal does not change that behavior. - -Developer Impact ----------------- - -Developers could more quickly test scaling by bypassing the Heat stack update -completely if desired, or using the ``ExtraInventoryData`` interface. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - James Slagle - -Work Items ----------- - -* Add new parameter ``ExtraInventoryData`` - -* Add Heat processing of ``ExtraInventoryData`` - - * create Neutron ports - - * add stack outputs - -* Update tripleo-ansible-inventory to consume from added stack outputs - -* Update HostsEntry to be generic - -Dependencies -============ - -* Depends on nova-less-deploy work for baremetal provisioning outside of Heat. - If using pre-provisioned nodes, does not depend on nova-less-deploy. - -* All deployment configurations coming out of Heat need to be generic per role. - Most of this work was complete in Train, however this should be reviewed. For - example, the HostsEntry data is still static and Heat is calculating the node - list. This data needs to be moved to an Ansible template. - - -Testing -======= - -Scaling is not currently tested in CI, however perhaps it could be with this -change. - -Manual test plans and other test automation would need to be updated to also -test scaling with ``ExtraInventoryData``. - - -Documentation Impact -==================== - -Documentation needs to be added for ``ExtraInventoryData``. - -The feature should also be fully explained in that users and deployers need to -be made aware of the change of how nodes may or may not be represented in the -Heat stack. - -References -========== - -* https://specs.openstack.org/openstack/tripleo-specs/specs/stein/nova-less-deploy.html -* https://docs.openstack.org/project-deploy-guide/tripleo-docs/latest/features/undercloud_minion.html -* https://docs.openstack.org/project-deploy-guide/tripleo-docs/latest/features/distributed_compute_node.html diff --git a/specs/ussuri/tripleo-operator-ansible.rst b/specs/ussuri/tripleo-operator-ansible.rst deleted file mode 100644 index 64fad99f..00000000 --- a/specs/ussuri/tripleo-operator-ansible.rst +++ /dev/null @@ -1,331 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================================================================= -tripleo-operator-ansible - Ansible roles and modules to interact with TripleO -============================================================================= - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-operator-ansible - -As an operator of a TripleO deployment, I would like to be able to comsume -supported ansible roles and modules that let me perform TripleO related -actions in my automation. - -Problem Description -=================== - -The existing tripleo-ansible_ repository currently contains roles, plugins -and modules that are consumed by TripleO to perform the actual deployments and -configurations. As these are internal implementations to TripleO, we would not -want operators consuming these directly. The tripleo-ansible_ repository is -also branched which means that the contents within the repo and packaging -are specific to a singular release. This spec propose that we create a new -repository targeted for external automation for any supported version. - -Currently Operators do not have a set of official ansible roles and modules -that can be used to deploy and manage TripleO environments. For folks who wish -to manage their TripleO environments in an automated fashion, we have seen -multiple folks implement the same roles to manage TripleO. e.g. -tripleo-quickstart_, tripleo-quickstart-extras_, infrared_, tripleo-lab_. - -* TripleO should provide a set of ansible roles and modules that can be used - by the end user to deploy and manage an Undercloud and Overcloud. - -* TripleO should provide a set of ansible roles and modules that can be used - to perform scaling actions. - -* TripleO should provide a set of ansible roles and modules that can be used - to perform update and upgrade actions. - -.. _tripleo-ansible: https://opendev.org/openstack/tripleo-ansible -.. _infrared: https://github.com/redhat-openstack/infrared -.. _tripleo-quickstart: https://opendev.org/openstack/tripleo-quickstart -.. _tripleo-quickstart-extras: https://opendev.org/openstack/tripleo-quickstart-extras -.. _tripleo-lab: https://github.com/cjeanner/tripleo-lab - -Proposed Change -=============== - -Overview --------- - -TripleO should create a new repository where ansible roles, plugins and -modules that wrap TripleO actions can be stored. This repository should be -branchless so that the roles can be used with any currently supported version -of TripleO. The goal is to only provide automation for TripleO actions and not -necessarily other cloud related actions. The roles in this new repository -should only be targeted to providing an automation interface for the existing -`tripleoclient commands`_. The repository may provide basic setups actions such -as implementing a wrapper around tripleo-repos_. The roles contained in this -repository should not implement additional day 2 cloud related operations such -as creating servers, networks or other resources on the deployed Overcloud. - -This new repository should be able to be packaged and distributed via an RPM -as well as being able to be published to `Ansible Galaxy`_. The structure -of this new repository should be Ansible collections_ compatible. - -The target audience of the new repository would be end users (operators, -developers, etc) who want to write automation around TripleO. The new -repository and roles would be our officially supported automation artifacts. -One way to describe this would be like providing Puppet modules for a given -peice of software so that it can be consumed by users who use Puppet. The -existing CLI will continue to function for users who do not want to use -Ansible to automate TripleO deployments or who wish to continue to use the CLI -by hand. The roles are not a replacement for the CLI, but only provide an -official set of roles for people who use Ansible. - -The integration point for Ansible users would be the roles provided via -tripleo-operator-ansible. We would expect users to perform actions by -including our provided roles. - -An example playbook for a user could be: - -.. code-block:: yaml - - - hosts: undercloud - gather_facts: true - tasks: - - include_role: - role: tripleo_undercloud - tasks_from: install - vars: - tripleo_undercloud_configuration: - DEFAULT: - undercloud_debug: True - local_ip: 192.168.50.1/24 - - name: Copy nodes.json - copy: - src: /home/myuser/my-environment-nodes.json - dest: /home/stack/nodes.json - - include_role: - role: tripleo_baremetal - tasks_from: introspection - vars: - tripleo_baremetal_nodes_file: /home/stack/nodes.json - tripleo_baremetal_introspection_provide: True - tripleo_baremetal_introspection_all_managable: True - - include_role: - role: tripleo_overcloud - tasks_from: deploy - vars: - tripleo_overcloud_environment_files: - - network_isolation.yaml - - ceph_storage.yaml - tripleo_overcloud_roles: - - Controller - - Networker - - Compute - - CephStorage - -The internals of these roles could possibly proceed in two different paths: - -* Implement simple wrappers around the invocation of the actual TripleO - commands using execs, shell or commands. This path will likely be the fastest - path to have an initial implementation. - -.. code-block:: yaml - - - name: Install undercloud - command: "openstack undercloud install {{ tripleo_undercloud_install_options }}" - chdir: "{{ tripleo_undercloud_install_directory }}" - - -* Implement a python wrapper to call into the provided tripleoclient classes. - This path may be a longer term goal as we may be able to provide better - testing by using modules. - -.. code-block:: python - - #!/usr/bin/python - - # import the python-tripleoclient - # undercloud cli - - from tripleoclient.v1 import undercloud - - import sys - import json - import os - import shlex - - # See the following for details - # https://opendev.org/openstack/python-tripleoclient/src/branch/ - # master/tripleoclient/v1/undercloud.py - - # setup the osc command - - - class Arg: - verbose_level = 4 - - - # instantiate the - u = undercloud.InstallUndercloud('tripleo', Arg()) - - # prog_name = 'openstack undercloud install' - tripleo_args = u.get_parser('openstack undercloud install') - - # read the argument string from the arguments file - args_file = sys.argv[1] - args_data = file(args_file).read() - - # For this module, we're going to do key=value style arguments. - arguments = shlex.split(args_data) - for arg in arguments: - - # ignore any arguments without an equals in it - if "=" in arg: - - (key, value) = arg.split("=") - - # if setting the time, the key 'time' - # will contain the value we want to set the time to - - if key == "dry_run": - if value == "True": - tripleo_args.dry_run = True - else: - tripleo_args.dry_run = False - - tripleo_args.force_stack_validations = False - tripleo_args.no_validations = True - tripleo_args.force_stack_update = False - tripleo_args.inflight = False - - # execute the install via python-tripleoclient - rc = u.take_action(tripleo_args) - - if rc != 0: - print(json.dumps({ - "failed": True, - "msg": "failed tripleo undercloud install" - })) - sys.exit(1) - - print(json.dumps({ - "changed": True, - "msg": "SUCCESS" - })) - sys.exit(0) - -.. code-block:: yaml - - - name: Install undercloud - tripleo_undercloud: - install: true - foo: bar - -These implementations will need to be evaluated to understand which works -best when attempting to support multiple versions of TripleO where options -may or may not be available. The example of this is where we supported one -cli parameter in versions >= Stein but not prior to this. - -The goal is to have a complete set of roles to do basic deployments within -a single cycle. We should be able to itterate on the internals of the roles -once we have established basic set to prove out the concept. More complex -actions or other version support may follow on in later cycles. - -.. _tripleoclient commands: https://docs.openstack.org/python-tripleoclient/latest/index.html -.. _tripleo-repos: https://opendev.org/openstack/tripleo-repos -.. _Ansible Galaxy: https://galaxy.ansible.com/ -.. _collections: https://docs.ansible.com/ansible/latest/dev_guide/developing_collections.html - -Alternatives ------------- - -* Do nothing and continue to have multiple tools re-implement the actions in - ansible roles. - -* Pick a singular implementaion from the existing set and merge them together - within this existing tool. This however may include additional actions that - are outside of the scope of the TripleO management. This may also limit the - integration by others if established interfaces are too opinionated. - -Security Impact ---------------- - -None. - -Upgrade Impact --------------- - -There should be no upgrade impact other than pulling in the upgrade related -actions into this repository. - -Other End User Impact ---------------------- - -None. - -Performance Impact ------------------- - -None. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -Developers will need to ensure the supported roles are updated if the cli -or other actions are updated with new options or patterns. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - mwhahaha - -Other contributors: - weshay - emilienm - cloudnull - -Work Items ----------- - -The existing roles should be evaulated to see if they can be reused and pulled -into the new repository. - -* Create new tripleo-operator-ansible -* Establish CI and testing framework for the new repository -* Evaulate and pull in existing roles if possible -* Initial implementation may only be a basic wrapper over the cli -* Update tripleo-quickstart to leverage the newly provided roles and remove - previously roles. - -Dependencies -============ - -If there are OpenStack service related actions that need to occur, we may need -to investigate the inclusion of OpenStackSDK, shade or other upstream related -tools. - -Testing -======= - -The new repository should have molecule testing for any new role created. -Additionally once tripleo-quickstart begins to consume the roles we will need -to ensure that other deployment related CI jobs are included in the testing -matrix. - -Documentation Impact -==================== - -The roles should be documented (perferrably automated) for the operators to -be able to consume these new roles. - -References -========== - -None. diff --git a/specs/victoria/simple-container-generation.rst b/specs/victoria/simple-container-generation.rst deleted file mode 100644 index c8544883..00000000 --- a/specs/victoria/simple-container-generation.rst +++ /dev/null @@ -1,427 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - - -=========================== -Simple Container Generation -=========================== - -Simple container generation is an initiative to reduce complexity in the -TripleO container build, deployment, and distribution process by reducing the -size and scope of the TripleO container build tools. - -The primary objective of this initiative is to replace Kolla, and our -associated Kolla customization tools, as the selected container generation -tool-kit. The TripleO community has long desired an easier solution for -deployers and integrators alike and this initiative is making that desire a -reality. - -The Simple container generation initiative is wanting to pivot from a -tool-chain mired between a foundational component of Kolla-Ansible and a -general purpose container build system, to a vertically integrated solution -that is only constructing what TripleO needs, in a minimally invasive, and -simple to understand way. - -[#f3]_ - - -Problem Description -=================== - -TripleO currently leverages Kolla to produce container images. These images are -built for Kolla-Ansible using an opinionated build process which has general -purpose features. While our current images work, they're large and not well -suited for the TripleO use-case, especially in distributed data-centers. The -issue of container complexity and size impacts three major groups, deployers, -third party integrators, and maintainers. As the project is aiming to simplify -interactions across the stack, the container life cycle and build process has -been identified as something that needs to evolve. The TripleO project needs -something vertically integrated which produces smaller images, that are easier -to maintain, with far fewer gyrations required to tailor images to our needs. - - -Proposed Change -=============== - -Overview --------- - -Implement a container file generation role, and a set of statically defined -override variable files which are used to generate our required -container files. [#f2]_ - -Layering -^^^^^^^^ - -.. code-block:: text - - tripleo-base+---+ - | - | - +---+-openstack-${SERVICE}-1-common-+-->openstack-${SERVICE}-1-a - | | - | +-->openstack-${SERVICE}-1-b - | | - | +-->openstack-${SERVICE}-1-c - +-->openstack-${SERVICE}-2 - | - +-->ancillary-${SERVICE}-1 - | - +-->ancillary-${SERVICE}-2 - - -User Experience -^^^^^^^^^^^^^^^ - -Building the standard set of images will be done through a simple command line -interface using the TripleO python client. - -.. code-block:: shell - - $ openstack tripleo container image build [opts] - - -This simple sub-command will provide users the ability to construct images as -needed, generate container files, and debug runtime issues. - - -CLI Options -^^^^^^^^^^^ - -The python TripleO client options for the new container image build entry point. - -=========== =============================== ================================================================= -Option Default Description -=========== =============================== ================================================================= -config-file $PATH/overcloud_containers.yaml Configuration file setting the list of containers to build. -exclude [] Container type exclude. Can be specified multiple times. -work-dir /tmp/container-builds Container builds directory, storing the container files and - logs for each image and its dependencies. -skip-push False Skip pushing images to the registry -skip-build False Only generates container files without producing a local build. -base centos Base image name. -type binary Image type. -tag latest Image tag. -registry localhost Container registry URL. -namespace tripleomaster Container registry namespace. -volume [] Container bind mount used when building the image. Should be - specified multiple times if multiple volumes. -=========== =============================== ================================================================= - - -Container Image Build Tools -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Container images will be built using Buildah_, The required Buildah -functionality will leverage `BuildahBuilder` via `python-tripleoclient` -integration and be exposed though CLI options. - -.. _Buildah: https://buildah.io - - -Image layout -^^^^^^^^^^^^ - -Each image will have its own YAML file which has access to the following -parameters. Each YAML file will have one required parameter (tcib_from for the -source image to build from) and optional parameters. - -================= ============================= ==================== ======== =================================================== -Option Default Type Required Description -================= ============================= ==================== ======== =================================================== -tcib_path `{{ lookup('env', 'HOME') }}` String Path to generated the container file(s) for a given - image. -tcib_args Dict[str, str] Single level `key:value` pairs. Implements arg_. -tcib_from `centos:8` Str True Container image to deploy from. Implements from_. -tcib_labels Dict[str, str] Single level `key:value` pairs. Implements label_. -tcib_envs Dict[str, str] Single level `key:value` pairs. Implements env_. -tcib_onbuilds List[str] =String. Implements onbuild_. -tcib_volumes List[str] =String. Implements volume_. -tcib_workdir Str Implements workdir_. -tcib_adds List[str] =String. Implements add_. -tcib_copies List[str] =String. Implements copy_. -tcib_exposes List[str] =String. Implements expose_. -tcib_user Str Implements user_. -tcib_shell Str Implements shell_. -tcib_runs List[str] =String. Implements run_. -tcib_healthcheck Str Implements healthcheck_. -tcib_stopsignal Str Implements stopsignal_. -tcib_entrypoint Str Implements entrypoint_. -tcib_cmd Str Implements cmd_. -tcib_actions List[Dict[str, str]] Each item is a Single level Dictionary `key:value` - pairs. Allows for arbitrary verbs which maintains - ordering. -tcib_gather_files List[str] Each item is a String. Collects files from the - host and stores them in the build directory. -================= ============================= ==================== ======== =================================================== - -.. _arg: https://docs.docker.com/engine/reference/builder/#arg -.. _from: https://docs.docker.com/engine/reference/builder/#from -.. _label: https://docs.docker.com/engine/reference/builder/#label -.. _env: https://docs.docker.com/engine/reference/builder/#env -.. _onbuild: https://docs.docker.com/engine/reference/builder/#onbuild -.. _volume: https://docs.docker.com/engine/reference/builder/#volume -.. _workdir: https://docs.docker.com/engine/reference/builder/#workdir -.. _add: https://docs.docker.com/engine/reference/builder/#add -.. _copy: https://docs.docker.com/engine/reference/builder/#copy -.. _expose: https://docs.docker.com/engine/reference/builder/#expose -.. _user: https://docs.docker.com/engine/reference/builder/#user -.. _shell: https://docs.docker.com/engine/reference/builder/#shell -.. _run: https://docs.docker.com/engine/reference/builder/#run -.. _healthcheck: https://docs.docker.com/engine/reference/builder/#healthcheck -.. _stopsignal: https://docs.docker.com/engine/reference/builder/#stopsignal -.. _entrypoint: https://docs.docker.com/engine/reference/builder/#entrypoint -.. _cmd: https://docs.docker.com/engine/reference/builder/#cmd - - - Application packages are sorted within each container configuration file. - This provides a programmatic interface to derive package sets, allows - overrides, and is easily visualized. While the package option is not - processes by the `tripleo_container_image_build` role, it will serve as a - standard within our templates. - - ================ ==================================================== - Option Description - ================ ==================================================== - tcib_packages Dictionary of packages to install. - - .. code-block:: yaml - - common: - - openstack-${SERVICE}-common - distro-1: - common: - - openstack-${SERVICE}-proprietary - x86_64: - - $dep-x86_64 - power: - - $dep-power - distro-2: - common: - - openstack-${SERVICE} - - $dep - ================ ==================================================== - - This option is then captured and processed by a simple `RUN` action. - - .. code-block:: yaml - - tcib_actions: - - run: "dnf install -y {{ tcib_packages['common'] }} {{ tcib_packages[ansible_distribution][ansible_architecture] }}" - - -Example Container Variable File -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - tcib_from: ubi8 - tcib_path: "{{ lookup('env', 'HOME') }}/example-image" - tcib_labels: - maintainer: MaintainerX - tcib_entrypoint: dumb-init --single-child -- - tcib_stopsignal: SIGTERM - tcib_envs: - LANG: en_US.UTF-8 - tcib_runs: - - mkdir -p /etc/ssh && touch /etc/ssh/ssh_known_host - tcib_copies: - - /etc/hosts /opt/hosts - tcib_gather_files: - - /etc - tcib_packages: - common: - - curl - centos: - x86_64: - - wget - tcib_actions: - - run: "dnf install -y {{ tcib_packages['common'] }} {{ tcib_packages[ansible_distribution][ansible_architecture] }}" - - copy: /etc/resolv.conf /resolv.conf - - run: ["/bin/bash", "-c", "echo hello world"] - - -Container File Structure -^^^^^^^^^^^^^^^^^^^^^^^^ - -The generated container file(s) will follow a simple directory structure -which provide an easy way to view, and understand, build relationships and -dependencies throughout the stack. - -.. code-block:: shell - - tripleo-base/${CONTAINERFILE} - tripleo-base/ancillary-${SERVICE}-1/${CONTAINERFILE} - tripleo-base/ancillary-${SERVICE}-2/${CONTAINERFILE} - tripleo-base/openstack-${SERVICE}-1-common/${CONTAINERFILE} - tripleo-base/openstack-${SERVICE}-1-common/openstack-${SERVICE}-1-a/${CONTAINERFILE} - tripleo-base/openstack-${SERVICE}-1-common/openstack-${SERVICE}-1-b/${CONTAINERFILE} - tripleo-base/openstack-${SERVICE}-1-common/openstack-${SERVICE}-1-c/${CONTAINERFILE} - tripleo-base/openstack-${SERVICE}-2/${CONTAINERFILE} - - -Alternatives ------------- - -* Use Ansible Bender - -Ansible Bender was evaluated as a tool which could help to build the container -images. However it has not been productized downstream; which would make it -difficult to consume. It doesn't generate Dockerfiles and there is a strong -dependency on Bender tool; the container image build process would therefore be -more difficult to do in a standalone environment where Bender isn't available. -[#f1]_ - -* Leave the container image build process untouched. - -We could leave the container image generate process untouched. This keeps us a -consumer of Kolla and requires we maintain our complex ancillary tooling to -ensure Kolla containers work for TripleO. - - -Security Impact ---------------- - -While security is not a primary virtue in the simple container generation -initiative, security will be improved by moving to simplified containers. If -the simple container generation initiative is ratified, all containers used -within TripleO will be vertically integrated into the stack, making it possible -to easily audit the build tools and all applications, services, and files -installed into our containerized runtimes. With simplification we'll improve -the ease of understanding and transparency which makes our project more -sustainable, thereby more secure. The proposed solution must provide layers -where we know what command has been run exactly; so we can quickly figure out -how an image was built. - - -Upgrade Impact --------------- - -There is no upgrade impact because the new container images will provide -feature parity with the previous ones; they will have the same or similar -injected scripts that are used when the containers start. - - -Other End User Impact ---------------------- - -None - - -Performance Impact ------------------- - -We should expect better performance out of our containers, as they will be -smaller. While the runtime will act the same, the software delivery will be -faster as the size of each container will smaller, with better constructed -layers. Smaller containers will decrease the mean time to ready which will have -a positive performance impact and generally improve the user experience. - - -Other Deployer Impact ---------------------- - -The simplified container generation initiative will massively help third party -integrators. With simplified container build tools we will be able to easily -articulate requirements to folks looking to build on-top of TripleO. Our -tool-chain will be capable of bootstrapping applications where required, and -simple enough to integrate with a wide variety of custom applications -constructed in bespoke formats. - - -Developer Impact ----------------- - -In the first phase, there won't be any developer impact because the produced -images will be providing the same base layers as before. For example, they will -contain all the Kolla scripts that are required to merge configuration files or -initialize the container at startup. - -These scripts will be injected in the container images for backward -compatibility: - -* kolla_extend_start -* set_configs.py -* start.sh -* copy_cacerts.sh -* httpd_setup.sh - -In a second phase, we will simplify these scripts to remove what isn't needed -by TripleO. The interface in the composable services will likely evolve over -time. For example kolla_config will become container_config. There is no plan -at this time to rewrite the configuration file merge logic. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - * Cloudnull - * EmilienM - - -Work Items ----------- - -First phase -^^^^^^^^^^^ - -* Ansible role to generate container file(s) - https://review.opendev.org/#/c/722557 -* Container images layouts - https://review.opendev.org/#/c/722486 -* Deprecate "openstack overcloud container image build" -* Implement "openstack tripleo container image build" which will reuse the - `BuildahBuilder` and the same logic as the deprecated command but without Kolla. -* Build new images and publish them. -* Switch the upstream CI to use the new images. - -Second phase: - -* Simplifying the injected scripts to only do what we need in TripleO. -* Rename the configuration interfaces in TripleO Heat Templates. - - -Dependencies -============ - -The tooling will be in existing repositories so there is no new dependency. It -will mainly be in tripleo-ansible, tripleo-common, python-tripleoclient and -tripleo-heat-templates. Like before, Buildah will be required to build the -images. - - -Testing -======= - -* The tripleo-build-containers-centos-8 job will be switched to be using - the new "openstack tripleo container image build" command. - -* A molecule job will exercise the container image build process using - the new role. - -* Some end-to-end job will also be investigated to build and deploy - a container into a running deployment. - - -Documentation Impact -==================== - -Much of the documentation impact will be focused on cleanup of the existing -documentation which references Kolla, and the creation of documentation that -highlights the use of the vertically integrated stack. - -Since the changes should be transparent for the end-users who just pull images -without rebuilding it, the manuals will still be updated with the new command -and options if anyone wants to build the images themselves. - -References -========== - -.. [#f1] https://review.opendev.org/#/c/722136/ -.. [#f2] https://review.opendev.org/#/c/722557/ -.. [#f3] https://blueprints.launchpad.net/tripleo/+spec/simplified-containers diff --git a/specs/victoria/tripleo-powerflex-integration.rst b/specs/victoria/tripleo-powerflex-integration.rst deleted file mode 100644 index 18ec35b7..00000000 --- a/specs/victoria/tripleo-powerflex-integration.rst +++ /dev/null @@ -1,262 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================================================ -Enable TripleO to deploy Dell EMC PowerFlex software defined storage via Ansible -================================================================================ - -Problem description -=================== - -There is currently no automated way to deploy VxFlexOS from within TripleO. -Goal is to provide an ease of use at the time of deployment as well as during -lifecycle operations. - -Proposed changes -================ - -Overview --------- -VxFlexOS has been rebranded to PowerFlex. - -The deployer experience to stand up PowerFlex with TripleO should be the -following: - -The deployer chooses to deploy a role containing any of the PowerFlex services: -PowerflexMDM, PowerflexLIA, PowerflexSDS and PowerflexSDC. - -At least three new Overcloud roles should be defined such as: -- Controller with PowerFlex -- Compute with PowerFlex -- Storage with PowerFlex - -Custom roles definition are used to define which service will run on which -type of nodes. We'll use this custom roles_data.yaml to deploy the overcloud. - -PowerFlex support for HCI, which combines compute and storage into a single -node, has been considered but will not be part of the first drop. - -The deployer provides the PowerFlex parameters as offered today in a Heat env -file. - -The deployer starts the deployment and gets an overcloud with PowerFlex and -appropriate services deployed on each node per its role. -Current code is available here. Still WIP. - -https://github.com/dell/tripleo-powerflex - -The following files are created in -/usr/share/openstack-tripleo-heat-templates/deployment/powerflex-ansible : -- powerflex-base.yaml -- powerflex-lia.yaml -- powerflex-mdm.yaml -- powerflex-sdc.yaml -- powerflex-sds.yaml -All of these files are responsible of the configuration of each sevice. Each -service is based upon the powerflex-base.yaml template which calls the Ansible -playbook and triggers the deployment. - -The directory /usr/share/powerflex-ansible holds the Ansible playbook which -installs and configure PowerFlex. - -A new tripleo-ansible role is created in /usr/share/ansible/roles called -tripleo-powerflex-run-ansible which prepares the variables and triggers the -execution of the PowerFlex Ansible playbook. - -An environment name powerflex-ansible.yaml file is created in -/usr/share/openstack-tripleo-heat-emplates/environments/powerflex-ansible -and defines the resource registry mapping and additional parameters required by -the PowerFlex Ansible playbook. - -Ports which have to be opened are managed by TripleO. - -PowerFlex deployment with TripleO Ansible ------------------------------------------ -Proposal to create a TripleO Ansible playbook to deploy a PowerFlex system. - -We refer to a PowerFlex system as a set of services deployed on nodes on a -per-role basis. - -The playbook described here assumes the following: - -A deployer chooses to deploy PowerFlex and includes the following Overcloud -roles which installs the PowerFlex services based upon the mapping found in -THT's roles_data.yaml: - -| Role | Associated PowerFlex service | -| ---------- | ---------------------------------------- | -| Controller | PowerflexMDM, PowerflexLIA, PowerflexSDC | -| Compute | PowerflexLIA, PowerflexSDC | -| Storage | PowerflexLIA, PowerflexSDS | - -The deployer chooses to include new Heat environment files which will be in THT -when this spec is implemented. An environment file will change the -implementation of any of the four services from the previous step. - -A new Ansible playbook is called during the deployment which triggers the -execution of the appropriate PowerFlex Ansible playbook. - -This can be identified as an cascading-ansible deployment. - -A separate Ansible playbook will be created for each goal described below: - -- Initial deployment of OpenStack and PowerFlex -- Update and upgrade PowerFlex SW -- Scaling up or down DayN operations - -This proposal only refers to a single PowerFlex system deployment. - -RPMS/Kernel dependencies ------------------------- - -Virt-Customize will be used to inject the rpms into the overcloud-full-image for -new installations. - - -Version dependencies --------------------- - -Version control is handled outside current proposal. The staging area has the -PowerFlex packages specific to the OS version of overcloud image. - -Ansible playbook -================= - -Initial deployment of OpenStack and PowerFlex ---------------------------------------------- - -The sequence of events for this new Ansible playbook to be triggered during -initial deployment with TripleO follows: - -1. Define the Overcloud on the Undercloud in Heat. This includes the Heat -parameters that are related to PowerFlex which will later be passed to -powerflex-ansible via TripleO Ansible playbook. - -2. Run `openstack overcloud deploy` with default PowerFlex options and include -a new Heat environment file to make the implementation of the service -deployment use powerflex-ansible. - -3. The undercloud assembles and uploads the deployment plan to the undercloud -Swift. - -4. TripleO starts to deploy the Overcloud and interfaces with Heat accordingly. - -5. A point in the deployment is reached where the Overcloud nodes are imaged, -booted, and networked. At that point the undercloud has access to the -provisioning or management IPs of the Overcloud nodes. - -6. The TripleO Ansible playbook responsible to Deploy PowerFlex with any of -the four PowerFlex services, including PowerflexMDM, PowerflexLIA, PowerflexSDS -and PowerflexSDC. - -7. The servers which host PowerFlex services have their relevant firewall ports -opened according to the needs of their service, e.g. the PowerflexMDM are -configured to accept traffic on TCP port 9011 and 6611. - -8. A new Heat environment file which defines additional parameters that we want -to override is passed to the TripleO Ansible playbook. - -9. The TripleO Ansible playbook translates these parameters so that they match -the parameters that powerflex-ansible expects. The translation entails building -an argument list that may be passed to the playbook by calling -`ansible-playbook --extra-vars`. An alternative location for the -/usr/share/powerflex-ansible playbook is possible via an argument. No -playbooks are run yet at this stage. - -10. The TripleO Ansible playbook is called and passed the list -of parameters as described earlier. A dynamic Ansible inventory is used with the -`-i` option. In order for powerflex-ansible to work there must be a group called -`[mdms]`, '[tbs]', '[sdss]' and '[sdcs]' in the inventory. - -11. The TripleO Ansible playbook starts the PowerFlex install using the -powerflex-ansible set of playbooks - -Update/Upgrade PowerFlex SW ---------------------------- - -TBD - -Scaling up/down ---------------- - -This implementation supports the add or remove of SDS and/or SDC at any moment -(Day+N operations) using the same deployment method. - -1. The deployer chooses which type of node he wants to add or remove from the -Powerflex system. - -2. The deployer launches an update on the Overcloud which will bring up or down -the nodes to add/remove. - -3. The nodes will be added or removed from the Overcloud. - -4. The SDS and SDC SW will be added or removed from the PowerFlex system. - -5. Storage capacity will be updated consequently. -For Scaling down operation, it will succeed only if: -- the minimum of 3 SDS nodes remains -- the free storage capacity available is enough for rebalancing the data - -PowerFlex services breakdown -============================ - -The PowerFlex system is broken down into multiple components, each of these have -to be installed on specific node types. - -Non HCI model -------------- - -- Controllers will host the PowerflexLIA, PowerflexMDM and PowerflexSDC (Glance) - components. A minimum of 3 MDMs is required. - -- Computes will host the PowerflexLIA and PowerflexSDC as they will be - responsible for accessing volumes. There is no minimum. - -- Storage will host the PowerflexLIA and PowerflexSDS as disks will be presented - as backend. A minimum of 3 SDS is required. A minimum of 1 disk per SDS is - also required to connect the SDS. - -HCI model ---------- - -- Controllers will host the PowerflexLIA, PowerflexMDM and PowerflexSDC (Glance) - components. A minimum of 3 MDMs is required. - -- Compute HCI will host the PowerflexLIA and PowerflexSDC as they will be - responsible for accessing volumes and the PowerflexSDS as disks will be - presented as backend. A minimum of 3 SDS is required. A minimum of 1 disk per - SDS is also required to connect the SDS. - -Security impact -=============== - -- A new SSH key pair will be created on the undercloud. - The public key of this pair will be installed in the heat-admin user's - authorized_keys file on all Overcloud nodes which will be MDMs, SDSs, or SDCs. - This process will follow the same pattern used to create the SSH keys used for - TripleO validations so nothing new would happen in that respect; just another - instance on the same type of process. - -- Additional firewall configuration need to include all TCP/UDP ports needed by - Powerflex services according to the following: - | Overcloud role | PowerFlex Service | Ports | - | -------------- | ----------------- | ---------------------- | - | Controller | LIA, SDC, SDS | 9099, 7072, 6611, 9011 | - | Compute | LIA, SDC | 9099 | - | Storage | LIA, SDS | 9099, 7072 | - -- Kernel modules package like scini.ko will be installed depending of the - version of the operating system of the overcloud node. - -- Question: Will there be any SELinux change needed for IP ports that vxflexOS - is using? - -Performance Impact -================== -The following applies to the undercloud: - -- TripleO Ansible will need to run an additional playbook - diff --git a/specs/wallaby/ephemeral-heat-overcloud.rst b/specs/wallaby/ephemeral-heat-overcloud.rst deleted file mode 100644 index 27100281..00000000 --- a/specs/wallaby/ephemeral-heat-overcloud.rst +++ /dev/null @@ -1,248 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -======================================== -Ephemeral Heat Stack for all deployments -======================================== - -https://blueprints.launchpad.net/tripleo/+spec/ephemeral-heat-overcloud - -This spec proposes using the ephemeral Heat stack model for all deployments -types, including the overcloud. Using ephemeral Heat is already done for -standalone deployments with the "tripleo deploy" command, and for the -undercloud install as well. Expanding its use to overcloud deployments will -align the different deployment methods into just a single method. It will also -make the installation process more stateless and with better predictability -since there is no Heat stack to get corrupted or possibly have bad state or -configuration. - - -Problem Description -=================== - -* Maintaining the Heat stack can be problematic due to corruption via either - user or software error. Backups are often not available, and even when they - exist, they are no guarantee to recover the stack. Corruption or loss of the - Heat stack, such as accidental deletion, requires custom recovery procedures - or re-deployments. - -* The Heat deployment itself must be maintained, updated, and upgraded. These - tasks are not large efforts, but they are areas of maintenance that would be - eliminated when using ephemeral Heat instead. - -* Relying on the long lived Heat process makes the deployment less portable in - that there are many assumptions in TripleO that all commands are run - directly from the undercloud. Using ephemeral Heat would at least allow for - the stack operation and config-download generation to be entirely portable - such that it could be run from any node with python-tripleoclient installed. - -* There are large unknowns in the state of each Heat stack that exists for all - current deployments. These unknowns can cause issues during update/upgrade as - we can't possibly account for all of these items, such as out of date - parameter usage or old/incorrect resource registry mappings. Having each - stack operation create a new stack will eliminate those issues. - - -Proposed Change -=============== - -Overview --------- - -The ephemeral Heat stack model involves starting a short lived heat process -using a database engine for the purposes of creating the stack. The initial -proposal assumes using the MySQL instance already present on the undercloud as -the database engine. To maintain compatibility with the already implemented -"tripleo deploy" code path, SQLite will also be supported for single node -deployments. SQLite may also be supported for other deployments of -sufficiently small size so as that SQLite is not a bottleneck. - -After the stack is created, the config-download workflow is run to download and -render the ansible project directory to complete the deployment. The short -lived heat process is killed and the database is deleted, however, enough -artifacts are saved to reproduce the Heat stack if necessary including the -database dump. The undercloud backup and restore procedure will be modified to -account for the removal of the Heat database. - -This model is already used by the "tripleo deploy" command for the standalone -and undercloud installations and is well proven for those use cases. Switching -the overcloud deployment to also use ephemeral Heat aligns all of the different -deployments to use Heat the same way. - -We can scale the ephemeral Heat processes by using a podman pod that -encapsulates containers for heat-api, heat-engine, and any other process we -needed. Running separate Heat processes containerized instead of a single -heat-all process will allow starting multiple engine workers to allow for -scale. Management and configuration of the heat pod will be fairly prescriptive -and it will use default podman networking as we do not need the Heat processes -to scale beyond a single host. Moving forward, undercloud minions will no -longer install heat-engine process as a means for scale. - -As part of this change, we will also add the ability to run Heat commands -against the saved database from a given deployment. This will give -operators a way to inspect the Heat stack that was created for debugging -purposes. - -Managing the templates used during the deployment becomes even more important -with this change, as the templates and environments passed to the "overcloud -deploy" command are the entire source of truth to recreate the deployment. We -may consider further management around the templates, such as a git repository -but that is outside the scope of this spec. - -There are some cases where the saved state in the stack is inspected before a -deployment operation. Two examples are comparing the Ceph fsid's between the -input and what exists in the stack, as well as checking for a missing -network-isolation.yaml environment. - -In cases such as these, we need a way to perform these checks outside of -inspecting the Heat stack itself. A straightforward way to do these types of -checks would be to add ansible tasks that check the existing deployed overcloud -(instead of the stack) and then cause an error that will stop the deployment if -an invalid change is detected. - -Alternatives ------------- - -The alternative is to make no changes and continue to use Heat as we do today -for the overcloud deployment. With the work that has already been done to -decouple Heat from Nova, Ironic, and now Neutron, it instead seems like the -next iterative step is to use ephemeral Heat for all of our deployment types. - -Security Impact ---------------- - -The short lived ephemeral heat process uses no authentication. This is in -contrast to the Heat process we have on the undercloud today that uses Keystone -for authentication. In reality, this change has little effect on security as -all of the sensitive data is actually passed into Heat from the templates. We -should however make sure that the generated artifacts are secured -appropriately. - -Since the Heat process is ephemeral, no change related to SRBAC (Secure RBAC) -is needed. - -Upgrade Impact --------------- - -When users upgrade to Wallaby, the Heat processes will be shutdown on the -undercloud, and further stack operations will use ephemeral Heat. - -Upgrade operations for the overcloud will work as expected as all of the update -and upgrade tasks are entirely generated with config-download on each stack -operation. We will however need to ensure proper upgrade testing to be sure -that all services can be upgraded appropriately using ephemeral Heat. - -Other End User Impact ---------------------- - -End users will no longer have a running instance of Heat to interact with or -run heat client commands against. However, we will add management around -starting an ephemeral Heat process with the previously used database for -debugging inspection purposes (stack resource list/show, etc). - -Performance Impact ------------------- - -The ephemeral Heat process is presently single threaded. Addressing this -limitation by using a podman pod for the Heat processes will allow the -deployment to scale to meet overcloud deployment needs, while keeping the -process ephemeral and easy to manage with just a few commands. - -Using the MySQL database instead of SQLite as the database engine should -alleviate any impact around the database being a bottleneck. After the -database is backed up after a deployment operation, it would be wiped from -MySQL so that no state is saved outside of the produced artifacts from the -deployment. - -Alternatively, we can finish the work started in `Scaling with the Ansible -inventory`_. That work will enable deploying the Heat stack with a count of 1 -for each role. With that change, the Heat stack operation times will scale with -the number of roles in the deployment, and not the number of nodes, which will -allow for similar performance as currently exists. Even while using the -inventory to scale, we are still likely to have worse performance with a single -heat-all process than we do today. With just a few roles, using just heat-all -becomes a bottleneck. - -Other Deployer Impact ---------------------- - -Initially, deployers will have the option to enable using the ephemeral Heat -model for overcloud deployments, until it becomes the default. - -Developer Impact ----------------- - -Developers will need to be aware of the new commands that will be added to -enable inspecting the Heat stack for debugging purposes. - -In some cases, some service template updates may be required where there are -instances that those templates rely on saved state in the Heat stack. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - james-slagle - -Work Items ----------- - -The plan is to start prototyping this effort and have the option in place to -use it for a default overcloud deployment in Wallaby. There may be additional -fine tunings that we can finish in the X release, with a plan to backport to -Wallaby. Ideally, we would like to make this the default behavior in Wallaby. -To the extent that is possible will be determined by the prototype work. - -* Add management of Heat podman pod to tripleoclient -* Add option to "overcloud deploy" to use ephemeral Heat -* Use code from "tripleo deploy" for management of ephemeral Heat -* Ensure artifacts from the deployment are saved in known locations and - reusable as needed -* Update undercloud backup/restore to account for changes related to Heat - database. -* Add commands to enable running Heat commands with a previously used - database -* Modify undercloud minion installer to no longer install heat-engine -* Switch some CI jobs over to use the optional ephemeral Heat -* Eventually make using ephemeral Heat the default in "overcloud deploy" -* Align the functionality from "tripleo deploy" into the "overcloud deploy" - command and eventually deprecate "tripleo deploy". - -Dependencies -============ - -This work depends on other ongoing work to decouple Heat from management of -other OpenStack API resources, particularly the composable networks v2 work. - -* Network Data v2 Blueprint - https://blueprints.launchpad.net/tripleo/+spec/network-data-v2-ports - -Testing -======= - -Initially, the change will be optional within the "overcloud deploy" command. -We can choose some CI jobs to switch over to opt-in. Eventually, it will become -the default behavior and all CI jobs would then be affected. - -Documentation Impact -==================== - -Documentation updates will be necessary to detail the changes around using -ephemeral Heat. Specifically: - -* User Interface changes -* How to run Heat commands to inspect the stack -* Where artifacts from the deployment were saved and how to use them - -References -========== - -* `Scaling with the Ansible inventory`_ specification - - -.. _Scaling with the Ansible inventory: https://specs.openstack.org/openstack/tripleo-specs/specs/ussuri/scaling-with-ansible-inventory.html diff --git a/specs/wallaby/excise-swift.rst b/specs/wallaby/excise-swift.rst deleted file mode 100644 index 511c7d05..00000000 --- a/specs/wallaby/excise-swift.rst +++ /dev/null @@ -1,188 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================= -Disable Swift from the Undercloud -================================= - -The goal of this proposal is to introduce the community to the idea of -disabling Swift on the TripleO Undercloud. Within this propose we intend -to provide a high-level overview of how we can accomplish this goal. - - -Problem Description -=================== - -Swift is being used to store objects related to the deployment which are -managed entirely on the Undercloud. In the past, there was an API / UI to -interact with the deployment tooling; however, with the deprecation of the UI -and the removal of Mistral this is no longer the case. The Undercloud is -assumed to be a single node which is used to deploy OpenStack clouds, and -requires the user to login to the node to run commands. Because we're no longer -attempting to make the Undercloud a distributed system there's no need for an -API'able distributed storage service. Swift, in it's current state, is -under-utilized and carries unnecessary operational and resource overhead. - - -Proposed Change -=============== - -Overview --------- - -Decommission Swift from the Undercloud. - -To decommission Swift, we'll start by removing all of the `tripleoclient` Swift -interactions. These interactions are largely storing and retrieving YAML files -which provide context to the user for current deployment status. To ensure -we're not breaking deployment expectations, we'll push everything to the local -file system and retain all of the file properties wherever possible. We will -need coordinate with tripleo-ansible to ensure we're making all direct Swift -client and module interactions optional. - -Once we're able to remove the `tripleoclient` Swift interactions, we'll move to -disable Swift interactions from tripleo-common. These interactions are similar -to the ones found within the `tripleoclient`, though tripleo-common has some -complexity; we'll need to ensure we're not breaking expectations we've created -with our puppet deployment methodologies which have some Swift assumptions. - - -Alternatives ------------- - -We keep everything as-is. - - -Security Impact ---------------- - -There should be no significant security implications when disabling Swift. -It could be argued that disabling Swift might make the deployment more secure, -it will lessen the attack surface; however, given the fact that Swift on the -Undercloud is only used by director I would consider any benefit insignificant. - - -Upgrade Impact --------------- - -There will be no upgrade impact; this change will be transparent to the -end-user. - - -Other End User Impact ---------------------- - -None. - - -Performance Impact ------------------- - -Disabling Swift could make some client interactions faster; however, the -benefit should be negligible. That said, disabling Swift would remove a -service on the Undercloud, which would make setup faster and reduce the -resources required to run the Undercloud. - - -Other Deployer Impact ---------------------- - -Operationally we should see an improvement as it will no longer be required to -explore a Swift container, and download files to debug different parts of the -deployment. All deployment related file artifacts housed within Swift will -exist on the Undercloud using the local file system, and should be easily -interacted with. - - -Developer Impact ----------------- - -None, if anything disabling Swift should make the life of a TripleO developer -easier. - - -Implementation -============== - -Excising Swift client interactions will be handled directly in as few reviews -as possible; hopefully allowing us to backport this change, should it be deemed -valuable to stable releases. - -All of the objects stored within Swift will be stored in -`/var/lib/tripleo/{named_artifact_directories}`. This will allow us to -implement all of the same core logic in our various libraries just without the -use of the API call to store the object. - -In terms of enabling us to eliminate swift without having a significant impact -on the internal API we'll first start by trying to replace the swift object -functions within tripleo-common with local file system calls. By using the -existing functions and replacing the backend we'll ensure API compatibility and -lessen the likely hood of creating regressions. - -.. note:: - - We'll need to collaborate with various groups to ensure we're porting assumed - functionality correctly. While this spec will not go into the specifics - implementation details for porting assumed functionality, it should be known - that we will be accountable for ensuring existing functionality is ported - appropriately. - - -Assignee(s) ------------ - -Primary assignee: - cloudnull - -Other contributors: - -- emilien -- ekultails - -Work Items ----------- - -The work items listed here are high level, and not meant to provide specific -implementation details or timelines. - -* Enumerate all of the Swift interactions -* Create a space on the Undercloud to house the files -* This location will be on the local file system and will be created into a - git archive; git is used for easier debug, rapid rollback, and will - provide simple versioning. -* Create an option to disable Swift on the Undercloud. -* Convert client interactions to using the local file system -* Ensure all tripleo-ansible Swift client calls are made optional -* Convert tripleo-common Swift interactions to using the local file system -* Disable Swift on the Undercloud - - -Dependencies -============ - -Before Swift can be disabled on the Undercloud we will need ensure the -deployment methodology has been changed to Metalsmith. - - -Testing -======= - -The Swift tests will need to be updated to use the local file system, however -the existing tests and test structure will be reused. - - -Documentation Impact -==================== - -There are several references to Swift in our documentation which we will need to -update. - - -References -========== - -* https://etherpad.opendev.org/p/tripleo-heat-swift-removal-undercloud -* http://paste.openstack.org/show/798208 diff --git a/specs/wallaby/mixed-operating-system-versions.rst b/specs/wallaby/mixed-operating-system-versions.rst deleted file mode 100644 index 0345a24f..00000000 --- a/specs/wallaby/mixed-operating-system-versions.rst +++ /dev/null @@ -1,267 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=============================== -Mixed Operating System Versions -=============================== - -https://blueprints.launchpad.net/tripleo/+spec/mixed-operating-system-versions - -This spec proposes that a single TripleO release supports multiple operating -system versions. - -Problem Description -=================== - -Historically a single branch or release of TripleO has supported only a single -version of an operating system at a time. In the past, this has been specific -versions of Ubuntu or Fedora in the very early days, and now has standardized -on specific versions of CentOS Stream. - -In order to upgrade to a later version of OpenStack, it involves first -upgrading the TripleO undercloud, and then upgrading the TripleO overcloud to -the later version of OpenStack. The problem with supporting only a single -operating system version at a time is that such an OpenStack upgrade typically -implies an upgrade of the operating system at the same time. Combining the -OpenStack upgrade with a simultaneous operating system upgrade is problematic -due to: - -1. Upgrade complexity -2. Upgrade time resulting in extended maintenance windows -3. Operating system incompatibilities with running workloads (kernel, libvirt, - KVM, qemu, OVS/OVN, etc). -4. User impact of operating system changes (docker vs. podman, network-scripts - vs. NetworkManager, etc). - -Proposed Change -=============== - -Overview --------- - -This spec proposes that a release of TripleO support 2 major versions of an -operating system, particularly CentOS Stream. A single release of TripleO -supporting two major versions of CentOS Stream will allow for an OpenStack -upgrade while remaining on the same operating version. - -There are multiple software versions in play during an OpenStack upgrade: - -:TripleO: - The TripleO version is the version of the TripleO related packages installed - on the undercloud. While some other OpenStack software versions are used here - (Ironic, Neutron, etc), for the purposes of this spec, all TripleO and - OpenStack software on the undercloud will be referred to as the TripleO - version. The TripleO version corresponds to an OpenStack version. - Examples: Train, Wallaby, Zed. - -:OpenStack: - The OpenStack version is the version of OpenStack on the overcloud that is - being managed by the TripleO undercloud. - Examples: Train, Wallaby, Zed. - -:Operating System: - The operating system version is the version of CentOS Stream. Both the - undercloud and overcloud have operating system versions. The undercloud and - the overcloud may not have the same operating system version, and all nodes - in the overcloud may not have the same operating system version. - Examples: CentOS Stream 8, 9, 10 - -:Container Image: - The container image version is the version of the base container image used - by tcib. This is a version of the Red Hat universal base image (UBI). - Examples: UBI 8, 9, 10 - -For the purposes of this spec, the operating system versions being discussed -will be CentOS Stream 8 and 9, while the OpenStack versions will be Train and -Wallaby. However, the expectation is that TripleO continues to support 2 -operating system versions with each release going forward. Subsequently, the -Zed. release of TripleO would support CentOS Stream 9 and 10. - -With the above version definitions and considerations in mind, a TripleO -managed upgrade from Train to Wallaby would be described as the following: - -#. Upgrade the undercloud operating system version from CentOS Stream 8 to 9. -#. Upgrade the undercloud TripleO version from Train to Wallaby. - - #. The Wallaby version of the TripleO undercloud will only run on CentOS Stream - 9. - #. Implies upgrading all TripleO and OpenStack software on the undercloud to - Wallaby. - -#. Upgrade the OpenStack version on the overcloud from Train to Wallaby - - #. Does not imply upgrading the operating system version from CentOS Stream 8 - to 9. - #. Implies upgrading to new container image versions that are the images for - OpenStack Wallaby. These container image versions will likely be service - dependent. Some services may use UBI version 9, while some may remain on UBI - version 8. - -#. Upgrade the operating system version on the overcloud nodes from CentOS - Stream 8 to 9. - - #. Can happen node by node, with given constraints that might include all - control plane nodes need to be upgraded at the same time. - #. Data plane nodes could be selectively upgraded. - -The default behavior will be that users and operators can choose to upgrade to -CentOS Stream 9 separately from the OpenStack upgrade. For those operators who -want a combined OpenStack and operating system upgrade to match previous FFU -behavior, they can perform both upgrades back to back. The OpenStack and -operating system upgrades will be separate processes. There may be UX around -making the processes appear as one, but that is not prescribed by this spec. - -New TripleO deployments can choose either CentOS Stream 8 or 9 for their -Overcloud operating system version. - -The implication with such a change is that the TripleO software needs to know -how to manage OpenStack on different operating system versions. Ansible roles, -puppet modules, shell scripts, etc, all need to remove any assumptions about a -given operating system and be developed to manage both CentOS Stream 8 and 9. -This includes operating system utilities that may function quite differently -depending on the underlying version, such as podman and container-tools. - -CentOS Stream 8 support could not be dropped until the Zed. release of TripleO, -at which time, support would be needed for CentOS Stream 9 and 10. - -Alternatives ------------- - -:Alternative 1: - The TripleO undercloud Wallaby version could support running on both CentOS - Stream 8 and 9. There does not seem to be much benefit in supporting both. - Some users may refuse to introduce 9 into their environments at all, but - TripleO has not encountered similar resistance in the past. - -:Alternative 2: - When upgrading the overcloud to the OpenStack Wallaby version, it could be - required that all control plane nodes go through an operating system upgrade - as well. Superficially, this appears to reduce the complexity of the - development and test matrix. However, given the nature of composable roles, - this requirement would really need to be prescribed per-service, and not - per-role. Enforcing such a requirement would be problematic given the - flexibility of running any service on any role. It would instead be better - that TripleO document what roles need to be upgraded to a newer operating - system version at the same time, by documenting a set of already provided - roles or services. E.g., all nodes running a pacemaker managed service need - to be upgraded to the same operating system version at the same time. - -:Alternative 3: - A single container image version could be used for all of OpenStack Wallaby. In - order to support running those containers on both CentOS Stream 8 and 9, the - single UBI container image would likely need to be 8, as anticipated support - statements may preclude support for running UBI 9 images on 8. - -:Alternative 4: - New deployments could be forced to use CentOS Stream 9 only for their - overcloud operating system version. However, some users may have workloads - that have technical or certification requirements that could require CentOS - Stream 8. - -Security Impact ---------------- - -None. - -Upgrade Impact --------------- - -This proposal is meant to improve the FFU process by separating the OpenStack -and operating system upgrades. - -Most users and operators will welcome this change. Some may prefer the old -method which offered a more simultaneous and intertwined upgrade. While the new -process could be implemented in such a way to offer a similar simultaneous -experience, it will still be different and likely appear as 2 distinct steps. - -Distinct steps should result in an overall simplification of the upgrade -process. - -Other End User Impact ---------------------- - -None. - -Performance Impact ------------------- - -The previous implementations of FFU had the OpenStack and operating system -upgrades intertwined in the way that they were performed. With the separation -of the upgrade processes, the overall upgrade of both OpenStack and the -operating system may take a longer amount of time overall. Operators would need -to plan for longer maintenance windows in the cases where they still want to -upgrade both during the same windows. - -Otherwise, operators can choose to upgrade just OpenStack first, and then the -operating system at a later date, resulting in multiple, but shorter, -maintenance windows. - -Other Deployer Impact ---------------------- - -None. - -Developer Impact ----------------- - -TripleO developers will need support managing OpenStack software across -multiple operating system versions. - -Service developers responsible for TripleO integrations, will need to decide -upgrade requirements around their individual services when it comes to -container image versions and supporting different operating system versions. - -Given that the roll out of CentOS Stream 9 support in TripleO has happened in a -way that overlaps with supporting 8, it is largely true today that TripleO -Wallaby already supports both 8 and 9. CI jobs exist that test Wallaby on both -8 and 9. Going forward, that needs to remain true. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - - -Other contributors: - - -Work Items ----------- - -1. tripleo-ansible - CentOS Stream 8 and 9 support -2. tripleo-heat-templates - CentOS Stream 8 and 9 support -3. puppet-tripleo - CentOS Stream 8 and 9 support -4. puppet-* - CentOS Stream 8 and 9 support -5. tcib - build right container image versions per service - - -Dependencies -============ - -* CentOS Stream 9 builds will be required to fully test and develop - -Testing -======= - -FFU is not typically tested in upstream CI. However, CI will be needed that -tests deploying OpenStack Wallaby on both CentOS Stream 8 -and 9 in order to verify that TripleO Wallaby is compatible with both operating -system versions. - - -Documentation Impact -==================== - -The matrix of supported versions will need to be documented within -tripleo-docs. - -References -========== - -None. diff --git a/specs/wallaby/tripleo-ceph-client.rst b/specs/wallaby/tripleo-ceph-client.rst deleted file mode 100644 index 456b09a6..00000000 --- a/specs/wallaby/tripleo-ceph-client.rst +++ /dev/null @@ -1,210 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=================== -TripleO Ceph Client -=================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-ceph-client - -Native Ansible roles for TripleO integration with Ceph clusters. - - -Problem Description -=================== - -Starting in the Octopus release, Ceph has its own day1 tool called -cephadm [1]_ and it's own day2 tool called orchestrator [2]_ which -will replace ceph-ansible [3]_. While ceph-ansible had the necessary -features to configure Ceph clients, distributing for example config file -and keyrings as necessary on nodes which aren't members of the Ceph cluster, -neither cephadm or the orchestrator will manage Ceph clients configuration. - -Goal is to create some new ansible roles in TripleO to perform the -Ceph clients (Nova, Cinder, Glance, Manila) configuration, which is of special -importance in TripleO to support deployment scenarios where the Ceph cluster -is externally managed, not controlled by the undercloud, yet the OpenStack -services configuration remains a responsibility of TripleO. - - -.. _proposed-change: - -Proposed Change -=============== - -Overview --------- - -Introduce a new role into tripleo-ansible for Ceph client configuration. - -The new role will: - -- Configure OpenStack services as clients of an external Ceph cluster - (in the case of collocation, the ceph cluster is still logically - external) -- Provide Ceph configuration files and cephx keys for OpenStack - clients of RBD and CephFS (Nova, Cinder, Glance, Manila) -- Full multiclient support, e.g. one OpenStack deployment may use - multiple Ceph clusters, e.g. multibackend Glance -- Configure clients quickly, e.g. generate the key in one place - and copy it efficiently -- This is a standalone role which is reusable to configure OpenStack - against an externally managed Ceph cluster -- Not break existing support for CephExternalMultiConfig which is used - for configuring OpenStack to work with more than one Ceph cluster - when deploying Ceph in DCN environments (Deployment of dashboard on - DCN sites is not in scope with this proposal). - - -Alternatives ------------- - -Support for clients configuration might be added in future versions -of cephadm, yet there are some reasons why we won't be able to use this -feature as-is even if it was available today: - -- it assumes the for the cephadm tool to be configured with admin privileges - for the external Ceph cluster, which we don't have when Ceph is not - managed by TripleO; -- it also assumes that each and every client node has been provisioned into - the external Ceph orchestrator inventory so that evey Ceph MON is able to - log into the client node (overcloud nodes) via SSH; -- while offering the necessary functionalities to copy the config - files and cephx keyrings over to remote client nodes, it won't be able to - configure for example Nova with the libvirtd secret for qemu-kvm, which is - a task only relevant when the client is OpenStack; - -Security Impact ---------------- - -None derived directly from the decision to create new ansible roles. The -distribution of the cephx keyrings itself though should be implemented using -a TripleO service, like the existing CephClient service, so that keyrings -are only deployed on those nodes which actually need those. - -Upgrade Impact --------------- - -The goal is to preserve and reuse any existing Heat parameter which is -currently consumed to drive ceph-ansible; from operators' perspective the -problem of configuring a Ceph client isn't changed and there shouldn't be -a need to change the existing parameters, it's just the implementation -which will change. - -Performance Impact ------------------- - -As described in the :ref:`proposed-change` section, the purpose of this -role is to proper configure clients and it allows OpenStack services to -connect to an internal or external Ceph cluster, as well as multiple Ceph -cluster in a DCN context. -Since both config files and keys are necessary for many OpenStack services -(Nova, Cinder, Glance, Manila) to make them able to properly interact with -the Ceph cluster, at least two actions should be performed: - -- generate keys in one place -- copy the generated keys efficiently - -The `ceph_client` role should be very small, and a first improvement -in terms of performances can be found on key generation since they are -created in one, centralized place. -The generated keys, then, just need to be distributed across the nodes -of the Ceph cluster, as well as the Ceph cluster config file. -Adding this role to tripleo-ansible avoid adding an extra calls from -a pure deployment perspective; in fact, no additional ansible playbooks -will be triggered and we expect to see performances improved since no -additional layers are involved here. - -Developer Impact ----------------- - -How Ceph is deployed could change for anyone maintaining TripleO code -for OpenStack services which use Ceph. In theory there should be no -change as the CephClient service will still configure the Ceph -configuration and Ceph key files in the same locations. Those -developers will just need to switch to the new templates when they are -stable. - - -Implementation -============== - -The new role should be enabled by a TripleO service, like it happens -today with the CephClient service. -Depending on the environment file chosen at deployment time, the -actual implementation of such a service could either be based on -ceph-ansible or on the new role. - -When the Ceph cluster is not external, the role will also create -pools and the cephx keyrings into the Ceph cluster; these steps -will be skipped instead when Ceph is external precisely because we won't -have admin privileges to change the cluster configuration in that case. - -TripleO Heat Templates ----------------------- - -The existing implementation which depends on ceph-ansible will remain -in-tree for at least 1 deprecation cycle. By reusing the existing Heat -input parameters we should be able to transparently make the clients -configuration happen with ceph-ansible or the new role just by -switching the environment file used at deployment time. -TripleO users who currently use -`environments/ceph-ansible/ceph-ansible-external.yaml` in order to -have their Overcloud use an existing Ceph cluster, should be able to -apply the same templates to the new template for configuring Ceph -clients, e.g. `environments/ceph-client.yaml`. This will result in -the new tripleo-ansible/roles/ceph_client role being executed. - -Assignee(s) ------------ - -- fmount -- fultonj -- gfidente -- jmolmo - -Work Items ----------- - -Proposed Schedule ------------------ - -- OpenStack W: start tripleo-ansible/roles/ceph_client as experimental - and then set it as default in scenarios 001/004. We expect to to - become stable during the W cycle. - -Dependencies -============ - -The `ceph_client` role will be added in tripleo-ansible and allow -configuring the OpenStack services as clients of an external or TripleO -managed Ceph cluster; no new dependencies are added for tripleo-ansible -project. The `ceph_client` role will work with External Ceph, Internal -Ceph deployed by ceph-ansible, and the Ceph deployment described in -[4]_. - -Testing -======= - -It should be possible to reconfigure one of the existing CI scenarios -already deploying with Ceph to use the newer `ceph_client` role, -making it non-voting until the code is stable. Then switch the other -existing CI scenario to it. - - -Documentation Impact -==================== - -No doc changes should be needed. - - -References -========== - -.. [1] `cephadm `_ -.. [2] `orchestrator `_ -.. [3] `ceph-ansible `_ -.. [4] `tripleo-ceph `_ diff --git a/specs/wallaby/tripleo-ceph-ganesha.rst b/specs/wallaby/tripleo-ceph-ganesha.rst deleted file mode 100644 index 19e164bd..00000000 --- a/specs/wallaby/tripleo-ceph-ganesha.rst +++ /dev/null @@ -1,158 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================================== -TripleO Ceph Ganesha Integration for Manila -=========================================== - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-ceph-ganesha - -Starting in the Octopus release, Ceph has its own day1 tool called cephadm and -its own day2 tool called orchestrator which will replace ceph-ansible. - -During the Wallaby cycle TripleO will no longer use ceph-ansible for Ceph -deployment and instead use cephadm [2]_ as described in [1]_. Ganesha deserves -special attention because for its deployment we will use special functionalities -in cephadm [2]_ meant to deploy the Ganesha service standalone when the Ceph -cluster is external. - -Problem Description -=================== - -In TripleO we support deployment of Ganesha both when the Ceph cluster is itself -managed by TripleO and when the Ceph cluster is itself not managed by TripleO. - -When the Ceph cluster is *not* managed by Tripleo, the Ganesha service must be -deployed standalone; that is, without any additional core Ceph daemon and it -should instead be configured to use the external Ceph MON and MDS daemons. - -Proposed Change -=============== - -Overview --------- - -An ansible task will trigger cephadm [2]_ with special arguments for it to stand -up a standalone Ganesha container and to it we will provide: - -- the Ceph cluster config file, generated using tripleo-ceph-client [3]_ role -- the Ceph cluster keyring to interact with MDS -- the Ganesha config file with pointers to the Ceph config/keyring to use - -The container will then be controlled by pacemaker, as it is today and reusing -the same code which today manages the ceph-nfs systemd service created by -ceph-ansible. - -Alternatives ------------- - -Forking and reusing the existing ceph-ansible role for ceph-nfs has been -discussed but ultimately discarded as that would have moved ownership of the -Ganesha deployment tasks in TripleO, while our goal remaing to keep ownership -where subject expertise is, in the Ceph deployment tool. - -Security Impact ---------------- - -None, the same code which TripleO would already use for the generation of the -Ceph cluster config and keyrings will be consumed. - -Upgrade Impact --------------- - -Some upgrade tasks which stop and remove the pre-existing ceph-nfs container -and systemd unit will be added to clean up the system from the ceph-ansible -managed resources. - -Other End User Impact ---------------------- - -None, the existing input parameters will be reused to drive the newer deployment -tool. - -Performance Impact ------------------- - -No changes. - -Other Deployer Impact ---------------------- - -No impact on users. - -Developer Impact ----------------- - -The Ganesha config file will be generated using a specific tripleo-ceph task -while previously, with ceph-ansible, this was created by ceph-ansible itself. - -Implementation -============== - -The existing implementation which depends on ceph-ansible will remain -in-tree for at least 1 deprecation cycle. By reusing the existing Heat -input parameters we should be able to transparently make the Ganesha -deployment happen with ceph-ansible or the new role just by switching -the environment file used at deployment time. - -Deployment Flow ---------------- - -The deployment and configuration described in this spec will -happen before `openstack overcloud deploy`, as described in -[1]_. This is consistent with how ceph-ansible used to run during -step2 to configure these services. However, parts of the Manila -configuration which use Ganesha will still happen when `openstack -overcloud deploy` is run. This is because some of the configuration -for Ganesha and Manila needs to happen during step 5. Thus, files like -`environments/manila-cephfsganesha-config.yaml` will be updated to -trigger the new required actions. - -Assignee(s) ------------ - -- fmount -- fultonj -- gfidente - -Work Items ----------- - -- Create a set of tasks to deploy on overcloud nodes the Ganesha config file -- Create a set of tasks to trigger cephadm with special arguments - -Dependencies -============ - -- The tripleo-ceph spec [1]_ - -Testing -======= - -Testing is currently impossible as we only have one network while for Ganesha -we require at least two, one which connects it to the Ceph public network and -another where the NFS proxy service is exposed to tenants. - -This is a design decision, one of the values added by the use of an NFS proxy -for CephFS is to implement network isolation in between the tenant guests and -the actual Ceph cluster. - -Such a limitation does not come from the migration to cephadm [2]_ but it has -always existed; the code which enforces the use of two isolated networks is in -fact in TripleO, not in the Ceph tool itself. We might revisit this in the -future but it is not a goal of this spec to change this. - -Documentation Impact -==================== - -No changes should be necessary to the TripleO documentation. - -References -========== - -.. [1] `tripleo-ceph `_ -.. [2] `cephadm `_ -.. [3] `tripleo-ceph-client `_ diff --git a/specs/wallaby/tripleo-ceph.rst b/specs/wallaby/tripleo-ceph.rst deleted file mode 100644 index c7828c01..00000000 --- a/specs/wallaby/tripleo-ceph.rst +++ /dev/null @@ -1,832 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============ -TripleO Ceph -============ - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-ceph - -A light Ansible framework for TripleO integration with Ceph clusters -deployed with cephadm_ and managed with Ceph orchestrator_. - - -Problem Description -=================== - -Starting in the Octopus release, Ceph has its own day1 tool called -cephadm_ and it's own day2 tool called orchestrator_ which will -replace ceph-ansible_. What should TripleO's Ceph integration -do about this? We currently provide the following user experience: - - Describe an OpenStack deployment, which includes Ceph, and TripleO - will "make it so" - -The above has been true for TripleO since Kilo and should -continue. TripleO should also continue hyper-converged support -(collocation of OpenStack and Ceph containers). There is sufficient -value in both of these (one tool and hyper-convergence) to justify -this project. At the same time we want to deploy Ceph in a way -consistent with the way the Ceph project is moving and decouple the -complexity of day2 management of Ceph from TripleO. - - -Proposed Change -=============== - -Overview --------- - -Modify tripleo-ansible, tripleo-heat-templates, and -python-tripleoclient in support of the following goals: - -- Provide Ansible roles which deploy Ceph by calling cephadm_ and Ceph - orchestrator -- Focus on the day1 problem for Ceph RBD, RGW, CephFS, and Dashboard - deployment by leveraging `cephadm bootstrap --apply-spec` as - described in Ceph issue 44873_ -- By default, day2 Ceph operations should be done directly with Ceph - orchestrator_ or Ceph Dashboard and not by running `openstack - overcloud deploy` -- TripleO stack updates do not trigger the new Ansible roles - introduced by this spec. -- Provide an opinionated Ceph installation based on parameters from - TripleO (including hardware details from Ironic) -- Configure cephx keyrings and pools for OpenStack on a deployed Ceph - cluster -- Support collocation (hyperconvergence) of OpenStack/Ceph containers - on same host - - cephadm_ reconciliation loop must not break OpenStack configuration - - TripleO configuration updates must not break Ceph configuration -- Provide Ceph integration but maximize orthogonality between - OpenStack and Ceph - -The implementation of the TripleO CephClient service during the W -cycle is covered in a different spec in review 757644_. This work will -be merged before the work described in this spec as it will be -compatible with the current Ceph deployment methods. It will also be -compatible with the future deployment methods described in this spec. - -Integration Points ------------------- - -The default deployment method of OpenStack/Ceph for TripleO Victoria -is the following 2-step-process: - -1. Deploy nodes with metalsmith_ -2. Deploy OpenStack and Ceph with `openstack overcloud deploy` - -The Ceph portion of item 2 uses external_deploy_steps_tasks to call -ceph-ansible by using the tripleo-ansible roles: tripleo_ceph_common, -tripleo_ceph_uuid, tripleo_ceph_work_dir, tripleo_ceph_run_ansible. - -The ultimate goal for this spec is to support the following -4-step-process: - -1. Deploy the hardware with metalsmith_ -2. Configure networking (including storage networks) -3. Deploy Ceph with the roles and interface provided by tripleo-ansible/python-tripleoclient -4. Deploy OpenStack with `openstack overcloud deploy` - -Item 2 above depends on the spec for network data v2 format described -in review 752437_ and a subsequent network-related feature which moves -port management out of Heat, and supports applying network -configuration prior to Heat stack deployment described in review -760536_. - -Item 3 above is the focus of this spec but it is not necessarily -the only integration point. If it is not possible to configure the -storage networks prior to deploying OpenStack, then the new method -of Ceph deployment will still happen via external_deploy_steps_tasks -as it currently does in Victoria via the 2-step-process. Another way -to say this is that Ceph may be deployed *during* the overcloud -deployment in the 2-step process or Ceph may be deployed *before* the -overcloud during the 4-step process; in either case we will change how -Ceph is deployed. - -The benefit of deploying Ceph before deploying the overcloud is that -the complexity of the Ceph deployment is decoupled from the complexity -of the OpenStack deployment. Even if Ceph is deployed before the -overcloud, its deployment remains a part of TripleO the same way that -the bare metal deployment remains a part of TripleO; even though a -separate tool, e.g. metalsmith_ or cephadm_ is used to deploy the -resources which are not deployed when `openstack overcloud deploy` -is run. - -Additional details on how Ceph is deployed before vs during the -overcloud deployment are covered in the implementation section. - -Alternatives ------------- - -We could ask deployers to do this: - -- Deploy hardware and configure networking -- Use cephadm_ and orchestrator_ directly to configure that hardware - with Ceph and create OpenStack pools accessible by CephX clients -- Use TripleO to configure OpenStack - -We have completed a POC of the above using Ussuri and config-download -tags to only run certain steps but would prefer to offer an option to -automate the Ceph deployment. The TripleO project has already ensured -that the move from one to three is automated and requires only two -commands because the tripleo python client now has an option to call -metalsmith_. The alternative is to not automate step two, but that is -user unfriendly. - -Another alternative is to continue using ceph-ansible_ as we do today. -However, even though ceph-ansible_ can deploy Octopus today and will -continue to support deployment of Luminous and Nautilus, the project -has a cephadm-adopt_ playbook for converting Ceph clusters that it has -deployed to mangement by cephadm_ orchestrator_ so seems to be moving -away from true Octopus support. ceph-ansible_ has lot of code and day2 -support; porting ceph-ansible itself to cephadm_ or orchestrator_ is -more work than completing this project with a smaller scope and looser -coupling. - -Security Impact ---------------- - -The cephadm_ tool is imperative and requires SSH access to the Ceph -cluster nodes in order to execute remote commands and deploy the -specified services. This command will need to be installed on one of -the overcloud nodes which will host the composable CephMon service. -From the cephadm_ point of view, that node will be a bootstrap node -on which the Ceph cluster is created. - -For this reason the Ceph cluster nodes must be SSH accessible and -provide a user with root privileges to perform some tasks. For -example, the standard way to add a new host when using cephadm_ is to -run the following: - -- `ssh-copy-id -f -i /etc/ceph/ceph.pub root@**` -- `ceph orch host add **` - -The TripleO deployment flow, and in particular config-download, -already provides the key elements to properly configure and run -the two actions described above, hence the impact from a security -point of view is unchanged compared to the previous deployment model. - -We will create a user like ceph-admin using the same process -config-download uses to create the tripleo-admin user and then -cephadm_ will use this user when it runs commands to add other -hosts. - -Upgrade Impact --------------- - -Ceph Nautilus clusters are still managed by ceph-ansible, and cephadm_ -can be enabled, as the new, default backend, once the Octopus release -is reached. Therefore, starting from Nautilus, two main steps are -identified in the upgrade process: - -- Upgrade the cluster using ceph-ansible_ `rolling_update`: - ceph-ansible_ should provide, as already done in the past, a rolling - update playbook that can be executed to upgrade all the services to - the Octopus release -- Migrate the existing cluster to cephadm/orchestrator: when all the - services are updated to Octopus cephadm-adopt_ will be executed as - an additional step - -New Ceph Octopus deployed clusters will use cephadm_ and ceph -orchestrator_ by default, and the future upgrade path will be provided -by cephadm_upgrade_, which will be able to run, stop and resume all -the Ceph upgrade phases. At that point day2 ceph operations will need -to be carried out directly with ceph orchestrator. Thus, it will no -longer be necessary to include the -`tripleo-heat-templates/environments/ceph-ansible/*` files in the -`openstack overcloud deploy` command with the exception of the Ceph -client configuration as described in review 757644_, which will have a -new environment file. - -.. note:: - - The Upgrade process for future releases can be subject of slight - modifications according to the OpenStack requirements. - - -Other End User Impact ---------------------- - -The main benefit from the operator perspective is the ability to take -advantage of the clear separation between the deployment phase and -day2 operations as well as the separation between the Ceph deployment -and the OpenStack deployment. At the same time TripleO can still -address all the deployment phase operations with a single tool but -leave and rely on orchestrator_ for what concerns day2 tasks. - -Many common tasks can now be performed the same way regardless of if -the Ceph cluster is internal (deployed by) or external to TripleO. -The operator can use the cephadm_ and orchestrator_ tools which will -be accessible from one of the Ceph cluster monitor nodes. - -For instance, since cephadm_ maintains the status of the cluster, the -operator is now able to perform the following tasks without interacting -with TripleO at all: - -1. Monitor replacement -2. OSD replacement (if a hardware change is necessary then Ironic - might be involved) - -.. note:: - - Even though cephadm_ standalone, when combined with Ceph - orchestrator_, should support all the commands required to the - carry out day2 operations, our plan is for tripleo-ceph to - continue to manage and orchestrate other actions that can - be taken by an operator when TripleO should be involved. E.g. - a CephStorage node is added as a scale-up operation, then - the tripleo-ceph Ansible roles should make calls to add the OSDs. - -Performance Impact ------------------- - -Stack updates will not trigger Ceph tools so "OpenStack only" changes -won't be delayed by Ceph operations. Ceph client configuration will -take less time though this benefit is covered in review 757644_. - -Other Deployer Impact ---------------------- - -Like ceph-ansible, cephadm_ is distributed as an RPM and can be -installed from Ceph repositories. However, since the deployment -approach is changed and cephadm_ requires a Ceph monitor node to -bootstrap a minimal cluster, we would like to install the cephadm_ -RPM on the overcloud image. As of today this RPM is approximately 46K -and we expect this to simplify the installation process. When cephadm_ -bootstraps the first Ceph monitor (on the first Controller node by -default) it will download the necessary Ceph containers. To contrast -this proposal with the current Ceph integration, ceph-ansible_ needs -to be installed on the undercloud and it then manages the download of -Ceph containers to overcloud nodes. In the case of both cephadm_ and -ceph-ansible, no other package changes are needed for the overcloud -nodes as both tools run Ceph in containers. - -This change affects all TripleO users who deploy an Overcloud which -interfaces with Ceph. Any TripleO users who does not interface with -Ceph will not be directly impacted by this project. - -TripleO users who currently use -`environments/ceph-ansible/ceph-ansible.yaml` in order to have their -overcloud deploy an internal Ceph cluster will need to migrate to the -new method when deploying W. This file and others will deprecated as -described in more detail below. - -The proposed changes do not take immediate effect after they are -merged because both the ceph-ansible_ and cephadm_ interfaces will -exist intree concurrently. - -Developer Impact ----------------- - -How Ceph is deployed could change for anyone maintaining TripleO code -for OpenStack services which use Ceph. In theory there should be no -change as the CephClient service will still configure the Ceph -configuration and Ceph key files in the same locations. Those -developers will just need to switch to the new interfaces when they -are stable. - -Implementation -============== - -How configuration data is passed to the new tooling when Ceph is -deployed *before* or *during* the overcloud deployment, as described -in the Integration Points section of the beginning of this spec, will -be covered in more detail in this section. - -Deprecations ------------- - -Files in `tripleo-heat-templates/environments/ceph-ansible/*` and -`tripleo-heat-templates/deployment/ceph-ansible/*` will be deprecated -in W and removed in X. They will be obsoleted by the new THT -parameters covered in the next section with the exception of -`ceph-ansible/ceph-ansible-external.yaml` which will be replaced by -`environments/ceph-client.yaml` as described in review 757644_. - -The following tripleo-ansible roles will be deprecated at the start -of W: tripleo_ceph_common, tripleo_ceph_uuid, tripleo_ceph_work_dir, -and tripleo_ceph_run_ansible. The ceph_client role will not be -deprecated but it will be re-implemented as described in review -757644_. New roles will be introduced to tripleo-ansible to replace -them. - -Until the project described here is complete during X we will -continue to maintain the deprecated ceph-ansible_ roles and -Heat templates for the duration of W and so it is likely that during -one release we will have intree support both ceph-ansible_ and -cephadm_. - -New THT Templates ------------------ - -Not all THT configuration for Ceph can be removed. The firewall is -still configured based on THT as descrbed in the next section and THT -also controls which composable service is deployed and where. The -following new files will be created in -`tripleo-heat-templates/environments/`: - -- cephadm.yaml: triggers new cephadm Ansible roles until `openstack - overcloud ceph ...` makes it unnecessary. Contains the paths to the - files described in the Ceph End State Definition YAML Input section. -- ceph-rbd.yaml: RBD firewall ports, pools and cephx key defaults -- ceph-rgw.yaml: RGW firewall ports, pools and cephx key defaults -- ceph-mds.yaml: MDS firewall ports, pools and cephx key defaults -- ceph-dashboard.yaml: defaults for Ceph Dashboard firewall ports - -All of the above (except cephadm.yaml) will result in the appropriate -firewall ports being opened as well as a new idempotent Ansible role -connecting to the Ceph cluster in order to create the Ceph pools and -cephx keys to access those pools. Which ports, pools and keys are -created will depend on which files are included. E.g. if the deployer -ran `openstack overcloud deploy ... -e ceph-rbd.yaml -e cep-rgw.yaml` -then the ports, pools and cephx keys would be configured for Nova, -Cinder, and Glance to use Ceph RBD and RGW would be configured with -Keystone, but no firewall ports, pools and keys for the MDS service -would be created and the firewall would not be opened for the Ceph -dashboard. - -None of the above files, except cephadm.yaml, will result in Ceph -itself being deployed and none of the parameters needed to deploy Ceph -itself will be in the above files. E.g. PG numbers and OSD devices -will not be defined in THT anymore. Instead the parameters which are -needed to deploy Ceph itself will be in tripleo_ceph_config.yaml as -described in the Ceph End State Definition YAML Input section and -cephadm.yaml will only contain references to those files. - -The cephx keys and pools, created as described above, will result in -output data which looks like the following:: - - pools: - - volumes - - vms - - images - - backups - openstack_keys: - - caps: - mgr: allow * - mon: profile rbd - osd: 'osd: profile rbd pool=volumes, profile rbd pool=backups, - profile rbd pool=vms, profile rbd pool=images' - key: AQCwmeRcAAAAABAA6SQU/bGqFjlfLro5KxrB1Q== - mode: '0600' - name: client.openstack - -The above can be written to a file, e.g. ceph_client.yaml, and passed -as input to the the new ceph client role described in review 757644_ -(along with the ceph_data.yaml file produced as output as described in -Ceph End State Definition YAML Output). - -In DCN deployments this type of information is extracted from the Heat -stack with `overcloud export ceph`. When the new method of deployment -is used this information can come directly from each genereated yaml -file (e.g. ceph_data.yaml and ceph_client.yaml) per Ceph cluster. - -Firewall --------- - -Today the firewall is not configured by ceph-ansible_ and it won't be -configured by cephadm_ as its `--skip-firewalld` will be used. We -expect the default overcloud to not have firewall rules until -`openstack overcloud deploy` introduces them. The THT parameters -described in the previous section will have the same firewall ports as -the ones they will deprecate (`environments/ceph-ansible/*`) so that -the appropriate ports per service and based on composable roles will -be opened in the firewall as they are today. - -OSD Devices ------------ - -The current defaults will always be wrong for someone because the -`devices` list of available disks will always vary based on hardware. -The new default will use all available devices when creating OSDs by -running `ceph orch apply osd --all-available-devices`. It will still -be possible to override this default though the ceph-ansible_ syntax of -the `devices` list will be deprecated. In its place the OSD Service -Specification defined by cephadm_ drivegroups will be used and the tool -will apply it by running `ceph orch apply osd -i osd_spec.yml`. More -information on the `osd_spec.yaml` is covered in the Ceph End State -Definition YAML Input section. - -Ceph Placement Group Parameters -------------------------------- - -The new tool will deploy Ceph with the pg autotuner feature enabled. -Parameters to set the placement groups will be deprecated. Those who -wish to disable the pg autotuner may do so using Ceph CLI tools after -Ceph is deployed. - -Ceph End State Definition YAML Input ------------------------------------- - -Regardless of if Ceph is deployed *before* or *during* overcloud -deployment, a new playbook which deploys Ceph using cephadm_ will be -created and it will accept the following files as input: - -- deployed-metal.yaml: this file is generated by running a command - like `openstack overcloud node provision ... --output - deployed-metal.yaml` when using metalsmith_. - -- (Optional) "deployed-network-env": the file that is generated by - `openstack network provision` as described in review 752437_. This - file is used when deploying Ceph before the overcloud to identify - the storage networks. This will not be necessary when deploying Ceph - during overcloud deployment so it is optional and the storage - network will be identified instead as it is today. - -- (Optional) Any valid cephadm_ config.yml spec file as described in - Ceph issue 44205_ may be directly passed to the cephadm_ execution - and where applicable will override all relevant settings in the file - described at the end of this list. - -- (Optional) Any valid drivegroup_ YAML file (e.g. osd_spec.yml) may - be passed and the tooling will apply it with `ceph orch apply osd -i - osd_spec.yml`. This setting will override all relevant settings in - the file described at the end of this list. - -- tripleo_ceph_config.yaml: This file will contain configuration data - compatible with nearly all Ceph options supported today by TripleO - Heat Templates with the exception of the firewall, ceph pools and - cephx keys. A template of this file will be provided in as a default - in one of the new tripleo-ansible roles (e.g. tripleo_cephadm_common) - -Another source of data which is input into the new playbook is the -inventory which is covered next section. - -Ansible Inventory and Ansible User ----------------------------------- - -The current Ceph implementation uses the Ansible user tripleo-admin. -That user and the corresponding SSH keys are created by the -tripleo-ansible role tripleo_create_admin. This role uses the -heat-admin account which is the default account if `openstack -overcloud node provision` is not passed the `--overcloud-ssh-user` -option. The current implementation also uses the inventory generated -by tripleo-ansible-inventory. These resources will not be available -if Ceph is deployed *before* the overcloud and there's no reason they -are needed if Ceph is deployed *during* the overcloud deployment. - -Regardless if Ceph is deployed *before* or *during* overcloud, prior -to deploying Ceph, `openstack overcloud admin authorize` should be run -and it should pass options to enable a ceph-admin user which can be -used by cephadm_ and to allow SSH access for the ansible roles -described in this spec. - -A new command, `openstack overcloud ceph inventory` will be -implemented which creates an Ansible inventory for the new playbook -and roles described in this spec. This command will require the -following input: - -- deployed-metal.yaml: this file is generated by running a command - like `openstack overcloud node provision ... --output - deployed-metal.yaml` when using metalsmith_. - -- (Optional) roles.yaml: If this file is not passed then - /usr/share/openstack-tripleo-heat-templates/roles_data.yaml will be - used in its place. If the roles in deployed-metal.yaml do not have a - definition found in roles.yaml, then an error is thrown that a role - being used is undefined. By using this file, the TripleO composable - roles will continue to work as they to today. The services matching - "OS::TripleO::Services::Ceph*" will correspond to a new Ansible - inventory group and the hosts in that group will correspond to the - hosts found in deployed-metal.yaml. - -- (Options) `-u --ssh-user `: this is not a file but an option - which defaults to "ceph-admin". This represents the user which was - created created on all overcloud nodes by `openstack overcloud admin - authorize`. - -- (Options) `-i --inventory `: this is not a file but an option - which defaults to "/home/stack/inventory.yaml". This represents the - inventory which will be created. - -If Ceph is deployed before the overcloud, users will need to run this -command to generate an Ansible inventory file. They will also need to -pass the path to the generated inventory file to `openstack overcloud -ceph provision` as input. - -If Ceph is deployed *during* overcloud deployment, users do not need -to know about this command as external_deploy_steps_tasks will run -this command directly to generate the inventory before running the new -tripleo ceph playbook with this inventory. - -Ceph End State Definition YAML Output -------------------------------------- - -The new playbook will write output data to one yaml file which -contains information about the Ceph cluster and may be used as -input to other processes. - -In the case that Ceph is deployed before the overcloud, if `openstack -overcloud ceph provision --output ceph_data.yaml` were run, then -`ceph_data.yaml` would then be passed to `openstack overcloud deploy -... -e ceph_data.yaml`. The `ceph_data.yaml` file will contain -key/value pairs such as the Ceph FSID, Name, and the Ceph monitor IPs. - -In the case that Ceph is deployed with the overcloud, if -external_deploy_steps_tasks calls the new playbook, then the same file -will be written to it's default location (/home/stack/ceph_data.yaml) -and the new client role will directly read the parameters from this file. - -An example of what this file, e.g. `ceph_data.yaml`, looks like is:: - - cluster: ceph - fsid: af25554b-42f6-4d2b-9b9b-d08a1132d3e899 - ceph_mon_ips: - - 172.18.0.5 - - 172.18.0.6 - - 172.18.0.7 - -In DCN deployments this type of information is extracted from the Heat -stack with `overcloud export ceph`. When the new method of deployment -is used this information can come directly from the `ceph_data.yaml` -file per Ceph cluster. This file will be passed as input to the new -ceph client role described in review 757644_. - -Requirements for deploying Ceph during Overcloud deployment ------------------------------------------------------------ - -If Ceph is deployed *during* the overcloud deployment, the following -should be the case: - -- The external_deploy_steps_tasks playbook will execute the new - Ansible roles after `openstack overcloud deploy` is executed. -- If `openstack overcloud node provision .. --output - deployed-metal.yaml` were run, then `deployed-metal.yaml` would be - input to `openstack overcloud deploy`. This is the current behavior - we have in V. -- Node scale up operations for day2 Ceph should be done by running - `openstack overcloud node provision` and then `openstack overcloud - deploy`. This will include reasserting the configuration of - OpenStack services unless those operations are specifically set to - "noop". -- Creates its own Ansible inventory and user -- The path to the "Ceph End State Definition YAML Input" is referenced - via a THT parameter so that when external_deploy_steps_tasks runs it - will pass this file to the new playbook. - -Requirements for deploying Ceph before Overcloud deployment ------------------------------------------------------------ - -If Ceph is deployed *before* the overcloud deployment, the following -should be the case: - -- The new Ansible roles will be triggered when the user runs a command - like `openstack overcloud ceph ...`; this command is meant - to be run after running `openstack overcloud node provision` to - trigger metalsmith_ but before running `openstack overcloud deploy`. -- If `openstack overcloud node provision .. --output - deployed-metal.yaml` were run, then `deployed-metal.yaml` would be - input to `openstack overcloud ceph provision`. -- Node scale up operations for day2 Ceph should be done by running - `openstack overcloud node provision`, `openstack overcloud network - provision`, and `openstack overcloud admin authorize` to enable a - ceph-admin user. However it isn't necessary to run `openstack - overcloud ceph ...` because the operator should connect to the Ceph - cluster itself to add the extra resources, e.g. use a cephadm shell - to add the new hardware as OSDs or other Ceph resource. If the - operation includes adding hyperconverged node with both Ceph and - OpenStack services then the third step will be to run `openstack - overcloud deploy`. -- Requires the user to create an inventory (and user) before running - using new Ceph deployment tools. -- "Ceph End State Definition YAML Input" is directly passed. - -Container Registry Support --------------------------- - -It is already supported to host a container registry on the -undercloud. This registry contains Ceph and OpenStack containers -and it may be populated before deployment or during deployment. -When deploying ceph before overcloud deployment it will need to be -populated before deployment. The new integration described in this -spec will direct cephadm_ to pull the Ceph containers from the same -source identified by `ContainerCephDaemonImage`. For example:: - - ContainerCephDaemonImage: undercloud.ctlplane.mydomain.tld:8787/ceph-ci/daemon:v4.0.13-stable-4.0-nautilus-centos-7-x86_64 - -Network Requirements for Ceph to be deployed before the Overcloud ------------------------------------------------------------------ - -The deployment will be completed by running the following commands: - -- `openstack overcloud node provision ...` -- `openstack overcloud network provision ...` (see review 751875_) -- `openstack overcloud ceph ...` (triggers cephadm/orchestrator) -- `openstack overcloud deploy ...` - -In the past stack updates did everything, but the split for -metalsmith_ established a new pattern. As per review 752437_ and a -follow up spec to move port management out of Heat, and apply network -configuration prior to the Heat stack deployment, it will eventually -be possible for the network to be configured before `openstack -overcloud deploy` is run. This creates an opening for the larger goal -of this spec which is a looser coupling between Ceph and OpenStack -deployment while retaining full integration. After the storage and -storage management networks are configured, then Ceph can be deployed -before any OpenStack services are configured. This should be possible -regardless of if the same node hosts both Ceph and OpenStack -containers. - -Development work on for deploying Ceph before overcloud deployment -can begin before the work described in reviews 752437_ and 760536_ -is completed by either of the following methods: - -Option 1: -- `openstack overcloud deploy --skip-tags step2,step3,step4,step5` -- use tripleo-ceph development code to stand up Ceph -- `openstack overcloud deploy --tags step2,step3,step4,step5` - -The last step will also configure the ceph clients. This sequence has -been verified to work in a proof of concept of this proposal. - -Option 2: -- Create the storage and storage management networks from the undercloud (using review 751875_) -- Create the Ironic ports for each node as per review 760536_ -- Use instances Nics Properties to pass a list of dicts to provision the node not just on the ctlplane network but also the storage and storage-management networks when the node is provisioned with metalsmith_ -- Metalsmith/Ironic should attach the VIFs so that the nodes are connected to the Storage and Storage Management networks so that Ceph can then be deployed. - -PID1 services used by Ceph --------------------------- - -During the W cycle we will not be able to fully deploy an HA Dashboard -and HA RGW service before the overcloud is deployed. Thus, we will -deploy these services as we do today; by using a ceph tool, though -we'll use cephadm_ in place of ceph-ansible_, and then complete the -configuration of these services during overcloud deployment. Though -the work to deploy the service itself will be done before overcloud -deployment, the service won't be accessible in HA until after the -overcloud deployment. - -Why can't we fully deploy the HA RGW service before the overcloud? -Though cephadm_ can deploy an HA RGW service without TripleO its -implementation uses keepalived which cannot be collocated with -pacemaker, which is required on controller nodes. Thus, during the -W cycle we will keep using the RGW service with haproxy and revisit -making it a separate deployment with collaboration with the PID1 team -in a future cycle. - -Why can't we fully deploy the HA Dashboard service before the -overcloud? cephadm_ does not currently have a builtin HA model for -its dashboard and the HA Dashboard is only available today when it -is deployed by TripleO (unless it's configured manually). - -Ceph services which need VIPs (Dashbard and RGW) need to know what the -VIPs will be in advance but the VIPs do not need to be pingable before -those Ceph services are deployed. Instead we will be able to know what -the VIPs are before deploying Ceph per the work related to reviews -751875_ and 760536_. We will pass these VIPs as input to cephadm_. - -For example, if we know the Dashboard VIP in advance, we can run the -following:: - - ceph --cluster {{ cluster }} dashboard set-grafana-api-url {{ dashboard_protocol }}://{{ VIP }}:{{ grafana_port }}" - -The new automation could then save the VIP parameter in the ceph mgr -global config. A deployer could then and wait for haproxy to be -available from the overcloud deploy so that an HA dashbard similar to -the one Victoria deploys is available. - -It would be simpler if we could address the above issues before -overcloud deployment but doing so is out of the scope of this spec. -However, we can aim to offer the dashboard in HA with the new tooling -around the time of the X cycle and we hope to do so through -collaboration with the Ceph orchestrator community. - -TripleO today also supports deploying the Ceph dashboard on any -composed network. If the work included in review 760536_ allows us to -compose and deploy the overcloud networks in advance, then we plan to -pass parameters to cephadm to continue support of the dashboard on its -own private network. - -TLS-Everywhere --------------- - -If Ceph is provisioned before the overcloud, then we will not have -the certificates and keys generated by certmonger via TripleO's -tls-everywhere framework. We expect cephadm to be able to deploy the -Ceph Dashboard (with Grafana), RGW (with HA via haproxy) with TLS -enabled. For the sake of orthogonality we could require that the -certificates and keys for RGW and Dashboard be generated outside of -TripleO so that these services could be fully deployed without the -overcloud. However, because we still need to use PID1 services as -described in the previous section, we will continue to use TripleO's -TLS-e framework. - -Assignee(s) ------------ - -- fmount -- fultonj -- gfidente -- jmolmo - -Work Items ----------- - -- Create a set of roles matching tripleo_ansible/roles/tripleo_cephadm_* - which can coexist with the current tripleo_ceph_common, - tripleo_ceph_uuid, tripleo_ceph_work_dir, tripleo_ceph_run_ansible, - roles. -- Patch the python tripleo client to support the new command options -- Create a new external_deploy_steps_tasks interface for deploying - Ceph using the new method during overcloud deployment -- Update THT scenario001/004 to use new method of ceph deployment - -Proposed Schedule ------------------ - -- OpenStack W: merge tripleo-ansible/roles/ceph_client descrbed in - review 757644_ early as it will work with ceph-ansible_ internal - ceph deployments too. Create tripleo-ansible/roles/cephadm_* roles - and tripleo client work to deploy Octopus as experimental and then - default (only if stable). If new tripleo-ceph is not yet stable, - then Wallaby will release with Nautilus support as deployed by - ceph-ansible_ just like Victoria. Either way Nautilus support via - current THT and tripleo-ansible triggering ceph-ansible_ will be - deprecated. - -- OpenStack X: tripleo-ansible/roles/cephadm_* become the default, - tripleo-ansible/roles/ceph_* are removed except the new ceph_client, - tripleo-heat-templates/environments/ceph-ansible/* removed. Migrate - to Ceph Pacific which GAs upstream in March 2021. - -Dependencies -============ - -- The spec for tripleo-ceph-client described in review 757644_ -- The spec for network data v2 format described in review 752437_ -- The spec for node ports described in review 760536_ - -The last two items above are not required if we deploy Ceph during -overcloud deployment. - -Testing -======= - -This project will be tested against at least two different scenarios. -This will ensure enough coverage on different use cases and cluster -configurations, which is pretty similar to the status of the job -definition currently present in the TripleO CI. -The defined scenarios will test different features that can be enabled -at day1. -As part of the implementation plan, the definition of the -tripleo-heat-templates environment CI files, which contain the testing job -parameters, is one of the goals of this project, and we should make sure -to have: - -- a basic scenario that covers the ceph cluster deployment using cephadm_; - we will gate the tripleo-ceph project against this scenario, as well - as the related tripleo heat templates deployment flow; - -- a more advanced use case with the purpose of testing the configuration - that can be applied to the ceph cluster and are orchestrated by the - tripleo-ceph project. - -The two items described above are pretty similar to the test suite that -today is maintained in the TripleO CI, and they can be implemented -reworking the existing scenarios, adding the proper support to the -cephadm_ deployment model. -A WIP patch can be created and submitted with the purpose of testing -and gating the tripleo-ceph project, and, when it becomes stable -enough, the scenario001 will be able to be officially merged. -The same approach can be applied to the existing scenario004, which -can be seen as an improvement of the first testing job. -This is mostly used to test the Rados Gateway service deployment and -the manila pools and key configuration. -An important aspect of the job definition process is related to -standalone vs multinode. -As seen in the past, multinode can help catching issues that are not -visible in a standalone environment, but of course the job -configuration can be improved in the next cycles, and we can start -with standalone testing, which is what is present today in CI. -Maintaining the CI jobs green will be always one of the goals of the -ceph integration project, providing a smooth path and a good experience -moving from ceph-ansible_ to cephadm_, continuously improving the testing -area to ensure enough coverage of the implemented features. - -Documentation Impact -==================== - -tripleo-docs will be updated to cover Ceph integration with the new tool. - - -.. Indirect Hyperlink Targets - -.. _cephadm: https://docs.ceph.com/en/latest/cephadm/ -.. _orchestrator: https://docs.ceph.com/en/latest/mgr/orchestrator/ -.. _ceph-ansible: https://github.com/ceph/ceph-ansible -.. _metalsmith: https://docs.openstack.org/project-deploy-guide/tripleo-docs/latest/provisioning/baremetal_provision.html -.. _cephadm-adopt: https://github.com/ceph/ceph-ansible/blob/master/infrastructure-playbooks/cephadm-adopt.yml -.. _drivegroup: https://docs.ceph.com/en/latest/cephadm/drivegroups -.. _cephadm_upgrade: https://docs.ceph.com/docs/master/cephadm/upgrade -.. _44205: https://tracker.ceph.com/issues/44205 -.. _44873: https://tracker.ceph.com/issues/44873 -.. _757644: https://review.opendev.org/#/c/757644 -.. _752437: https://review.opendev.org/#/c/752437 -.. _751875: https://review.opendev.org/#/c/751875 -.. _757644: https://review.opendev.org/#/c/757644 -.. _760536: https://review.opendev.org/#/c/760536 diff --git a/specs/wallaby/triplo-bgp-frrouter.rst b/specs/wallaby/triplo-bgp-frrouter.rst deleted file mode 100644 index 50ed079d..00000000 --- a/specs/wallaby/triplo-bgp-frrouter.rst +++ /dev/null @@ -1,245 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -============================== -Install and Configure FRRouter -============================== - -The goal of this spec is to design and plan requirements for adding support to -TripleO to install and provide a basic configuration of Free Range Router (FRR) -on overcloud nodes in order to support BGP dynamic routing. There are multiple -reasons why an administrator might want to run FRR, including to obtain -multiple routes on multiple uplinks to northbound switches, or to advertise -routes to networks or IP addresses via dynamic routing protocols. - -Problem description -=================== - -There are several use cases for using BGP, and in fact there are separate -efforts underway to utilize BGP for the control plane and data plane. - -BGP may be used for equal-cost multipath (ECMP) load balancing of outbound -links, and bi-directional forwarding detection (BFD) for resiliency to ensure -that a path provides connectivity. For outbound connectivity BGP will learn -routes from BGP peers. - -BGP may be used for advertising routes to API endpoints. In this model HAProxy -will listen on an IP address and FRR will advertise routes to that IP to BGP -peers. High availability for HAProxy is provided via other means such as -Pacemaker, and FRR will simply advertise the virtual IP address when it is -active on an API controller. - -BGP may also be used for routing inbound traffic to provider network IPs or -floating IPs for instance connectivity. The Compute nodes will run FRR to -advertise routes to the local VM IPs or floating IPs hosted on the node. FRR -has a daemon named Zebra that is responsible for exchanging routes between -routing daemons such as BGP and the kernel. The *redistribute connected* -statement in the FRR configuration will cause local IP addresses on the host -to be advertised via BGP. Floating IP addresses are attached to a loopback -interface in a namespace, so they will be redistributed using this method. -Changes to OVN will be required to ensure provider network IPs assigned to VMs -will be assigned to a loopback interface in a namespace in a similar fashion. - -Proposed Change -=============== - -Overview --------- - -Create a container with FRR. The container will run the BGP daemon, BFD -daemon, and Zebra daemon (which copies routes to/from the kernel). Provide a -basic configuration that would allow BGP peering with multiple peers. In the -control plane use case the FRR container needs to be started along with the HA -components, but in the data plane use case the container will be a sidecar -container supporting Neutron. The container is defined in a change proposed -here: [1]_ - -The container will be deployed using a TripleO Deployment Service. The service -will use Ansible to template the FRR configuration file, and a simple -implementation exists in a proposed change here: [2]_ - -The current FRR Ansible module is insufficient to configure BGP parameters and -would need to be extended. At this time the Ansible Networking development -team is not interested in extending the FRR module, so the configuration will -be provided using TripleO templates for the FRR main configuration file and -daemon configuration file. Those templates are defined in a change proposed -here: [3]_ - -A user-modifiable environment file will need to be provided so the installer -can provide the configuration data needed for FRR (see User Experience below). - -OVN will need to be modified to enable the Compute node to assign VM provider -network IPs to a loopback interface inside a namespace. These IP address will -not be used for sending or receiving traffic, only for redistributing routes -to the IPs to BGP peers. Traffic which is sent to those IP addresses will be -forwarded to the VM using OVS flows on the hypervisor. An example agent for -OVN has been written to demonstrate how to monitor the southbound OVN DB and -create loopback IP addresses when a VM is started on a Compute node. The OVN -changes will be detailed in a separate OVN spec. Demonstration code is -available on Github: [4]_ - -User Experience -^^^^^^^^^^^^^^^ - -The installer will need to provide some basic information for the FRR -configuration, including whether to enable BFD, BGP IPv4, BGP IPv6, -and other settings. See the Example Configuration Data section below. - -Additional user-provided data may include inbound or outbound filter prefixes. -The default filter prefixes will accept only default routes via BGP, and will -export only loopback IPs, which have a /32 subnet mask for IPv4 or /128 subnet -mask for IPv6. - -Example Configuration Data -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - tripleo_frr_bfd: false - tripleo_frr_bgp: false - tripleo_frr_bgp_ipv4: true - tripleo_frr_bgp_ipv4_allowas_in: false - tripleo_frr_bgp_ipv6: true - tripleo_frr_bgp_ipv6_allowas_in: false - tripleo_frr_config_basedir: "/var/lib/config-data/ansible-generated/frr" - tripleo_frr_hostname: "{{ ansible_hostname }}" - tripleo_frr_log_level: informational - tripleo_frr_watchfrr: true - tripleo_frr_zebra: false - -Alternatives -============ - -1. Routing outbound traffic via multiple uplinks - - Fault-tolerance and load-balancing for outbound traffic is typically - provided by bonding Ethernet interfaces. This works for most cases, but - is susceptible to unidirectional interface failure, a situation where - traffic works in only one direction. The LACP protocol for bonding does - provide some protection against unidirectional traffic failures, but is not - as robust as bi-directional forwarding detection (BFD) provided by FRR. - -2. Routing inbound traffic to highly-available API endpoints - - The most common method currently used to provide HA for API endpoints is - to use a virtual IP that fails over from active to standby nodes using a - shared Ethernet MAC address. The drawback to this method is that all - standby API controllers must reside on the same layer 2 segment (VLAN) as - the active controller. This presents a challenge if the operator wishes - to place API controllers in different failure domains for power and/or - networking. A BGP daemon avoids this limitation by advertising a route - to the shared IP address directly to the BGP peering router over a routed - layer 3 link. - - -3. Routing to Neutron IP addresses - - Data plane traffic is usually delivered to provider network or floating - IP addresses via the Ethernet MAC address associated with the IP and - determined via ARP requests on a shared VLAN. This requires that every - Compute node which may host a provider network IP or floating IP has - the appropriate VLAN trunked to a provider bridge attached to an interface - or bond. This makes it impossible to migrate VMs or floating IPs across - layer 3 boundaries in edge computing topologies or in a fully layer 3 - routed datacenter. - - -Security Impact -=============== - -There have been no direct security impacts identified with this approach. The -installer should ensure that security policy on the network as whole prevents -IP spoofing which could divert legitimate traffic to an unintended host. This -is a concern whether or not the OpenStack nodes are using BGP themselves, and -may be an issue in environments using traditional routing architecture or -static routes. - - -Upgrade Impact -============== - -When (if) we remove the capability to manage network resources in the -overcloud heat stack, we will need to evaluate whether we want to continue -to provide BGP configuration as a part of the overcloud configuration. - -If an operator wishes to begin using BGP routing at the same time as -upgrading the version of OpenStack used they will need to provide the -required configuration parameters if they differ from the defaults provided -in the TripleO deployment service. - - -Performance Impact -================== - -No performance impacts are expected, either positive or negative by using -this approach. Attempts have been made to minimize memory and CPU usage by -using conservative defaults in the configuration. - - -Documentation Impact -==================== - -This is a new TripleO deployment service and should be properly documented -to instruct installers in the configuration of FRR for their environment. - -The TripleO docs will need updates in many sections, including: - -* `TripleO OpenStack Deployment - `_ -* `Provisioning Baremetal Before Overcloud Deploy - `_ -* `Deploying with Custom Networks - `_ -* `Configuring Network Isolation - `_ -* `Deploying Overcloud with L3 routed networking - `_ - -The FRR daemons are documented elsewhere, and we should not need to document -usage of BGP in general, as this is a standard protocol. The configuration of -top-of-rack switches is different depending on the make and model of routing -switch used, and we should not expect to provide configuration examples for -network hardware. - -Implementation -============== - -The implementation will require a new TripleO deployment service, container -definition, and modifications to the existing role definitions. Those changes -are proposed upstream, see the References section for URL links. - - -Assignee(s) -=========== - -Primary assignee: - * Dan Sneddon - -Secondary assignees: - * Michele Baldessari - * Carlos Gonclaves - * Daniel Alvarez Sanchez - * Luis Tomas Bolivar - - -Work Items -========== - -* Develop the container definition -* Define the TripleO deployment service templates -* Define the TripleO Ansible role -* Modify the existing TripleO roles to support the above changes -* Merge the changes to the container, deployment service, and Ansible role -* Ensure FRR packages are available for supported OS versions - - -References -========== - -.. [1] `Review: DNR/DNM Frr support `_. -.. [2] `Review: Add tripleo_frr role `_. -.. [3] `Review: WIP/DNR/DNM FRR service `_. -.. [4] `OVN BGP Agent `_. diff --git a/specs/wallaby/triplo-network-data-v2-node-ports.rst b/specs/wallaby/triplo-network-data-v2-node-ports.rst deleted file mode 100644 index 79b1f51d..00000000 --- a/specs/wallaby/triplo-network-data-v2-node-ports.rst +++ /dev/null @@ -1,675 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -==================================================== -Network Data v2 - node ports and node network config -==================================================== - -With "Network Data v2" the goal is to move management of network resources -out of the heat stack. The schema spec [1]_ talked about the -``network_data.yaml`` format and managing networks, segments and subnets. This -spec follows up with node ports for composable networks and moving the node -network configuration action to the baremetal/network configuration workflow. - - -Problem description -=================== - -Applying a network change on day 2, currently requires a full stack update -since network resources such as ports are managed by heat. It has also been -problematic to create ports for large scale deployments; neutron on the single -node undercloud gets overwhelmed and it is difficult to throttle port creation -in Heat. As an early indication on the performance of port creation with the -proposed ansible module: - -Performance stats: 100 nodes x 3 networks = 300 ports - -.. code-block:: text - - 4xCPU 1.8 GHz (8GB) 8x CPU 2.6 GHz (12GB) - ------------------- -------------------------------- - Concurr: 10 20 10 4 - ........ .............. ......... ......... ......... - Create real 5m58.006s 1m48.518s 1m51.998s 1m25.022s - Delete: real 4m12.812s 0m47.475s 0m48.956s 1m19.543s - Re-run: real 0m19.386s 0m4.389s 0m4.453s 0m4.977s - - -Proposed Change -=============== - -Extend the baremetal provisioning workflow that runs before overcloud -deployment to also create ports for composable networks. The baremetal -provisioning step already create ports for the provisioning network. Moving -the management of ports for composable networks to this workflow will -consolidate all port management into one workflow. - -Also make baremetal provisioning workflow execute the tripleo-ansible -``tripleo_network_config`` role to configure node networking after -node provisioning. - -The deploy workflow would be: - -#. Operator defines composable networks in network data YAML file. -#. Operator provisions composable networks by running the - ``openstack overcloud network provision`` command, providing the network - data YAML file as input. -#. Operator defines roles and nodes in the baremetal deployment YAML file. This - YAML also defines the networks for each role. -#. Operator deploys baremetal nodes by running the - ``openstack overcloud node provision`` command. This step creates ports in - neutron, and also configures networking; including composable networks; on - the nodes using ansible role to apply network config with os-net-config - [2]_. -#. Operator deploys heat stack including the environment files produced by the - commands executed in the previous steps by running the - ``openstack overcloud deploy`` command. -#. Operator executes config-download to install and configure openstack on the - overcloud nodes. *(optional - only if overcloud deploy command executed with - ``-stack-only``)* - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Harald Jensås - -Approver(s) ------------ - -Primary approver: - TODO - - -Implementation Details ----------------------- - -The baremetal YAML definition will be extended, adding the ``networks`` and the -``network_config`` keys in role ``defaults`` as well as per-instance to support -``fixed_ip`` addressing, manually pre-created port resource and per-node -network configuration template. - -The ``networks`` will replace the current ``nic`` key, until the ``nic`` key is -deprecated either can be used but not both at the same time. Networks in -``networks`` will support a boolean key ``vif`` which indicate if the port -should be attached in Ironic or not. If no network with ``vif: true`` is -specified an implicit one for the control plane will be appended: - -.. code-block:: yaml - - - network: ctlplane - vif: true - -For networks with ``vif: true``, ports will be created by metalsmith. For -networks with ``vif: false`` (or ``vif`` not specified) the workflow will -create neutron ports based on the YAML definition. - -The neutron ports will initially be tagged with the *stack name* and the -instance *hostname*, these tags are used for idempotency. The ansible module -managing ports will get all ports with the relevant tags and then add/remove -ports based on the expanded roles defined in the Baremetal YAML definition. -(The *hostname* and *stack_name* tags are also added to ports created with heat -in this tripleo-heat-templates change [4]_, to enable *adoption* of neutron -ports created by heat for the upgrade scenario.) - -Additionally the ports will be tagged with the ironic node uuid when this is -available. Full set of tags are shown in the example below. - -.. code-block:: json - - { - "port": { - "name": "controller-1-External", - "tags": ["tripleo_ironic_uuid=", - "tripleo_hostname=controller-1", - "tripleo_stack_name=overcloud"], - } - } - -.. Note:: In deployments where baremetal nodes have multiple physical NIC's - multiple networks can have ``vif: true``, so that VIF attach - in ironic and proper neutron port binding happens. In a scenario - where neutron on the Undercloud is managing the switch this would - enable automation of the Top-of-Rack switch configuration. - -Mapping of the port data for overcloud nodes will go into a ``NodePortMap`` -parameter in tripleo-heat-tempaltes. The map will contain submaps for each -node, keyed by the node name. Initially the ``NodePortMap`` will be consumed by -alternative *fake-port* -``OS::TripleO::{{role.name}}::Ports::{{network.name}}Port`` resource templates. -In the final implementation the environment file created can be extended and -the entire ``OS::TripleO::{{role.name}}`` resource can be replaced with a -template that references parameter in the generated environment directly, i.e a -re-implemented ``puppet/role.role.j2.yaml`` without the server and port -resources. The ``NodePortMap`` will be added to the -*overcloud-baremetal-deployed.yaml* created by the workflow creating the -overcloud node port resources. - -Network ports for ``vif: false`` networks, will be managed by a new ansible -module ``tripleo_overcloud_network_ports``, the input for this role will be a -list of instance definitions as generated by the -``tripleo_baremetal_expand_roles`` ansible module. The -``tripleo_baremetal_expand_roles`` ansible module will be extended to add -network/subnet information from the baremetal deployment YAML definition. - -The baremetal provision workflow will be extended to write a ansible inventory, -we should try extend tripleo-ansible-inventory so that the baremetal -provisioning workflow can re-use existing code to create the inventory. -The inventory will be used to configure networking on the provisioned nodes -using the **triple-ansible** ``tripleo_network_config`` ansible role. - - -Already Deployed Servers -~~~~~~~~~~~~~~~~~~~~~~~~ - -The Baremetal YAML definition will be used to describe the **pre-deployed** -servers baremetal deployment. In this scenario there is no Ironic node to -update, no ironic UUID to add to a port's tags and no ironic node to attach -VIFs to. - -All ports, including the ctlplane port will be managed by the -``tripleo_overcloud_network_ports`` ansible module. The Baremetal YAML -definition for a deployment with pre-deployed servers will have to include an -``instance`` entry for each pre-deployed server. This entry will have the -``managed`` key set to ``false``. - -It should be possible for an already deployed server to have a management -address that is completely separate from the tripleo managed addreses. The -Baremetal YAML definition can be extended to carry a ``management_ip`` field -for this purpose. In the case no managment address is available the ctlplane -network entry for pre-deployed instances must have ``fixed_ip`` configured. - -The deployment workflow will *short circuit* the baremetal provisioning of -``managed: false`` instances. The Baremetal YAML definition can define a -mix of *already deployed server* instances, and instances that should be -provisioned via metalsmith. See :ref:`baremetal_yaml_pre_provsioned`. - -YAML Examples -~~~~~~~~~~~~~ - -Example: Baremetal YAML definition with defaults properties -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - - name: Controller - count: 1 - hostname_format: controller-%index% - defaults: - profile: control - network_config: - template: templates/multiple_nics/multiple_nics.j2 - physical_bridge_name: br-ex - public_interface_name: nic1 - network_deployment_actions: ['CREATE'] - net_config_data_lookup: {} - networks: - - network: ctlplane - vif: true - - network: external - subnet: external_subnet - - network: internal_api - subnet: internal_api_subnet - - network: storage - subnet: storage_subnet - - network: storage_mgmt - subnet: storage_mgmt_subnet - - network: Tenant - subnet: tenant_subnet - - name: Compute - count: 1 - hostname_format: compute-%index% - defaults: - profile: compute - network_config: - template: templates/multiple_nics/multiple_nics.j2 - physical_bridge_name: br-ex - public_interface_name: nic1 - network_deployment_actions: ['CREATE'] - net_config_data_lookup: {} - networks: - - network: ctlplane - vif: true - - network: internal_api - subnet: internal_api_subnet - - network: tenant - subnet: tenant_subnet - - network: storage - subnet: storage_subnet - -Example: Baremetal YAML definition with per-instance overrides -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - - name: Controller - count: 1 - hostname_format: controller-%index% - defaults: - profile: control - network_config: - template: templates/multiple_nics/multiple_nics.j2 - physical_bridge_name: br-ex - public_interface_name: nic1 - network_deployment_actions: ['CREATE'] - net_config_data_lookup: {} - bond_interface_ovs_options: - networks: - - network: ctlplane - vif: true - - network: external - subnet: external_subnet - - network: internal_api - subnet: internal_api_subnet - - network: storage - subnet: storage_subnet - - network: storage_mgmt - subnet: storage_mgmt_subnet - - network: tenant - subnet: tenant_subnet - instances: - - hostname: controller-0 - name: node00 - networks: - - network: ctlplane - vif: true - - network: internal_api: - fixed_ip: 172.21.11.100 - - hostname: controller-1 - name: node01 - networks: - External: - port: controller-1-external - - hostname: controller-2 - name: node02 - - name: ComputeLeaf1 - count: 1 - hostname_format: compute-leaf1-%index% - defaults: - profile: compute-leaf1 - networks: - - network: internal_api - subnet: internal_api_subnet - - network: tenant - subnet: tenant_subnet - - network: storage - subnet: storage_subnet - instances: - - hostname: compute-leaf1-0 - name: node03 - network_config: - template: templates/multiple_nics/multiple_nics_dpdk.j2 - physical_bridge_name: br-ex - public_interface_name: nic1 - network_deployment_actions: ['CREATE'] - net_config_data_lookup: {} - num_dpdk_interface_rx_queues: 1 - networks: - - network: ctlplane - vif: true - - network: internal_api - fixed_ip: 172.21.12.105 - - network: tenant - port: compute-leaf1-0-tenant - - network: storage - subnet: storage_subnet - - -.. _baremetal_yaml_pre_provsioned: - -Example: Baremetal YAML for Already Deployed Servers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - - name: Controller - count: 3 - hostname_format: controller-%index% - defaults: - profile: control - network_config: - template: templates/multiple_nics/multiple_nics.j2 - networks: - - network: ctlplane - - network: external - subnet: external_subnet - - network: internal_api - subnet: internal_api_subnet - - network: storage - subnet: storage_subnet - - network: storage_mgmt - subnet: storage_mgmt_subnet - - network: tenant - subnet: tenant_subnet - managed: false - instances: - - hostname: controller-0 - networks: - - network: ctlplane - fixed_ip: 192.168.24.10 - - hostname: controller-1 - networks: - - network: ctlplane - fixed_ip: 192.168.24.11 - - hostname: controller-2 - networks: - - network: ctlplane - fixed_ip: 192.168.24.12 - - name: Compute - count: 2 - hostname_format: compute-%index% - defaults: - profile: compute - network_config: - template: templates/multiple_nics/multiple_nics.j2 - networks: - - network: ctlplane - - network: internal_api - subnet: internal_api_subnet - - network: tenant - subnet: tenant_subnet - - network: storage - subnet: storage_subnet - instances: - - hostname: compute-0 - managed: false - networks: - - network: ctlplane - fixed_ip: 192.168.24.100 - - hostname: compute-1 - managed: false - networks: - - network: ctlplane - fixed_ip: 192.168.24.101 - -Example: NodeNetworkDataMappings -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - NodePortMap: - controller-0: - ctlplane: - ip_address: 192.168.24.9 (2001:DB8:24::9) - ip_subnet: 192.168.24.9/24 (2001:DB8:24::9/64) - ip_address_uri: 192.168.24.9 ([2001:DB8:24::9]) - internal_api: - ip_address: 172.18.0.9 (2001:DB8:18::9) - ip_subnet: 172.18.0.9/24 (2001:DB8:18::9/64) - ip_address_uri: 172.18.0.9 ([2001:DB8:18::9]) - tenant: - ip_address: 172.19.0.9 (2001:DB8:19::9) - ip_subnet: 172.19.0.9/24 (2001:DB8:19::9/64) - ip_address_uri: 172.19.0.9 ([2001:DB8:19::9]) - compute-0: - ctlplane: - ip_address: 192.168.24.15 (2001:DB8:24::15) - ip_subnet: 192.168.24.15/24 (2001:DB8:24::15/64) - ip_address_uri: 192.168.24.15 ([2001:DB8:24::15]) - internal_api: - ip_address: 172.18.0.15 (2001:DB8:18::1) - ip_subnet: 172.18.0.15/24 (2001:DB8:18::1/64) - ip_address_uri: 172.18.0.15 ([2001:DB8:18::1]) - tenant: - ip_address: 172.19.0.15 (2001:DB8:19::15) - ip_subnet: 172.19.0.15/24 (2001:DB8:19::15/64) - ip_address_uri: 172.19.0.15 ([2001:DB8:19::15]) - -Example: Ansible inventory -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - Controller: - vars: - role_networks: - - External - - InternalApi - - Tenant - role_networks_lower: - External: external - InternalApi: internal_api - Tenant: tenant - networks_all: - - External - - InternalApi - - Tenant - neutron_physical_bridge_name: br-ex - neutron_public_interface_name: nic1 - tripleo_network_config_os_net_config_mappings: {} - network_deployment_actions: ['CREATE', 'UPDATE'] - ctlplane_subnet_cidr: 24 - ctlplane_mtu: 1500 - ctlplane_gateway_ip: 192.168.24.254 - ctlplane_dns_nameservers: [] - dns_search_domains: [] - ctlplane_host_routes: {} - internal_api_cidr: 24 - internal_api_gateway_ip: 172.18.0.254 - internal_api_host_routes: [] - internal_api_mtu: 1500 - internal_api_vlan_id: 20 - tenant_cidr: 24 - tenant_api_gateway_ip: 172.19.0.254 - tenant_host_routes: [] - tenant_mtu: 1500 - hosts: - controller-0: - ansible_host: 192.168.24.9 - ctlplane_ip: 192.168.24.9 - internal_api_ip: 172.18.0.9 - tenant_ip: 172.19.0.9 - Compute: - vars: - role_networks: - - InternalApi - - Tenant - role_networks_lower: - InternalApi: internal_api - Tenant: tenant - networks_all: - - External - - InternalApi - - Tenant - neutron_physical_bridge_name: br-ex - neutron_public_interface_name: nic1 - tripleo_network_config_os_net_config_mappings: {} - network_deployment_actions: ['CREATE', 'UPDATE'] - ctlplane_subnet_cidr: 24 - ctlplane_mtu: 1500 - ctlplane_gateway_ip: 192.168.25.254 - ctlplane_dns_nameservers: [] - dns_search_domains: [] - ctlplane_host_routes: {} - internal_api_cidr: 24 - internal_api_gateway_ip: 172.18.1.254 - internal_api_host_routes: [] - internal_api_mtu: 1500 - internal_api_vlan_id: 20 - tenant_cidr: 24 - tenant_api_gateway_ip: 172.19.1.254 - tenant_host_routes: [] - tenant_mtu: 1500 - hosts: - compute-0: - ansible_host: 192.168.25.15 - ctlplane_ip: 192.168.25.15 - internal_ip: 172.18.1.15 - tenant_ip: 172.19.1.15 - - -TODO ----- - -* Constraint validation, for example ``BondInterfaceOvsOptions`` uses - ``allowed_pattern: ^((?!balance.tcp).)*$`` to ensure balance-tcp bond mode is - not used, as it is known to cause packet loss. - -Work Items ----------- - -#. Write ansible inventory after baremetal provisioning - - Create an ansible inventory, similar to the inventory created by config- - download. The ansible inventory is required to apply network - configuration to the deployed nodes. - - We should try to extend tripleo-ansible-inventory so that the baremetal - provisioning workflow can re-use existing code to create the inventory. - - It is likely that it makes sense for the workflow to also run the - tripleo-ansible role tripleo_create_admin to create the *tripleo-admin* - ansible user. - -#. Extend baremetal provisioning workflow to create neutron ports and - update the ironic node ``extra`` field with the ``tripleo_networks`` map. - -#. The baremetal provisioning workflow needs a *pre-deployed-server* option - that cause it to not deploy baremetal nodes, only create network ports. - When this option is used the baremetal deployment YAML file will also - describe the already provisioned nodes. - -#. Apply and validate network configuration using the **triple-ansible** - ``tripleo_network_config`` ansible role. This step will be integrated in - the provisioning command. - -#. Disable and remove management of composable network ports in - tripleo-heat-templates. - -#. Change the Undercloud and Standalone deploy to apply network configuration - prior to the creating the ephemeral heat stack using the - ``tripleo_network_config`` ansible role. - -Testing -======= - -Multinode OVB CI job's with network-isolation will be updated to test the new -workflow. - -Upgrade Impact -============== - -During upgrade switching to use network ports managed outside of the heat stack -the ``PortDeletionPolicy`` must be set to ``retain`` during the update/upgrade -*prepare* step, so that the existing neutron ports (which will be adopted by -the pre-heat port management workflow) are not deleted when running the update/ -upgrade *converge* step. - -Moving node network configuration out of tripleo-heat-templates will require -manual (or scripted) migration of settings controlled by heat template -parameters to the input file used for baremetal/network provisioning. At least -the following parameters are affected: - -* NeutronPhysicalBridge -* NeutronPublicInterface -* NetConfigDataLookup -* NetworkDeploymentActions - -Parameters that will be deprecated: - -* NetworkConfigWithAnsible -* {{role.name}}NetworkConfigTemplate -* NetworkDeploymentActions -* {{role.name}}NetworkDeploymentActions -* BondInterfaceOvsOptions -* NumDpdkInterfaceRxQueues -* {{role.name}}LocalMtu -* NetConfigDataLookup -* DnsServers -* DnsSearchDomains -* ControlPlaneSubnetCidr -* HypervisorNeutronPublicInterface -* HypervisorNeutronPhysicalBridge - -The environment files used to select one of the pre-defined nic config -templates will no longer work. The template to use must be set in the YAML -defining the baremetal/network deployment. This affect the following -environment files: - -* environments/net-2-linux-bonds-with-vlans.j2.yaml -* environments/net-bond-with-vlans.j2.yaml -* environments/net-bond-with-vlans-no-external.j2.yaml -* environments/net-dpdkbond-with-vlans.j2.yaml -* environments/net-multiple-nics.j2.yaml -* environments/net-multiple-nics-vlans.j2.yaml -* environments/net-noop.j2.yaml -* environments/net-single-nic-linux-bridge-with-vlans.j2.yaml -* environments/net-single-nic-with-vlans.j2.yaml -* environments/net-single-nic-with-vlans-no-external.j2.yaml - -Documentation Impact -==================== - -The documentation effort is **heavy** and will need to be incrementally -updated. As a minumum, a separate page explaining the new process must be -created. - -The TripleO docs will need updates in many sections, including: - -* `TripleO OpenStack Deployment - `_ -* `Provisioning Baremetal Before Overcloud Deploy - `_ -* `Deploying with Custom Networks - `_ -* `Configuring Network Isolation - `_ -* `Deploying Overcloud with L3 routed networking - `_ - - -Alternatives -============ - -#. **Not changing how ports are created** - - In this case we keep creating the ports with heat, the do nothing - alternative. - -#. **Create a completely separate workflow for composable network ports** - - A separate workflow that can run before/after node provisioning. It can read - the same YAML format as baremetal provisioning, or it can have it's own YAML - format. - - The problem with this approach is that we loose the possibility to store - relations between neutron-port and baremetal node in a database. As in, we'd - need our own database (a file) maintaining the relationships. - - .. Note:: We need to implement this workflow anyway for a pre-deployed - server scenario, but instead of a completely separate workflow - the baremetal deploy workflow can take an option to not - provision nodes. - -#. **Create ports in ironic and bind neutron ports** - - Instead of creating ports unknown to ironic, create ports for the ironic - nodes in the baremetal service. - - The issue is that ironic does not have a concept of virtual port's, so we - would have to either add this support in ironic, switch TripleO to use - neutron trunk ports or create *fake* ironic ports that don't actually - reflect NICs on the baremetal node. (This abandoned ironic spec [3]_ discuss - one approach for virtual port support, but it was abandoned in favor of - neutron trunk ports.) - - With each PTG there is a re-occurring suggestion to replace neutron with a - more light weight IPAM solution. However, the effort to actually integrate - it properly with ironic and neutron for composable networks probably isn't - time well spent. - - -References -========== - -.. [1] `Review: Spec for network data v2 format `_. -.. [2] `os-net-config `_. -.. [3] `Abandoned spec for VLAN Aware Baremetal Instances `_. -.. [4] `Review: Add hostname and stack_name tags to ports `_. diff --git a/specs/wallaby/triplo-network-data-v2.rst b/specs/wallaby/triplo-network-data-v2.rst deleted file mode 100644 index 04d64d5c..00000000 --- a/specs/wallaby/triplo-network-data-v2.rst +++ /dev/null @@ -1,348 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=============================== -Network Data format/schema (v2) -=============================== - -The network data schema (``network_data.yaml``) used to define composable -networks in TripleO has had several additions since it was first introduced. -Due to legacy compatibility some additions make the schema somewhat non- -intuitive. Such as adding support for routed networks, where the ``subnets`` -map was introduced. - -The goal of this spec is to get discussion and settle on a new network data -(v2) format that will be used once management of network resources such -as networks, segments and subnets are moved out of the heat stack. - -Problem description -=================== - -The current schema is somewhat inconsistent, and not as precice as it could -be. For example the ``base`` subnet being at level-0, while additional -subnets are in the ``subnets`` map. It would be more intuitive to define -all subnets in the ``subnets`` map. - -Currently the network resource properties are configured via a mix of -parameters in the heat environment and network data. For example -``dns_domain``, ``admin_state_up``, ``enable_dhcp``, ``ipv6_address_mode``, -``ipv6_ra_mode`` and ``shared`` properties are configured via Heat parameters, -while other properties such as ``cidr``, ``gateway_ip``, ``host_routes`` etc. -is defined in network data. - -Proposed Change -=============== - -Overview --------- - -Change the network data format so that all network properties are managed in -network data, so that network resources can be managed outside of the heat -stack. - -.. note:: Network data v2 format will only be used with the new tooling that - will manage networks outside of the heat stack. - -Network data v2 format should stay compatible with tripleo-heat-templates -jinja2 rendering outside of the ``OS::TripleO::Network`` resource and it's -subresources ``OS::TripleO::Network::{{network.name}}``. - -User Experience -^^^^^^^^^^^^^^^ - -Tooling will be provided for user's to export the network information from -an existing deployment. This tooling will output a network data file in -v2 format, which from then on can be used to manage the network resources -using tripleoclient commands or tripleo-ansible cli playbooks. - -The command line tool to manage the network resources will output the -environment file that must be included when deploying the heat stack. (Similar -to the environment file produced when provisioning baremetal nodes without -nova.) - -CLI Commands -^^^^^^^^^^^^ - -Command to export provisioned overcloud network information to network data v2 -format. - -.. code-block:: shell - - openstack overcloud network export \ - --stack \ - --output - -Command to create/update overcloud networks outside of heat. - -.. code-block:: shell - - openstack overcloud network provision \ - --networks-file \ - --output - - -Main difference between current network data schema and the v2 schema proposed -here: - -* Base subnet is moved to the ``subnets`` map, aligning configuration for - non-routed and routed deploymends (spine-and-leaf, DCN/Edge) -* The ``enabled`` (bool) is no longer used. Disabled networks should be - excluded from the file, removed or commented. -* The ``compat_name`` option is no longer required. This was used to change - the name of the heat resource internally. Since the heat resource will be a - thing of the past with network data v2, we don't need it. -* The keys ``ip_subnet``, ``gateway_ip``, ``allocation_pools``, ``routes``, - ``ipv6_subnet``, ``gateway_ipv6``, ``ipv6_allocation_pools`` and - ``routes_ipv6`` are no longer valid at the network level. -* New key ``physical_network``, our current physical_network names for base and - non-base segments are not quite compatible. Adding logic in code to - compensate is complex. (This field may come in handy when creating ironic - ports in metalsmith as well.) -* New keys ``network_type`` and ``segmentation_id`` since we could have users - that used ``{{network.name}}NetValueSpecs`` to set network_type vlan. - -.. note:: The new tooling should validate that non of the keys previously - valid in network data v1 are used in network data v2. - -Example network data v2 file for IPv4 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - - name: Storage - name_lower: storage (optional, default: name.lower()) - admin_state_up: false (optional, default: false) - dns_domain: storage.localdomain. (optional, default: undef) - mtu: 1442 (optional, default: 1500) - shared: false (optional, default: false) - service_net_map_replace: storage (optional, default: undef) - ipv6: true (optional, default: false) - vip: true (optional, default: false) - subnets: - subnet01: - ip_subnet: 172.18.1.0/24 - gateway_ip: 172.18.1.254 (optional, default: undef) - allocation_pools: (optional, default: []) - - start: 172.18.1.10 - end: 172.18.1.250 - enable_dhcp: false (optional, default: false) - routes: (optional, default: []) - - destination: 172.18.0.0/24 - nexthop: 172.18.1.254 - vlan: 21 (optional, default: undef) - physical_network: storage_subnet01 (optional, default: {{name.lower}}_{{subnet name}}) - network_type: flat (optional, default: flat) - segmentation_id: 21 (optional, default: undef) - subnet02: - ip_subnet: 172.18.0.0/24 - gateway_ip: 172.18.0.254 (optional, default: undef) - allocation_pools: (optional, default: []) - - start: 172.18.0.10 - end: 172.18.0.250 - enable_dhcp: false (optional, default: false) - routes: (optional, default: []) - - destination: 172.18.1.0/24 - nexthop: 172.18.0.254 - vlan: 20 (optional, default: undef) - physical_network: storage_subnet02 (optional, default: {{name.lower}}_{{subnet name}}) - network_type: flat (optional, default: flat) - segmentation_id: 20 (optional, default: undef) - -Example network data v2 file for IPv6 -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - - - name: Storage - name_lower: storage - admin_state_up: false - dns_domain: storage.localdomain. - mtu: 1442 - shared: false - vip: true - subnets: - subnet01: - ipv6_subnet: 2001:db8:a::/64 - gateway_ipv6: 2001:db8:a::1 - ipv6_allocation_pools: - - start: 2001:db8:a::0010 - end: 2001:db8:a::fff9 - enable_dhcp: false - routes_ipv6: - - destination: 2001:db8:b::/64 - nexthop: 2001:db8:a::1 - ipv6_address_mode: null - ipv6_ra_mode: null - vlan: 21 - physical_network: storage_subnet01 (optional, default: {{name.lower}}_{{subnet name}}) - network_type: flat (optional, default: flat) - segmentation_id: 21 (optional, default: undef) - subnet02: - ipv6_subnet: 2001:db8:b::/64 - gateway_ipv6: 2001:db8:b::1 - ipv6_allocation_pools: - - start: 2001:db8:b::0010 - end: 2001:db8:b::fff9 - enable_dhcp: false - routes_ipv6: - - destination: 2001:db8:a::/64 - nexthop: 2001:db8:b::1 - ipv6_address_mode: null - ipv6_ra_mode: null - vlan: 20 - physical_network: storage_subnet02 (optional, default: {{name.lower}}_{{subnet name}}) - network_type: flat (optional, default: flat) - segmentation_id: 20 (optional, default: undef) - -Example network data v2 file for dual stack -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Dual IPv4/IPv6 with two subnets per-segment, one for IPv4 and the other for -IPv6. A single neutron port with an IP address in each subnet can be created. - -In this case ``ipv6`` key will control weather services are configured to -bind to IPv6 or IPv4. (default ipv6: false) - -.. code-block:: yaml - - - name: Storage - name_lower: storage - admin_state_up: false - dns_domain: storage.localdomain. - mtu: 1442 - shared: false - ipv6: true (default ipv6: false) - vip: true - subnets: - subnet01: - ip_subnet: 172.18.1.0/24 - gateway_ip: 172.18.1.254 - allocation_pools: - - start: 172.18.1.10 - end: 172.18.1.250 - routes: - - destination: 172.18.0.0/24 - nexthop: 172.18.1.254 - ipv6_subnet: 2001:db8:a::/64 - gateway_ipv6: 2001:db8:a::1 - ipv6_allocation_pools: - - start: 2001:db8:a::0010 - end: 2001:db8:a::fff9 - routes_ipv6: - - destination: 2001:db8:b::/64 - nexthop: 2001:db8:a::1 - vlan: 21 - subnet02: - ip_subnet: 172.18.0.0/24 - gateway_ip: 172.18.0.254 - allocation_pools: - - start: 172.18.0.10 - end: 172.18.0.250 - routes: - - destination: 172.18.1.0/24 - nexthop: 172.18.0.254 - ipv6_subnet: 2001:db8:b::/64 - gateway_ipv6: 2001:db8:b::1 - ipv6_allocation_pools: - - start: 2001:db8:b::0010 - end: 2001:db8:b::fff9 - routes_ipv6: - - destination: 2001:db8:a::/64 - nexthop: 2001:db8:b::1 - vlan: 20 - -Alternatives ------------- - -#. Not changing the network data format - - In this case we need an alternative to provide the values for resource - properties currently managed using heat parameters, when moving - management of the network resources outside the heat stack. - -#. Only add new keys for properties - - Keep the concept of the ``base`` subnet at level-0, and only add keys - for properties currently managed using heat parameters. - - -Security Impact -=============== - -N/A - - -Upgrade Impact -============== - -When (if) we remove the capability to manage network resources in the -overcloud heat stack, the user must run the export command to generate -a new network data v2 file. Use this file as input to the ``openstack -overcloud network provision`` command, to generate the environment file -required for heat stack without network resources. - - -Performance Impact -================== - -N/A - - -Documentation Impact -==================== - -The network data v2 format must be documented. Procedures to use the commands -to export network information from existing deployments as well as -procedures to provision/update/adopt network resources with the non-heat stack -tooling must be provided. - -Heat parameters which will be deprecated/removed: - -* ``{{network.name}}NetValueSpecs``: Deprecated, Removed. - This was used to set ``provider:physical_network`` and - ``provider:network_type``, or actually **any** network property. -* ``{network.name}}NetShared``: Deprecated, replaced by network level - ``shared`` (bool) -* ``{{network.name}}NetAdminStateUp``: Deprecated, replaced by network - level ``admin_state_up`` (bool) -* ``{{network.name}}NetEnableDHCP``: Deprecated, replaced by subnet - level ``enable_dhcp`` (bool) -* ``IPv6AddressMode``: Deprecated, replaced by subnet level - ``ipv6_address_mode`` -* ``IPv6RAMode``: Deprecated, replaced by subnet level ``ipv6_ra_mode`` - -Once deployed_networks.yaml (https://review.opendev.org/751876) is used the -following parameters are Deprecated, since they will no longer be used: - -* {{network.name}}NetCidr -* {{network.name}}SubnetName -* {{network.name}}Network -* {{network.name}}AllocationPools -* {{network.name}}Routes -* {{network.name}}SubnetCidr_{{subnet}} -* {{network.name}}AllocationPools_{{subnet}} -* {{network.name}}Routes_{{subnet}} - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - * Harald Jensås - - -Work Items ----------- - -* Add tags to resources using heat stack - https://review.opendev.org/750666 -* Tools to extract provisioned networks from existing deployment - https://review.opendev.org/750671, https://review.opendev.org/750672 -* New tooling to provision/update/adopt networks - https://review.opendev.org/751739, https://review.opendev.org/751875 -* Deployed networks template in THT - https://review.opendev.org/751876 diff --git a/specs/xena/ansible-logging-tripleoclient.rst b/specs/xena/ansible-logging-tripleoclient.rst deleted file mode 100644 index e5713db3..00000000 --- a/specs/xena/ansible-logging-tripleoclient.rst +++ /dev/null @@ -1,304 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================== -Improve logging for ansible calls in tripleoclient -================================================== - -Launchpad blueprint: - -https://blueprints.launchpad.net/tripleo/+spec/ansible-logging-tripleoclient - -Problem description -=================== -Currently, the ansible playbooks logging as shown during a deploy or day-2 -operations such us upgrade, update, scaling is either too verbose, or not -enough. - -Furthermore, since we're moving to ephemeral services on the Undercloud (see -`ephemeral heat`_ for instance), getting information about the state, content -and related things is a bit less intuitive. A proper logging, with associated -CLI, can really improve that situation and provide a better user experience. - - -Requirements for the solution -============================= -No new service addition ------------------------ -We are already trying to remove things from the Undercloud, such as Mistral, -it's not in order to add new services. - -No increase in deployment and day-2 operations time ---------------------------------------------------- -The solution must not increase the time taken for deploy, update, upgrades, -scaling and any other day-2 operations. It must be 100% transparent to the -operator. - -Use existing tools ------------------- -In the same way we don't want to have new services, we don't want to reinvent -the wheel once more, and we must check the already huge catalog of existing -solutions. - -KISS ----- -Keep It Simple Stupid is a key element - code must be easy to understand and -maintain. - -Proposed Change -=============== - -Introduction ------------- -While working on the `Validation Framework`_, a big part was about the logging. -There, we found a way to get an actual computable output, and store it in a -defined location, allowing to provide a nice interface in order to list and -show validation runs. - -This heavily relies on an ansible callback plugin with specific libs, which are -shipped in `python-validations-libs`_ package. - -Since the approach is modular, those libs can be re-used pretty easily in other -projects. - -In addition, python-tripleoclient already depends on `python-validations-libs`_ -(via a dependency on validations-common), meaning we already have the needed -bits. - -The Idea --------- -Since we have the mandatory code already present on the system (provided by the -new `python-validations-libs`_ package), we can modify how ansible-runner is -configured in order to inject a callback, and get the output we need in both -the shell (direct feedback to the operator) and in a dedicated file. - -Since callback aren't cheap (but, hopefully not expensive either), proper PoC -must be conducted in order to gather metrics about CPU, RAM and time. Please -see Performance Impact section. - -Direct feedback ---------------- -The direct feedback will tell the operator about the current task being done -and, when it ends, if it's a success or not. - -Using a callback might provide a "human suited" output. - -File logging ------------- -Here, we must define multiple things, and take into account we're running -multiple playbooks, with multiple calls to ansible-runner. - -File location -............. -Nowadays, most if not all of the deploy related files are located in the -user home directory (i.e. ~/overcloud-deploy//). -It therefore sounds reasonable to get the log in the same location, or a -subdirectory in that location. - -Keeping this location also solves the potential access right issue, since a -standard home directory has a 0700 mode, preventing any other user to access -its content. - -We might even go a bit deeper, and enforce a 0600 mode, just to be sure. - -Remember, logs might include sensitve data, especially when we're running with -extra debugging. - -File format convention -...................... -In order to make the logs easily usable by automated tools, and since we -already heavily rely on JSON, the log output should be formated as JSON. This -would allow to add some new CLI commands such as "history list", "history show" -and so on. - -Also, JSON being well known by logging services such as ElasticSearch, using it -makes sending them to some central logging service really easy and convenient. - -While JSON is nice, it will more than probably prevent a straight read by the -operator - but with a working CLI, we might get something closer to what we -have in the `Validation Framework`_, for instance (see `this example`_). We -might even consider a CLI that will allow to convert from JSON to whatever -the operator might want, including but not limited to XML, plain text or JUnit -(Jenkins). - -There should be a new parameter allowing to switch the format, from "plain" to -"json" - the default value is still subject to discussion, but providing this -parameter will ensure Operators can do whetever they want with the default -format. A concensus seems to indicate "default to plain". - -Filename convention -................... -As said, we're running multiple playbooks during the actions, and we also want -to have some kind of history. - -In order to do that, the easiest way to get a name is to concatenate the time -and the playbook name, something like: - -* *timestamp*-*playbookname*.json - -Use systemd/journald instead of files -..................................... -One might want to use systemd/journald instead of plain files. While this -sounds appealing, there are multiple potential issues: - -#. Sensitive data will be shown in the system's journald, at hand of any other - user -#. Journald has rate limitations and threshold, meaning we might hit them, and - therefore lose logs, or prevent other services to use journald for their - own logging -#. While we can configure a log service (rsyslog, syslog-ng, etc) in order to - output specific content to specific files, we will face access issues on - them - -Therefore, we shouldn't use journald. - -Does it meet the requirements? ------------------------------- -* No service addition: yes - it's only a change in the CLI, no new dependecy is - needed (tripleoclient already depends on validations-common, which depends on - validations-libs) -* No increase in operation time: this has to be proven with proper PoC and - metrics gathering/comparison. -* Existing Tool: yes -* Actively maintained: so far, yes - expected to be extended outside of TripleO -* KISS: yes, based on the validations-libs and simple Ansible callback - -Alternatives -============ - -ARA ---- -`ARA Records Ansible`_ provides some of the functionnalities we implemented in -the Validation Framework logging, but it lacks some of the wanted features, -such as - -* CLI integration within tripleoclient -* Third-party service independency -* plain file logging in order to scrap them with SOSReport or other tools - -ARA needs a DB backend - we could inject results in the existing galera DB, but -that might create some issues with the concurrent accesses happening during a -deploy for instance. Using sqlite is also an option, but it means new packages, -new file location to save, binary format and so on. - -It also needs some web server in order to show the reporting, meaning yet -another httpd configuration, and the need to access to it on the undercloud. - -Also, ARA being a whole service, it would require to deploy it, configure it, -and maintain it - plus ensure it is properly running before each action in -order to ensure it gets the logs. - -By default, ARA doesn't affect the actual playbook output, while the goal of -this spec is mostly about it: provide a concise feedback to the operator, while -keeping the logs on disk, in files, with the ability to interact with them -through the CLI directly. - -In the end, ARA might be a solution, but it will require more work to get it -integrated, and, since the Triple UI has been deprecated, there isn't real way -to integrate it in an existing UI tool. - -Would it meet the requirements? -............................... -* No service addition: no, due to the "REST API" aspect. A service must answer - API calls -* No increase in operation time: probably yes, depending on the way ARA can - manage inputs queues. Since it's also using a callback, we have to account - for the potential resources used by it. -* Existing tool: yes -* Actively maintained: yes -* KISS: yes, but it adds new dependencies (DB backend, Web server, ARA service, - and so on) - -Note on the "new dependencies": while ARA can be launched -`without any service`_, it seems to be only for devel purpose, according to the -informative note we can read on the documentation page:: - - Good for small scale usage but inefficient and contains a lot of small files - at a large scale. - -Therefore, we shouldn't use ARA. - -Proposed Roadmap -================ -In Xena: - -* Ensure we have all the ABI capabilities within validations-libs in order to - set needed/wanted parameters for a different log location and file naming -* Start to work on the ansible-runner calls so that it uses a tweaked callback, - using the validations-libs capabilities in order to get the direct feedback - as well as the formatted file in the right location - -Security Impact -=============== -As we're going to store full ansible output on the disk, we must ensure log -location accesses are closed to any non-wanted user. As stated while talking -about the file location, the directory mode and ownership must be set so that -only the needed users can access its content (root + stack user) - -Once this is sorted out, no other security impact is to be expected - further -more, it will even make things more secure than now, since the current way -ansible is launched within tripleoclient puts an "ansible.log" file in the -operator home directory without any specific rights. - -Upgrade Impact -============== -Appart from ensuring the log location exists, there isn't any major upgrade -impact. A doc update must be done in order to point to the log location, as -well as some messages within the CLI. - -End User Impact -=============== -There are two impacts to the End User: - -* CLI output will be reworked in order to provide useful information (see - Direct Feedback above) -* Log location will change a bit for the ansible part (see File Logging above) - -Performance Impact -================== -A limited impact is to be expected - but proper PoC with metrics must be -conducted to assess the actual change. - -Multiple deploys must be done, with different Overcloud design, in order to -see the actual impact alongside the number of nodes. - -Deployer Impact -=============== -Same as End User Impact: CLI output will be changed, and the log location will -be updated. - -Developer Impact -================ -The callback is enabled by default, but the Developer might want to disable it. -Proper doc should reflect this. No real impact in the end. - -Implementation -============== -Contributors ------------- -* Cédric Jeanneret -* Mathieu Bultel - -Work Items ----------- -* Modify validations-libs in order to provided the needed interface (shouldn't - be really needed, the libs are already modular and should expose the wanted - interfaces and parameters) -* Create a new callback in tripleo-ansible -* Ensure the log directory is created with the correct rights -* Update the ansible-runner calls to enable the callback by default -* Ensure tripleoclient outputs status update on a regular basis while the logs - are being written in the right location -* Update/create the needed documentations about the new logging location and - management - -.. _ephemeral heat: https://specs.openstack.org/openstack/tripleo-specs/specs/wallaby/ephemeral-heat-overcloud.html -.. _Validation Framework: https://specs.openstack.org/openstack/tripleo-specs/specs/stein/validation-framework.html -.. _this example: https://asciinema.org/a/283645 -.. _python-validations-libs: https://opendev.org/openstack/validations-libs -.. _ARA Records Ansible: https://ara.recordsansible.org/ -.. _without any service: https://ara.readthedocs.io/en/latest/cli.html#ara-manage-generate -.. _ansible "acl": https://docs.ansible.com/ansible/latest/modules/acl_module.html diff --git a/specs/xena/healthcheck-cleanup.rst b/specs/xena/healthcheck-cleanup.rst deleted file mode 100644 index c9e311b1..00000000 --- a/specs/xena/healthcheck-cleanup.rst +++ /dev/null @@ -1,217 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=============================== -Cleaning container healthchecks -=============================== - -https://blueprints.launchpad.net/tripleo/+spec/clean-container-healthchecks - -We don't rely on the `container healthcheck`_ results for anything in the -infrastructure. They are time and resource consuming, and their maintenance is -mostly random. We can at least remove the ones that aren't hitting an actual -API healthcheck endpoint. - -This proposal was discussed during a `session at the Xena PTG`_ - -Problem Description -=================== - -Since we moved the services to container, first with the docker engine, then -with podman, container healthchecks have been implemented and used. - -While the very idea of healthchecks isn't bad, the way we (TripleO) are -making and using them is mostly wrong: - -* no action is taken upon healthcheck failure -* some (most) aren't actually checking if the service is working, but merely - that the service container is running - -The healthchecks such as `healthcheck_port`_, `healthcheck_listen`_, -`healthcheck_socket`_ as well as most of the scripts calling -`healthcheck_curl`_ are mostly NOT doing anything more than ensuring a -service is running - and we already have this info when the container is -"running" (good), "restarting" (not good) or "exited" (with a non-0 code -- bad). - -Also, the way podman implements healthchecks is relying on systemd and its -transient service and `timers`_. Basically, for each container, a new systemd -unit is created and injected, as well as a new timer - meaning systemd calls -podman. This isn't really good for the hosts, especially the ones having -heavy load due to their usage. - -Proposed Change -=============== - -Overview --------- - -A deep cleaning of the current healthcheck is needed, such as the -`healthcheck_socket`_, `healthcheck_port`_, and `healthcheck_curl`_ -that aren't calling an actual API healthcheck endpoint. This list isn't -exhaustive. - -This will drastically reduce the amount of "podman" calls, leading -to less resource issues, and provide a better comprehension when we list -the processes or services. - -In case an Operator wants to get some status information, they can leverage -an existing validation:: - - openstack tripleo validator run --validation service-status - -This validation can be launched from the Undercloud directly, and will gather -remote status for every OC nodes, then provide a clear summary. - -Such a validation could also be launched from a third-party monitoring -instance, provided it has the needed info (mostly the inventory). - -Alternatives ------------- - -There are multiple alternatives we can even implement as a step-by-step -solution, though any of them would more than probably require their own -specifications and discussions: - -Replace the listed healthchecks by actual service healthchecks -.............................................................. - -Doing so would allow to get a better understanding of the stack health, but -will not solve the issue with podman calls (hence resource eating and related -things). -Such healchecks can be launched from an external tool, for instance based -on a host's cron, or an actual service. - -Call the healthchecks from an external tool -........................................... - -Doing so would prevent the potential resource issues with the "podman exec" -calls we're currently seeing, while allowing a centralization for the results, -providing a better way to get metrics and stats. - -Keep things as-is -................. - -Because we have to list this one, but there are hints this isn't the right -thing to do (hence the current spec). - -Security Impact ---------------- - -No real Security impact. Less services/calls might lead to smaller attack -surface, and it might prevent some *denial of service* situations. - -Upgrade Impact --------------- - -No Upgrade impact. - -Other End User Impact ---------------------- - -The End User doesn't have access to the healthcheck anyway - that's more for -the operator. - -Performance Impact ------------------- - -The systems will be less stressed, and this can improve the current situation -regarding performances and stability. - -Other Deployer Impact ---------------------- - -There is no "deployer impact" if we don't consider they are the operator. - -For the latter, there's a direct impact: ``podman ps`` won't be able to show -the health status anymore or, at least, not for the containers without such -checks. - -But the operator is able to leverage the service-status validation instead - -this validation will even provide more information since it takes into account -the failed containers, a thing ``podman ps`` doesn't show without the proper -option, and even with it, it's not that easy to filter. - -Developer Impact ----------------- - -In order to improve the healthchecks, especially for the API endpoints, service -developers will need to implement specific tests in the app. - -Once it's existing, working and reliable, they can push it to any healthcheck -tooling at disposition - being the embedded container healthcheck, or some -dedicated service as described in the third step. - - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - cjeanner - -Work Items ----------- - -#. Triage existing healthcheck, and if they aren't calling actual endpoint, - deactive them in tripleo-heat-templates -#. Ensure the stack stability isn't degraded by this change, and properly - document the "service-status" validation with the Validation Framework Team - -The second work item is more an empirical data on the long term - we currently -don't have actual data, appart a `Launchpad issue`_ pointing to a problem -maybe caused by the way healthchecks are launched. - -Possible future work items -.......................... - -#. Initiate a discussion with CloudOps (metrics team) regarding an dedicated - healthcheck service, and how to integrate it properly within TripleO -#. Initiate a cross-Team work toward actual healthcheck endpoints for the - services in need - -Those are just here for the sake of evolution. Proper specs will be needed -in order to frame the work. - -Dependencies -============ - -For step 1 and 2, no real dependencies are needed. - -Testing -======= - -Testing will require different things: - -* Proper metrics in order to ensure there's no negative impact - and that any - impact is measurable -* Proper insurance the removal of the healthcheck doesn't affect the services - in a negative way -* Proper testing of the validations, especially "service-status" in order to - ensure it's reliable enough to be considered as a replacement at some point - -Documentation Impact -==================== - -A documentation update will be needed regarding the overall healthcheck topic. - -References -========== - -* `Podman Healthcheck implementation and usage`_ - - -.. _container healthcheck: https://opendev.org/openstack/tripleo-common/src/branch/master/healthcheck -.. _healthcheck_port: https://opendev.org/openstack/tripleo-common/src/commit/a072a7f07ea75933a2372b1a95ae960095a3250e/healthcheck/common.sh#L49 -.. _healthcheck_listen: https://opendev.org/openstack/tripleo-common/src/commit/a072a7f07ea75933a2372b1a95ae960095a3250e/healthcheck/common.sh#L85 -.. _healthcheck_socket: https://opendev.org/openstack/tripleo-common/src/commit/a072a7f07ea75933a2372b1a95ae960095a3250e/healthcheck/common.sh#L95 -.. _healthcheck_curl: https://opendev.org/openstack/tripleo-common/src/commit/a072a7f07ea75933a2372b1a95ae960095a3250e/healthcheck/common.sh#L28 -.. _session at the Xena PTG: https://etherpad.opendev.org/p/tripleo-xena-drop-healthchecks -.. _timers: https://www.freedesktop.org/software/systemd/man/systemd.timer.html -.. _Podman Healthcheck implementation and usage: https://developers.redhat.com/blog/2019/04/18/monitoring-container-vitality-and-availability-with-podman/ -.. _Launchpad issue: https://bugs.launchpad.net/tripleo/+bug/1923607 diff --git a/specs/xena/keystoneless-undercloud.rst b/specs/xena/keystoneless-undercloud.rst deleted file mode 100644 index 0fba8852..00000000 --- a/specs/xena/keystoneless-undercloud.rst +++ /dev/null @@ -1,196 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================================================== -Support Keystoneless Undercloud (basic auth or noauth) -====================================================== - -The goal of this proposal is to introduce the community to the idea of -removing Keystone from TripleO undercloud and run the remaining OpenStack -services either with basic authentication or noauth (i.e. Standalone mode). - - -Problem Description -=================== - -With the goal of having a thin undercloud we've been simplifying the -undercloud architecture since a few cycles and have removed a number -of OpenStack services. After moving to use `network_data_v2`_ and -`ephemeral_heat`_ by default, we are left only with neutron, ironic -and ironic-inspector services. - -Keystone authentication and authorization does not add lot of value to the -undercloud. We use `admin` and `admin` project for everything. There are -also few service users (one per service) for communication between services. -Most of the overcloud deployment and configuration is done as the os user. -Also, for large deployments we increase token expiration time to a large -value which is orthogonal to keystone security. - - -Proposed Change -=============== - -Overview --------- - -At present, we have keystone running in the undercloud providing catalog, -authentication/authorization services to the remaining deployed services -neutron, ironic and ironic-inspector. Ephemeral heat uses a fake keystone -client which does not talk to keystone. - -All these remaining services are capabale of running standalone using either -`http_basic` or `noauth` auth_strategy and clients using openstacksdk and -keystoneauth can use `HTTPBasicAuth` or `NoAuth` identity plugins with the -standalone services. - -The proposal is to deploy these OpenStack services either with basic auth or -noauth and remove keystone from the undercloud by default. - -- Deploy ironic/ironic-inspector/neutron with `http_basic` (default) or `noauth` - -This would also allow us to remove some additional services like `memcached` -from the undercloud mainly used for authtoken caching. - - -Alternatives ------------- - -- Keep keystone in the undercloud as before. - - -Security Impact ---------------- - -There should not be any significant security implications by disabling keystone -on the undercloud as there are no multi-tenancy and RABC requirements for -undercloud users/operators. Deploying baremetal and networking services with `http_basic` authentication would protect against any possible intrusion as before. - - -Upgrade Impact --------------- - -There will be no upgrade impact; this change will be transparent to the -end-user. - - -Other End User Impact ---------------------- - -None. - - -Performance Impact ------------------- - -Disabling authentication and authorization would make the API calls faster and -the overall resource requirements of undercloud would reduce. - - -Other Deployer Impact ---------------------- - -None - -Developer Impact ----------------- - -None. - - -Implementation -============== - -- Add THT support for configuring `auth_strategy` for ironic and neutron - services and manage htpasswd files used for basic authentication by the - ironic services. - -.. code-block:: yaml - - IronicAuthStrategy: http_basic - NeutronAuthStrategy: http_basic - -- Normally, Identity service middleware provides a X-Project-Id header based on - the authentication token submitted by the service client. However when keystone - is not available neutron expects `project_id` in the `POST` requests (i.e create - API). Also, metalsmith communicates with `neutron` to create `ctlplane` ports for - instances. - - Add a middleware for neutron API `http_basic` pipeline to inject a fake project_id - in the context. - -- Add basic authentication middleware to oslo.middleware and use it for undercloud - neutron. - -- Create/Update clouds.yaml to use `auth_type: http_basic` and use endpoint overrides - for the public endpoints with `_endpoint_override` entries. We - would leverage the `EndpointMap` and change `extraconfig/post_deploy` to create - and update clouds.yaml. - -.. code-block:: yaml - - clouds: - undercloud: - auth: - password: piJsuvz3lKUtCInsiaQd4GZ1w - username: admin - auth_type: http_basic - baremetal_api_version: '1' - baremetal_endpoint_override: https://192.168.24.2:13385 - baremetal_introspection_endpoint_override: https://192.168.24.2:13050 - network_api_version: '2' - network_endpoint_override: https://192.168.24.2:13696 - -Assignee(s) ------------ - -Primary assignee: - ramishra - -Other contributors: - - -Work Items ----------- - -- Add basic authentication middleware in oslo.middleware - https://review.opendev.org/c/openstack/oslo.middleware/+/802234 -- Support `auth_strategy` with ironic and neutron services - https://review.opendev.org/c/openstack/tripleo-heat-templates/+/798241 -- Neutron middleware to add fake project_id to noauth pipleline - https://review.opendev.org/c/openstack/neutron/+/799162 -- Configure neutron paste deploy for basic authentication - https://review.opendev.org/c/openstack/tripleo-heat-templates/+/804598 -- Disable keystone by default - https://review.opendev.org/c/openstack/tripleo-heat-templates/+/794912 -- Add option to enable keystone if required - https://review.opendev.org/c/openstack/python-tripleoclient/+/799409 -- Other patches: - https://review.opendev.org/c/openstack/tripleo-ansible/+/796991 - https://review.opendev.org/c/openstack/tripleo-common/+/796825 - https://review.opendev.org/c/openstack/tripleo-ansible/+/797381 - https://review.opendev.org/c/openstack/tripleo-heat-templates/+/799408 - - -Dependencies -============ - -Ephemeral heat and network-data-v2 are used as defaults. - - -Documentation Impact -==================== - -Update the undercloud installation and upgrade guides. - - -References -========== - -* `network_data_v2`_ specification -* `ephemeral_heat`_ specification - -.. _network_data_v2: https://specs.openstack.org/openstack/tripleo-specs/specs/wallaby/triplo-network-data-v2-node-ports.html -.. _ephemeral_heat: https://specs.openstack.org/openstack/tripleo-specs/specs/wallaby/ephemeral-heat-overcloud.html diff --git a/specs/xena/tripleo-independent-release.rst b/specs/xena/tripleo-independent-release.rst deleted file mode 100644 index a9d116bb..00000000 --- a/specs/xena/tripleo-independent-release.rst +++ /dev/null @@ -1,191 +0,0 @@ - -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -================================================= -Moving TripleO repos to independent release model -================================================= - -Include the URL of your launchpad blueprint: - -https://blueprints.launchpad.net/tripleo - -This spec proposes that we move all tripleo repos to the independent release -model. The proposal was first raised during tripleo irc meetings [1]_ and then -also on the openstack-discuss mailing list [2]_. - -Problem Description -=================== - -The TripleO repos [3]_ mostly follow the cycle-with-intermediary release -model, for example tripleo-heat-templates at [4]_. Mostly because some of -tripleo repos use the independent release model, for example tripleo-upgrade -at [5]_. A description of the different release models can be found at [6]_. - -By following the cycle-with-intermediary release model, TripleO is bound to -produce a release for each OpenStack development cycle and a corresponding -stable/branch in the tripleo repos. However as we have seen this causes an -ongoing maintenance burden; consider that currently TripleO supports 5 -active branches - Train, Ussuri, Victoria, Wallaby and Xena (current master). -In fact until very recently that list contained 7 branches, including Stein -and Queens (currently moving to End Of Life [7]_). - -This creates an ongoing maintenance and resource burden where for each -branch we are backporting changes, implementing, running and maintaining -upstream CI and ensuring compatibility with the rest of OpenStack with 3rd -party CI and the component and integration promotion pipelines [8]_, on an -ongoing bases. - -Finally, changes in the underlying OS between branches means that for some -branches we maintain two "types" of CI job; for stable/train we have to support -both Centos 7 and Centos 8. With the coming stable/xena, we would likely have -to support Centos-Stream-8 as well as Centos-Stream-9 in the event that -Stream-9 is not fully available by the xena release, which further compounds -the resource burden. By adopting the proposal laid out here we can choose to -skip the Xena branch thus avoiding this increased CI and maintenance cost. - -Proposed Change -=============== - -Overview --------- - -The proposal is for all TripleO repos that are currently using the -cycle-with-intermediary release model to switch to independent. This will -allow us to choose to skip a particular release and more importantly skip -the creation of the given stable/branch on those repos. - -This would allow the TripleO community to focus our resources on those branches -that are most 'important' to us, namely the 'FFU branches'. That is, the -branches that are part of the TripleO Fast Forward Upgrade chain (currently -these are Train -> Wallaby -> Z?). For example it is highly likely that we -would not create a Xena branch. - -Developers will be freed from having to backport changes across stable/branches -and this will have a dramatic effect on our upstream CI resource consumption -and maintenance burden. - -Alternatives ------------- - -We can continue to create all the stable/branches and use the same release -model we currently have. This would mean we would continue to have an increased -maintenance burden and would have to address that with increased resources. - -Security Impact ---------------- - -None - -Upgrade Impact --------------- - -For upgrades it would mean that TripleO would no longer directly support all -OpenStack stable branches. So if we decide not to create stable/xena for example -then you cannot upgrade from wallaby to xena using TripleO. In some respects -this would more closely match reality since the focus of the active tripleo -developer community has typically been on ensuring the Fast Forward Upgrade -(e.g. train to wallaby) and less so on ensuring the point to point upgrade -between 2 branches. - -Other End User Impact ---------------------- - -TripleO would no longer be able to deploy all versions of OpenStack. One idea -that was brough forth in the discussions around this topic thus far, is that -we can attempt to address this by designating a range of git tags as compatible -with a particular OpenStack stable branch. - -For example if TripleO doesn't create a stable/xena, but during the xena cycle -makes releases for the various Tripleo repos then *those* releases will be -compatible for deploying OpenStack stable/xena. We can maintain and publicise -a set of compatible tags for each of the affected repos (e.g., -tripleo-heat-templates versions 15.0.0 to 15.999.999 are compatible with -OpenStack stable/xena). - -Some rules around tagging will help us. Generally we can keep doing what we -currently do with respect to tagging; For major.minor.patch (e.g. 15.1.1) in -the release tag, we will always bump major to signal a new stable branch. - -One problem with this solution is that there is no place to backport fixes to. -For example if you are using tripleo-heat-templates 15.99.99 to deploy -OpenStack Xena (and there is no stable/xena for tht) then you'd have to apply -any fixes to the top of the 15.99.99 tag and use it. There would be no way -to commit these fixes into the code repo. - -Performance Impact ------------------- - -None - -Other Deployer Impact ---------------------- - -There were concerns raised in the openstack-discuss thread [2] about RDO -packaging and how it would be affected by this proposal. As was discussed -there are no plans for RDO to stop building packages for any branch. For the -building of tripleo repos we would have to rely on the latest compatible -git tag, as outlined above in `Other End User Impact`_. - -Developer Impact ----------------- - -Will have less stable/branches to backport fixes to. It is important to note -however that by skipping some branches, resulting backports across multiple -branches will result in a larger code diff and so be harder for developers to -implement. That is, there will be increased complexity in resulting backports if -we skip intermediate branches. - -As noted in the `Other End User Impact`_ section above, for those branches that -tripleo decides not to create, there will be no place for developers to commit -any branch specific fixes to. They can consume particular tagged releases of -TripleO repos that are compatible with the given branch, but will not be able -to commit those changes to the upstream repo since the branch will not exist. - -Implementation -============== - -Assignee(s) ------------ - -Wesley Hayutin -Marios Andreou - -Work Items ----------- - -Besides posting the review against the releases repo [9]_ we will need to -update documentation to reflect and inform about this change. - -Dependencies -============ - -None - -Testing -======= - -None - -Documentation Impact -==================== - -Yes we will at least need to add some section to the docs to explain this. -We may also add some landing page to show the currently 'active' or supported -TripleO branches. - -References -========== - -.. [1] `Tripleo IRC meeting logs 25 May 2021 `_ -.. [2] `openstack-discuss thread '[tripleo] Changing TripleO's release model' `_ -.. [3] `TripleO section in governance projects.yaml `_ -.. [4] `tripleo-heat-templates wallaby release file `_ -.. [5] `tripleo-upgrade independent release file `_ -.. [6] `OpenStack project release models `_ -.. [7] `openstack-discuss [TripleO] moving stable/stein and stable/queens to End of Life `_ -.. [8] `TripleO Docs - TripleO CI Promotions `_ -.. [9] `opendev.org openstack/releases git repo `_ diff --git a/specs/xena/tripleo-repos-single-source.rst b/specs/xena/tripleo-repos-single-source.rst deleted file mode 100644 index f67c781e..00000000 --- a/specs/xena/tripleo-repos-single-source.rst +++ /dev/null @@ -1,339 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================== -TripleO Repos Single Source -=========================== - -This proposal lays out the plan to use tripleo-repos as a single source -to install and configure non-base OS repos for TripleO - including -setting the required DLRN hashes. - -https://blueprints.launchpad.net/tripleo/+spec/tripleo-repos-single-source - -Problem Description -=================== - -Reviewing the code base, there are multiple places where repos are -specified. For example,in the release files we are setting the -configuration that is applied by `repo setup role`_. -Some of the other repo/version configurations are included in: - -* `tripleo repos`_ -* `repo setup role`_ -* `release config files`_ -* `container tooling (base tcib file)`_ -* `tripleo-ansible`_ -* `rdo config`_ (example) -* `tripleo-heat-templates`_ -* `tripleo-operator-ansible`_ - -.. _`tripleo repos`: https://opendev.org/openstack/tripleo-repos -.. _`repo setup role`: https://opendev.org/openstack/tripleo-quickstart/src/commit/d14d81204036a02562c3f4efd7acb3b38cb6ae95/roles/repo-setup/templates/repo_setup.sh.j2#L72 -.. _`release config files`: https://opendev.org/openstack/tripleo-quickstart/src/commit/d14d81204036a02562c3f4efd7acb3b38cb6ae95/config/release/tripleo-ci/CentOS-8/master.yml#L93 -.. _`container tooling (base tcib file)`: https://opendev.org/openstack/tripleo-common/src/commit/d3286377132ee6b0689a39e52858c07954711d13/container-images/tcib/base/base.yaml#L59 -.. _`tripleo-ansible`: https://opendev.org/openstack/tripleo-ansible/src/commit/509e630baa92673e72e641635d5742da01b4dc3b/tripleo_ansible/roles/tripleo_podman/vars/redhat-8.2.yml -.. _`rdo config`: https://review.rdoproject.org/r/31439 -.. _`tripleo-heat-templates`: https://opendev.org/openstack/tripleo-heat-templates/src/commit/125f45820255efe370af1024080bafc695892faa/environments/lifecycle/undercloud-upgrade-prepare.yaml -.. _`tripleo-operator-ansible`: https://opendev.org/openstack/tripleo-operator-ansible/src/commit/14a601a47be217386df83512fae3a2e5aa5444a3/roles/tripleo_container_image_build/molecule/default/converge.yml#L172 - - -The process of setting repo versions requires getting and -transforming DLRN hashes, for example resolving 'current-tripleo' -to a particular DLRN build ID and specifying the correct proxies. -Currently a large portion of this work is done in the release files -resulting in sections of complicated and fragile Bash scripts - -duplicated across numerous release files. - -This duplication, coupled with the various locations in use -for setting repo configurations, modules and supported versions -is confusing and error prone. - -There should be one source of truth for which repos are installed -within a tripleo deployment and how they are installed. -Single-sourcing all these functions will avoid the current -problems of duplication, over-writing settings and version confusion. - -Proposed Change -=============== - -Overview --------- - -This proposal puts forward using tripleo-repos as the 'source of truth' -for setting repo configurations, modules and supported versions - -including setting the DLRN hashes required to specify exact repo -versions to install for upstream development/CI workflows. - -Having a single source of truth for repo config, modules, etc. will make -development and testing more consistent, reliable and easier to debug. - -The intent is to use the existing tripleo-repos repo for this work and -not to create a new repo. It is as yet to be determined if we will add -a v2/versioned api or how we will handle the integration with the -existing functionality there. - -We aim to modularize the design and implementation of the proposed tripleo-repos -work. Two sub systems in particular have been identified that can be -implemented independently of, and ultimately to be consumed by, tripleo-repos; -the resolution of delorean build hashes from known tags (i.e. resolving -'current-tripleo' to a particular DLRN build ID) and the configuration of dnf -repos and modules will be implemented as independent python modules, with -their own unit tests, clis, ansible modules etc. - -Integration Points ------------------- - -The new work in tripleo-repos will have to support with all -the cases currently in use and will have to integrate with: - -* DLRN Repos -* release files -* container and overcloud image builds -* rdo config -* yum/dnf repos and modules -* Ansible (Ansible module) -* promotion pipeline - ensuring the correct DLRN hashes - -Incorporating the DLRN hash functionality makes the tool -more complex. Unit tests will be required to guard -against frequent breakages. This is one of the reasons that we decided to split -this DLRN hash resolution into its own dedicated python module -'tripleo-get-hash' for which we can have independent unit tests. - -The scope of the new tripleo-repos tool will be limited to upstream -development/CI workflows. - -Alternatives ------------- - -Functionality to set repos, modules and versions is already available today. -It would be possible to leave the status quo or: - -* Use rdo config to set one version per release - however, this would not - address the issue of changing DLRN hashes -* Create an rpm that lays down /etc/tripleo-release where container-tools could - be meta data in with that, similar to /etc/os-release - -Security Impact ---------------- - -No security impact is anticipated. The work is currently in the tripleo -open-source repos and will remain there - just in a consolidated -place and format. - -Upgrade Impact --------------- - -Currently there will be no upgrade impact. The new CLI will support -all release versions under support and in use. At a later date, -when the old CLI is deprecated there may be some update -implications. - -However,there may be work to make the emit_releases_file -https://opendev.org/openstack/tripleo-ci/src/branch/master/scripts/emit_releases_file/emit_releases_file.py -functionality compatible with the new CLI. - -Other End User Impact ---------------------- - -Work done on the new project branch will offer a different version of CLI, v2. -End users would be able to select which version of the CLI to use - until -the old CLI is deprecated. - - -Performance Impact ------------------- - -No performance impact is expected. Possible performance improvements could -result from ensuring that proxy handling (release file, mirrors, rdoproject) -is done correctly and consistently. - -Other Deployer Impact ---------------------- - - -Developer Impact ----------------- - -See ```Other End User Impact``` section. - -Implementation -============== - -The functionality added to tripleo-repos will be writen as a Python module -with a CLI and will be able to perform the following primary functions: - -* Single source the installation of all TripleO related repos -* Include the functionality current available in the repo-setup role - including creating repos from templates and files -* Perform proxy handling such as is done in the release files - (mirrors, using rdoproject for DLRN repos) -* Get and transform human-readable DLRN hashes - to be implemented as an - independent module. -* Support setting yum modules such as container-tools - to be implemented - as an independent module. -* Support enabling and disabling repos and setting their priorities - -The repo-setup role shall remain but it will invoke tripleo-repos. -All options required to be passed to tripleo-repos should be in the -release file. - -Work done on the new project branch will offer a different version of CLI, v2. -Unit tests will be added on this branch to test the new CLI directly. -CI would be flipped to run in the new branch when approved by TripleO teams. -All current unit tests should pass with the new code. - -An Ansible module will be added to call the tripleo-repos -options from Ansible directly without requiring the end -user to invoke the Python CLI from within Ansible. - -The aim is for tripleo-repos to be the single source for all repo related -configuration. In particular the goal is to serve the following 3 personas: - -* Upstream/OpenStack CI jobs -* Downstream/OSP/RHEL jobs -* Customer installations - -The configuration required to serve each of these use cases is slightly -different. In upstream CI jobs we need to configure the latest current-tripleo -promoted content repos. In downstream/OSP jobs we need to use rhos-release -and in customer installations we need to use subscription manager. - -Because of these differing requirements we are leaning towards storing the -configuration for each in their intended locations, with the upstream config -being the 'base' and the downstream config building ontop of that (the -implication is that some form of inheritance will be used to avoid duplication). -This was discussed during the `Xena PTG session`_ - -.. _`Xena PTG session`: https://etherpad.opendev.org/p/ci-tripleo-repos - -Assignee(s) ------------ - -* sshnaidm (DF and CI) -* marios (CI and W-release PTL) -* weshay -* chandankumar -* ysandeep -* arxcruz -* rlandy -* other DF members (cloudnull) - -Work Items ----------- - -Proposed Schedule -================= - -Investigative work will be begin in the W-release cycle on a project branch -in tripleo-repos. The spec will be put forward for approval in the X-release -cycle and impactful and integration work will be visible once the spec -is approved. - -Dependencies -============ - -This work has a dependency on the `DLRN API`_ and on yum/dnf. - -.. _`DLRN API`: https://dlrn.readthedocs.io/en/latest/api.html - -Testing -======= - -Specific unit tests will be added with the python-based code built. -All current CI tests will run through this work and will -test it on all releases and in various aspects such as: - -* container build -* overcloud image build -* TripleO deployments (standalone, multinode, scenarios, OVB) -* updates and upgrades - -CLI Design -========== - -Here is an abstract sketch of the intended cli design for the -new tripleo-repos. - -It covers most of the needs discussed at multiple places. - -Scenario 1 ----------- - -The goal is to construct a repo with the correct hash for an integration -or a component pipeline. - -For this scenario: - -* Any combination of `hash, distro, commit, release, promotion, url` parameters can passed -* Use the `tripleo-get-hash`_ module to determine the DLRN build ID -* Use the calculated DLRN build ID to create and add a repo - -.. _`tripleo-get-hash`: https://opendev.org/openstack/tripleo-repos/src/branch/master/tripleo-get-hash - - -Scenario 2 ----------- - -The goal is to construct any type of yum/dnf repo. - -For this scenario: - -* Construct and add a yum/dnf repo using a combination of the following parameters -* filename - filename for saving the resulting repo (mandatory) -* reponame - name of repository (mandatory) -* baseurl - base URL of the repository (mandatory) -* down_url - URL to download repo file from (mandatory/multually exclusive to baseurl) -* priority - priority of resulting repo (optional) -* enabled - 0/1 whether the repo is enabled or not (default: 1 - enabled) -* gpgcheck - whether to check GPG keys for repo (default: 0 - don't check) -* module_hotfixes - whether to make all RPMs from the repository available (default: 0) -* sslverify - whether to use a cert to use repo metadata (default: 1) -* type - type of the repo(default: generic, others: custom and file) - - -Scenario 3 ----------- - -The goal is to enable or disable specific dnf module and also install or -remove a specific package. - -For this scenario: - -* Specify -* module name -* which version to disable -* which version to enable -* which specific package from the module to install (optional) - - -Scenario 4 ----------- - -The goal is to enable or disable some repos, -remove any associated repo files no longer needed, -and then perform a system update. - -For this scenario: - -* Specify -* repo names to be disabled -* repo names to be enabled -* the files to be removed -* whether to perform the system update - - -Documentation Impact -==================== - -tripleo-docs will be updated to point to the new supported -repo/modules/versions setting workflow in tripleo-repos. - -References to old sources of settings such as tripleo-ansible, -release files in tripleo-quickstart and the repo-setup role -will have to be removed and replaced to point to the new -workflow. diff --git a/specs/xena/whole-disk-default.rst b/specs/xena/whole-disk-default.rst deleted file mode 100644 index 7b7f67fd..00000000 --- a/specs/xena/whole-disk-default.rst +++ /dev/null @@ -1,307 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=================================== -Deploy whole disk images by default -=================================== - -https://blueprints.launchpad.net/tripleo/+spec/whole-disk-default - -This blueprint tracks the tasks required to switch to whole-disk overcloud -images by default instead of the current overcloud-full partition image. - -Whole disk images vs partition images -===================================== - -The current overcloud-full partition image consists of the following: - -* A compressed qcow2 image file which contains a single root partition with - all the image contents - -* A kernel image file for the kernel to boot - -* A ramdisk file to boot with the kernel - -Whereas the overcloud-hardened-uefi-full whole-disk image consists of a single -compressed qcow2 image containing the following: - -* A partition layout containing UEFI boot, legacy boot, and a root partition - -* The root partition contains a single lvm group with a number of logical - volumes of different sizes which are mounted at /, /tmp, /var, /var/log, etc. - -When a partition image is deployed, ironic-python-agent does the following on -the baremetal disk being deployed to: - -* Creates the boot and root partitions on the disk - -* Copies the partition image contents to the root partition - -* Populates the empty boot partition with everything required to boot, including - the kernel image, ramdisk file, a generated grub config, and an installed - grub binary - -When a whole-disk image is deployed, ironic-python-agent simply copies the whole -image to the disk. - -When the partition image deploy boots for the first time, the root partition -grows to take up all of the available disk space. This mechanism is provided -by the base cloud image. There is no equivalent partition growing mechanism -for a multi-volume LVM whole-disk image. - -Problem Description -=================== - -The capability to build and deploy a whole-disk overcloud image has been -available for many releases, but it is time to switch to this as the default. -Doing this will avoid the following issues and bring the following benefits: - -* As of CentOS-8.4, grub will stop support for installing the bootloader on a - UEFI system. ironic-python-agent depends on grub installs to set up EFI boot - with partition images, so UEFI boot will stop working when CentOS 8.4 is - used. - -* Other than this new grub behaviour, keeping partition boot working in - ironic-python-agent has been a development burden and involves code - complexity which is avoided for whole-disk deployments. - -* TripleO users are increasingly wanting to deploy with UEFI Secure Boot - enabled, this is only possible with whole-disk images that use the signed - shim bootloader. - -* Partition images need to be distributed with kernel and ramdisk files, adding - complexity to file management of deployed images compared to a single - whole-disk image file. - -* The `requirements for a hardened image`_ includes having separate volumes for - root, data etc. All TripleO users get the security benefit of hardened images - when a whole-disk image is used. - -* We currently need dedicated CI jobs both in the upstream check/gate (when the - relevant files changed) but also in periodic integration lines, to build and - publish the latest 'current-tripleo' version of the hardened images. In the long - term, only a single hardend UEFI whole-disk image needs to be built and - published, reducing the CI footprint. (in the short term, CI footprint may go up - so the whole-disk image can be published, and while hardened vs hardened-uefi - jobs are refactored. - -Proposed Change -=============== - -Overview --------- - -Wherever the partition image overcloud-full.qcow2 is built, published, or used -needs to be updated to use overcloud-hardened-uefi-full.qcow2 by default. - -This blueprint will be considered complete when it is possible to follow the -default path in the documentation and the result is an overcloud deployed -with whole-disk images. - -Image upload tool -+++++++++++++++++ - -The default behaviour of ``openstack overcloud image upload`` needs to be -aware that overcloud-hardened-uefi-full.qcow2 should be uploaded by default -when it is detected in the local directory. - -Reviewing image build YAML -++++++++++++++++++++++++++ - -Once the periodic jobs are updated, image YAML defining -overcloud-hardened-full can be deleted, leaving only -overcloud-hardened-uefi-full. Other refactoring can be done such as renaming --python3.yaml back to -base.yaml. - -Reviewing partition layout -++++++++++++++++++++++++++ - -Swift data is stored in ``/srv`` and according to the criteria of hardened -images this should be in its own partition. This will need to be added to the -existing partition layout for whole-disk UEFI images. - -Partition growing -+++++++++++++++++ - -On node first boot, a replacement mechanism for growing the root partition is -required. This is a harder problem for the multiple LVM volumes which the -whole-disk image creates. Generally the ``/var`` volume should grow to take -available disk space because this is where TripleO and OpenStack services store -their state, but sometimes ``/srv`` will need to grow for Swift storage, and -sometimes there may need to be a proportional split of multiple volumes. This -suggests that there will be new tripleo-heat-templates variables which will -specify the volume/proportion growth behaviour on a per-role basis. - -A new utility is required which automates this LVM volume growing -requirement. It could be implemented a number of ways: - -1. A new project/package containing the utility, installed on the image and - run by first-boot or early tripleo-ansible. - -2. A utility script installed by a diskimage-builder/tripleo-image-elements - element and run by first-boot or as a first-boot ansible task (post-provisioning - or early deploy). - -3. Implement entirely in an ansible role, either in its own repository, or as - part of tripleo-ansible. It would be run by early tripleo-ansible. - -This utility will also be useful to other cloud workloads which use LVM based -images, so some consideration is needed for making it a general purpose tool -which can be used outside an overcloud image. Because of this, option 2. is -proposed initially as the preferred way to install this utility, and it will -be proposed as a new element in diskimage-builder. Being coupled with -diskimage-builder means the utility can make assumptions about the partition -layout: - -* a single Volume Group that defaults to name ``vg`` - -* volume partitions are formatted with XFS, which can be resized while mounted - -Alternatives ------------- - -Because of the grub situation, the only real alternative is dropping support -for UEFI boot, which means only supporting legacy BIOS boot indefinitely. -This would likely have negative feedback from end-users. - -Security Impact ---------------- - -* All deployments will use images that comply with the hardened-image - requirements, so deployments will gain these security benefits - -* Whole disk images are UEFI Secure Boot enabled, so this blueprint brings us - closer to recommending that Secure Boot be switched on always. This will - validate to users that they have deployed boot/kernel binaries signed by Red - Hat. - -Upgrade Impact --------------- - -Nodes upgraded in-place will continue to be partition image based, and -new/replaced nodes will be deployed with whole-disk images. This doesn't have -a specific upgrade implication, unless we document an option for replacing -every node in order to ensure all nodes are deployed with whole-disk images. - -Other End User Impact ---------------------- - -There is little end-user impact other than: - -* The change of habit required to use overcloud-hardened-uefi-full.qcow2 - instead of overcloud-full.qcow2 - -* The need to set the heat variable if custom partition growing behaviour is - required - -Performance Impact ------------------- - -There is no known performance impact with this change. - -Other Deployer Impact ---------------------- - -All deployer impacts have already been mentioned elsewhere. - -Developer Impact ----------------- - -There are no developer impacts beyond the already mentioned deployer impacts. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - Steve Baker - -Work Items ----------- - -* python-tripleoclient: image upload command, handle - overcloud-hardened-uefi-full.qcow2 as the default if it exists locally - -* tripleo-ansible/cli-overcloud-node-provision.yaml: detect - overcloud-hardened-uefi-full.(qcow2|raw) as the default if it exists in - /var/lib/ironic/images - -* RDO jobs: - * add periodic job for overcloud-hardened-uefi-full - * remove periodic job for overcloud-hardened-full - * modify image publishing jobs to publish overcloud-hardened-uefi-full.qcow2 - -* tripleo-image-elements/overcloud-partition-uefi: add ``/srv`` logical volume - for swift data - -* tripleo-quickstart-extras: Use the whole_disk_images=True variable to switch to - downloading/uploading/deploying overcloud-hardened-uefi-full.qcow2 - -* tripleo-ci/featureset001/002: Enable whole_disk_images=True - -* diskimage-builder: Add new element which installs utility for growing LVM - volumes based on specific volume/proportion mappings - -* tripleo-common/image-yaml: - * refactor to remove non-uefi hardened image - * rename -python3.yaml back to -base.yaml - * add the element which installs the grow partition utility - -* tripleo-heat-templates: Define variables for driving partition growth - volume/proportion mappings - -* tripleo-ansible: Consume the volume/proportion mapping and run the volume - growing utility on every node in early boot. - -* tripleo-docs: - * Update the documentation for deploying whole-disk images by default - * Document variables for controlling partition growth - -Dependencies -============ - -Unless diskimage-builder require separate tracking to add the partition -growth utility, all tasks can be tracked under this blueprint. - -Testing -======= - -Image building and publishing ------------------------------ - -Periodic jobs which build images, and jobs which build and publish images to -downloadable locations need to be updated to build and publish -overcloud-hardened-uefi-full.qcow2. Initially this can be in parallel with -the existing overcloud-full.qcow2 publishing, but eventually that can be -switched off. - -overcloud-hardened-full.qcow2 is the same as -overcloud-hardened-uefi-full.qcow2 except that it only supports legacy BIOS -booting. Since overcloud-hardened-uefi-full.qcow2 supports both legacy BIOS -and UEFI boot, the periodic jobs which build overcloud-hardened-full.qcow2 -can be switched off from Wallaby onwards (assuming these changes are backported -as far back as Wallaby). - -CI support ----------- - -CI jobs which consume published images need to be modified so they can -download overcloud-hardened-uefi-full.qcow2 and deploy it as a whole-disk -image. - -Documentation Impact -==================== - -The TripleO Deployment Guide needs to be modified so that -overcloud-hardened-uefi-full.qcow2 is referred to throughout, and so that it -correctly documents deploying a whole-disk image based overcloud. - -References -========== - -.. _requirements for a hardened image: https://teknoarticles.blogspot.com/2017/07/build-and-use-security-hardened-images.html diff --git a/specs/yoga/taskcore-directord.rst b/specs/yoga/taskcore-directord.rst deleted file mode 100644 index 86d914f9..00000000 --- a/specs/yoga/taskcore-directord.rst +++ /dev/null @@ -1,514 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================================================== -Unifying TripleO Orchestration with Task-Core and Directord -=========================================================== - -Include the URL of your launchpad blueprint: -https://blueprints.launchpad.net/tripleo/+spec/unified-orchestration - -The purpose of this spec is to introduce core concepts around Task-Core and -Directord, explain their benefits, and cover why the project should migrate -from using Ansible to using Directord and Task-Core. - -TripleO has long been established as an enterprise deployment solution for -OpenStack. Different task executions have been used at different times. -Originally, os-collect-config was used, then the switch to Ansible was -completed. A new task execution environment will enable moving forward -with a solution designed around the specific needs of TripleO. - -The tools being introduced are Task-Core and Directord. - -Task-Core_: - A dependency management and inventory graph solution which allows operators - to define tasks in simple terms with robust dominion over a given - environment. Declarative dependencies will ensure that if a container/config - is changed, only the necessary services are reloaded/restarted. Task-Core - provides access to the right tools for a given job with provenance, allowing - operators and developers to define outcomes confidently. - -Directord_: - A deployment framework built to manage the data center life cycle, which is - both modular and fast. Directord focuses on consistently maintaining - deployment expectations with a near real-time level of performance_ at almost - any scale. - - -Problem Description -=================== - -Task execution in TripleO is: - -* Slow -* Resource intensive -* Complex -* Defined in a static and sequential order -* Not optimized for scale - -TripleO presently uses Ansible to achieve its task execution orchestration -goals. While the TripleO tooling around Ansible (playbooks, roles, modules, -plugins) has worked and is likely to continue working should maintainers bear -an increased burden, future changes around direction due to `Ansible Execution -Environments`_ provide an inflection point. These upstream changes within -Ansible, where it is fundamentally moving away from the TripleO use case, force -TripleO maintainers to take on more ownership for no additional benefit. The -TripleO use case is actively working against the future direction of Ansible. - -Further, the Ansible lifecycle has never matched that of TripleO. A single -consistent and backwards compatible Ansible version can not be used across a -single version of TripleO without the tripleo-core team committing to maintain -that version of Ansible, or commit to updating the Ansible version in a stable -TripleO release. The cost to maintain a tool such as Ansible that the core team -does not own is high vs switching to custom tools designed specifically for the -TripleO use case. - -The additional cost of maintaining Ansible as the task execution engine for -TripleO, has a high likelihood of causing a significant disruption to the -TripleO project; this is especially true as the project looks to support future -OS versions. - -Presently, there are diminishing benefits that can be realized from any -meaningful performance, scale, or configurability improvments. The -simplification efforts and work around custom Ansible strategies and plugins -have reached a conclusion in terms of returns. - -While other framework changes to expose scaling mechanisms, such as using -``--limit`` or partitioning of the ansible execution across multiple stacks or -roles do help with the scaling problem, they are however in the category of -work arounds as they do not directly address the inherent scaling issues with -task executions. - -Proposed Change -=============== - -To make meaningful task execution orchestration improvements, TripleO must -simplify the framework with new tools, enable developers to build intelligent -tasks, and provide meaningful performance enhancements that scale to meet -operators' expectations. If TripleO can capitalize on this moment, it will -improve the quality of life for day one deployers and day two operations and -upgrades. - -The proposal is to replace all usage of Ansible with Directord for task -execution, and add the usage of Task-Core for dynamic task dependencies. - -In some ways, the move toward Task-Core and Directord creates a -General-Problem_, as it's proposing the replacement of many bespoke tools, which -are well known, with two new homegrown ones. Be that as it may, much attention -has been given to the user experience, addressing many well-known pain points -commonly associated with TripleO environments, including: scale, barrier to -entry, execution times, and the complex step process. - -Overview --------- - -This specification consists of two parts that work together to achieve the -project goals. - -Task-Core: - Task-Core builds upon native OpenStack libraries to create a dependency graph - and executes a compiled solution. With Task-Core, TripleO will be able to - define a deployment with dependencies instead of brute-forcing one. While - powerful, Task-Core keeps development easy and consistent, reducing the time - to deliver and allowing developers to focus on their actual deliverable, not - the orchestration details. Task-Core also guarantees reproducible builds, - runtime awareness, and the ability to resume when issues are encountered. - -* Templates containing step-logic and ad-hoc tasks will be refactored into - Task-Core definitions. - -* Each component can have its own Task-Core purpose, providing resources and - allowing other resources to depend on it. - -* The invocation of Task-Core will be baked into the TripleO client, it will - not have to be invoked as a separate deployment step. - -* Advanced users will be able to use Task-Core to meet their environment - expectations without fully understanding the deployment nuance of multiple - bespoke systems. - -* Employs a validation system around inputs to ensure they are correct before - starting the deployment. While the validation wont ensure an operational - deployment, it will remove some issues caused by incorrect user input, such - as missing dependent services or duplicate services; providing early feedback - to deployers so they're able to make corrections before running longer - operations. - -Directord: - Directord provides a modular execution platform that is aware of managed - nodes. Because Directord leverages messaging, the platform can guarantee - availability, transport, and performance. Directord has been built from the - ground up, making use of industry-standard messaging protocols which ensure - pseudo-real-time performance and limited resource utilization. The built-in - DSL provides most of what the TripleO project will require out of the box. - Because no solution is perfect, Directord utilizes a plugin system that will - allow developers to create new functionality without compromise or needing to - modify core components. Additionally, plugins are handled the same, allowing - Directord to ensure the delivery and execution performance remain consistent. - -* Directord is a single application that is ideally suited for containers while - also providing native hooks into systems; this allows Directord to operate in - heterogeneous environments. Because Directord is a simplified application, - operators can choose how they want to run it and are not forced into a one size - fits all solution. - -* Directord is platform-agnostic, allowing it to run across systems, versions, - and network topologies while simultaneously guaranteeing it maintains the - smallest possible footprint. - -* Directord is built upon messaging, giving it the unique ability to span - network topologies with varying latencies; messaging protocols compensate for - high latency environments and will finally give TripleO the ability to address - multiple data-centers and fully embrace "the edge." - -* Directord client/server communication is secured (TLS, etc) and encrypted. - -* Directord node management to address unreachable or flapping clients. - -With Task-Core and Directord, TripleO will have an intelligent dependency graph -that is both easy to understand and extend. TripleO will now be aware of things -like service dependencies, making it possible to run day two operations quickly -and more efficiently (e.g, update and restart only dependent services). -Finally, TripleO will shrink its maintenance burden by eliminating Ansible. - - -Alternatives ------------- - -Stay the course with Ansible - -Continuing with Ansible for task execution means that the TripleO core team -embraces maintaining Ansible for the specific TripleO use case. Additionally, -the TripleO project begins documenting the scale limitations and the boundaries -that exist due to the nature of task execution. Focus needs to shift to the -required maintenance necessary for functional expectations TripleO. Specific -Ansible versions also need to be maintained beyond their upstream lifecycle. -This maintenance would likely include maintaining an Ansible branch where -security and bug fixes could be backported, with our own project CI to validate -functionality. - -TripleO could also embrace the use of `Ansible Execution Environments`_ through -continued investigative efforts. Although, if TripleO is already maintaining -Ansible, this would not be strictly required. - - -Security Impact ---------------- - -Task-Core and Directord are two new tools and attack surfaces, which will -require a new security assessment to be performed to ensure the tooling -exceeds the standard already set. That said, steps have already been taken to -ensure the new proposed architecture is FIPS_ compatible, and enforces -`transport encryption`_. - -Directord also uses `ssh-python`_ for bootstrapping tasks. - -Ansible will be removed, and will no longer have a security impact within -TripleO. - - -Upgrade Impact --------------- - -The undercloud can be upgraded in place to use Directord and Task-Core. There -will be upgrade tasks that will migrate the undercloud as necessary to use the -new tools. - -The overcloud can also be upgraded in place with the new tools. Upgrade tasks -will be migrated to use the Directord DSL just like deployment tasks. This spec -proposes no changes to the overcloud architecture itself. - -As part of the upgrade task migration, the tasks can be rewritten to take -advantage of the new features exposed by these tools. With the introduction of -Task-Core, upgrade tasks can use well-defined dependencies for dynamic -ordering. Just like deployment, update/upgrade times will be decreased due to -the aniticipated performance increases. - - -Other End User Impact ---------------------- - -When following the `happy path`_, the end-user, deployers, and operators will -not interact with this change as the user interface will effectively remain the -same. However the user experience will change. Operators accustomed to Ansible -tasks, logging, and output, will instead need to become familiar with those -same aspects of Directord and Task-Core. - -If an operator wishes to leverage the advanced capabilities of either -Task-Core or Directord, the tooling will have documented end user interfaces -available for interfaces such as custom components and orchestrations. - -It should be noted that there's a change in deployment architecture in that -Directord follows a server/client model; albeit an ephemeral one. This change -aims to be fully transparent, however, it is something that end users, -deployers, will need to be aware of. - - -Performance Impact ------------------- - -This specification will have a positive impact on performance. Due to the -messaging architecture of Directord, near-realtime task execution will be -possible in parallel across all nodes. - -* Performance_ analysis has been done comparing configurability and runtime of - Directord vs. Ansible, the TripleO default orchestration tool. This analysis - highlights some of the performance gains this specification will provide; - initial testing suggests that Task-Core and Directord is more than 10x - faster than our current tool chain, representing a potential 90% time savings - in just the task execution overhead. - -* One of the goals of this specification is to remove impediments in the time - to work. Deployers should not be spending exorbitant time waiting for tools to - do work; in some cases, waiting longer for a worker to be available than it - would take to perform a task manually. - -* Improvements from being able to execute more efficiently in parallel. The - Ansible strategy work allowed us to run tasks from a given Ansible play in - parallel accoss the nodes. However this was limited to a effectively a single - play per node in terms of execution. The granularity was limited to a play - such that an Ansible play that with 100 items of work for one role and 10 - items of work would be run in parallel on the nodes. The role with 10 items - of work would likely finish first and the overall execution would have to - wait until the entire play was completed everywhere. The long pole for a - play's execution is the node with the most set of tasks. With the transition - to task-core and directord, the overall unit of work is an orchestration - which may have 5 tasks. If we take the same 100 tasks for one role and split - them up into 20 orchestrations that can be run in parallel, and the 10 items - of work into two orchestrations for the other roles. We are able to better - execute the work in parallel when there are no specific ordering - requirements. Improvements are expected around host prep tasks and other - services where we do not have specific ordering requirements. Today these - tasks get put in a random spot within a play and have to wait on other - unrelated tasks to complete before being run. We expect there to be less - execution overhead time per the other items in this section, however the - overall improvements are limited based on how well we can remove unnecessary - ordering requirements. - -* Deployers will no longer be required to run a massive server for medium-scale - deployment. Regardless of size, the memory footprint and compute cores needed - to execute a deployment will be significantly reduced. - - -Other Deployer Impact ---------------------- - -Task-Core and Directord represent an unknown factor; as such, they are -**not** battle-tested and will create uncertainty in an otherwise "stable_" -project. - -Deployers will experience the time savings of doing deployments. Deployers who -implement new services will need to do so with Directord and Task-Core. - -Extensive testing has been done; -all known use-cases, from system-level configuration to container pod -orchestration, have been covered, and automated tests have been created to -ensure nothing breaks unexpectedly. Additionally, for the first time, these -projects have expectations on performance, with tests backing up those claims, -even at a large scale. - -At present, TripleO assumes SSH access between the Undercloud and -Overcloud is always present. Additionally, TripleO believes the infrastructure -is relatively static, making day two operations risky and potentially painful. -Task-Core will reduce the computational burden when crafting action plans, and -Directord will ensure actions are always performed against the functional -hosts. - -Another improvement this specification will enhance is in the area of vendor -integrations. Vendors will be able to provide meaningful task definitions which -leverage an intelligent inventory and dependency system. No longer will TripleO -require vendors have in-depth knowledge of every deployment detail, even those -outside of the scope of their deliverable. By easing the job definitions, -simplifying the development process, and speeding up the execution of tasks are -all positive impacts on deployers. - -Test clouds are still highly recommended sources of information; however, -system requirements on the Undercloud will reduce. By reducing the resources -required to operate the Undercloud, the cost of test environments, in terms of -both hardware and time, will be significantly lowered. With a lower barrier to -entry developers and operators alike will be able to more easily contribute to -the overall project. - - -Developer Impact ----------------- - -To fully realize the benefits of this specification Ansible tasks will need to -be refactored into the Task-Core scheme. While Task-Core can run Ansible and -Directord has a plugin system which easily allows developers to port legacy -modules into Directord plugins, there will be a developer impact as the TripleO -development methodology will change. It's fair to say that the potential -developer impact will be huge, yet, the shift isn't monumental. Much of the -Ansible presently in TripleO is shell-oriented, and as such, it is easily -portable and as stated, compatibility layers exist allowing the TripleO project -to make the required shift gradually. Once the Ansible tasks are -ported, the time saved in execution will be significant. - -Example `Task-Core and Directord implementation for Keystone`_: - While this implementation example is fairly basic, it does result in a - functional Keystone environment and in roughly 5 minutes and includes - services like MySQL, RabbitMQ, Keystone as well as ensuring that the - operating systems is setup and configured for a cloud execution environment. - The most powerful aspect of this example is the inclusion of the graph - dependency system which will allow us easily externalize services. - -* The use of advanced messaging protocols instead of SSH means TripleO can more - efficiently address deployments in local data centers or at the edge - -* The Directord server and storage can be easily offloaded, making it possible - for the TripleO Client to be executed from simple environments without access - to the overcloud network; imagine running a massive deployment from a laptop. - - -Implementation -============== - -In terms of essential TripleO integration, most of the work will occur within -the tripleoclient_, with the following new workflow. - -`Execution Workflow`_:: - - ┌────┐ ┌─────────────┐ ┌────┐ ┌─────────┐ ┌─────────┬──────┐ ??????????? - │USER├──►│TripleOclient├──►│Heat├──►│Task-Core├──►│Directord│Server├──►? Network ? - └────┘ └─────────────┘ └────┘ └─────────┘ └─────────┴──────┘ ??????????? - ▲ ▲ ▲ - │ ┌─────────┬───────┐ | | - └──────────────────────►│Directord│Storage│◄──┘ | - └─────────┴───────┘ | - | - ┌─────────┬──────┐ | - │Directord│Client│◄───────┘ - └─────────┴──────┘ - -* Directord|Server - Task executor connecting to client. - -* Directord|Client - Client program running on remote hosts connecting back to - the Directord|Server. - -* Directord|Storage - An optional component, when not externalized, Directord will - maintain the runtime storage internally. In this configuration Directord is - ephemeral. - -To enable a gradual transition, ansible-runner_ has been implemented within -Task-Core, allowing the TripleO project to convert playbooks into tasks that -rely upon strongly typed dependencies without requiring a complete rewrite. The -initial implementation should be transparent. Once the Task-Core hooks are set -within tripleoclient_ functional groups can then convert their tripleo-ansible_ -roles or ad-hoc Ansible tasks into Directord orchestrations. Teams will have -the flexibility to transition code over time and are incentivized by a -significantly improved user experience and shorter time to delivery. - - -Assignee(s) ------------ - -Primary assignee: - * Cloudnull - Kevin Carter - * Mwhahaha - Alex Schultz - * Slagle - James Slagle - - -Other contributors: - * ??? - - -Work Items ----------- - -#. Migrate Directord and Task-Core to the OpenStack namespace. -#. Package all of Task-Core, Directord, and dependencies for pypi -#. RPM Package all of Task-Core, Directord, and dependencies for RDO -#. Directord container image build integration within TripleO / tcib -#. Converge on a Directord deployment model (container, system, hybrid). -#. Implement the Task-Core code path within TripleO client. -#. Port in template Ansible tasks to Directord orchestrations. -#. Port Ansible roles into Directord orchestrations. -#. Port Ansible modules and actions into pure Python or Directord components -#. Port Ansible workflows in tripleoclient into pure Python or Directord - orchestrations. -#. Migration tooling for Heat templates, Ansible roles/modules/actions. -#. Port Ansible playbook workflows in tripleoclient to pure Python or - Directord orchestrations. -#. Undercloud upgrade tasks to migrate to Directord + Task-Core architecture -#. Overcloud upgrade tasks to migrate to enable Directord client bootstrapping - - -Dependencies -============ - -Both Task-Core and Directord are dependencies, as they're new projects. These -dependencies may or may not be brought into the OpenStack namespace; -regardless, both of these projects, and their associated dependencies, will -need to be packaged and provided for by RDO. - - -Testing -======= - -If successful, the implementation of Task-Core and Directord will leave the -existing testing infrastructure unchanged. TripleO will continue to function as -it currently does through the use of the tripleoclient_. - -New tests will be created to ensure the Task-Core and Directord components -remain functional and provide an SLA around performance and configurability -expectations. - - -Documentation Impact -==================== - -Documentation around Ansible will need to be refactored. - -New documentation will need to be created to describe the advanced -usage of Task-Core and Directord. Much of the client interactions from the -"`happy path`_" will remain unchanged. - - -References -========== - -* Directord official documentation https://directord.com - -* Ansible's decision to pivot to execution environments: - https://ansible-runner.readthedocs.io/en/latest/execution_environments.html - -.. _Task-Core: https://github.com/mwhahaha/task-core - -.. _Directord: https://github.com/cloudnull/directord - -.. _General-Problem: https://xkcd.com/974 - -.. _`legacy tooling`: https://xkcd.com/1822 - -.. _`transport encryption`: https://directord.com/drivers.html - -.. _FIPS: https://en.wikipedia.org/wiki/Federal_Information_Processing_Standards - -.. _Performance: https://directord.com/overview.html#comparative-analysis - -.. _practical: https://xkcd.com/382 - -.. _stable: https://xkcd.com/1343 - -.. _validation: https://xkcd.com/327 - -.. _scheme: https://github.com/mwhahaha/task-core/tree/main/schema - -.. _`Task-Core and Directord implementation for Keystone`: https://raw.githubusercontent.com/mwhahaha/task-core/main/examples/directord/services/openstack-keystone.yaml - -.. _`happy path`: https://xkcd.com/85 - -.. _tripleoclient: https://github.com/openstack/python-tripleoclient - -.. _`Execution Workflow`: https://review.opendev.org/c/openstack/tripleo-heat-templates/+/798747 - -.. _ansible-runner: https://github.com/ansible/ansible-runner - -.. _tripleo-ansible: https://github.com/openstack/tripleo-ansible - -.. _`Ansible Execution Environments`: https://ansible-runner.readthedocs.io/en/latest/execution_environments.html - -.. _`ssh-python`: https://pypi.org/project/ssh-python diff --git a/specs/yoga/tripleo_ceph_ingress.rst b/specs/yoga/tripleo_ceph_ingress.rst deleted file mode 100644 index 725894b9..00000000 --- a/specs/yoga/tripleo_ceph_ingress.rst +++ /dev/null @@ -1,259 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================================== -TripleO Ceph Ingress Daemon Integration -=========================================== - -Starting in the Octopus release, Ceph introduced its own day1 tool called -cephadm and its own day2 tool called orchestrator which replaced ceph-ansible. -During the Wallaby and Xena cycles TripleO moved away from ceph-ansible and -adopted cephadm [1]_ as described in [2]_. -During Xena cycle a new approach of deploying Ceph in a TripleO context has -been established and now a Ceph cluster can be provisioned before the overcloud -is created, leaving to the overcloud deployment phase the final configuration -of the Ceph cluster which depends on the OpenStack enabled services defined by -the tripleo-heat-templates interface. -The next goal is to deploy as many Ceph services as possible using the deployed -ceph interface instead of during overcloud deployment. -As part of this effort, we should pay attention to the high-availability aspect, -how it's implemented in the current release and how it should be changed for -Ceph. -This spec represents a follow up of [3]_, it defines the requirements to rely -on the Ceph provided HA daemons and describes the changes required in TripleO -to meet this goal. - -Problem Description -=================== - -In the following description we are referring to the Ganesha daemon and the -need of the related Ceph Ingress daemon deployment, but the same applies to -all the existing daemons that requires an high-availability configuration -(e.g., RGW and the Ceph dashboard for the next Ceph release). -In TripleO we support deployment of Ganesha both when the Ceph cluster is -itself managed by TripleO and when the Ceph cluster is itself not managed by -TripleO. -When the cluster is managed by TripleO, as per spec [3]_, it is preferable to -have cephadm manage the lifecycle of the NFS container instead of deploying it -with tripleo-ansible, and this is broadly covered and solved by allowing the -tripleo Ceph mkspec module to support the new Ceph daemon [4]_. -The ceph-nfs daemon deployed by cephadm has its own HA mechanism, called -ingress, which is based on haproxy and keepalived [5]_ so we would no longer -use pcmk as the VIP owner. -This means we would run pcmk and keepalived in addition to haproxy (deployed by -tripleo) and another haproxy (deployed by cephadm) on the same server (though -with listeners on different ports). -This approach only relies on Ceph components, and both external and internal -scenarios are covered. -However, adopting the ingress daemon for a TripleO deployed Ceph cluster means -that we need to make the overcloud aware about the new running services: for -this reason the proposed change is meant to introduce a new TripleO resource -that properly handles the interface with the Ceph services and is consistent -with the tripleo-heat-templates roles. - -Proposed Change -=============== - -Overview --------- - -The change proposed by this spec requires the introduction of a new TripleO -Ceph Ingress resource that describes the ingress service that provides load -balancing and HA. -The impact of adding a new `OS::TripleO::Services::CephIngress` resource can -be seen on the following projects. - - -tripleo-common --------------- - -As described in Container Image Preparation [6]_ the undercloud may be used as -a container registry for all the ceph related containers and a new, supported -syntax, has been introduced to `deployed ceph` to download containers from -authenticated registries. -However, as per [7]_, the Ceph ingress daemons won’t be baked into the Ceph -daemon container, hence `tripleo container image prepare` should be executed to -pull the new container images/tags in the undercloud as made for the Ceph -Dashboard and the regular Ceph image. -Once the ingress containers are available, it's possible to deploy the daemon -on top of ceph-nfs or ceph-rgw. -In particular, if this spec is going to be implemented, `deployed ceph` will be -the only way of setting up this daemon through cephadm for ceph-nfs, resulting -in a simplified tripleo-heat-templates interface and a less number of tripleo -ansible tasks execution because part of the configuration is moved before the -overcloud is deployed. -As part of this effort, considering that the Ceph related container images have -grown over the time, a new condition will be added to the tripleo-container jinja -template [8]_ to avoid pulling additional ceph images if Ceph is not deployed by -TripleO [10]_. -This will result in a new optimization for all the Ceph external cluster use cases, -as well as the existing CI jobs without Ceph. - -tripleo-heat-templates ----------------------- -A Heat resource will be created within the cephadm space. The new resource will -be also added to the existing Controller roles and all the relevant environment -files will be updated with the new reference. -In addition, as described in the spec [3]_, pacemaker constraints for ceph-nfs -and the related vip will be removed. -The tripleo-common ceph_spec library is already able to generate the spec for -this kind of daemon and it will trigger cephadm [4]_ to deploy an ingress daemon -provided that the NFS Ceph spec is applied against an existing cluster and the -backend daemon is up and running. -As mentioned before, the ingress daemon can also be deployed on top of an RGW -instance, therefore the proposed change is valid for all the Ceph services that -require an HA configuration. - - -Security Impact ---------------- - -The ingress daemon applied to an existing ceph-nfs instance is managed by -cephadm, resulting in a simplified model in terms of lifecycle. A Ceph spec for -the ingress daemon is generated right after the ceph-nfs instance is applied, -and as per [5]_ it requires two additional options: - -* frontend_port -* monitoring_port - -The two ports are required by haproxy to accept incoming requests and for -monitoring purposes, hence we need to make TripleO aware about this new service -and properly setup the firewall rules. As long as the ports defined by the spec -are passed to the overcloud deployment process and defined in the -tripleo-heat-templates CephIngress daemon resource, the `firewall_rules` -tripleo ansible role is run and rules are applied for both the frontend and -monitoring port. The usual network used by this daemon (and affected by the new -applied rules) is the `StorageNFS`, but we might have cases where an operator -overrides it. -The lifecycle, builds and security aspects for the container images associated -to the CephIngress resource are not managed by TripleO, and the Ceph -organization takes care about maintanance and updates. - - - -Upgrade Impact --------------- - -The problem of an existing Ceph cluster is covered by the spec [8]_. - - -Performance Impact ------------------- - -Since two new images (and the equivalent tripleo-heat-templates services) have -been introduced, some time is required to pull these new additional containers -in the undercloud. However, the tripleo_containers jinja template has been -updated, splitting off the Ceph related container images. In particular, during -the containers image prepare phase, a new boolean option has been added and -pulling the Ceph images can be avoided by setting the `ceph_images` boolean to -false. By doing this we can improve performances when Ceph is not required. - -Developer Impact ----------------- -This effort can be easily extended to move the RGW service to deployed ceph, -which is out of scope of this spec. - -Implementation -============== - -Deployment Flow ---------------- - -The deployment and configuration described in this spec will happen during -`openstack overcloud ceph deploy`, as described in [8]_. -The current implementation of `openstack overcloud network vip provision` -allows to provision 1 vip per network, which means that using the new Ceph -Ingress daemon (that requires 1 vip per service) can break components that -are still using the VIP provisioned on the storage network (or any other -network depending on the tripleo-heat-templates override specified) and -are managed by pacemaker. -A new option `--ceph-vip` for `openstack overcloud ceph deploy` command -will be added [11]_. This option may be used to reserve VIP(s) for each -Ceph service specified by the 'service/network' mapping defined as input. -For instance, a generic ceph service mapping can be something like the -following:: - - --- - ceph_services: - - service: ceph_nfs - network: storage - - service: ceph_rgw - network: storage - -For each service added to the list above, a virtual ip on the specified -network (that can be a composable network) will be created and used as -frontend_vip of the ingress daemon. -As described in the overview section, an ingress object will be defined -and deployed and this is supposed to manage both the VIP and the HA for -this component. - -Assignee(s) ------------ - -- fmount -- fultonj -- gfidente - -Work Items ----------- - -- Create a new Ceph prefixed Heat resource that describes the Ingress daemon - in the TripleO context. -- Add both haproxy and keepalived containers to the Ceph container list so that - they can be pulled during the `Container Image preparation` phase. -- Create a set of tasks to deploy both the nfs and the related ingress - daemon -- Deprecate the pacemaker related configuration for ceph-nfs, including - pacemaker constraints between the manila-share service and ceph-nfs -- Create upgrade playbooks to transition from TripleO/pcmk managed nfs - ganesha to nfs/ingress daemons deployed by cephadm and managed by ceph - orch - -Depending on the state of the directord/task-core migration we might skip the -ansible part, though we could POC with it to get started, extending the existing -tripleo-ansible cephadm role. - -Dependencies -============ - -This work depends on the tripleo_ceph_nfs spec [3]_ that moves from tripleo -deployed ganesha to the cephadm approach. - -Testing -======= - -The NFS daemon feature can be enabled at day1 and it will be tested against -the existing TripleO scenario004 [9]_. -As part of the implementation plan, the update of the existing heat templates -environment CI files, which contain both the Heat resources and the testing -job parameters, is one of the goals of this spec. - - -Documentation Impact -==================== - -The documentation will describe the new parameters introduced to the `deployed -ceph` cli to give the ability to deploy additional daemons (ceph-nfs and the -related ingress daemon) as part of deployed ceph. -However, we should provide upgrade instructions for pre existing environments -that need to transition from TripleO/pcmk managed nfs ganesha to nfs daemons -deployed by cephadm and managed by ceph orch. - - -References -========== - -.. [1] `cephadm `_ -.. [2] `tripleo-ceph `_ -.. [3] `tripleo-nfs-spec `_ -.. [4] `tripleo-ceph-mkspec `_ -.. [5] `cephadm-nfs-ingress `_ -.. [6] `container-image-preparation `_ -.. [7] `ceph-ingress-containers `_ -.. [8] `tripleo-common-j2 `_ -.. [9] `tripleo-scenario004 `_ -.. [10] `tripleo-common-split-off `_ -.. [11] `tripleo-ceph-vip `_ diff --git a/specs/yoga/tripleo_ceph_manila.rst b/specs/yoga/tripleo_ceph_manila.rst deleted file mode 100644 index 7f31fff0..00000000 --- a/specs/yoga/tripleo_ceph_manila.rst +++ /dev/null @@ -1,231 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=========================================== -TripleO Ceph Ganesha Integration for Manila -=========================================== - -Starting in the Octopus release, Ceph introduced its own day1 tool called -cephadm and its own day2 tool called orchestrator which replaced ceph-ansible. -During the Wallaby and Xena cycles TripleO moved away from ceph-ansible and -adopted cephadm [1]_ as described in [2]_. -However, the ganesha deamon deployment remained under the tripleo-ansible -control, with a set of tasks that are supposed to replicate the relevant part -of the ceph-nfs ceph-ansible role [3]_. -This choice ensured backward compatibility with the older releases. - -Problem Description -=================== - -In TripleO we support deployment of Ganesha both when the Ceph cluster is -itself managed by TripleO and when the Ceph cluster is itself not managed -by TripleO. -When the cluster is managed by TripleO, an NFS daemon can be deployed as a -regular TripleO service via the tripleo-ansible module [4]_. -It is preferable to have cephadm manage the lifecycle of the NFS container -instead of deploying it with tripleo-ansible. -In order to do this we will require the following changes on both TripleO -and Manila: - -- the orchestrator provides an interface that should be used by Manila to - interact with the ganesha instances. The nfs orchestrator interface is - described in [5]_ and can be used to manipulate the nfs daemon, as well - as create and delete exports. - In the past the ganesha configuration file was fully customized by - ceph-ansible; the orchestrator is going to have a set of overrides to - preserve backwards compatibility. This result is achieved by setting a - userconfig object that lives within the Ceph cluster [5]_. It's going - to be possible to check, change and reset the nfs daemon config using - the same interface provided by the orchestrator [11]_. - -- The deployed NFS daemon is based on the watch_url mechanism [6]_: - adopting a cephadm deployed ganesha instance requires the Manila driver - be updated to support this new approach. This work is described in [10]_. - -- The ceph-nfs daemon deployed by cephadm has its own HA mechanism, called - ingress, which is based on haproxy and keepalived [7]_ so we would no - longer use pcmk as the VIP owner. - Note this means we would run pcmk and keepalived in addition to haproxy - (deployed by tripleo) and another haproxy (deployed by cephadm) on the - same server (though with listeners on different ports). - Because cephadm is controlling the ganesha life cycle, the pcs cli will - no longer be used to interact with the ganesha daemon and we will change - where the ingress daemon is used. - -When the Ceph cluster is *not* managed by TripleO, the Ganesha service is -currently deployed standalone on the overcloud and it's configured to use -the external Ceph MON and MDS daemons. -However, if this spec is implemented, then the standalone ganesha service -will no longer be deployed by TripleO. Instead, we will require that the -admin of the external ceph cluster add the ceph-nfs service to that cluster. -Though TripleO will still configure Manila to use that service. - -Thus in the external case, Ganesha won't be deployed and details about the -external Ganesha must be provided as input during overcloud deployment. We -will also provide tools to help someone who has deployed Ganesaha on the -overcloud transition the service to their external Ceph cluster. From a high -level the process will be the following: - -1. Generate a cephadm spec so that after the external ceph cluster becomes - managed by cephadm the spec can be used to add a the ceph-nfs service - with the required properties. -2. Disable the VIP PCS uses and provide a documented method for it to be - moved to the external ceph cluster. - -Proposed Change -=============== - -Overview --------- - -An ansible task will generate the Ceph NFS daemon spec and it will trigger -cephadm [2]_ to deploy the Ganesha container. - -- the NFS spec should be rendered and applied against the existing Ceph - cluster -- the ingress spec should be rendered (as part of the NFS deployment) - and applied against the cluster - -The container will be no longer controlled by pacemaker. - - -Security Impact ---------------- - -None, the same code which TripleO would already use for the generation of -the Ceph cluster config and keyrings will be consumed. - -Upgrade Impact --------------- - -- We will deprecate the ganesha managed by PCS so that it will still work - up until Z. -- We will provide playbooks which migrate from the old NFS service to the - new one. -- We will assume these playbooks will be available in Z and run prior to - the upgrade to the next release. - -Other End User Impact ---------------------- - -For fresh deployments, the existing input parameters will be reused to -drive the newer deployment tool. -For an existing environment, after the Ceph upgrade, the TripleO deployed -NFS instance will be stopped and removed by the migration playbook provided, -as well as the related pacemaker resources and constraints; cephadm will -be able to deploy and manage the new NFS instances, and the end user will -see a disruption in the NFS service. - -Performance Impact ------------------- - -No changes. - -Other Deployer Impact ---------------------- - -* "deployed ceph": For the first implementation of this spec we'll deploy - during overcloud deployment but we will aim to deliver this so that it - is compatible with "deployed ceph". VIPs are provisioned with - `openstack overcloud network vip provision` before - `openstack overcloud network provision` and before - `openstack overcloud node provision` so we would have an ingress VIP in - advance so we could do this with "deployed ceph". - -* directord/task-core: We will ultimately need this implemented for the - directord/task-core tool but could start with ansible tasks added to - the tripleo_ceph role. Depending on the state of the directord/task-core - migration when we implement we might skip the ansible part, though we - could POC with it to get started. - -Developer Impact ----------------- - -Assuming the manila services are able to interact with Ganesha using the -watch_url mechanism, the NFS daemon can be generated as a regular Ceph -daemon using the spec approach provided by the tripleo-ansible module [4]_. - -Implementation -============== - -Deployment Flow ---------------- - -The deployment and configuration described in this spec will happen during -`openstack overcloud deploy`, as described in [8]_. -This is consistent with how tripleo-ansible used to run during step2 to -configure these services. The tripleo-ansible tasks should be moved from a -pure ansible templating approach that generates the systemd unit according -to the input provided to a cephadm based daemon that can be configured with -the usual Ceph mgr config-key mechanism. -As described in the overview section, an ingress object will be defined and -deployed and this is supposed to manage both the VIP and the HA for this -component. - -Assignee(s) ------------ - -- fmount -- fultonj -- gfidente - -Work Items ----------- - -- Change the tripleo-ansible module to support the Ceph ingress daemon - type -- Create a set of tasks to deploy both the nfs and the related ingress - daemons -- Deprecate the pacemaker related configuration for ceph-nfs, including - pacemaker constraints between the manila-share service and ceph-nfs -- Create upgrade playbooks to transition from TripleO/pcmk managed nfs - ganesha to nfs daemons deployed by cephadm and managed by ceph orch - -Dependencies -============ - -- This work depends on the manila spec [10]_ that moves from dbus to the - watch_url approach - -Testing -======= - -The NFS daemon feature can be enabled at day1 and it will be tested against -the existing TripleO scenario004 [9]_. -As part of the implementation plan, the update of the existing heat templates -environment CI files, which contain the testing job parameters, is one of the -goals of this spec. -An important aspect of the job definition process is related to standalone vs -multinode. -As seen in the past, multinode can help catching issues that are not visible -in a standalone environment, but of course the job configuration can be improved -in the next cycles, and we can start with standalone testing, which is what is -present today in CI. - -Documentation Impact -==================== - -No changes should be necessary to the TripleO documentation, as the described -interface remains the unchanged. -However, we should provide upgrade instructions for pre existing environments -that need to transition from TripleO/pcmk managed nfs ganesha to nfs daemons -deployed by cephadm and managed by ceph orch. - -References -========== - -.. [1] `cephadm `_ -.. [2] `tripleo-ceph `_ -.. [3] `tripleo-ceph-ganesha `_ -.. [4] `tripleo-ceph-mkspec `_ -.. [5] `tripleo-ceph-nfs `_ -.. [6] `ganesha-watch_url `_ -.. [7] `cephadm-nfs-ingress `_ -.. [8] `tripleo-cephadm `_ -.. [9] `tripleo-scenario004 `_ -.. [10] `cephfs-nfs-drop-dbus `_ -.. [11] `cephfs-get-config `_ - diff --git a/specs/zed/decouple-tripleo-tasks.rst b/specs/zed/decouple-tripleo-tasks.rst deleted file mode 100644 index 712a662c..00000000 --- a/specs/zed/decouple-tripleo-tasks.rst +++ /dev/null @@ -1,253 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -====================== -Decouple TripleO Tasks -====================== - -https://blueprints.launchpad.net/tripleo/+spec/decouple-tripleo-tasks - -This spec proposes decoupling tasks across TripleO by organizing tasks in a way -that they are grouped as a function of what they manage. The desire is to be -able to better isolate and minimize what tasks need to be run for specific -management operations. The process of decoupling tasks is implemented through -moving tasks into standalone native ansible roles and playbooks in tripleo-ansible. - - -Problem Description -=================== - -TripleO presently manages the entire software configuration of the overcloud at -once each time ``openstack overcloud deploy`` is executed. Regardless of -whether nodes were already deployed, require a full redeploy for some reason, -or are new nodes (scale up, replacement) all tasks are executed. The -functionality of only executing needed tasks lies within Ansible. - -The problem with relying entirely on Ansible to determine if any changes are -needed is that it results in long deploy times. Even if nothing needs to be -done, it can take hours just to have Ansible check each task in order to make -that determination. - -Additionally, TripleO's reliance on external tooling (Puppet, container config -scripts, bootstrap scripts, etc) means that tasks executing those tools -**must** be executed by Ansible as Ansible does not have the necessary data -needed in order to determine if those tasks need to be executed or not. These -tasks often have cascading effects in determining what other tasks need to be -run. This is a general problem across TripleO, and is why the model of just -executing all tasks on each deploy has been the accepted pattern. - - -Proposed Change -=============== - -The spec proposes decoupling tasks and separating them out as needed to manage -different functionality within TripleO. Depending on the desired management -operation, tripleoclient will contain the necessary functionality to trigger -the right tasks. Decoupling and refactoring tasks will be done by migrating to -standalone ansible role and playbooks within tripleo-ansible. This will allow -for reusing the standalone ansible artifacts from tripleo-ansible to be used -natively with just ``ansible-playbook``. At the same time, the -``tripleo-heat-templates`` interfaces are maintained by consuming the new roles -and playbooks from ``tripleo-ansible``. - -Overview --------- - -There are 3 main changes proposed to implement this spec: - -#. Refactor ansible tasks from ``tripleo-heat-templates`` into standalone roles - in tripleo-ansible. -#. Develop standalone playbooks within tripleo-ansible to consume the - tripleo-ansible roles. -#. Update tripleo-heat-templates to use the standalone roles and playbooks from - ``tripleo-ansible`` with new ``role_data`` interfaces to drive specific - functionality with new ``openstack overcloud`` commands. - -Writing standalone roles in ``tripleo-ansible`` will largely be an exercise of -copy/paste from tasks lists in ``tripleo-heat-templates``. As tasks are moved -into standalone roles, tripleo-heat-templates can be directly updated to run -tasks from the those roles using ``include_role``. This pattern is already well -established in tripleo-heat-templates with composable services that use -existing standalone roles. - -New playbooks will be developed within tripleo-ansible to drive the standalone -roles using pure ``ansible-playbook``. These playbooks will offer a native -ansible experience for deploying with tripleo-ansible. - -The design principles behind the standalone role and playbooks are: - -#. Native execution with ansible-playbook, an inventory, and variable files. -#. No Heat. While Heat remains part of the TripleO architecture, it has no - bearing on how the native ansible is developed in tripleo-ansible. - tripleo-heat-templates can consume the standalone ansible playbooks and - roles from tripleo-ansible, but it does not dictate the interface. The - interface should be defined for native ansible best practices. -#. No puppet. As the standalone roles are developed, they will not rely on - puppet for configuration or any other tasks. To allow integration with - tripleo-heat-templates and existing TripleO interfaces (Hiera, Heat - parameters), the roles will allow skipping config generation and other parts - that use puppet so that pieces can be overridden by - ``tripleo-heat-templates`` specific tasks. When using native Ansible, - templated config files and native ansible tasks will be used instead of - Puppet. -#. While the decoupled tasks will allow for cleaner interfaces for executing - just specific management operations, all tasks will remain idempotent. A - full deployment that re-runs all tasks will still work, and result in no - effective changes for an already deployed cloud with the same set of inputs. - -The standalone roles will use separated task files for each decoupled -management interface exposed. The playbooks will be separated by management -interface as well to allow for executing just specific management functionality. - -The decoupled management interfaces are defined as: - -* bootstrap -* install -* pre-network -* network -* configure -* container-config -* service-bootstrap - -New task interfaces in ``tripleo-heat-templates`` will be added under -``role_data`` to correspond with the new management interfaces, and consume the -standalone ansible from tripleo-ansible. This will allow executing just -specific management interfaces and using the standalone playbooks from -tripleo-ansible directly. - -New subcommands will be added to tripleoclient to trigger the new management -interface operations, ``openstack overcloud install``, ``openstack overcloud -configure``, etc. - -``openstack overcloud deploy`` would continue to function as it presently does -by doing a full assert of the system state with all tasks. The underlying -playbook, ``deploy-steps-playbook.yaml`` would be updated as necessary to -include the other playbooks so that all tasks can be executed. - -Alternatives ------------- - -:Alternative 1 - Use --tags/--skip-tags: - -With ``--tags`` / ``--skip-tags``, tasks could be selectively executed. In the -past this has posed other problems within TripleO. Using tags does not allow -for composing tasks to the level needed, and often results in running tasks -when not needed or forgetting to tag needed tasks. Having to add the special -cased ``always`` tag becomes necessary so that certain tasks are run when -needed. The tags become difficult to maintain as it is not apparent what tasks -are tagged when looking at the entire execution. Additionally, not all -operations within TripleO map to Ansible tasks one to one. Container startup -are declared in a custom YAML format, and that format is then used as input to -a task. It is not possible to tag individual container startups unless tag -handling logic was added to the custom modules used for container startup. - -:Alternative 2 - Use --start-at-task: - -Using ``--start-at-task`` is likewise problematic, and it does not truly -partition the full set of tasks. Tasks would need to be reordered anyway across -much of TripleO so that ``--start-at-task`` would work. It would be more -straightforward to separate by playbook if a significant number of tasks need -to be reordered. - -Security Impact ---------------- - -Special consideration should be given to security related tasks to ensure that -the critical tasks are executed when needed. - -Upgrade Impact --------------- - -Upgrade and update tasks are already separated out into their own playbooks. -There is an understanding that the full ``deploy_steps_playbook.yaml`` is -executed after an update or upgrade however. This full set of tasks could end -up being reduced if tasks are sufficiently decoupled in order to run the -necessary pieces in isolation (config, bootstrap, etc). - -Other End User Impact ---------------------- - -Users will need to be aware of the limitations of using the new management -commands and playbooks. The expectation within TripleO has always been the -entire state of the system is re-asserted on scale up and configure operations. - -While the ability to still do a full assert would be present, it would no -longer be required. Operators and users will need to understand that only -running certain management operations may not fully apply a desired change. If -only a reconfiguration is done, it may not imply restarting containers. With -the move to standalone and native ansible components, with less -``config-download`` based generation, it should be more obvious what each -playbooks is responsible for managing. The native ansible interfaces will help -operators reason about what needs to be run and when. - -Performance Impact ------------------- - -Performance should be improved for the affected management operations due to -having to run less tasks, and being able to run only the tasks needed for a -given operation. - -There should be no impact when running all tasks. Tasks must be refactored in -such a way that the overall deploy process when all tasks are run is not made -slower. - -Other Deployer Impact ---------------------- - -Discuss things that will affect how you deploy and configure OpenStack -that have not already been mentioned, such as: - -* What config options are being added? Should they be more generic than - proposed (for example a flag that other hypervisor drivers might want to - implement as well)? Are the default values ones which will work well in - real deployments? - -* Is this a change that takes immediate effect after its merged, or is it - something that has to be explicitly enabled? - -Developer Impact ----------------- - -TripleO developers will be responsible for updating the service templates that -they maintain in order to refactor the tasks. - -Implementation -============== - -Assignee(s) ------------ - -Primary assignee: - James Slagle - -Work Items ----------- - -Work items or tasks -- break the feature up into the things that need to be -done to implement it. Those parts might end up being done by different people, -but we're mostly trying to understand the timeline for implementation. - - -Dependencies -============ - -None. - -Testing -======= - -Existing CI jobs would cover changes to task refactorings. -New CI jobs could be added for the new isolated management operations. - -Documentation Impact -==================== - -New commands and playbooks must be documented. - - -References -========== -`standalone-roles POC `_ diff --git a/specs/zed/placeholder.rst b/specs/zed/placeholder.rst deleted file mode 100644 index 58b0d0ce..00000000 --- a/specs/zed/placeholder.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. - This work is licensed under a Creative Commons Attribution 3.0 Unported - License. - - http://creativecommons.org/licenses/by/3.0/legalcode - -=============== -Zed placeholder -=============== diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_titles.py b/tests/test_titles.py deleted file mode 100644 index 2439a44c..00000000 --- a/tests/test_titles.py +++ /dev/null @@ -1,108 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import glob -import re - -import docutils.core -import testtools - - -class TestTitles(testtools.TestCase): - def _get_title(self, section_tree): - section = { - 'subtitles': [], - } - for node in section_tree: - if node.tagname == 'title': - section['name'] = node.rawsource - elif node.tagname == 'section': - subsection = self._get_title(node) - section['subtitles'].append(subsection['name']) - return section - - def _get_titles(self, spec): - titles = {} - for node in spec: - if node.tagname == 'section': - # Note subsection subtitles are thrown away - section = self._get_title(node) - titles[section['name']] = section['subtitles'] - return titles - - def _check_titles(self, filename, expect, actual): - missing_sections = [x for x in expect.keys() if x not in actual.keys()] - extra_sections = [x for x in actual.keys() if x not in expect.keys()] - - msgs = [] - if len(missing_sections) > 0: - msgs.append("Missing sections: %s" % missing_sections) - if len(extra_sections) > 0: - msgs.append("Extra sections: %s" % extra_sections) - - for section in expect.keys(): - missing_subsections = [x for x in expect[section] - if x not in actual[section]] - # extra subsections are allowed - if len(missing_subsections) > 0: - msgs.append("Section '%s' is missing subsections: %s" - % (section, missing_subsections)) - - if len(msgs) > 0: - self.fail("While checking '%s':\n %s" - % (filename, "\n ".join(msgs))) - - def _check_lines_wrapping(self, tpl, raw): - for i, line in enumerate(raw.split("\n")): - if "http://" in line or "https://" in line: - continue - self.assertTrue( - len(line) < 80, - msg="%s:%d: Line limited to a maximum of 79 characters." % - (tpl, i+1)) - - def _check_no_cr(self, tpl, raw): - matches = re.findall('\r', raw) - self.assertEqual( - len(matches), 0, - "Found %s literal carriage returns in file %s" % - (len(matches), tpl)) - - - def _check_trailing_spaces(self, tpl, raw): - for i, line in enumerate(raw.split("\n")): - trailing_spaces = re.findall(" +$", line) - self.assertEqual(len(trailing_spaces),0, - "Found trailing spaces on line %s of %s" % (i+1, tpl)) - - - def test_template(self): - with open("specs/template.rst") as f: - template = f.read() - spec = docutils.core.publish_doctree(template) - template_titles = self._get_titles(spec) - - releases = [x.split('/')[1] for x in glob.glob('specs/*/')] - for release in releases: - files = glob.glob("specs/%s/*/*" % release) - for filename in files: - self.assertTrue(filename.endswith(".rst"), - "spec filenames must use 'rst' extension.") - with open(filename) as f: - data = f.read() - - spec = docutils.core.publish_doctree(data) - titles = self._get_titles(spec) - self._check_titles(filename, template_titles, titles) - self._check_lines_wrapping(filename, data) - self._check_no_cr(filename, data) - self._check_trailing_spaces(filename, data) diff --git a/tools/abandon_old_reviews.sh b/tools/abandon_old_reviews.sh deleted file mode 100644 index 36a07c4e..00000000 --- a/tools/abandon_old_reviews.sh +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2019 Red Hat, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -# -# WARNING! -# Please do not run this script without talking to the TripleO PTL. Auto -# abandoning people's changes is a good thing, but must be done with care. -# -# before you run this modify your .ssh/config to create a -# review.opendev.org entry: -# -# Host review.opendev.org -# User -# Port 29418 -# - -DRY_RUN=0 -CLEAN_PROJECT="" - -function print_help { - echo "Script to abandon patches without activity for more than 4 weeks." - echo "Usage:" - echo " ./abandon_old_reviews.sh [--dry-run] [--project ] [--help]" - echo " --dry-run In dry-run mode it will only print what patches would be abandoned " - echo " but will not take any real actions in gerrit" - echo " --project Only check patches from if passed." - echo " It must be one of the projects which are a part of the TripleO-group." - echo " If project is not provided, all projects from the TripleO-group will be checked" - echo " --help Print help message" -} - -while [ $# -gt 0 ]; do - key="${1}" - - case $key in - --dry-run) - echo "Enabling dry run mode" - DRY_RUN=1 - shift # past argument - ;; - --project) - CLEAN_PROJECT="project:openstack/${2}" - shift # past argument - shift # past value - ;; - --help) - print_help - exit 2 - esac -done - -set -o errexit -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -function abandon_review { - local gitid=$1 - shift - local msg=$@ - unassign_and_new_bug $gitid - if [ $DRY_RUN -eq 1 ]; then - echo "Would abandon $gitid" - else - echo "Abandoning $gitid" - ssh review.opendev.org gerrit review $gitid --abandon --message \"$msg\" - fi -} - -function unassign_and_new_bug { - # unassign current assignee and set bug to 'new' status - local gitid=$1 - cm=$(ssh review.opendev.org "gerrit query $gitid --current-patch-set --format json" | jq .commitMessage) - for closes in $(echo -e $cm | awk '/[cC]loses-[bB]ug/ {match($0,/[0-9]+/); bug=substr($0,RSTART,RLENGTH); print bug}'); do - if [ $DRY_RUN -eq 1 ]; then - echo "Would unassign and tag 'timeout-abandon' $closes" - else - echo "Attempting to change status of bug $closes to New" - python "$DIR/unassign_bug.py" $closes - fi - done -} - -PROJECTS="($( -python - < 90 days without comment and currently blocked by a -core reviewer with a -2. We are abandoning this for now. -Feel free to reactivate the review by pressing the restore button and -contacting the reviewer with the -2 on this review to ensure you -address their concerns. For more details check policy -https://specs.openstack.org/openstack/tripleo-specs/specs/policy/patch-abandonment.html -EOF -) - -for review in $blocked_reviews; do - echo "Blocked review $review" - abandon_review $review $blocked_msg -done - -# then purge all the reviews that are > 90d with no changes and Zuul has -1ed - -failing_reviews=$(ssh review.opendev.org "gerrit query --current-patch-set --format json $PROJECTS status:open age:90d NOT label:Verified>=1,Zuul" | jq .currentPatchSet.revision | grep -v null | sed 's/"//g') - -failing_msg=$(cat < 90 days without comment, and failed Zuul the last -time it was checked. We are abandoning this for now. -Feel free to reactivate the review by pressing the restore button and -leaving a 'recheck' comment to get fresh test results. -For more details check policy -https://specs.openstack.org/openstack/tripleo-specs/specs/policy/patch-abandonment.html -EOF -) - -for review in $failing_reviews; do - echo "Failing review $review" - abandon_review $review $failing_msg -done - -# then purge all the reviews that are > 180 days with WIP -1 - -very_old_reviews=$(ssh review.opendev.org "gerrit query --current-patch-set --format json $PROJECTS status:open age:180d Workflow<=-1" | jq .currentPatchSet.revision | grep -v null | sed 's/"//g') - -very_old_msg=$(cat < 180 days without comment and WIP -1. We are abandoning this for now. -Feel free to reactivate the review by pressing the restore button and -contacting the reviewers. -For more details check policy -https://specs.openstack.org/openstack/tripleo-specs/specs/policy/patch-abandonment.html -EOF -) - -for review in $very_old_reviews; do - echo "Workflow -1 review $review" - abandon_review $review $very_old_msg -done \ No newline at end of file diff --git a/tools/unassign_bug.py b/tools/unassign_bug.py deleted file mode 100644 index da9043fa..00000000 --- a/tools/unassign_bug.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright 2019 Red Hat, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -"""Unassigns assignee from tripleobug, adds message and tag. -If you get the following exception, you need X11 and python-dbus installed: -'RuntimeError: No recommended backend was available. Install -the keyrings.alt package if you want to use the non-recommended -backends. See README.rst for details.' -or check solutions from: -https://github.com/jaraco/keyring/issues/258 -""" - -import os -import sys - -from launchpadlib.launchpad import Launchpad - - -MSG_BODY = "\ -This bug has had a related patch abandoned and has been automatically \ -un-assigned due to inactivity. Please re-assign yourself if you are \ -continuing work or adjust the state as appropriate if it is no longer valid." - - -def unassign(bug_num): - login = os.environ.get('LAUNCHPAD_LOGIN', 'tripleo') - password = os.environ.get('LAUNCHPAD_PASSWORD', 'production') - launchpad = Launchpad.login_with(login, password) - b = launchpad.bugs[bug_num] - for task in b.bug_tasks: - for tag in task.bug_target_name: - if 'tripleo' not in tag: - # try not to interfere with non-tripleo projects too much - continue - task.assignee = None - if task.status == "In Progress": - task.status = 'New' - task.lp_save() - b.tags = b.tags + ['timeout-abandon'] - b.newMessage(content=MSG_BODY, subject='auto-abandon-script') - b.lp_save() - - -if __name__ == '__main__': - unassign(int(sys.argv[1])) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index b629604e..00000000 --- a/tox.ini +++ /dev/null @@ -1,18 +0,0 @@ -[tox] -minversion = 2.0 -envlist = docs,py36 -skipsdist = True - -[testenv] -basepython = python3 -usedevelop = True -setenv = VIRTUAL_ENV={envdir} -deps = -r{toxinidir}/requirements.txt -commands = stestr run --slowest {posargs} - -[testenv:venv] -commands = {posargs} - -[testenv:docs] -commands = - sphinx-build -W -b html doc/source doc/build/html