Kubernetes workflow

- Add Kubernetes workflow
- Add Sample VNFM to test against
- Add sample application (VNF)
- Add testing instructions
- Update documentation

Details in: fenix/tools/README.md

Also fixes to:
- Api error handling
- Exceptions
- DB API
- some other minor bugs.

story: 2007301
Task: #38778

Change-Id: Ia37dfe7ea57935e73868da89aaa9a413721078ad
Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
This commit is contained in:
Tomi Juvonen 2020-03-13 12:14:22 +02:00
parent d6f1f7bc2d
commit d94ba44c69
17 changed files with 2074 additions and 93 deletions

View File

@ -188,7 +188,9 @@ migration-type:
Own action is create new and delete old instance. Own action is create new and delete old instance.
Note! VNF need to obey resource_mitigation with own action Note! VNF need to obey resource_mitigation with own action
This affects to order of delete old and create new to not over This affects to order of delete old and create new to not over
commit the resources. commit the resources. In Kubernetes also EVICTION supported. There admin
will delete instance and VNF automation like ReplicaSet will make a new
instance
in: body in: body
required: true required: true
type: string type: string

View File

@ -114,7 +114,9 @@ payload
| | | 'MIGRATE', 'LIVE_MIGRATE' and 'OWN_ACTION'. 'OWN_ACTION' means | | | | 'MIGRATE', 'LIVE_MIGRATE' and 'OWN_ACTION'. 'OWN_ACTION' means |
| | | an action project manager can do itself. Usually this could be | | | | an action project manager can do itself. Usually this could be |
| | | re-instantiation even with a new flavor. Other actions are done by | | | | re-instantiation even with a new flavor. Other actions are done by |
| | | Fenix as they need the admin privileges. Valid for states: | | | | Fenix as they need the admin privileges. In Kubernetes also 'EVICTION' |
| | | supported. There admin will delete instance and VNF automation like |
| | | ReplicaSet will make a new instance. Valid for states: |
| | | 'SCALE_IN', 'PREPARE_MAINTENANCE' and 'PLANNED_MAINTENANCE'. | | | | 'SCALE_IN', 'PREPARE_MAINTENANCE' and 'PLANNED_MAINTENANCE'. |
+-----------------+------------+------------------------------------------------------------------------+ +-----------------+------------+------------------------------------------------------------------------+
| instance_ids | string | Link to Fenix maintenance session and project specific API to get | | instance_ids | string | Link to Fenix maintenance session and project specific API to get |
@ -176,7 +178,7 @@ Example of notification for many instances:
"metadata": {"openstack_release": "Queens"} "metadata": {"openstack_release": "Queens"}
} }
Example of notification for single instances. Note the instance specific Example of notification for single instance. Note the instance specific
'reply_url': 'reply_url':
.. code-block:: json .. code-block:: json
@ -194,5 +196,23 @@ Example of notification for single instances. Note the instance specific
"metadata": {"openstack_release": "Queens"} "metadata": {"openstack_release": "Queens"}
} }
Example of notification for single instance in Kubernetes. Note the instance
specific 'reply_url' and allowed actions for Kubernetes:
.. code-block:: json
{
"service": "fenix",
"allowed_actions": ["OWN_ACTION", "EVICTION"],
"instance_ids": ["28d226f3-8d06-444f-a3f1-c586d2e7cb39"],
"reply_url": "http://0.0.0.0:12347/v1/maintenance/76e55df8-1c51-11e8-9928-0242ac110002/ead0dbcaf3564cbbb04842e3e54960e3/28d226f3-8d06-444f-a3f1-c586d2e7cb39",
"state": "PREPARE_MAINTENANCE",
"session_id": "76e55df8-1c51-11e8-9928-0242ac110002",
"reply_at": "2018-02-28T06:40:16",
"actions_at": "2018-02-29T00:00:00",
"project_id": "ead0dbcaf3564cbbb04842e3e54960e3",
"metadata": {"openstack_release": "Queens"}
}
.. [1] http://docs.openstack.org/developer/oslo.messaging/notifier.html .. [1] http://docs.openstack.org/developer/oslo.messaging/notifier.html
.. [2] https://docs.openstack.org/aodh/latest/admin/telemetry-alarms.html#event-based-alarm .. [2] https://docs.openstack.org/aodh/latest/admin/telemetry-alarms.html#event-based-alarm

View File

@ -20,18 +20,49 @@ from pecan import expose
from pecan import request from pecan import request
from pecan import response from pecan import response
from pecan import rest from pecan import rest
import six
from oslo_log import log from oslo_log import log
from oslo_messaging import RemoteError
from oslo_serialization import jsonutils from oslo_serialization import jsonutils
from fenix.api.v1 import maintenance from fenix.api.v1 import maintenance
from fenix.api.v1 import schema from fenix.api.v1 import schema
import fenix.exceptions as exceptions
import fenix.db.exceptions as db_exceptions
from fenix import policy from fenix import policy
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
class ProjectController(rest.RestController): def _format_ex_message(ex):
if len(ex.path) > 0:
return ("Invalid input for field/attribute %(path)s."
" Value: %(value)s. %(message)s" % {'path': ex.path.pop(),
'value': ex.instance,
'message': ex.message})
else:
return ex.message
return
class BaseController(rest.RestController):
def handle_remote_error(self, e):
cls = getattr(db_exceptions, e.exc_type, None)
cls = cls or getattr(exceptions, e.exc_type, None)
if cls is not None:
if e.value:
description = e.value
elif "msg_fmt" in vars(cls).keys():
description = cls.msg_fmt
else:
description = ""
abort(cls.code, description)
abort(500)
class ProjectController(BaseController):
name = 'project' name = 'project'
@ -49,8 +80,9 @@ class ProjectController(rest.RestController):
jsonschema.validate(session_id, schema.uid) jsonschema.validate(session_id, schema.uid)
jsonschema.validate(project_id, schema.uid) jsonschema.validate(project_id, schema.uid)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
abort(422, six.text_type(description))
engine_data = self.engine_rpcapi.project_get_session(session_id, engine_data = self.engine_rpcapi.project_get_session(session_id,
project_id) project_id)
try: try:
@ -68,18 +100,22 @@ class ProjectController(rest.RestController):
jsonschema.validate(project_id, schema.uid) jsonschema.validate(project_id, schema.uid)
jsonschema.validate(data, schema.maintenance_session_project_put) jsonschema.validate(data, schema.maintenance_session_project_put)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
engine_data = self.engine_rpcapi.project_update_session(session_id, abort(422, six.text_type(description))
project_id, try:
data) engine_data = self.engine_rpcapi.project_update_session(session_id,
project_id,
data)
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:
response.body = jsonutils.dumps(engine_data) response.body = jsonutils.dumps(engine_data)
class ProjectInstanceController(rest.RestController): class ProjectInstanceController(BaseController):
name = 'project_instance' name = 'project_instance'
@ -99,20 +135,24 @@ class ProjectInstanceController(rest.RestController):
data, data,
schema.maintenance_session_project_instance_put) schema.maintenance_session_project_instance_put)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
engine_data = ( abort(422, six.text_type(description))
self.engine_rpcapi.project_update_session_instance(session_id, try:
project_id, engine_data = (
instance_id, self.engine_rpcapi.project_update_session_instance(session_id,
data)) project_id,
instance_id,
data))
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:
response.body = jsonutils.dumps(engine_data) response.body = jsonutils.dumps(engine_data)
class SessionController(rest.RestController): class SessionController(BaseController):
name = 'session' name = 'session'
@ -126,15 +166,20 @@ class SessionController(rest.RestController):
try: try:
jsonschema.validate(session_id, schema.uid) jsonschema.validate(session_id, schema.uid)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
abort(422, six.text_type(description))
if request.body: if request.body:
LOG.error("Unexpected data") LOG.error("Unexpected data")
abort(400) abort(400)
session = self.engine_rpcapi.admin_get_session(session_id) try:
session = self.engine_rpcapi.admin_get_session(session_id)
except RemoteError as e:
self.handle_remote_error(e)
if session is None: if session is None:
LOG.error("Invalid session") description = "Invalid session"
abort(404) LOG.error(description)
abort(422, six.text_type(description))
try: try:
response.text = jsonutils.dumps(session) response.text = jsonutils.dumps(session)
except TypeError: except TypeError:
@ -149,9 +194,14 @@ class SessionController(rest.RestController):
jsonschema.validate(session_id, schema.uid) jsonschema.validate(session_id, schema.uid)
jsonschema.validate(data, schema.maintenance_session_put) jsonschema.validate(data, schema.maintenance_session_put)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
engine_data = self.engine_rpcapi.admin_update_session(session_id, data) abort(422, six.text_type(description))
try:
engine_data = self.engine_rpcapi.admin_update_session(session_id,
data)
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:
@ -164,19 +214,23 @@ class SessionController(rest.RestController):
try: try:
jsonschema.validate(session_id, schema.uid) jsonschema.validate(session_id, schema.uid)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
abort(422, six.text_type(description))
if request.body: if request.body:
LOG.error("Unexpected data") LOG.error("Unexpected data")
abort(400) abort(400)
engine_data = self.engine_rpcapi.admin_delete_session(session_id) try:
engine_data = self.engine_rpcapi.admin_delete_session(session_id)
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:
response.body = jsonutils.dumps(engine_data) response.body = jsonutils.dumps(engine_data)
class MaintenanceController(rest.RestController): class MaintenanceController(BaseController):
name = 'maintenance' name = 'maintenance'
@ -190,7 +244,10 @@ class MaintenanceController(rest.RestController):
if request.body: if request.body:
LOG.error("Unexpected data") LOG.error("Unexpected data")
abort(400) abort(400)
sessions = self.engine_rpcapi.admin_get() try:
sessions = self.engine_rpcapi.admin_get()
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(sessions) response.text = jsonutils.dumps(sessions)
except TypeError: except TypeError:
@ -204,9 +261,13 @@ class MaintenanceController(rest.RestController):
try: try:
jsonschema.validate(data, schema.maintenance_post) jsonschema.validate(data, schema.maintenance_post)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
session = self.engine_rpcapi.admin_create_session(data) abort(422, six.text_type(description))
try:
session = self.engine_rpcapi.admin_create_session(data)
except RemoteError as e:
self.handle_remote_error(e)
if session is None: if session is None:
LOG.error("Too many sessions") LOG.error("Too many sessions")
abort(509) abort(509)
@ -216,7 +277,7 @@ class MaintenanceController(rest.RestController):
response.body = jsonutils.dumps(session) response.body = jsonutils.dumps(session)
class InstanceController(rest.RestController): class InstanceController(BaseController):
name = 'instance' name = 'instance'
@ -230,15 +291,20 @@ class InstanceController(rest.RestController):
try: try:
jsonschema.validate(instance_id, schema.uid) jsonschema.validate(instance_id, schema.uid)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
abort(422, six.text_type(description))
if request.body: if request.body:
LOG.error("Unexpected data") LOG.error("Unexpected data")
abort(400) abort(400)
instance = self.engine_rpcapi.get_instance(instance_id) try:
instance = self.engine_rpcapi.get_instance(instance_id)
except RemoteError as e:
self.handle_remote_error(e)
if instance is None: if instance is None:
LOG.error("Invalid instance: %s" % instance_id) description = "Invalid instance: %s" % instance_id
abort(404) LOG.error(description)
abort(422, six.text_type(description))
try: try:
response.text = jsonutils.dumps(instance) response.text = jsonutils.dumps(instance)
except TypeError: except TypeError:
@ -253,10 +319,14 @@ class InstanceController(rest.RestController):
jsonschema.validate(instance_id, schema.uid) jsonschema.validate(instance_id, schema.uid)
jsonschema.validate(data, schema.instance_put) jsonschema.validate(data, schema.instance_put)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
engine_data = self.engine_rpcapi.update_instance(instance_id, abort(422, six.text_type(description))
data) try:
engine_data = self.engine_rpcapi.update_instance(instance_id,
data)
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:
@ -269,19 +339,23 @@ class InstanceController(rest.RestController):
try: try:
jsonschema.validate(instance_id, schema.uid) jsonschema.validate(instance_id, schema.uid)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
abort(422, six.text_type(description))
if request.body: if request.body:
LOG.error("Unexpected data") LOG.error("Unexpected data")
abort(400) abort(400)
engine_data = self.engine_rpcapi.delete_instance(instance_id) try:
engine_data = self.engine_rpcapi.delete_instance(instance_id)
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:
response.body = jsonutils.dumps(engine_data) response.body = jsonutils.dumps(engine_data)
class InstanceGroupController(rest.RestController): class InstanceGroupController(BaseController):
name = 'instance_group' name = 'instance_group'
@ -295,15 +369,20 @@ class InstanceGroupController(rest.RestController):
try: try:
jsonschema.validate(group_id, schema.uid) jsonschema.validate(group_id, schema.uid)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
abort(422, six.text_type(description))
if request.body: if request.body:
LOG.error("Unexpected data") LOG.error("Unexpected data")
abort(400) abort(400)
group = self.engine_rpcapi.get_instance_group(group_id) try:
group = self.engine_rpcapi.get_instance_group(group_id)
except RemoteError as e:
self.handle_remote_error(e)
if group is None: if group is None:
LOG.error("Invalid instance_group: %s" % group_id) description = "Invalid instance_group: %s" % group_id
abort(404) LOG.error(description)
abort(422, six.text_type(description))
try: try:
response.text = jsonutils.dumps(group) response.text = jsonutils.dumps(group)
except TypeError: except TypeError:
@ -318,10 +397,14 @@ class InstanceGroupController(rest.RestController):
jsonschema.validate(group_id, schema.uid) jsonschema.validate(group_id, schema.uid)
jsonschema.validate(data, schema.instance_group_put) jsonschema.validate(data, schema.instance_group_put)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
engine_data = ( abort(422, six.text_type(description))
self.engine_rpcapi.update_instance_group(group_id, data)) try:
engine_data = (
self.engine_rpcapi.update_instance_group(group_id, data))
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:
@ -334,13 +417,17 @@ class InstanceGroupController(rest.RestController):
try: try:
jsonschema.validate(group_id, schema.uid) jsonschema.validate(group_id, schema.uid)
except jsonschema.exceptions.ValidationError as e: except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message)) description = _format_ex_message(e)
abort(422) LOG.error(description)
abort(422, six.text_type(description))
if request.body: if request.body:
LOG.error("Unexpected data") LOG.error("Unexpected data")
abort(400) abort(400)
engine_data = ( try:
self.engine_rpcapi.delete_instance_group(group_id)) engine_data = (
self.engine_rpcapi.delete_instance_group(group_id))
except RemoteError as e:
self.handle_remote_error(e)
try: try:
response.text = jsonutils.dumps(engine_data) response.text = jsonutils.dumps(engine_data)
except TypeError: except TypeError:

View File

@ -40,7 +40,7 @@ reply_states = ['ACK_MAINTENANCE',
'NACK_PLANNED_MAINTENANCE', 'NACK_PLANNED_MAINTENANCE',
'NACK_MAINTENANCE_COMPLETE'] 'NACK_MAINTENANCE_COMPLETE']
allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION'] allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION', 'EVICTION']
maintenance_session_project_put = { maintenance_session_project_put = {
'type': 'object', 'type': 'object',

View File

@ -25,11 +25,11 @@ class FenixDBException(exceptions.FenixException):
msg_fmt = 'An unknown database exception occurred' msg_fmt = 'An unknown database exception occurred'
class FenixDBDuplicateEntry(FenixDBException): class FenixDBDuplicateEntry(exceptions.Duplicate):
msg_fmt = 'Duplicate entry for %(columns)s in %(model)s model was found' msg_fmt = 'Duplicate entry for %(columns)s in %(model)s model was found'
class FenixDBNotFound(FenixDBException): class FenixDBNotFound(exceptions.NotFound):
msg_fmt = '%(id)s %(model)s was not found' msg_fmt = '%(id)s %(model)s was not found'

View File

@ -516,11 +516,12 @@ def project_instance_get(instance_id):
def update_project_instance(values): def update_project_instance(values):
values = values.copy() values = values.copy()
minstance = models.ProjectInstance()
minstance.update(values)
session = get_session() session = get_session()
with session.begin(): with session.begin():
minstance = _project_instance_get(session, values['instance_id'])
if not minstance:
minstance = models.ProjectInstance()
minstance.update(values)
try: try:
minstance.save(session=session) minstance.save(session=session)
except common_db_exc.DBDuplicateEntry as e: except common_db_exc.DBDuplicateEntry as e:
@ -553,6 +554,15 @@ def instance_group_get(group_id):
return _instance_group_get(get_session(), group_id) return _instance_group_get(get_session(), group_id)
def _instance_groups_get(session):
query = model_query(models.InstanceGroup, session)
return query.all()
def instance_groups_get():
return _instance_groups_get(get_session())
def _group_instances_get(session, group_id): def _group_instances_get(session, group_id):
query = model_query(models.ProjectInstance, session) query = model_query(models.ProjectInstance, session)
return query.filter_by(group_id=group_id).all() return query.filter_by(group_id=group_id).all()
@ -564,28 +574,29 @@ def group_instances_get(group_id):
def update_instance_group(values): def update_instance_group(values):
values = values.copy() values = values.copy()
minstance = models.InstanceGroup()
minstance.update(values)
session = get_session() session = get_session()
with session.begin(): with session.begin():
ig = _instance_group_get(session, values['group_id'])
if not ig:
ig = models.InstanceGroup()
ig.update(values)
try: try:
minstance.save(session=session) ig.save(session=session)
except common_db_exc.DBDuplicateEntry as e: except common_db_exc.DBDuplicateEntry as e:
# raise exception about duplicated columns (e.columns) # raise exception about duplicated columns (e.columns)
raise db_exc.FenixDBDuplicateEntry( raise db_exc.FenixDBDuplicateEntry(
model=minstance.__class__.__name__, columns=e.columns) model=ig.__class__.__name__, columns=e.columns)
return instance_group_get(minstance.group_id) return instance_group_get(ig.group_id)
def remove_instance_group(group_id): def remove_instance_group(group_id):
session = get_session() session = get_session()
with session.begin(): with session.begin():
minstance = _instance_group_get(session, group_id) ig = _instance_group_get(session, group_id)
if not minstance: if not ig:
# raise not found error # raise not found error
raise db_exc.FenixDBNotFound(session, group_id=group_id, raise db_exc.FenixDBNotFound(session, group_id=group_id,
model='instance_groups') model='instance_groups')
session.delete(minstance) session.delete(ig)

View File

@ -55,6 +55,12 @@ class NotFound(FenixException):
code = 404 code = 404
class Duplicate(FenixException):
"""Object not found exception."""
msg_fmt = "Object with %(object)s not found"
code = 409
class NotAuthorized(FenixException): class NotAuthorized(FenixException):
msg_fmt = "Not authorized" msg_fmt = "Not authorized"
code = 403 code = 403

216
fenix/tools/README.md Normal file
View File

@ -0,0 +1,216 @@
# fenix.tools
This directory contains tools and instructions to test Fenix workflows.
Currently OPNFV Doctor has been used to test OpenStack related workflows.
As Doctor is at the moment only for OpenStack and Fenix itself needs a way to be
tested, the Kubernetes workflow (fenix/workflow/workflows/k8s.py) testing
is implemented here.
Files:
- 'demo-ha.yaml': demo-ha ReplicaSet to make 2 anti-affinity PODS.
- 'demo-nonha.yaml': demo-nonha ReplicaSet to make n nonha PODS.
- 'vnfm.py': VNFM to test k8s.py workflow.
## Kubernetes workflow (k8s.py)
First version of workflow towards Kubeernetes use cases.
### Requirements for testing
This workflow assumes ReplicaSet used for PODs. A Kubernetes cluster with
1 master and at least 3 workers are required for testing. Master node needs
DevStack to have Fenix and OpenStack services it still uses. Later on there
can be a version of Fenix not needing Keystone and AODH event alarming, but
using native Kubernetes services for RBAC and events.
As in Doctor testing, there is a pair of anti-affinity PODs (demo-ha) and rest
of the worker node capacity is filled with (demo-nonha) PODs. Scaling of PODs
is done via ReplicaSet number of replicas. Idea is that each POD is taking
("number of worker node CPUs" - 2) / 2. That will make sure scheduler fits
2 PODs on each node and 2 CPUs capacity is assumed for other node services.
This should require at least 6 CPUs for each node to work.
### Install Kubernetes cluster with 1 Manager and 3 Worker nodes.
Here is instructions:
https://docs.openstack.org/openstack-helm/latest/install/kubernetes-gate.html
https://phoenixnap.com/kb/how-to-install-kubernetes-on-a-bare-metal-server
https://phoenixnap.com/kb/how-to-install-kubernetes-on-centos
### On Manager node, install DevStack including Fenix and its minimum services
Note! There is no conflict with Kubernetes as limiting to only Fenix needed
services.
Clone devstack. Tested to work with latest stable release Train.
```sh
git clone https://github.com/openstack/devstack -b stable/train
```
Make local.conf. 'HOST_IP' should bemaster node IP.
```sh
cd devstack vi local.conf
```
```sh
[[local|localrc]]
GIT_BASE=https://git.openstack.org
HOST_IP=192.0.2.4
ADMIN_PASSWORD=admin
DATABASE_PASSWORD=admin
RABBIT_PASSWORD=admin
SERVICE_PASSWORD=admin
LOGFILE=/opt/stack/stack.sh.log
PUBLIC_INTERFACE=eth0
CEILOMETER_EVENT_ALARM=True
ENABLED_SERVICES=key,rabbit,mysql,fenix-engine,fenix-api,aodh-evaluator,aodh-notifier,aodh-api
enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer stable/train
enable_plugin aodh https://git.openstack.org/openstack/aodh stable/train
enable_plugin gnocchi https://github.com/openstack/gnocchi
enable_plugin fenix https://opendev.org/x/fenix master
```
Deploy needed OpenStack services with Fenix
```sh
./stack.sh
```
Now you should have Kubernetes cluster and Fenix via DevStack. Any hacking of
Fenix can be done under '/opt/stack/fenix'.
### Running test
Use 3 terminal windows (Term1, Term2 and Term3) to test Fenix with Kubernetes
kluster. Under here is what you can run in different terminals. Terminals
should be running in master node. Here is short description:
- Term1: Used for logging Fenix
- Term2: Infrastructure admin commands
- Term3: VNFM logging for testing and setting up the VNF
#### Term1: Fenix-engine logging
If any changes to Fenix make them under '/opt/stack/fenix'; restart fenix and
see logs
```sh
sudo systemctl restart devstack@fenix*;sudo journalctl -f --unit devstack@fenix-engine
```
API logs can also be seen
```sh
sudo journalctl -f --unit devstack@fenix-api
```
Debugging and other configuration changes to conf files under '/etc/fenix'
#### Term2: Infrastructure admin window
Use DevStack admin as user. Set your variables needed accordingly
```sh
. ~/devstack/operc admin admin
USER_ID=`openstack user list | grep admin | awk '{print $2}'
HOST=192.0.2.4
PORT=12347
```
Authenticate to Keystone as admin user before calling Fenix. If you will have
some not authorized error later on, you need to do this again.
```sh
OS_AUTH_TOKEN=`openstack token issue | grep " id " |awk '{print $4}'`
```
After you have first: Fenix running in Term1; Next: VNF created a in Term3
Next: VNFM running in Term3, you can create maintenance session utilizing those
```sh
DATE=`date -d "+15 sec" "+%Y-%m-%d %H:%M:%S"`;MSESSION=`curl -g -i -X POST http://$HOST:$PORT/v1/maintenance -H "Accept: application/json" -H "Content-Type: application/json" -d '{"workflow": "k8s", "state": "MAINTENANCE","metadata": {} ,"maintenance_at": "'"$DATE"'"}' -H "X-Auth-Token: $OS_AUTH_TOKEN" -H "X-User-Id: $USER_ID" | grep session_id | jq -r '.session_id'`
```
After maintenance workflow is 'MAINTENANCE_DONE', you should first press
"ctrl + c" in VNFM window (Term3), so it removes constraints from Fenix and
dies. Then you can remove the finished session from fenix
```sh
curl -g -i -X DELETE http://$HOST:$PORT/v1/maintenance/$MSESSION -H "Accept: application/json" -H "Content-Type: application/json" -H "X-Auth-Token: $OS_AUTH_TOKEN" -H "X-User-Id: $USER_ID"
```
If maintenance run till the end with 'MAINTENANCE_DONE', you are ready to run it
again if you wish. 'MAINTENANCE_FAILED' or in case of exceptions, you should
recover system before trying to test again. This is covered in Term3 below.
#### Term3: VNFM (fenix/tools/vnfm.py)
Go to Fenix Kubernetes tool directory for testing
```sh
cd /opt/stack/fenix/fenix/tools
```
Create demo namespace (we use demo namespace and demo user and project in
Keystone)
```sh
kubectl create namespace demo
```
Start VNFM (when done in this order, we make sure demo-ha has nodes for antiaffinity):
```sh
kubectl apply -f demo-ha.yaml --namespace=demo;sleep 1;kubectl apply -f demo-nonha.yaml --namespace=demo
```
Note you should modify above yaml files so that "cpu:" has value of
'(workernode.status.capacity["cpu"] - 2) / 2'. Default is expecting that there
is 32 cpus, so value is "15" in both yaml files. Replicas can be changed in
demo-nonha.yaml. Minimum 2 (if minimum of 3 worker nodes) to maximum
'(amount_of_worker_nodes-1)*2'. Greater amount means more scaling needed and
longer maintenance window as less parallel actions possible. Surely constraints
in vnfm.py also can be changed for different behavior.
You can delete pods used like this
```sh
kubectl delete replicaset.apps demo-ha demo-nonha --namespace=demo
```
Start Kubernetes VNFM that we need for testing
```sh
python vnfm.py
```
Now you can start maintenance session in Term2. When workflow failed or
completed; you first kill vnfm.py with "ctrl+c" and delete maintenance session
in Term2.
If workflow failed something might need to be manually fixed. Here you
uncordon your 3 worker nodes, if maintenance workflow did not run to the end.
```sh
kubectl uncordon worker-node3 worker-node2 worker-node1
```
You can check your pods matches to amount of replicas mentioned in
demo-nonha.yaml and demo-ha.yaml:
```sh
kubectl get pods --all-namespaces --output=wide
```
If not matching, delete and create again as easiest solution
```sh
kubectl delete replicaset.apps demo-ha demo-nonha --namespace=demo;sleep 15;kubectl apply -f demo-ha.yaml --namespace=demo;sleep 1;kubectl apply -f demo-nonha.yaml --namespace=demo
```

0
fenix/tools/__init__.py Normal file
View File

54
fenix/tools/demo-ha.yaml Normal file
View File

@ -0,0 +1,54 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
name: demo-ha
labels:
app: demo-ha
spec:
replicas: 2
selector:
matchLabels:
app: demo-ha
template:
metadata:
labels:
app: demo-ha
active: None
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- demo-ha
topologyKey: "kubernetes.io/hostname"
spec:
containers:
- name: nginx
image: nginx
resources:
requests:
cpu: "15"
ports:
- containerPort: 80
volumeMounts:
- name: workdir
mountPath: /usr/share/nginx/html
initContainers:
- name: install
image: busybox
command:
- wget
- "-O"
- "/work-dir/index.html"
- http://kubernetes.io
volumeMounts:
- name: workdir
mountPath: "/work-dir"
dnsPolicy: Default
volumes:
- name: workdir
emptyDir: {}

View File

@ -0,0 +1,42 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
name: demo-nonha
labels:
app: demo-nonha
spec:
replicas: 3
selector:
matchLabels:
app: demo-nonha
template:
metadata:
labels:
app: demo-nonha
spec:
containers:
- name: nginx
image: nginx
resources:
requests:
cpu: "15"
ports:
- containerPort: 80
volumeMounts:
- name: workdir
mountPath: /usr/share/nginx/html
initContainers:
- name: install
image: busybox
command:
- wget
- "-O"
- "/work-dir/index.html"
- http://kubernetes.io
volumeMounts:
- name: workdir
mountPath: "/work-dir"
dnsPolicy: Default
volumes:
- name: workdir
emptyDir: {}

545
fenix/tools/vnfm.py Normal file
View File

@ -0,0 +1,545 @@
##############################################################################
# Copyright (c) 2020 Nokia Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
# which accompanies this distribution, and is available at
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
import aodhclient.client as aodhclient
import datetime
from flask import Flask
from flask import request
import json
from keystoneclient import client as ks_client
from kubernetes import client
from kubernetes import config
import logging as lging
from oslo_config import cfg
from oslo_log import log as logging
import requests
import sys
from threading import Thread
import time
import yaml
try:
import fenix.utils.identity_auth as identity_auth
except ValueError:
sys.path.append('../utils')
import identity_auth
LOG = logging.getLogger(__name__)
streamlog = lging.StreamHandler(sys.stdout)
LOG.logger.addHandler(streamlog)
LOG.logger.setLevel(logging.INFO)
opts = [
cfg.StrOpt('ip',
default='127.0.0.1',
help='the ip of VNFM',
required=True),
cfg.IntOpt('port',
default='12348',
help='the port of VNFM',
required=True),
]
CONF = cfg.CONF
CONF.register_opts(opts)
CONF.register_opts(identity_auth.os_opts, group='service_user')
class VNFM(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
self.app = None
def start(self):
LOG.info('VNFM start......')
self.app = VNFManager(self.conf, self.log)
self.app.start()
def stop(self):
LOG.info('VNFM stop......')
if not self.app:
return
self.app.headers['X-Auth-Token'] = self.app.session.get_token()
self.app.delete_constraints()
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json',
}
url = 'http://%s:%d/shutdown'\
% (self.conf.ip,
self.conf.port)
requests.post(url, data='', headers=headers)
class VNFManager(Thread):
def __init__(self, conf, log):
Thread.__init__(self)
self.conf = conf
self.log = log
self.port = self.conf.port
self.intance_ids = None
# VNFM is started with OS_* exported as admin user
# We need that to query Fenix endpoint url
# Still we work with our tenant/poroject/vnf as demo
self.project = "demo"
LOG.info('VNFM project: %s' % self.project)
self.auth = identity_auth.get_identity_auth(conf, project=self.project)
self.session = identity_auth.get_session(auth=self.auth)
self.ks = ks_client.Client(version='v3', session=self.session)
self.aodh = aodhclient.Client(2, self.session)
# Subscribe to mainenance event alarm from Fenix via AODH
self.create_alarm()
config.load_kube_config()
self.kaapi = client.AppsV1Api()
self.kapi = client.CoreV1Api()
self.headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'}
self.headers['X-Auth-Token'] = self.session.get_token()
self.orig_number_of_instances = self.number_of_instances()
# List of instances
self.ha_instances = []
self.nonha_instances = []
# Different instance_id specific constraints {instanse_id: {},...}
self.instance_constraints = None
# Update existing instances to instance lists
self.update_instances()
# How many instances needs to exists (with current VNF load)
# max_impacted_members need to be updated accordingly
# if number of instances is scaled. example for demo-ha:
# max_impacted_members = len(self.ha_instances) - ha_group_limit
self.ha_group_limit = 2
self.nonha_group_limit = 2
# Different instance groups constraints dict
self.ha_group = None
self.nonha_group = None
# VNF project_id (VNF ID)
self.project_id = None
# HA instance_id that is active has active label
self.active_instance_id = self.active_instance_id()
services = self.ks.services.list()
for service in services:
if service.type == 'maintenance':
LOG.info('maintenance service: %s:%s type %s'
% (service.name, service.id, service.type))
maint_id = service.id
self.maint_endpoint = [ep.url for ep in self.ks.endpoints.list()
if ep.service_id == maint_id and
ep.interface == 'public'][0]
LOG.info('maintenance endpoint: %s' % self.maint_endpoint)
self.update_constraints_lock = False
self.update_constraints()
# Instances waiting action to be done
self.pending_actions = {}
def create_alarm(self):
alarms = {alarm['name']: alarm for alarm in self.aodh.alarm.list()}
alarm_name = "%s_MAINTENANCE_ALARM" % self.project
if alarm_name in alarms:
return
alarm_request = dict(
name=alarm_name,
description=alarm_name,
enabled=True,
alarm_actions=[u'http://%s:%d/maintenance'
% (self.conf.ip,
self.conf.port)],
repeat_actions=True,
severity=u'moderate',
type=u'event',
event_rule=dict(event_type=u'maintenance.scheduled'))
self.aodh.alarm.create(alarm_request)
def delete_remote_instance_constraints(self, instance_id):
url = "%s/instance/%s" % (self.maint_endpoint, instance_id)
LOG.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
if ret.status_code == 404:
LOG.info('Already deleted: %s' % instance_id)
else:
raise Exception(ret.text)
def update_remote_instance_constraints(self, instance):
url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"])
LOG.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
LOG.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def update_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
LOG.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance_group),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_constraints(self):
for instance_id in self.instance_constraints:
self.delete_remote_instance_constraints(instance_id)
self.delete_remote_group_constraints(self.nonha_group)
self.delete_remote_group_constraints(self.ha_group)
def update_constraints(self):
while self.update_constraints_lock:
LOG.info('Waiting update_constraints_lock...')
time.sleep(1)
self.update_constraints_lock = True
LOG.info('Update constraints')
if self.project_id is None:
self.project_id = self.ks.projects.list(name=self.project)[0].id
# Pods groupped by ReplicaSet, so we use that id
rs = {r.metadata.name: r.metadata.uid for r in
self.kaapi.list_namespaced_replica_set('demo').items}
max_impacted_members = len(self.nonha_instances) - 1
nonha_group = {
"group_id": rs['demo-nonha'],
"project_id": self.project_id,
"group_name": "demo-nonha",
"anti_affinity_group": False,
"max_instances_per_host": 0,
"max_impacted_members": max_impacted_members,
"recovery_time": 10,
"resource_mitigation": True}
LOG.info('create demo-nonha constraints: %s'
% nonha_group)
ha_group = {
"group_id": rs['demo-ha'],
"project_id": self.project_id,
"group_name": "demo-ha",
"anti_affinity_group": True,
"max_instances_per_host": 1,
"max_impacted_members": 1,
"recovery_time": 10,
"resource_mitigation": True}
LOG.info('create demo-ha constraints: %s'
% ha_group)
instance_constraints = {}
for ha_instance in self.ha_instances:
instance = {
"instance_id": ha_instance.metadata.uid,
"project_id": self.project_id,
"group_id": ha_group["group_id"],
"instance_name": ha_instance.metadata.name,
"max_interruption_time": 120,
"migration_type": "EVICTION",
"resource_mitigation": True,
"lead_time": 40}
LOG.info('create ha instance constraints: %s' % instance)
instance_constraints[ha_instance.metadata.uid] = instance
for nonha_instance in self.nonha_instances:
instance = {
"instance_id": nonha_instance.metadata.uid,
"project_id": self.project_id,
"group_id": nonha_group["group_id"],
"instance_name": nonha_instance.metadata.name,
"max_interruption_time": 120,
"migration_type": "EVICTION",
"resource_mitigation": True,
"lead_time": 40}
LOG.info('create nonha instance constraints: %s' % instance)
instance_constraints[nonha_instance.metadata.uid] = instance
if not self.instance_constraints:
# Initial instance constraints
LOG.info('create initial instances constraints...')
for instance in [instance_constraints[i] for i
in instance_constraints]:
self.update_remote_instance_constraints(instance)
self.instance_constraints = instance_constraints.copy()
else:
LOG.info('check instances constraints changes...')
added = [i for i in instance_constraints.keys()
if i not in self.instance_constraints]
deleted = [i for i in self.instance_constraints.keys()
if i not in instance_constraints]
modified = [i for i in instance_constraints.keys()
if (i not in added and i not in deleted and
instance_constraints[i] !=
self.instance_constraints[i])]
for instance_id in deleted:
self.delete_remote_instance_constraints(instance_id)
updated = added + modified
for instance in [instance_constraints[i] for i in updated]:
self.update_remote_instance_constraints(instance)
if updated or deleted:
# Some instance constraints have changed
self.instance_constraints = instance_constraints.copy()
if not self.ha_group or self.ha_group != ha_group:
LOG.info('ha instance group need update')
self.update_remote_group_constraints(ha_group)
self.ha_group = ha_group.copy()
if not self.nonha_group or self.nonha_group != nonha_group:
LOG.info('nonha instance group need update')
self.update_remote_group_constraints(nonha_group)
self.nonha_group = nonha_group.copy()
self.update_constraints_lock = False
def active_instance_id(self):
# We digtate the active in the beginning
instance = self.ha_instances[0]
LOG.info('Initially Active instance: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = {"metadata": {"labels": {"active": "True"}}}
self.kapi.patch_namespaced_pod(name, namespace, body)
self.active_instance_id = instance.metadata.uid
def switch_over_ha_instance(self, instance_id):
if instance_id == self.active_instance_id:
# Need to switchover as instance_id will be affected and is active
for instance in self.ha_instances:
if instance_id == instance.metadata.uid:
LOG.info('Active to Standby: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = client.UNKNOWN_BASE_TYPE()
body.metadata.labels = {"ative": None}
self.kapi.patch_namespaced_pod(name, namespace, body)
else:
LOG.info('Standby to Active: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = client.UNKNOWN_BASE_TYPE()
body.metadata.labels = {"ative": "True"}
self.kapi.patch_namespaced_pod(name, namespace, body)
self.active_instance_id = instance.metadata.uid
self.update_instances()
def get_instance_ids(self):
instances = self.kapi.list_pod_for_all_namespaces().items
return [i.metadata.uid for i in instances
if i.metadata.name.startswith("demo-")
and i.metadata.namespace == "demo"]
def update_instances(self):
instances = self.kapi.list_pod_for_all_namespaces().items
self.ha_instances = [i for i in instances
if i.metadata.name.startswith("demo-ha")
and i.metadata.namespace == "demo"]
self.nonha_instances = [i for i in instances
if i.metadata.name.startswith("demo-nonha")
and i.metadata.namespace == "demo"]
def _alarm_data_decoder(self, data):
if "[" in data or "{" in data:
# string to list or dict removing unicode
data = yaml.load(data.replace("u'", "'"))
return data
def _alarm_traits_decoder(self, data):
return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
for t in data['reason_data']['event']['traits']})
def get_session_instance_ids(self, url, session_id):
ret = requests.get(url, data=None, headers=self.headers)
if ret.status_code != 200:
raise Exception(ret.text)
LOG.info('get_instance_ids %s' % ret.json())
return ret.json()['instance_ids']
def scale_instances(self, scale_instances):
number_of_instances_before = len(self.nonha_instances)
replicas = number_of_instances_before + scale_instances
# We only scale nonha apps
namespace = "demo"
name = "demo-nonha"
body = {'spec': {"replicas": replicas}}
self.kaapi.patch_namespaced_replica_set_scale(name, namespace, body)
time.sleep(3)
# Let's check if scale has taken effect
self.update_instances()
number_of_instances_after = len(self.nonha_instances)
check = 20
while number_of_instances_after == number_of_instances_before:
if check == 0:
LOG.error('scale_instances with: %d failed, still %d instances'
% (scale_instances, number_of_instances_after))
raise Exception('scale_instances failed')
check -= 1
time.sleep(1)
self.update_instances()
number_of_instances_after = len(self.nonha_instances)
LOG.info('scaled instances from %d to %d' %
(number_of_instances_before, number_of_instances_after))
def number_of_instances(self):
instances = self.kapi.list_pod_for_all_namespaces().items
return len([i for i in instances
if i.metadata.name.startswith("demo-")])
def instance_action(self, instance_id, allowed_actions):
# We should keep instance constraint in our internal structur
# and match instance_id specific allowed action. Now we assume EVICTION
if 'EVICTION' not in allowed_actions:
LOG.error('Action for %s not foudn from %s' %
(instance_id, allowed_actions))
return None
return 'EVICTION'
def instance_action_started(self, instance_id, action):
time_now = datetime.datetime.utcnow()
max_interruption_time = (
self.instance_constraints[instance_id]['max_interruption_time'])
self.pending_actions[instance_id] = {
'started': time_now,
'max_interruption_time': max_interruption_time,
'action': action}
def was_instance_action_in_time(self, instance_id):
time_now = datetime.datetime.utcnow()
started = self.pending_actions[instance_id]['started']
limit = self.pending_actions[instance_id]['max_interruption_time']
action = self.pending_actions[instance_id]['action']
td = time_now - started
if td.total_seconds() > limit:
LOG.error('%s %s took too long: %ds' %
(instance_id, action, td.total_seconds()))
LOG.error('%s max_interruption_time %ds might be too short' %
(instance_id, limit))
raise Exception('%s %s took too long: %ds' %
(instance_id, action, td.total_seconds()))
else:
LOG.info('%s %s with recovery time took %ds' %
(instance_id, action, td.total_seconds()))
del self.pending_actions[instance_id]
def run(self):
app = Flask('VNFM')
@app.route('/maintenance', methods=['POST'])
def maintenance_alarm():
data = json.loads(request.data.decode('utf8'))
try:
payload = self._alarm_traits_decoder(data)
except Exception:
payload = ({t[0]: t[2] for t in
data['reason_data']['event']['traits']})
LOG.error('cannot parse alarm data: %s' % payload)
raise Exception('VNFM cannot parse alarm.'
'Possibly trait data over 256 char')
LOG.info('VNFM received data = %s' % payload)
state = payload['state']
reply_state = None
reply = dict()
LOG.info('VNFM state: %s' % state)
if state == 'MAINTENANCE':
self.headers['X-Auth-Token'] = self.session.get_token()
instance_ids = (self.get_session_instance_ids(
payload['instance_ids'],
payload['session_id']))
reply['instance_ids'] = instance_ids
reply_state = 'ACK_MAINTENANCE'
elif state == 'SCALE_IN':
# scale down only nonha instances
nonha_instances = len(self.nonha_instances)
scale_in = nonha_instances / 2
self.scale_instances(-scale_in)
self.update_constraints()
reply['instance_ids'] = self.get_instance_ids()
reply_state = 'ACK_SCALE_IN'
elif state == 'MAINTENANCE_COMPLETE':
# possibly need to upscale
number_of_instances = self.number_of_instances()
if self.orig_number_of_instances > number_of_instances:
scale_instances = (self.orig_number_of_instances -
number_of_instances)
self.scale_instances(scale_instances)
self.update_constraints()
reply_state = 'ACK_MAINTENANCE_COMPLETE'
elif (state == 'PREPARE_MAINTENANCE'
or state == 'PLANNED_MAINTENANCE'):
instance_id = payload['instance_ids'][0]
instance_action = (self.instance_action(instance_id,
payload['allowed_actions']))
if not instance_action:
raise Exception('Allowed_actions not supported for %s' %
instance_id)
LOG.info('VNFM got instance: %s' % instance_id)
self.switch_over_ha_instance(instance_id)
reply['instance_action'] = instance_action
reply_state = 'ACK_%s' % state
self.instance_action_started(instance_id, instance_action)
elif state == 'INSTANCE_ACTION_DONE':
# TBD was action done in max_interruption_time (live migration)
# NOTE, in EVICTION instance_id reported that was in evicted
# node. New instance_id might be different
LOG.info('%s' % payload['instance_ids'])
self.was_instance_action_in_time(payload['instance_ids'][0])
self.update_instances()
self.update_constraints()
else:
raise Exception('VNFM received event with'
' unknown state %s' % state)
if reply_state:
reply['session_id'] = payload['session_id']
reply['state'] = reply_state
url = payload['reply_url']
LOG.info('VNFM reply: %s' % reply)
requests.put(url, data=json.dumps(reply), headers=self.headers)
return 'OK'
@app.route('/shutdown', methods=['POST'])
def shutdown():
LOG.info('shutdown VNFM server at %s' % time.time())
func = request.environ.get('werkzeug.server.shutdown')
if func is None:
raise RuntimeError('Not running with the Werkzeug Server')
func()
return 'VNFM shutting down...'
app.run(host="0.0.0.0", port=self.port)
if __name__ == '__main__':
app_manager = VNFM(CONF, LOG)
app_manager.start()
try:
LOG.info('Press CTRL + C to quit')
while True:
time.sleep(2)
except KeyboardInterrupt:
app_manager.stop()

View File

@ -42,14 +42,14 @@ os_opts = [
] ]
def get_identity_auth(conf): def get_identity_auth(conf, project=None):
loader = loading.get_plugin_loader('password') loader = loading.get_plugin_loader('password')
return loader.load_from_options( return loader.load_from_options(
auth_url=conf.service_user.os_auth_url, auth_url=conf.service_user.os_auth_url,
username=conf.service_user.os_username, username=conf.service_user.os_username,
password=conf.service_user.os_password, password=conf.service_user.os_password,
user_domain_name=conf.service_user.os_user_domain_name, user_domain_name=conf.service_user.os_user_domain_name,
project_name=conf.service_user.os_project_name, project_name = (project or conf.service_user.os_project_name),
tenant_name=conf.service_user.os_project_name, tenant_name=conf.service_user.os_project_name,
project_domain_name=conf.service_user.os_project_domain_name) project_domain_name=conf.service_user.os_project_domain_name)

View File

@ -37,6 +37,7 @@ from uuid import uuid1 as generate_uuid
from fenix import context from fenix import context
from fenix.db import api as db_api from fenix.db import api as db_api
from fenix import exceptions
from fenix.utils.download import download_url from fenix.utils.download import download_url
import fenix.utils.identity_auth import fenix.utils.identity_auth
@ -159,6 +160,8 @@ class EngineEndpoint(object):
def admin_delete_session(self, ctx, session_id): def admin_delete_session(self, ctx, session_id):
"""Delete maintenance workflow session thread""" """Delete maintenance workflow session thread"""
LOG.info("EngineEndpoint: admin_delete_session") LOG.info("EngineEndpoint: admin_delete_session")
if session_id not in self.workflow_sessions:
raise exceptions.NotFound("session_id not found")
self.workflow_sessions[session_id].cleanup() self.workflow_sessions[session_id].cleanup()
self.workflow_sessions[session_id].stop() self.workflow_sessions[session_id].stop()
self.workflow_sessions.pop(session_id) self.workflow_sessions.pop(session_id)

View File

@ -398,13 +398,27 @@ class BaseWorkflow(Thread):
def maintenance(self): def maintenance(self):
LOG.error("%s: maintenance method not implemented!" % self.session_id) LOG.error("%s: maintenance method not implemented!" % self.session_id)
def maintenance_done(self):
LOG.error("%s: maintenance_done method not implemented!" %
self.session_id)
def maintenance_failed(self): def maintenance_failed(self):
LOG.error("%s: maintenance_failed method not implemented!" % LOG.error("%s: maintenance_failed method not implemented!" %
self.session_id) self.session_id)
def state(self, state): def state(self, state):
# TBD we could notify admin for workflow state change
self.session.prev_state = self.session.state self.session.prev_state = self.session.state
self.session.state = state self.session.state = state
if state in ["MAINTENANCE_DONE", "MAINTENANCE_FAILED"]:
try:
statefunc = (getattr(self,
self.states_methods[self.session.state]))
statefunc()
except Exception as e:
LOG.error("%s: %s Raised exception: %s" % (self.session_id,
statefunc, e), exc_info=True)
self.state("MAINTENANCE_FAILED")
def run(self): def run(self):
LOG.info("%s: started" % self.session_id) LOG.info("%s: started" % self.session_id)
@ -555,3 +569,10 @@ class BaseWorkflow(Thread):
LOG.error('%s: timer %s expired' % LOG.error('%s: timer %s expired' %
(self.session_id, timer_name)) (self.session_id, timer_name))
return False return False
def project_ids_with_instance_group(self):
igs = db_api.instance_groups_get()
project_ids = list()
[project_ids.append(ig.project_id) for ig in igs
if ig.project_id not in project_ids]
return project_ids

View File

@ -0,0 +1,984 @@
# Copyright (c) 2020 OpenStack Foundation.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import datetime
from importlib import import_module
try:
from importlib.machinery import SourceFileLoader
def mod_loader_action_instance(mname, mpath, session_instance,
ap_db_instance):
mi = SourceFileLoader(mname, mpath).load_module()
return mi.ActionPlugin(session_instance, ap_db_instance)
except ImportError:
from imp import load_source
def mod_loader_action_instance(mname, mpath, session_instance,
ap_db_instance):
mi = load_source(mname, mpath)
return mi.ActionPlugin(session_instance, ap_db_instance)
from keystoneclient import client as ks_client
from kubernetes import client
from kubernetes import config
from kubernetes.client.rest import ApiException
import os
from oslo_log import log as logging
import time
from fenix.db import api as db_api
from fenix.db import exceptions as db_exc
from fenix.utils.thread import run_async
from fenix.utils.time import datetime_to_str
from fenix.utils.time import is_time_after_time
from fenix.utils.time import reply_time_str
from fenix.utils.time import time_now_str
from fenix.workflow.workflow import BaseWorkflow
LOG = logging.getLogger(__name__)
class Workflow(BaseWorkflow):
def __init__(self, conf, session_id, data):
super(Workflow, self).__init__(conf, session_id, data)
config.load_kube_config()
v_api = client.VersionApi()
self.kapi = client.CoreV1Api()
self.ks = ks_client.Client(version='v3', session=self.auth_session)
LOG.info("%s: initialized with Kubernetes: %s" %
(self.session_id,
v_api.get_code_with_http_info()[0].git_version))
self.hosts = self._init_hosts_by_services()
LOG.info('%s: Execute pre action plugins' % (self.session_id))
self.maintenance_by_plugin_type("localhost", "pre")
self.group_impacted_members = {}
def _init_hosts_by_services(self):
LOG.info("%s: Dicovering hosts by services" % self.session_id)
nodes = self.kapi.list_node().items
hosts = []
for node in nodes:
host = {}
host['hostname'] = node.metadata.name
if 'node-role.kubernetes.io/master' in node.metadata.labels.keys():
host['type'] = 'controller'
else:
host['type'] = 'compute'
if node.spec.unschedulable:
host['disabled'] = True
else:
host['disabled'] = False
host['maintained'] = False
hosts.append(host)
return db_api.create_hosts_by_details(self.session_id, hosts)
def get_worker_nodes(self):
nodes = self.kapi.list_node().items
worker_hosts = self.get_compute_hosts()
return [n for n in nodes if n.metadata.name in worker_hosts]
def is_node_cordoned(self, node_name):
host = self.get_host_by_name(node_name)
return host.disabled
def cordon(self, node_name):
LOG.info("%s: cordon %s" % (self.session_id, node_name))
host = self.get_host_by_name(node_name)
body = {"apiVersion": "v1", "spec": {"unschedulable": True}}
self.kapi.patch_node(node_name, body)
host.disabled = True
def uncordon(self, node_name):
LOG.info("%s: uncordon %s" % (self.session_id, node_name))
host = self.get_host_by_name(node_name)
body = {"apiVersion": "v1", "spec": {"unschedulable": None}}
self.kapi.patch_node(node_name, body)
host.disabled = False
def _pod_by_id(self, pod_id):
return [p for p in self.kapi.list_pod_for_all_namespaces().items
if p.metadata.uid == pod_id][0]
def _pods_by_node_and_controller(self, node_name, contoller):
return [p for p in self.kapi.list_pod_for_all_namespaces().items
if p.metadata.owner_references[0].kind == contoller and
p.spec.node_name == node_name and
p.metadata.namespace != 'kube-system']
def _pods_by_nodes_and_controller(self, node_names, contoller):
return [p for p in self.kapi.list_pod_for_all_namespaces().items
if p.metadata.owner_references[0].kind == contoller and
p.spec.node_name in node_names and
p.metadata.namespace != 'kube-system']
def _get_pod_by_name_and_namespace(self, name, namespace):
try:
pod = self.kapi.read_namespaced_pod(name, namespace)
except ApiException:
pod = None
return pod
# TBD remove as deprecated
def _get_pod_host_and_state(self, name):
return [(p.spec.node_name, p.status.phase) for p in
self.kapi.list_pod_for_all_namespaces().items
if p.metadata.name == name][0]
# TBD remove as deprecated
def wait_pod_evicted(self, name, orig_host, orig_state):
host, state = self._get_pod_host_and_state(name)
check = 60
last_state = orig_state
last_host = orig_host
while host == orig_host or state != orig_state:
if host != last_host or state != last_state:
# log only if either value changed since last round
LOG.info("%s: pod: %s %s on host %s" %
(self.session_id, name, state, host))
last_state = state
last_host = host
if check == 0:
raise Exception('Pod %s eviction timout' % name)
check -= 1
time.sleep(1)
host, state = self._get_pod_host_and_state(name)
# TBD remove as deprecated
def drain(self, node_name):
LOG.info("%s: drain %s" % (self.session_id, node_name))
if not self.is_node_cordoned(node_name):
self.cordon(node_name)
for pod in self._pods_by_node_and_controller(node_name,
'ReplicaSet'):
namespace = pod.metadata.namespace
name = pod.metadata.name
orig_host = pod.spec.node_name
orig_state = pod.status.phase
# For now k8s namespace will be the user and project in OpenStack
# keystone. Keycloak or webhook for keystone should be used
body = client.V1beta1Eviction()
body.api_version = "policy/v1beta1"
body.kind = "Eviction"
body.metadata = {"name": name, "namespace": namespace}
LOG.info("%s: Evicting pod: %s %s on host %s" %
(self.session_id, name, orig_state, orig_host))
try:
self.kapi.create_namespaced_pod_eviction(name,
namespace,
body)
except ApiException as e:
LOG.error("Exception when calling create_namespaced_pod_"
"eviction: %s\n" % e)
# self.wait_pod_evicted(name, orig_host, orig_state)
LOG.info("%s: Evicted pod: %s" % (self.session_id, name))
# VNFM should keep track of constraints, not Fenix
# db_api.remove_project_instance(pod_id)
# self.notify_action_done(self.instance_by_id(pod_id))
LOG.info("%s: drained %s" % (self.session_id, node_name))
def evict(self, pod, recovery_time):
namespace = pod.metadata.namespace
name = pod.metadata.name
pod_id = pod.metadata.uid
LOG.info("%s: Evict: %s: %s" % (self.session_id, pod_id, name))
orig_host = pod.spec.node_name
orig_state = pod.status.phase
# For now k8s namespace will be the user and project in OpenStack
# keystone. Keycloak or webhook for keystone should be used
body = client.V1beta1Eviction()
body.api_version = "policy/v1beta1"
body.kind = "Eviction"
body.metadata = {"name": name,
"namespace": namespace}
LOG.info("%s: Evicting pod: %s %s on host %s" %
(self.session_id, name, orig_state, orig_host))
try:
self.kapi.create_namespaced_pod_eviction(name,
namespace,
body)
except ApiException as e:
LOG.error("Exception when calling create_namespaced_pod_"
"eviction: %s\n" % e)
# Need to start timer to wait new POD initialization with recovery time
# TBD this might first check new POD STATUS == running and then
# still wait instance_group.recovery_time. This might be tricky as we
# do not know new POD. We might check new pods, but there might be more
# than one becasue parallel actions. Somehow we would need to be able
# to map evicted POD for new to make this enhancement
# tried adding "labels": {"previous_pod_id": pod_id} in above body.
# that did not result to this label to be in new POD
timer = 'RECOVERY_%s_TIMEOUT' % pod_id
self.start_timer(recovery_time, timer)
time.sleep(1)
pod = self._get_pod_by_name_and_namespace(name, namespace)
check = 40
LOG.info("%s: Waiting pod: %s eviction from host %s ..." %
(self.session_id, name, orig_host))
while pod:
if check == 0:
raise Exception('Pod %s still not deleted in eviction' % name)
check -= 1
time.sleep(1)
pod = self._get_pod_by_name_and_namespace(name, namespace)
LOG.info("%s: Evicted pod: %s: %s" % (self.session_id, pod_id, name))
return True
def _fenix_instance(self, project_id, instance_id, instance_name, host,
state, details=None, action=None, project_state=None,
action_done=False):
instance = {'session_id': self.session_id,
'instance_id': instance_id,
'action': action,
'project_id': project_id,
'instance_id': instance_id,
'project_state': project_state,
'state': state,
'instance_name': instance_name,
'action_done': action_done,
'host': host,
'details': details}
return instance
def initialize_server_info(self):
project_ids = {}
instances = []
worker_hosts = self.get_compute_hosts()
pods = self._pods_by_nodes_and_controller(worker_hosts, 'ReplicaSet')
for pod in pods:
host = pod.spec.node_name
# Map K8S namespace as user and project in keystone
if pod.metadata.namespace not in project_ids.keys():
project_id = str(self.ks.projects.list(
name=pod.metadata.namespace)[0].id)
project_ids[pod.metadata.namespace] = project_id
else:
project_id = project_ids[pod.metadata.namespace]
instance_name = pod.metadata.name
instance_id = pod.metadata.uid
state = pod.status.phase # Running
instances.append(self._fenix_instance(project_id, instance_id,
instance_name, host, state))
if project_ids:
self.projects = self.init_projects(project_ids.values())
else:
LOG.info('%s: No projects on nodes under maintenance' %
self.session_id)
if len(instances):
self.instances = self.add_instances(instances)
else:
LOG.info('%s: No instances on nodes under maintenance' %
self.session_id)
LOG.info(str(self))
def update_instance(self, project_id, instance_id, instance_name, host,
state, details=None):
if self.instance_id_found(instance_id):
# TBD Might need to update instance variables here if not done
# somewhere else
return
elif self.instance_name_found(instance_name):
# Project has made re-instantiation, remove old add new
old_instance = self.instance_by_name(instance_name)
instance = self._fenix_instance(project_id, instance_id,
instance_name, host,
state, details,
old_instance.action,
old_instance.project_state,
old_instance.action_done)
self.instances.append(self.add_instance(instance))
self.remove_instance(old_instance)
else:
# Instance new, as project has added instances
instance = self._fenix_instance(project_id, instance_id,
instance_name, host,
state, details)
self.instances.append(self.add_instance(instance))
def remove_non_existing_instances(self, instance_ids):
remove_instances = [instance for instance in
self.instances if instance.instance_id not in
instance_ids]
for instance in remove_instances:
# Instance deleted, as project possibly scaled down
self.remove_instance(instance)
def update_server_info(self):
# TBD This keeps internal instance information up-to-date and prints
# it out. Same could be done by updating the information when changed
# Anyhow this also double checks information against K8S
project_ids = {}
instance_ids = []
worker_hosts = self.get_compute_hosts()
pods = self._pods_by_nodes_and_controller(worker_hosts, 'ReplicaSet')
for pod in pods:
host = pod.spec.node_name
# Map K8S namespace as user and project in keystone
if pod.metadata.namespace not in project_ids.keys():
project_id = self.ks.projects.list(
name=pod.metadata.namespace)[0].id
project_ids[pod.metadata.namespace] = project_id
else:
project_id = project_ids[pod.metadata.namespace]
instance_name = pod.metadata.name
instance_id = pod.metadata.uid
state = pod.status.phase # Running
details = None
self.update_instance(project_id, instance_id, instance_name, host,
state, details)
instance_ids.append(instance_id)
self.remove_non_existing_instances(instance_ids)
LOG.info(str(self))
def projects_with_constraints(self):
project_ids = self.project_ids_with_instance_group()
for project_id in self.projects():
if project_id not in project_ids:
LOG.error('%s: project_id %s not '
'set any instance_group' %
(self.session_id, project_id))
return False
return True
def confirm_maintenance(self):
allowed_actions = []
actions_at = self.session.maintenance_at
state = 'MAINTENANCE'
self.set_projets_state(state)
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('\nMAINTENANCE to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
reply_at = reply_time_str(self.conf.project_maintenance_reply)
if is_time_after_time(reply_at, actions_at):
LOG.error('%s: No time for project to answer in state: %s'
% (self.session_id, state))
self.state("MAINTENANCE_FAILED")
return False
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_maintenance_reply,
'MAINTENANCE_TIMEOUT')
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_maintenance failed after retries')
break
else:
LOG.info('confirm_maintenance retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def worker_nodes_cpu_info(self, system_reserved):
# TBD system_reserved is now just questimated according to what
# flannel, kubelet... needs on top of pods
workers_info = {}
worker_hosts = self.get_compute_hosts()
workers = self.get_worker_nodes()
pods = self._pods_by_nodes_and_controller(worker_hosts, 'ReplicaSet')
for worker in workers:
cpus = int(worker.status.capacity[u'cpu']) - system_reserved
name = worker.metadata.name
workers_info[name] = {'cpus_used': 0,
'cpus': cpus,
'name': name}
for pod in [p for p in pods if p.spec.node_name == name]:
cpus_used = 0
for container in pod.spec.containers:
try:
cpus_used += int(container.resources.requests[u'cpu'])
except AttributeError:
# container does not need to have
# resources.requests.cpu
pass
if cpus_used > 0:
workers_info[name]['cpus_used'] += cpus_used
if workers_info[name]['cpus_used'] > workers_info[name]['cpus']:
LOG.error('%s overbooked: %s' %
(name, workers_info[name]))
raise Exception('%s overbooked: %s' %
(name, workers_info[name]))
LOG.info('workers_info:\n%s' % workers_info)
return workers_info
def confirm_scale_in(self):
allowed_actions = []
actions_at = reply_time_str(self.conf.project_scale_in_reply)
reply_at = actions_at
state = 'SCALE_IN'
self.set_projets_state(state)
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('\nSCALE_IN to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'SCALE_IN_TIMEOUT')
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_scale_in failed after retries')
break
else:
LOG.info('confirm_scale_in retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def need_scale_in(self):
# TBD see if there is enough free capacity, so we do not need to scale
# TBD this should be calculated according to instance and
# instance_group constraints
workers_info = self.worker_nodes_cpu_info(2)
prev_cpus = 0
free_cpus = 0
prev_hostname = ''
LOG.info('checking workers CPU capacity')
for worker in workers_info.values():
hostname = worker['name']
cpus = worker['cpus']
cpus_used = worker['cpus_used']
if prev_cpus != 0 and prev_cpus != cpus:
raise Exception('%s: %d cpus on %s does not match to'
'%d on %s'
% (self.session_id, cpus, hostname,
prev_cpus, prev_hostname))
free_cpus += cpus - cpus_used
prev_cpus = cpus
prev_hostname = hostname
if free_cpus >= cpus:
# TBD cpu capacity might be too scattered so moving instances from
# one host to other host still might not succeed. At least with
# NUMA and CPU pinning, one should calculate and ask specific
# instances
return False
else:
return True
def find_host_to_be_empty(self, need_empty, weighted_hosts):
print("need_empty: %s" % need_empty)
hosts_to_be_empty = []
for instances in sorted(weighted_hosts.keys()):
print("instances in weighted_hosts: %s" % instances)
weighted_candidates = weighted_hosts[instances]
if len(weighted_candidates) == need_empty:
# Happened to be exact match to needed
hosts_to_be_empty = weighted_hosts[instances]
print("hosts to be empty: %s" % hosts_to_be_empty)
elif len(weighted_candidates) > need_empty:
# More candidates than we need, dig deeper to act_instances
for host in weighted_candidates:
print("host to be empty: %s" % host)
hosts_to_be_empty.append(host)
if len(hosts_to_be_empty) == need_empty:
break
if len(hosts_to_be_empty) == need_empty:
break
if len(hosts_to_be_empty) != need_empty:
print("we failed to search hosts to be empty!!!")
return hosts_to_be_empty
def make_empty_hosts(self, state):
# TBD, calculate how many nodes can be made empty, now just very simple
# According to where is least pods
weighted_hosts = {}
empty_hosts = []
for host in self.get_compute_hosts():
instances = len(self.instances_by_host(host))
if instances == 0:
self.empty_hosts.append(host)
self.cordon(host)
LOG.info("host %s empty" % host)
else:
if instances not in weighted_hosts:
weighted_hosts[instances] = [host]
else:
weighted_hosts[instances].append(host)
if len(empty_hosts):
# TBD We just need empty host to initial POC testing
return True
else:
need_empty = 1
hosts_to_be_empty = self.find_host_to_be_empty(need_empty,
weighted_hosts)
thrs = []
for host in hosts_to_be_empty:
thrs.append(self.actions_to_have_empty_host(host, state))
# self._wait_host_empty(host)
for thr in thrs:
thr.join()
return True
@run_async
def instance_action(self, instance, state, target_host=None):
if not self.confirm_instance_action(instance, state):
raise Exception('%s: instance %s action %s '
'confirmation failed' %
(self.session_id, instance.instance_id,
instance.action))
# TBD from constraints or override in instance.action
LOG.info('Action %s instance %s ' % (instance.action,
instance.instance_id))
try:
instance_constraints = (
db_api.project_instance_get(instance.instance_id))
group_id = instance_constraints.group_id
instance_group = db_api.instance_group_get(group_id)
if group_id not in self.group_impacted_members:
self.group_impacted_members[group_id] = 0
max_parallel = instance_group.max_impacted_members
LOG.info("%s - instance_group: %s max_impacted_members: %s "
"recovery_time: %s" %
(instance.instance_id, instance_group.group_name,
max_parallel, instance_group.recovery_time))
except db_exc.FenixDBNotFound:
raise Exception('failed to get %s constraints' %
(instance.instance_id))
while max_parallel < self.group_impacted_members[group_id]:
LOG.info('%s waiting in group queue / max_parallel %s/%s' %
(instance.instance_id,
self.group_impacted_members[group_id],
max_parallel))
time.sleep(5)
self.group_impacted_members[group_id] += 1
LOG.debug("%s Reserved / max_impacted_members: %s/%s" %
(instance.instance_id, self.group_impacted_members[group_id],
max_parallel))
if instance.action == 'OWN_ACTION':
pass
elif instance.action == 'EVICTION':
pod = self._pod_by_id(instance.instance_id)
if not self.evict(pod, instance_group.recovery_time):
self.group_impacted_members[group_id] -= 1
LOG.debug("%s Reservation freed. remain / "
"max_impacted_members:%s/%s"
% (instance.instance_id,
self.group_impacted_members[group_id],
max_parallel))
raise Exception('%s: instance %s action '
'%s failed' %
(self.session_id, instance.instance_id,
instance.action))
else:
self.group_impacted_members[group_id] -= 1
LOG.debug("%s Reservation freed. remain / "
"max_impacted_members:%s/%s"
% (instance.instance_id,
self.group_impacted_members[group_id],
max_parallel))
raise Exception('%s: instance %s action '
'%s not supported' %
(self.session_id, instance.instance_id,
instance.action))
# We need to obey recovery time for instance group before
# decrease self.group_impacted_members[group_id] to allow
# one more instances of same group to be affected by any move
# operation
if instance_group.recovery_time > 0:
timer = 'RECOVERY_%s_TIMEOUT' % instance.instance_id
LOG.info("%s wait POD to recover from move..."
% instance.instance_id)
while not self.is_timer_expired(timer):
time.sleep(1)
self.notify_action_done(instance)
self.group_impacted_members[group_id] -= 1
LOG.debug("%s Reservation freed. remain / max_impacted_members: %s/%s"
% (instance.instance_id,
self.group_impacted_members[group_id],
max_parallel))
@run_async
def actions_to_have_empty_host(self, host, state, target_host=None):
# TBD we only support EVICTION of all pods with drain(host)
# Need parallel hosts and make_empty_hosts to calculate
thrs = []
LOG.info('actions_to_have_empty_host %s' % host)
instances = self.instances_by_host(host)
if not instances:
raise Exception('No instances on host: %s' % host)
self.cordon(host)
for instance in instances:
LOG.info('move %s from %s' % (instance.instance_name, host))
thrs.append(self.instance_action(instance, state,
target_host))
# thrs.append(self.confirm_instance_action(instance, state))
for thr in thrs:
thr.join()
if state == 'PLANNED_MAINTENANCE':
self.host_maintenance(host)
def confirm_instance_action(self, instance, state):
instance_id = instance.instance_id
LOG.info('%s to instance %s' % (state, instance_id))
allowed_actions = ['EVICTION', 'OWN_ACTION']
try:
instance_constraints = db_api.project_instance_get(instance_id)
wait_time = instance_constraints.lead_time
LOG.info("%s actions_at from constraints lead_time: %s" %
(instance_id, wait_time))
except db_exc.FenixDBNotFound:
wait_time = self.conf.project_maintenance_reply
actions_at = reply_time_str(wait_time)
reply_at = actions_at
instance.project_state = state
metadata = self.session.meta
retry = 2
replied = False
while not replied:
metadata = self.session.meta
self._project_notify(instance.project_id, [instance_id],
allowed_actions, actions_at, reply_at,
state, metadata)
timer = '%s_%s_TIMEOUT' % (state, instance_id)
self.start_timer(self.conf.project_maintenance_reply, timer)
replied = self.wait_instance_reply_state(state, instance, timer)
if not replied:
if retry == 0:
LOG.info('confirm_instance_action for %s failed after '
'retries' % instance.instance_id)
break
else:
LOG.info('confirm_instance_action for %s retry'
% instance.instance_id)
else:
break
retry -= 1
return replied
def confirm_maintenance_complete(self):
state = 'MAINTENANCE_COMPLETE'
metadata = self.session.meta
actions_at = reply_time_str(self.conf.project_scale_in_reply)
reply_at = actions_at
self.set_projets_state(state)
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('%s to project %s' % (state, project))
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
allowed_actions = []
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'%s_TIMEOUT' % state)
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_maintenance_complete failed after '
'retries')
break
else:
LOG.info('confirm_maintenance_complete retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def notify_action_done(self, instance):
instance_ids = [instance.instance_id]
project = instance.project_id
allowed_actions = []
actions_at = None
reply_at = None
state = "INSTANCE_ACTION_DONE"
instance.project_state = state
metadata = "{}"
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
def maintenance_by_plugin_type(self, hostname, plugin_type):
aps = self.get_action_plugins_by_type(plugin_type)
session_dir = "%s/%s" % (self.conf.engine.local_cache_dir,
self.session_id)
download_plugin_dir = session_dir + "/actions/"
if aps:
LOG.info("%s: Calling action plug-ins with type %s" %
(self.session_id, plugin_type))
for ap in aps:
ap_name = "fenix.workflow.actions.%s" % ap.plugin
LOG.info("%s: Calling action plug-in module: %s" %
(self.session_id, ap_name))
ap_db_instance = self._create_action_plugin_instance(ap.plugin,
hostname)
try:
action_plugin = getattr(import_module(ap_name),
'ActionPlugin')
ap_instance = action_plugin(self, ap_db_instance)
except ImportError:
download_plugin_file = "%s/%s.py" % (download_plugin_dir,
ap.plugin)
LOG.info("%s: Trying from: %s" % (self.session_id,
download_plugin_file))
if os.path.isfile(download_plugin_file):
ap_instance = (
mod_loader_action_instance(ap_name,
download_plugin_file,
self,
ap_db_instance))
else:
raise Exception('%s: could not find action plugin %s' %
(self.session_id, ap.plugin))
ap_instance.run()
if ap_db_instance.state:
LOG.info('%s: %s finished with %s host %s' %
(self.session_id, ap.plugin,
ap_db_instance.state, hostname))
if 'FAILED' in ap_db_instance.state:
raise Exception('%s: %s finished with %s host %s' %
(self.session_id, ap.plugin,
ap_db_instance.state, hostname))
else:
raise Exception('%s: %s reported no state for host %s' %
(self.session_id, ap.plugin, hostname))
# If ap_db_instance failed, we keep it for state
db_api.remove_action_plugin_instance(ap_db_instance)
else:
LOG.info("%s: No action plug-ins with type %s" %
(self.session_id, plugin_type))
def _wait_host_empty(self, host):
check = 60
pods = self._pods_by_node_and_controller(host, 'ReplicaSet')
while pods:
if check == 0:
raise Exception('Wait empty host %s timout' % host)
elif not check % 5:
LOG.info('...waiting host %s empty' % host)
check -= 1
time.sleep(1)
pods = self._pods_by_node_and_controller(host, 'ReplicaSet')
LOG.info('Host %s empty' % host)
@run_async
def host_maintenance_async(self, hostname):
self.host_maintenance(hostname)
def host_maintenance(self, hostname):
host = self.get_host_by_name(hostname)
if host.type == "compute":
self._wait_host_empty(hostname)
LOG.info('IN_MAINTENANCE %s' % hostname)
self._admin_notify(self.conf.service_user.os_project_name,
hostname,
'IN_MAINTENANCE',
self.session_id)
for plugin_type in ["host", host.type]:
LOG.info('%s: Execute %s action plugins' % (self.session_id,
plugin_type))
self.maintenance_by_plugin_type(hostname, plugin_type)
self._admin_notify(self.conf.service_user.os_project_name,
hostname,
'MAINTENANCE_COMPLETE',
self.session_id)
if host.type == "compute":
self.uncordon(hostname)
LOG.info('MAINTENANCE_COMPLETE %s' % hostname)
host.maintained = True
def maintenance(self):
LOG.info("%s: maintenance called" % self.session_id)
self.initialize_server_info()
time.sleep(1)
self.state('START_MAINTENANCE')
if not self.projects_with_constraints:
self.state('MAINTENANCE_FAILED')
return
if not self.confirm_maintenance():
self.state('MAINTENANCE_FAILED')
return
maintenance_empty_hosts = self.get_empty_computes()
if len(maintenance_empty_hosts) == 0:
if self.need_scale_in():
LOG.info('%s: Need to scale in to get capacity for '
'empty host' % (self.session_id))
self.state('SCALE_IN')
else:
LOG.info('%s: Free capacity, but need empty host' %
(self.session_id))
self.state('PREPARE_MAINTENANCE')
else:
LOG.info('Empty host found')
self.state('START_MAINTENANCE')
if self.session.maintenance_at > datetime.datetime.utcnow():
time_now = time_now_str()
LOG.info('Time now: %s maintenance starts: %s....' %
(time_now, datetime_to_str(self.session.maintenance_at)))
td = self.session.maintenance_at - datetime.datetime.utcnow()
self.start_timer(td.total_seconds(), 'MAINTENANCE_START_TIMEOUT')
while not self.is_timer_expired('MAINTENANCE_START_TIMEOUT'):
time.sleep(1)
time_now = time_now_str()
LOG.info('Time to start maintenance: %s' % time_now)
def scale_in(self):
LOG.info("%s: scale in" % self.session_id)
# TBD we just blindly ask to scale_in to have at least one
# empty compute. With NUMA and CPU pinning and together with
# how many instances can be affected at the same time, we should
# calculate and ask scaling of specific instances
if not self.confirm_scale_in():
self.state('MAINTENANCE_FAILED')
return
# TBD it takes time to have proper information updated about free
# capacity. Should make sure instances removed by other means than
# sleeping here
time.sleep(4)
self.update_server_info()
maintenance_empty_hosts = self.get_empty_computes()
if len(maintenance_empty_hosts) == 0:
if self.need_scale_in():
LOG.info('%s: Need to scale in more to get capacity for '
'empty host' % (self.session_id))
self.state('SCALE_IN')
else:
LOG.info('%s: Free capacity, but need empty host' %
(self.session_id))
self.state('PREPARE_MAINTENANCE')
else:
LOG.info('Empty host found')
self.state('START_MAINTENANCE')
def prepare_maintenance(self):
LOG.info("%s: prepare_maintenance called" % self.session_id)
if not self.make_empty_hosts('PREPARE_MAINTENANCE'):
LOG.error('make_empty_hosts failed')
self.state('MAINTENANCE_FAILED')
else:
self.state('START_MAINTENANCE')
self.update_server_info()
def start_maintenance(self):
LOG.info("%s: start_maintenance called" % self.session_id)
empty_hosts = self.get_empty_computes()
if not empty_hosts:
LOG.error("%s: No empty host to be maintained" % self.session_id)
self.state('MAINTENANCE_FAILED')
return
for host_name in self.get_compute_hosts():
self.cordon(host_name)
thrs = []
for host_name in empty_hosts:
# LOG.info("%s: Maintaining %s" % (self.session_id, host_name))
thrs.append(self.host_maintenance_async(host_name))
# LOG.info("%s: Maintained %s" % (self.session_id, host_name))
for thr in thrs:
thr.join()
time.sleep(1)
self.update_server_info()
self.state('PLANNED_MAINTENANCE')
def planned_maintenance(self):
LOG.info("%s: planned_maintenance called" % self.session_id)
maintained_hosts = self.get_maintained_hosts_by_type('compute')
compute_hosts = self.get_compute_hosts()
not_maintained_hosts = ([host for host in compute_hosts if host
not in maintained_hosts])
empty_compute_hosts = self.get_empty_computes()
parallel = len(empty_compute_hosts)
not_maintained = len(not_maintained_hosts)
while not_maintained:
if not_maintained < parallel:
parallel = not_maintained
thrs = []
for index in range(parallel):
shost = not_maintained_hosts[index]
thost = empty_compute_hosts[index]
thrs.append(
self.actions_to_have_empty_host(shost,
'PLANNED_MAINTENANCE',
thost))
for thr in thrs:
thr.join()
empty_compute_hosts = self.get_empty_computes()
del not_maintained_hosts[:parallel]
parallel = len(empty_compute_hosts)
not_maintained = len(not_maintained_hosts)
self.update_server_info()
LOG.info("%s: planned_maintenance done" % self.session_id)
self.state('MAINTENANCE_COMPLETE')
def maintenance_complete(self):
LOG.info("%s: maintenance_complete called" % self.session_id)
LOG.info('%s: Execute post action plugins' % self.session_id)
self.maintenance_by_plugin_type("localhost", "post")
LOG.info('Projects may still need to up scale back to full '
'capcity')
if not self.confirm_maintenance_complete():
self.state('MAINTENANCE_FAILED')
return
self.update_server_info()
self.state('MAINTENANCE_DONE')
def maintenance_done(self):
LOG.info("%s: MAINTENANCE_DONE" % self.session_id)
def maintenance_failed(self):
LOG.info("%s: MAINTENANCE_FAILED" % self.session_id)
def cleanup(self):
LOG.info("%s: cleanup" % self.session_id)
db_api.remove_session(self.session_id)

View File

@ -170,18 +170,6 @@ class Workflow(BaseWorkflow):
self.nova.services.enable(hostname, "nova-compute") self.nova.services.enable(hostname, "nova-compute")
host.disabled = False host.disabled = False
def get_compute_hosts(self):
return [host.hostname for host in self.hosts
if host.type == 'compute']
def get_empty_computes(self):
all_computes = self.get_compute_hosts()
instance_computes = []
for instance in self.instances:
if instance.host not in instance_computes:
instance_computes.append(instance.host)
return [host for host in all_computes if host not in instance_computes]
def get_instance_details(self, instance): def get_instance_details(self, instance):
network_interfaces = next(iter(instance.addresses.values())) network_interfaces = next(iter(instance.addresses.values()))
for network_interface in network_interfaces: for network_interface in network_interfaces:
@ -401,6 +389,8 @@ class Workflow(BaseWorkflow):
return all_replied return all_replied
def need_scale_in(self): def need_scale_in(self):
# TBD this should be calculated according to instance and
# instance_group constraints
hvisors = self.nova.hypervisors.list(detailed=True) hvisors = self.nova.hypervisors.list(detailed=True)
prev_vcpus = 0 prev_vcpus = 0
free_vcpus = 0 free_vcpus = 0
@ -991,7 +981,7 @@ class Workflow(BaseWorkflow):
def scale_in(self): def scale_in(self):
LOG.info("%s: scale in" % self.session_id) LOG.info("%s: scale in" % self.session_id)
# TBD we just blindly ask to scale_in to have at least one # TBD we just blindly ask to scale_in to have at least one
# empty compute. With NUMA and CPI pinning and together with # empty compute. With NUMA and CPU pinning and together with
# how many instances can be affected at the same time, we should # how many instances can be affected at the same time, we should
# calculate and ask scaling of specific instances # calculate and ask scaling of specific instances
if not self.confirm_scale_in(): if not self.confirm_scale_in():