Browse Source

Kubernetes workflow

- Add Kubernetes workflow
- Add Sample VNFM to test against
- Add sample application (VNF)
- Add testing instructions
- Update documentation

Details in: fenix/tools/README.md

Also fixes to:
- Api error handling
- Exceptions
- DB API
- some other minor bugs.

story: 2007301
Task: #38778

Change-Id: Ia37dfe7ea57935e73868da89aaa9a413721078ad
Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
changes/98/712898/1
Tomi Juvonen 1 year ago
parent
commit
d94ba44c69
17 changed files with 2074 additions and 93 deletions
  1. +3
    -1
      doc/source/api-ref/v1/parameters.yaml
  2. +22
    -2
      doc/source/user/notifications.rst
  3. +147
    -60
      fenix/api/v1/controllers/maintenance.py
  4. +1
    -1
      fenix/api/v1/schema.py
  5. +2
    -2
      fenix/db/exceptions.py
  6. +23
    -12
      fenix/db/sqlalchemy/api.py
  7. +6
    -0
      fenix/exceptions.py
  8. +216
    -0
      fenix/tools/README.md
  9. +0
    -0
      fenix/tools/__init__.py
  10. +54
    -0
      fenix/tools/demo-ha.yaml
  11. +42
    -0
      fenix/tools/demo-nonha.yaml
  12. +545
    -0
      fenix/tools/vnfm.py
  13. +2
    -2
      fenix/utils/identity_auth.py
  14. +3
    -0
      fenix/utils/service.py
  15. +21
    -0
      fenix/workflow/workflow.py
  16. +984
    -0
      fenix/workflow/workflows/k8s.py
  17. +3
    -13
      fenix/workflow/workflows/vnf.py

+ 3
- 1
doc/source/api-ref/v1/parameters.yaml View File

@ -188,7 +188,9 @@ migration-type:
Own action is create new and delete old instance.
Note! VNF need to obey resource_mitigation with own action
This affects to order of delete old and create new to not over
commit the resources.
commit the resources. In Kubernetes also EVICTION supported. There admin
will delete instance and VNF automation like ReplicaSet will make a new
instance
in: body
required: true
type: string


+ 22
- 2
doc/source/user/notifications.rst View File

@ -114,7 +114,9 @@ payload
| | | 'MIGRATE', 'LIVE_MIGRATE' and 'OWN_ACTION'. 'OWN_ACTION' means |
| | | an action project manager can do itself. Usually this could be |
| | | re-instantiation even with a new flavor. Other actions are done by |
| | | Fenix as they need the admin privileges. Valid for states: |
| | | Fenix as they need the admin privileges. In Kubernetes also 'EVICTION' |
| | | supported. There admin will delete instance and VNF automation like |
| | | ReplicaSet will make a new instance. Valid for states: |
| | | 'SCALE_IN', 'PREPARE_MAINTENANCE' and 'PLANNED_MAINTENANCE'. |
+-----------------+------------+------------------------------------------------------------------------+
| instance_ids | string | Link to Fenix maintenance session and project specific API to get |
@ -176,7 +178,7 @@ Example of notification for many instances:
"metadata": {"openstack_release": "Queens"}
}
Example of notification for single instances. Note the instance specific
Example of notification for single instance. Note the instance specific
'reply_url':
.. code-block:: json
@ -194,5 +196,23 @@ Example of notification for single instances. Note the instance specific
"metadata": {"openstack_release": "Queens"}
}
Example of notification for single instance in Kubernetes. Note the instance
specific 'reply_url' and allowed actions for Kubernetes:
.. code-block:: json
{
"service": "fenix",
"allowed_actions": ["OWN_ACTION", "EVICTION"],
"instance_ids": ["28d226f3-8d06-444f-a3f1-c586d2e7cb39"],
"reply_url": "http://0.0.0.0:12347/v1/maintenance/76e55df8-1c51-11e8-9928-0242ac110002/ead0dbcaf3564cbbb04842e3e54960e3/28d226f3-8d06-444f-a3f1-c586d2e7cb39",
"state": "PREPARE_MAINTENANCE",
"session_id": "76e55df8-1c51-11e8-9928-0242ac110002",
"reply_at": "2018-02-28T06:40:16",
"actions_at": "2018-02-29T00:00:00",
"project_id": "ead0dbcaf3564cbbb04842e3e54960e3",
"metadata": {"openstack_release": "Queens"}
}
.. [1] http://docs.openstack.org/developer/oslo.messaging/notifier.html
.. [2] https://docs.openstack.org/aodh/latest/admin/telemetry-alarms.html#event-based-alarm

+ 147
- 60
fenix/api/v1/controllers/maintenance.py View File

@ -20,18 +20,49 @@ from pecan import expose
from pecan import request
from pecan import response
from pecan import rest
import six
from oslo_log import log
from oslo_messaging import RemoteError
from oslo_serialization import jsonutils
from fenix.api.v1 import maintenance
from fenix.api.v1 import schema
import fenix.exceptions as exceptions
import fenix.db.exceptions as db_exceptions
from fenix import policy
LOG = log.getLogger(__name__)
class ProjectController(rest.RestController):
def _format_ex_message(ex):
if len(ex.path) > 0:
return ("Invalid input for field/attribute %(path)s."
" Value: %(value)s. %(message)s" % {'path': ex.path.pop(),
'value': ex.instance,
'message': ex.message})
else:
return ex.message
return
class BaseController(rest.RestController):
def handle_remote_error(self, e):
cls = getattr(db_exceptions, e.exc_type, None)
cls = cls or getattr(exceptions, e.exc_type, None)
if cls is not None:
if e.value:
description = e.value
elif "msg_fmt" in vars(cls).keys():
description = cls.msg_fmt
else:
description = ""
abort(cls.code, description)
abort(500)
class ProjectController(BaseController):
name = 'project'
@ -49,8 +80,9 @@ class ProjectController(rest.RestController):
jsonschema.validate(session_id, schema.uid)
jsonschema.validate(project_id, schema.uid)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
engine_data = self.engine_rpcapi.project_get_session(session_id,
project_id)
try:
@ -68,18 +100,22 @@ class ProjectController(rest.RestController):
jsonschema.validate(project_id, schema.uid)
jsonschema.validate(data, schema.maintenance_session_project_put)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
engine_data = self.engine_rpcapi.project_update_session(session_id,
project_id,
data)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
try:
engine_data = self.engine_rpcapi.project_update_session(session_id,
project_id,
data)
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:
response.body = jsonutils.dumps(engine_data)
class ProjectInstanceController(rest.RestController):
class ProjectInstanceController(BaseController):
name = 'project_instance'
@ -99,20 +135,24 @@ class ProjectInstanceController(rest.RestController):
data,
schema.maintenance_session_project_instance_put)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
engine_data = (
self.engine_rpcapi.project_update_session_instance(session_id,
project_id,
instance_id,
data))
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
try:
engine_data = (
self.engine_rpcapi.project_update_session_instance(session_id,
project_id,
instance_id,
data))
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:
response.body = jsonutils.dumps(engine_data)
class SessionController(rest.RestController):
class SessionController(BaseController):
name = 'session'
@ -126,15 +166,20 @@ class SessionController(rest.RestController):
try:
jsonschema.validate(session_id, schema.uid)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
if request.body:
LOG.error("Unexpected data")
abort(400)
session = self.engine_rpcapi.admin_get_session(session_id)
try:
session = self.engine_rpcapi.admin_get_session(session_id)
except RemoteError as e:
self.handle_remote_error(e)
if session is None:
LOG.error("Invalid session")
abort(404)
description = "Invalid session"
LOG.error(description)
abort(422, six.text_type(description))
try:
response.text = jsonutils.dumps(session)
except TypeError:
@ -149,9 +194,14 @@ class SessionController(rest.RestController):
jsonschema.validate(session_id, schema.uid)
jsonschema.validate(data, schema.maintenance_session_put)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
engine_data = self.engine_rpcapi.admin_update_session(session_id, data)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
try:
engine_data = self.engine_rpcapi.admin_update_session(session_id,
data)
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:
@ -164,19 +214,23 @@ class SessionController(rest.RestController):
try:
jsonschema.validate(session_id, schema.uid)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
if request.body:
LOG.error("Unexpected data")
abort(400)
engine_data = self.engine_rpcapi.admin_delete_session(session_id)
try:
engine_data = self.engine_rpcapi.admin_delete_session(session_id)
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:
response.body = jsonutils.dumps(engine_data)
class MaintenanceController(rest.RestController):
class MaintenanceController(BaseController):
name = 'maintenance'
@ -190,7 +244,10 @@ class MaintenanceController(rest.RestController):
if request.body:
LOG.error("Unexpected data")
abort(400)
sessions = self.engine_rpcapi.admin_get()
try:
sessions = self.engine_rpcapi.admin_get()
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(sessions)
except TypeError:
@ -204,9 +261,13 @@ class MaintenanceController(rest.RestController):
try:
jsonschema.validate(data, schema.maintenance_post)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
session = self.engine_rpcapi.admin_create_session(data)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
try:
session = self.engine_rpcapi.admin_create_session(data)
except RemoteError as e:
self.handle_remote_error(e)
if session is None:
LOG.error("Too many sessions")
abort(509)
@ -216,7 +277,7 @@ class MaintenanceController(rest.RestController):
response.body = jsonutils.dumps(session)
class InstanceController(rest.RestController):
class InstanceController(BaseController):
name = 'instance'
@ -230,15 +291,20 @@ class InstanceController(rest.RestController):
try:
jsonschema.validate(instance_id, schema.uid)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
if request.body:
LOG.error("Unexpected data")
abort(400)
instance = self.engine_rpcapi.get_instance(instance_id)
try:
instance = self.engine_rpcapi.get_instance(instance_id)
except RemoteError as e:
self.handle_remote_error(e)
if instance is None:
LOG.error("Invalid instance: %s" % instance_id)
abort(404)
description = "Invalid instance: %s" % instance_id
LOG.error(description)
abort(422, six.text_type(description))
try:
response.text = jsonutils.dumps(instance)
except TypeError:
@ -253,10 +319,14 @@ class InstanceController(rest.RestController):
jsonschema.validate(instance_id, schema.uid)
jsonschema.validate(data, schema.instance_put)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
engine_data = self.engine_rpcapi.update_instance(instance_id,
data)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
try:
engine_data = self.engine_rpcapi.update_instance(instance_id,
data)
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:
@ -269,19 +339,23 @@ class InstanceController(rest.RestController):
try:
jsonschema.validate(instance_id, schema.uid)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
if request.body:
LOG.error("Unexpected data")
abort(400)
engine_data = self.engine_rpcapi.delete_instance(instance_id)
try:
engine_data = self.engine_rpcapi.delete_instance(instance_id)
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:
response.body = jsonutils.dumps(engine_data)
class InstanceGroupController(rest.RestController):
class InstanceGroupController(BaseController):
name = 'instance_group'
@ -295,15 +369,20 @@ class InstanceGroupController(rest.RestController):
try:
jsonschema.validate(group_id, schema.uid)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
if request.body:
LOG.error("Unexpected data")
abort(400)
group = self.engine_rpcapi.get_instance_group(group_id)
try:
group = self.engine_rpcapi.get_instance_group(group_id)
except RemoteError as e:
self.handle_remote_error(e)
if group is None:
LOG.error("Invalid instance_group: %s" % group_id)
abort(404)
description = "Invalid instance_group: %s" % group_id
LOG.error(description)
abort(422, six.text_type(description))
try:
response.text = jsonutils.dumps(group)
except TypeError:
@ -318,10 +397,14 @@ class InstanceGroupController(rest.RestController):
jsonschema.validate(group_id, schema.uid)
jsonschema.validate(data, schema.instance_group_put)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
engine_data = (
self.engine_rpcapi.update_instance_group(group_id, data))
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
try:
engine_data = (
self.engine_rpcapi.update_instance_group(group_id, data))
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:
@ -334,13 +417,17 @@ class InstanceGroupController(rest.RestController):
try:
jsonschema.validate(group_id, schema.uid)
except jsonschema.exceptions.ValidationError as e:
LOG.error(str(e.message))
abort(422)
description = _format_ex_message(e)
LOG.error(description)
abort(422, six.text_type(description))
if request.body:
LOG.error("Unexpected data")
abort(400)
engine_data = (
self.engine_rpcapi.delete_instance_group(group_id))
try:
engine_data = (
self.engine_rpcapi.delete_instance_group(group_id))
except RemoteError as e:
self.handle_remote_error(e)
try:
response.text = jsonutils.dumps(engine_data)
except TypeError:


+ 1
- 1
fenix/api/v1/schema.py View File

@ -40,7 +40,7 @@ reply_states = ['ACK_MAINTENANCE',
'NACK_PLANNED_MAINTENANCE',
'NACK_MAINTENANCE_COMPLETE']
allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']
allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION', 'EVICTION']
maintenance_session_project_put = {
'type': 'object',


+ 2
- 2
fenix/db/exceptions.py View File

@ -25,11 +25,11 @@ class FenixDBException(exceptions.FenixException):
msg_fmt = 'An unknown database exception occurred'
class FenixDBDuplicateEntry(FenixDBException):
class FenixDBDuplicateEntry(exceptions.Duplicate):
msg_fmt = 'Duplicate entry for %(columns)s in %(model)s model was found'
class FenixDBNotFound(FenixDBException):
class FenixDBNotFound(exceptions.NotFound):
msg_fmt = '%(id)s %(model)s was not found'


+ 23
- 12
fenix/db/sqlalchemy/api.py View File

@ -516,11 +516,12 @@ def project_instance_get(instance_id):
def update_project_instance(values):
values = values.copy()
minstance = models.ProjectInstance()
minstance.update(values)
session = get_session()
with session.begin():
minstance = _project_instance_get(session, values['instance_id'])
if not minstance:
minstance = models.ProjectInstance()
minstance.update(values)
try:
minstance.save(session=session)
except common_db_exc.DBDuplicateEntry as e:
@ -553,6 +554,15 @@ def instance_group_get(group_id):
return _instance_group_get(get_session(), group_id)
def _instance_groups_get(session):
query = model_query(models.InstanceGroup, session)
return query.all()
def instance_groups_get():
return _instance_groups_get(get_session())
def _group_instances_get(session, group_id):
query = model_query(models.ProjectInstance, session)
return query.filter_by(group_id=group_id).all()
@ -564,28 +574,29 @@ def group_instances_get(group_id):
def update_instance_group(values):
values = values.copy()
minstance = models.InstanceGroup()
minstance.update(values)
session = get_session()
with session.begin():
ig = _instance_group_get(session, values['group_id'])
if not ig:
ig = models.InstanceGroup()
ig.update(values)
try:
minstance.save(session=session)
ig.save(session=session)
except common_db_exc.DBDuplicateEntry as e:
# raise exception about duplicated columns (e.columns)
raise db_exc.FenixDBDuplicateEntry(
model=minstance.__class__.__name__, columns=e.columns)
model=ig.__class__.__name__, columns=e.columns)
return instance_group_get(minstance.group_id)
return instance_group_get(ig.group_id)
def remove_instance_group(group_id):
session = get_session()
with session.begin():
minstance = _instance_group_get(session, group_id)
if not minstance:
ig = _instance_group_get(session, group_id)
if not ig:
# raise not found error
raise db_exc.FenixDBNotFound(session, group_id=group_id,
model='instance_groups')
session.delete(minstance)
session.delete(ig)

+ 6
- 0
fenix/exceptions.py View File

@ -55,6 +55,12 @@ class NotFound(FenixException):
code = 404
class Duplicate(FenixException):
"""Object not found exception."""
msg_fmt = "Object with %(object)s not found"
code = 409
class NotAuthorized(FenixException):
msg_fmt = "Not authorized"
code = 403


+ 216
- 0
fenix/tools/README.md View File

@ -0,0 +1,216 @@
# fenix.tools
This directory contains tools and instructions to test Fenix workflows.
Currently OPNFV Doctor has been used to test OpenStack related workflows.
As Doctor is at the moment only for OpenStack and Fenix itself needs a way to be
tested, the Kubernetes workflow (fenix/workflow/workflows/k8s.py) testing
is implemented here.
Files:
- 'demo-ha.yaml': demo-ha ReplicaSet to make 2 anti-affinity PODS.
- 'demo-nonha.yaml': demo-nonha ReplicaSet to make n nonha PODS.
- 'vnfm.py': VNFM to test k8s.py workflow.
## Kubernetes workflow (k8s.py)
First version of workflow towards Kubeernetes use cases.
### Requirements for testing
This workflow assumes ReplicaSet used for PODs. A Kubernetes cluster with
1 master and at least 3 workers are required for testing. Master node needs
DevStack to have Fenix and OpenStack services it still uses. Later on there
can be a version of Fenix not needing Keystone and AODH event alarming, but
using native Kubernetes services for RBAC and events.
As in Doctor testing, there is a pair of anti-affinity PODs (demo-ha) and rest
of the worker node capacity is filled with (demo-nonha) PODs. Scaling of PODs
is done via ReplicaSet number of replicas. Idea is that each POD is taking
("number of worker node CPUs" - 2) / 2. That will make sure scheduler fits
2 PODs on each node and 2 CPUs capacity is assumed for other node services.
This should require at least 6 CPUs for each node to work.
### Install Kubernetes cluster with 1 Manager and 3 Worker nodes.
Here is instructions:
https://docs.openstack.org/openstack-helm/latest/install/kubernetes-gate.html
https://phoenixnap.com/kb/how-to-install-kubernetes-on-a-bare-metal-server
https://phoenixnap.com/kb/how-to-install-kubernetes-on-centos
### On Manager node, install DevStack including Fenix and its minimum services
Note! There is no conflict with Kubernetes as limiting to only Fenix needed
services.
Clone devstack. Tested to work with latest stable release Train.
```sh
git clone https://github.com/openstack/devstack -b stable/train
```
Make local.conf. 'HOST_IP' should bemaster node IP.
```sh
cd devstack vi local.conf
```
```sh
[[local|localrc]]
GIT_BASE=https://git.openstack.org
HOST_IP=192.0.2.4
ADMIN_PASSWORD=admin
DATABASE_PASSWORD=admin
RABBIT_PASSWORD=admin
SERVICE_PASSWORD=admin
LOGFILE=/opt/stack/stack.sh.log
PUBLIC_INTERFACE=eth0
CEILOMETER_EVENT_ALARM=True
ENABLED_SERVICES=key,rabbit,mysql,fenix-engine,fenix-api,aodh-evaluator,aodh-notifier,aodh-api
enable_plugin ceilometer https://git.openstack.org/openstack/ceilometer stable/train
enable_plugin aodh https://git.openstack.org/openstack/aodh stable/train
enable_plugin gnocchi https://github.com/openstack/gnocchi
enable_plugin fenix https://opendev.org/x/fenix master
```
Deploy needed OpenStack services with Fenix
```sh
./stack.sh
```
Now you should have Kubernetes cluster and Fenix via DevStack. Any hacking of
Fenix can be done under '/opt/stack/fenix'.
### Running test
Use 3 terminal windows (Term1, Term2 and Term3) to test Fenix with Kubernetes
kluster. Under here is what you can run in different terminals. Terminals
should be running in master node. Here is short description:
- Term1: Used for logging Fenix
- Term2: Infrastructure admin commands
- Term3: VNFM logging for testing and setting up the VNF
#### Term1: Fenix-engine logging
If any changes to Fenix make them under '/opt/stack/fenix'; restart fenix and
see logs
```sh
sudo systemctl restart devstack@fenix*;sudo journalctl -f --unit devstack@fenix-engine
```
API logs can also be seen
```sh
sudo journalctl -f --unit devstack@fenix-api
```
Debugging and other configuration changes to conf files under '/etc/fenix'
#### Term2: Infrastructure admin window
Use DevStack admin as user. Set your variables needed accordingly
```sh
. ~/devstack/operc admin admin
USER_ID=`openstack user list | grep admin | awk '{print $2}'
HOST=192.0.2.4
PORT=12347
```
Authenticate to Keystone as admin user before calling Fenix. If you will have
some not authorized error later on, you need to do this again.
```sh
OS_AUTH_TOKEN=`openstack token issue | grep " id " |awk '{print $4}'`
```
After you have first: Fenix running in Term1; Next: VNF created a in Term3
Next: VNFM running in Term3, you can create maintenance session utilizing those
```sh
DATE=`date -d "+15 sec" "+%Y-%m-%d %H:%M:%S"`;MSESSION=`curl -g -i -X POST http://$HOST:$PORT/v1/maintenance -H "Accept: application/json" -H "Content-Type: application/json" -d '{"workflow": "k8s", "state": "MAINTENANCE","metadata": {} ,"maintenance_at": "'"$DATE"'"}' -H "X-Auth-Token: $OS_AUTH_TOKEN" -H "X-User-Id: $USER_ID" | grep session_id | jq -r '.session_id'`
```
After maintenance workflow is 'MAINTENANCE_DONE', you should first press
"ctrl + c" in VNFM window (Term3), so it removes constraints from Fenix and
dies. Then you can remove the finished session from fenix
```sh
curl -g -i -X DELETE http://$HOST:$PORT/v1/maintenance/$MSESSION -H "Accept: application/json" -H "Content-Type: application/json" -H "X-Auth-Token: $OS_AUTH_TOKEN" -H "X-User-Id: $USER_ID"
```
If maintenance run till the end with 'MAINTENANCE_DONE', you are ready to run it
again if you wish. 'MAINTENANCE_FAILED' or in case of exceptions, you should
recover system before trying to test again. This is covered in Term3 below.
#### Term3: VNFM (fenix/tools/vnfm.py)
Go to Fenix Kubernetes tool directory for testing
```sh
cd /opt/stack/fenix/fenix/tools
```
Create demo namespace (we use demo namespace and demo user and project in
Keystone)
```sh
kubectl create namespace demo
```
Start VNFM (when done in this order, we make sure demo-ha has nodes for antiaffinity):
```sh
kubectl apply -f demo-ha.yaml --namespace=demo;sleep 1;kubectl apply -f demo-nonha.yaml --namespace=demo
```
Note you should modify above yaml files so that "cpu:" has value of
'(workernode.status.capacity["cpu"] - 2) / 2'. Default is expecting that there
is 32 cpus, so value is "15" in both yaml files. Replicas can be changed in
demo-nonha.yaml. Minimum 2 (if minimum of 3 worker nodes) to maximum
'(amount_of_worker_nodes-1)*2'. Greater amount means more scaling needed and
longer maintenance window as less parallel actions possible. Surely constraints
in vnfm.py also can be changed for different behavior.
You can delete pods used like this
```sh
kubectl delete replicaset.apps demo-ha demo-nonha --namespace=demo
```
Start Kubernetes VNFM that we need for testing
```sh
python vnfm.py
```
Now you can start maintenance session in Term2. When workflow failed or
completed; you first kill vnfm.py with "ctrl+c" and delete maintenance session
in Term2.
If workflow failed something might need to be manually fixed. Here you
uncordon your 3 worker nodes, if maintenance workflow did not run to the end.
```sh
kubectl uncordon worker-node3 worker-node2 worker-node1
```
You can check your pods matches to amount of replicas mentioned in
demo-nonha.yaml and demo-ha.yaml:
```sh
kubectl get pods --all-namespaces --output=wide
```
If not matching, delete and create again as easiest solution
```sh
kubectl delete replicaset.apps demo-ha demo-nonha --namespace=demo;sleep 15;kubectl apply -f demo-ha.yaml --namespace=demo;sleep 1;kubectl apply -f demo-nonha.yaml --namespace=demo
```

+ 0
- 0
fenix/tools/__init__.py View File


+ 54
- 0
fenix/tools/demo-ha.yaml View File

@ -0,0 +1,54 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
name: demo-ha
labels:
app: demo-ha
spec:
replicas: 2
selector:
matchLabels:
app: demo-ha
template:
metadata:
labels:
app: demo-ha
active: None
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- demo-ha
topologyKey: "kubernetes.io/hostname"
spec:
containers:
- name: nginx
image: nginx
resources:
requests:
cpu: "15"
ports:
- containerPort: 80
volumeMounts:
- name: workdir
mountPath: /usr/share/nginx/html
initContainers:
- name: install
image: busybox
command:
- wget
- "-O"
- "/work-dir/index.html"
- http://kubernetes.io
volumeMounts:
- name: workdir
mountPath: "/work-dir"
dnsPolicy: Default
volumes:
- name: workdir
emptyDir: {}

+ 42
- 0
fenix/tools/demo-nonha.yaml View File

@ -0,0 +1,42 @@
apiVersion: apps/v1
kind: ReplicaSet
metadata:
name: demo-nonha
labels:
app: demo-nonha
spec:
replicas: 3
selector:
matchLabels:
app: demo-nonha
template:
metadata:
labels:
app: demo-nonha
spec:
containers:
- name: nginx
image: nginx
resources:
requests:
cpu: "15"
ports:
- containerPort: 80
volumeMounts:
- name: workdir
mountPath: /usr/share/nginx/html
initContainers:
- name: install
image: busybox
command:
- wget
- "-O"
- "/work-dir/index.html"
- http://kubernetes.io
volumeMounts:
- name: workdir
mountPath: "/work-dir"
dnsPolicy: Default
volumes:
- name: workdir
emptyDir: {}

+ 545
- 0
fenix/tools/vnfm.py View File

@ -0,0 +1,545 @@
##############################################################################
# Copyright (c) 2020 Nokia Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
# which accompanies this distribution, and is available at
# http://www.apache.org/licenses/LICENSE-2.0
##############################################################################
import aodhclient.client as aodhclient
import datetime
from flask import Flask
from flask import request
import json
from keystoneclient import client as ks_client
from kubernetes import client
from kubernetes import config
import logging as lging
from oslo_config import cfg
from oslo_log import log as logging
import requests
import sys
from threading import Thread
import time
import yaml
try:
import fenix.utils.identity_auth as identity_auth
except ValueError:
sys.path.append('../utils')
import identity_auth
LOG = logging.getLogger(__name__)
streamlog = lging.StreamHandler(sys.stdout)
LOG.logger.addHandler(streamlog)
LOG.logger.setLevel(logging.INFO)
opts = [
cfg.StrOpt('ip',
default='127.0.0.1',
help='the ip of VNFM',
required=True),
cfg.IntOpt('port',
default='12348',
help='the port of VNFM',
required=True),
]
CONF = cfg.CONF
CONF.register_opts(opts)
CONF.register_opts(identity_auth.os_opts, group='service_user')
class VNFM(object):
def __init__(self, conf, log):
self.conf = conf
self.log = log
self.app = None
def start(self):
LOG.info('VNFM start......')
self.app = VNFManager(self.conf, self.log)
self.app.start()
def stop(self):
LOG.info('VNFM stop......')
if not self.app:
return
self.app.headers['X-Auth-Token'] = self.app.session.get_token()
self.app.delete_constraints()
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json',
}
url = 'http://%s:%d/shutdown'\
% (self.conf.ip,
self.conf.port)
requests.post(url, data='', headers=headers)
class VNFManager(Thread):
def __init__(self, conf, log):
Thread.__init__(self)
self.conf = conf
self.log = log
self.port = self.conf.port
self.intance_ids = None
# VNFM is started with OS_* exported as admin user
# We need that to query Fenix endpoint url
# Still we work with our tenant/poroject/vnf as demo
self.project = "demo"
LOG.info('VNFM project: %s' % self.project)
self.auth = identity_auth.get_identity_auth(conf, project=self.project)
self.session = identity_auth.get_session(auth=self.auth)
self.ks = ks_client.Client(version='v3', session=self.session)
self.aodh = aodhclient.Client(2, self.session)
# Subscribe to mainenance event alarm from Fenix via AODH
self.create_alarm()
config.load_kube_config()
self.kaapi = client.AppsV1Api()
self.kapi = client.CoreV1Api()
self.headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'}
self.headers['X-Auth-Token'] = self.session.get_token()
self.orig_number_of_instances = self.number_of_instances()
# List of instances
self.ha_instances = []
self.nonha_instances = []
# Different instance_id specific constraints {instanse_id: {},...}
self.instance_constraints = None
# Update existing instances to instance lists
self.update_instances()
# How many instances needs to exists (with current VNF load)
# max_impacted_members need to be updated accordingly
# if number of instances is scaled. example for demo-ha:
# max_impacted_members = len(self.ha_instances) - ha_group_limit
self.ha_group_limit = 2
self.nonha_group_limit = 2
# Different instance groups constraints dict
self.ha_group = None
self.nonha_group = None
# VNF project_id (VNF ID)
self.project_id = None
# HA instance_id that is active has active label
self.active_instance_id = self.active_instance_id()
services = self.ks.services.list()
for service in services:
if service.type == 'maintenance':
LOG.info('maintenance service: %s:%s type %s'
% (service.name, service.id, service.type))
maint_id = service.id
self.maint_endpoint = [ep.url for ep in self.ks.endpoints.list()
if ep.service_id == maint_id and
ep.interface == 'public'][0]
LOG.info('maintenance endpoint: %s' % self.maint_endpoint)
self.update_constraints_lock = False
self.update_constraints()
# Instances waiting action to be done
self.pending_actions = {}
def create_alarm(self):
alarms = {alarm['name']: alarm for alarm in self.aodh.alarm.list()}
alarm_name = "%s_MAINTENANCE_ALARM" % self.project
if alarm_name in alarms:
return
alarm_request = dict(
name=alarm_name,
description=alarm_name,
enabled=True,
alarm_actions=[u'http://%s:%d/maintenance'
% (self.conf.ip,
self.conf.port)],
repeat_actions=True,
severity=u'moderate',
type=u'event',
event_rule=dict(event_type=u'maintenance.scheduled'))
self.aodh.alarm.create(alarm_request)
def delete_remote_instance_constraints(self, instance_id):
url = "%s/instance/%s" % (self.maint_endpoint, instance_id)
LOG.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
if ret.status_code == 404:
LOG.info('Already deleted: %s' % instance_id)
else:
raise Exception(ret.text)
def update_remote_instance_constraints(self, instance):
url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"])
LOG.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
LOG.info('DELETE: %s' % url)
ret = requests.delete(url, data=None, headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def update_remote_group_constraints(self, instance_group):
url = "%s/instance_group/%s" % (self.maint_endpoint,
instance_group["group_id"])
LOG.info('PUT: %s' % url)
ret = requests.put(url, data=json.dumps(instance_group),
headers=self.headers)
if ret.status_code != 200 and ret.status_code != 204:
raise Exception(ret.text)
def delete_constraints(self):
for instance_id in self.instance_constraints:
self.delete_remote_instance_constraints(instance_id)
self.delete_remote_group_constraints(self.nonha_group)
self.delete_remote_group_constraints(self.ha_group)
def update_constraints(self):
while self.update_constraints_lock:
LOG.info('Waiting update_constraints_lock...')
time.sleep(1)
self.update_constraints_lock = True
LOG.info('Update constraints')
if self.project_id is None:
self.project_id = self.ks.projects.list(name=self.project)[0].id
# Pods groupped by ReplicaSet, so we use that id
rs = {r.metadata.name: r.metadata.uid for r in
self.kaapi.list_namespaced_replica_set('demo').items}
max_impacted_members = len(self.nonha_instances) - 1
nonha_group = {
"group_id": rs['demo-nonha'],
"project_id": self.project_id,
"group_name": "demo-nonha",
"anti_affinity_group": False,
"max_instances_per_host": 0,
"max_impacted_members": max_impacted_members,
"recovery_time": 10,
"resource_mitigation": True}
LOG.info('create demo-nonha constraints: %s'
% nonha_group)
ha_group = {
"group_id": rs['demo-ha'],
"project_id": self.project_id,
"group_name": "demo-ha",
"anti_affinity_group": True,
"max_instances_per_host": 1,
"max_impacted_members": 1,
"recovery_time": 10,
"resource_mitigation": True}
LOG.info('create demo-ha constraints: %s'
% ha_group)
instance_constraints = {}
for ha_instance in self.ha_instances:
instance = {
"instance_id": ha_instance.metadata.uid,
"project_id": self.project_id,
"group_id": ha_group["group_id"],
"instance_name": ha_instance.metadata.name,
"max_interruption_time": 120,
"migration_type": "EVICTION",
"resource_mitigation": True,
"lead_time": 40}
LOG.info('create ha instance constraints: %s' % instance)
instance_constraints[ha_instance.metadata.uid] = instance
for nonha_instance in self.nonha_instances:
instance = {
"instance_id": nonha_instance.metadata.uid,
"project_id": self.project_id,
"group_id": nonha_group["group_id"],
"instance_name": nonha_instance.metadata.name,
"max_interruption_time": 120,
"migration_type": "EVICTION",
"resource_mitigation": True,
"lead_time": 40}
LOG.info('create nonha instance constraints: %s' % instance)
instance_constraints[nonha_instance.metadata.uid] = instance
if not self.instance_constraints:
# Initial instance constraints
LOG.info('create initial instances constraints...')
for instance in [instance_constraints[i] for i
in instance_constraints]:
self.update_remote_instance_constraints(instance)
self.instance_constraints = instance_constraints.copy()
else:
LOG.info('check instances constraints changes...')
added = [i for i in instance_constraints.keys()
if i not in self.instance_constraints]
deleted = [i for i in self.instance_constraints.keys()
if i not in instance_constraints]
modified = [i for i in instance_constraints.keys()
if (i not in added and i not in deleted and
instance_constraints[i] !=
self.instance_constraints[i])]
for instance_id in deleted:
self.delete_remote_instance_constraints(instance_id)
updated = added + modified
for instance in [instance_constraints[i] for i in updated]:
self.update_remote_instance_constraints(instance)
if updated or deleted:
# Some instance constraints have changed
self.instance_constraints = instance_constraints.copy()
if not self.ha_group or self.ha_group != ha_group:
LOG.info('ha instance group need update')
self.update_remote_group_constraints(ha_group)
self.ha_group = ha_group.copy()
if not self.nonha_group or self.nonha_group != nonha_group:
LOG.info('nonha instance group need update')
self.update_remote_group_constraints(nonha_group)
self.nonha_group = nonha_group.copy()
self.update_constraints_lock = False
def active_instance_id(self):
# We digtate the active in the beginning
instance = self.ha_instances[0]
LOG.info('Initially Active instance: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = {"metadata": {"labels": {"active": "True"}}}
self.kapi.patch_namespaced_pod(name, namespace, body)
self.active_instance_id = instance.metadata.uid
def switch_over_ha_instance(self, instance_id):
if instance_id == self.active_instance_id:
# Need to switchover as instance_id will be affected and is active
for instance in self.ha_instances:
if instance_id == instance.metadata.uid:
LOG.info('Active to Standby: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = client.UNKNOWN_BASE_TYPE()
body.metadata.labels = {"ative": None}
self.kapi.patch_namespaced_pod(name, namespace, body)
else:
LOG.info('Standby to Active: %s %s' %
(instance.metadata.name, instance.metadata.uid))
name = instance.metadata.name
namespace = instance.metadata.namespace
body = client.UNKNOWN_BASE_TYPE()
body.metadata.labels = {"ative": "True"}
self.kapi.patch_namespaced_pod(name, namespace, body)
self.active_instance_id = instance.metadata.uid
self.update_instances()
def get_instance_ids(self):
instances = self.kapi.list_pod_for_all_namespaces().items
return [i.metadata.uid for i in instances
if i.metadata.name.startswith("demo-")
and i.metadata.namespace == "demo"]
def update_instances(self):
instances = self.kapi.list_pod_for_all_namespaces().items
self.ha_instances = [i for i in instances
if i.metadata.name.startswith("demo-ha")
and i.metadata.namespace == "demo"]
self.nonha_instances = [i for i in instances
if i.metadata.name.startswith("demo-nonha")
and i.metadata.namespace == "demo"]
def _alarm_data_decoder(self, data):
if "[" in data or "{" in data:
# string to list or dict removing unicode
data = yaml.load(data.replace("u'", "'"))
return data
def _alarm_traits_decoder(self, data):
return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
for t in data['reason_data']['event']['traits']})
def get_session_instance_ids(self, url, session_id):
ret = requests.get(url, data=None, headers=self.headers)
if ret.status_code != 200:
raise Exception(ret.text)
LOG.info('get_instance_ids %s' % ret.json())
return ret.json()['instance_ids']
def scale_instances(self, scale_instances):
number_of_instances_before = len(self.nonha_instances)
replicas = number_of_instances_before + scale_instances
# We only scale nonha apps
namespace = "demo"
name = "demo-nonha"
body = {'spec': {"replicas": replicas}}
self.kaapi.patch_namespaced_replica_set_scale(name, namespace, body)
time.sleep(3)
# Let's check if scale has taken effect
self.update_instances()
number_of_instances_after = len(self.nonha_instances)
check = 20
while number_of_instances_after == number_of_instances_before:
if check == 0:
LOG.error('scale_instances with: %d failed, still %d instances'
% (scale_instances, number_of_instances_after))
raise Exception('scale_instances failed')
check -= 1
time.sleep(1)
self.update_instances()
number_of_instances_after = len(self.nonha_instances)
LOG.info('scaled instances from %d to %d' %
(number_of_instances_before, number_of_instances_after))
def number_of_instances(self):
instances = self.kapi.list_pod_for_all_namespaces().items
return len([i for i in instances
if i.metadata.name.startswith("demo-")])
def instance_action(self, instance_id, allowed_actions):
# We should keep instance constraint in our internal structur
# and match instance_id specific allowed action. Now we assume EVICTION
if 'EVICTION' not in allowed_actions:
LOG.error('Action for %s not foudn from %s' %
(instance_id, allowed_actions))
return None
return 'EVICTION'
def instance_action_started(self, instance_id, action):
time_now = datetime.datetime.utcnow()
max_interruption_time = (
self.instance_constraints[instance_id]['max_interruption_time'])
self.pending_actions[instance_id] = {
'started': time_now,
'max_interruption_time': max_interruption_time,
'action': action}
def was_instance_action_in_time(self, instance_id):
time_now = datetime.datetime.utcnow()
started = self.pending_actions[instance_id]['started']
limit = self.pending_actions[instance_id]['max_interruption_time']
action = self.pending_actions[instance_id]['action']
td = time_now - started
if td.total_seconds() > limit:
LOG.error('%s %s took too long: %ds' %
(instance_id, action, td.total_seconds()))
LOG.error('%s max_interruption_time %ds might be too short' %
(instance_id, limit))
raise Exception('%s %s took too long: %ds' %
(instance_id, action, td.total_seconds()))
else:
LOG.info('%s %s with recovery time took %ds' %
(instance_id, action, td.total_seconds()))
del self.pending_actions[instance_id]
def run(self):
app = Flask('VNFM')
@app.route('/maintenance', methods=['POST'])
def maintenance_alarm():
data = json.loads(request.data.decode('utf8'))
try:
payload = self._alarm_traits_decoder(data)
except Exception:
payload = ({t[0]: t[2] for t in
data['reason_data']['event']['traits']})
LOG.error('cannot parse alarm data: %s' % payload)
raise Exception('VNFM cannot parse alarm.'
'Possibly trait data over 256 char')
LOG.info('VNFM received data = %s' % payload)
state = payload['state']
reply_state = None
reply = dict()
LOG.info('VNFM state: %s' % state)
if state == 'MAINTENANCE':
self.headers['X-Auth-Token'] = self.session.get_token()
instance_ids = (self.get_session_instance_ids(
payload['instance_ids'],
payload['session_id']))
reply['instance_ids'] = instance_ids
reply_state = 'ACK_MAINTENANCE'
elif state == 'SCALE_IN':
# scale down only nonha instances
nonha_instances = len(self.nonha_instances)
scale_in = nonha_instances / 2
self.scale_instances(-scale_in)
self.update_constraints()
reply['instance_ids'] = self.get_instance_ids()
reply_state = 'ACK_SCALE_IN'
elif state == 'MAINTENANCE_COMPLETE':
# possibly need to upscale
number_of_instances = self.number_of_instances()
if self.orig_number_of_instances > number_of_instances:
scale_instances = (self.orig_number_of_instances -
number_of_instances)
self.scale_instances(scale_instances)
self.update_constraints()
reply_state = 'ACK_MAINTENANCE_COMPLETE'
elif (state == 'PREPARE_MAINTENANCE'
or state == 'PLANNED_MAINTENANCE'):
instance_id = payload['instance_ids'][0]
instance_action = (self.instance_action(instance_id,
payload['allowed_actions']))
if not instance_action:
raise Exception('Allowed_actions not supported for %s' %
instance_id)
LOG.info('VNFM got instance: %s' % instance_id)
self.switch_over_ha_instance(instance_id)
reply['instance_action'] = instance_action
reply_state = 'ACK_%s' % state
self.instance_action_started(instance_id, instance_action)
elif state == 'INSTANCE_ACTION_DONE':
# TBD was action done in max_interruption_time (live migration)
# NOTE, in EVICTION instance_id reported that was in evicted
# node. New instance_id might be different
LOG.info('%s' % payload['instance_ids'])
self.was_instance_action_in_time(payload['instance_ids'][0])
self.update_instances()
self.update_constraints()
else:
raise Exception('VNFM received event with'
' unknown state %s' % state)
if reply_state:
reply['session_id'] = payload['session_id']
reply['state'] = reply_state
url = payload['reply_url']
LOG.info('VNFM reply: %s' % reply)
requests.put(url, data=json.dumps(reply), headers=self.headers)
return 'OK'
@app.route('/shutdown', methods=['POST'])
def shutdown():
LOG.info('shutdown VNFM server at %s' % time.time())
func = request.environ.get('werkzeug.server.shutdown')
if func is None:
raise RuntimeError('Not running with the Werkzeug Server')
func()
return 'VNFM shutting down...'
app.run(host="0.0.0.0", port=self.port)
if __name__ == '__main__':
app_manager = VNFM(CONF, LOG)
app_manager.start()
try:
LOG.info('Press CTRL + C to quit')
while True:
time.sleep(2)
except KeyboardInterrupt:
app_manager.stop()

+ 2
- 2
fenix/utils/identity_auth.py View File

@ -42,14 +42,14 @@ os_opts = [
]
def get_identity_auth(conf):
def get_identity_auth(conf, project=None):
loader = loading.get_plugin_loader('password')
return loader.load_from_options(
auth_url=conf.service_user.os_auth_url,
username=conf.service_user.os_username,
password=conf.service_user.os_password,
user_domain_name=conf.service_user.os_user_domain_name,
project_name=conf.service_user.os_project_name,
project_name = (project or conf.service_user.os_project_name),
tenant_name=conf.service_user.os_project_name,
project_domain_name=conf.service_user.os_project_domain_name)


+ 3
- 0
fenix/utils/service.py View File

@ -37,6 +37,7 @@ from uuid import uuid1 as generate_uuid
from fenix import context
from fenix.db import api as db_api
from fenix import exceptions
from fenix.utils.download import download_url
import fenix.utils.identity_auth
@ -159,6 +160,8 @@ class EngineEndpoint(object):
def admin_delete_session(self, ctx, session_id):
"""Delete maintenance workflow session thread"""
LOG.info("EngineEndpoint: admin_delete_session")
if session_id not in self.workflow_sessions:
raise exceptions.NotFound("session_id not found")
self.workflow_sessions[session_id].cleanup()
self.workflow_sessions[session_id].stop()
self.workflow_sessions.pop(session_id)


+ 21
- 0
fenix/workflow/workflow.py View File

@ -398,13 +398,27 @@ class BaseWorkflow(Thread):
def maintenance(self):
LOG.error("%s: maintenance method not implemented!" % self.session_id)
def maintenance_done(self):
LOG.error("%s: maintenance_done method not implemented!" %
self.session_id)
def maintenance_failed(self):
LOG.error("%s: maintenance_failed method not implemented!" %
self.session_id)
def state(self, state):
# TBD we could notify admin for workflow state change
self.session.prev_state = self.session.state
self.session.state = state
if state in ["MAINTENANCE_DONE", "MAINTENANCE_FAILED"]:
try:
statefunc = (getattr(self,
self.states_methods[self.session.state]))
statefunc()
except Exception as e:
LOG.error("%s: %s Raised exception: %s" % (self.session_id,
statefunc, e), exc_info=True)
self.state("MAINTENANCE_FAILED")
def run(self):
LOG.info("%s: started" % self.session_id)
@ -555,3 +569,10 @@ class BaseWorkflow(Thread):
LOG.error('%s: timer %s expired' %
(self.session_id, timer_name))
return False
def project_ids_with_instance_group(self):
igs = db_api.instance_groups_get()
project_ids = list()
[project_ids.append(ig.project_id) for ig in igs
if ig.project_id not in project_ids]
return project_ids

+ 984
- 0
fenix/workflow/workflows/k8s.py View File

@ -0,0 +1,984 @@
# Copyright (c) 2020 OpenStack Foundation.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import datetime
from importlib import import_module
try:
from importlib.machinery import SourceFileLoader
def mod_loader_action_instance(mname, mpath, session_instance,
ap_db_instance):
mi = SourceFileLoader(mname, mpath).load_module()
return mi.ActionPlugin(session_instance, ap_db_instance)
except ImportError:
from imp import load_source
def mod_loader_action_instance(mname, mpath, session_instance,
ap_db_instance):
mi = load_source(mname, mpath)
return mi.ActionPlugin(session_instance, ap_db_instance)
from keystoneclient import client as ks_client
from kubernetes import client
from kubernetes import config
from kubernetes.client.rest import ApiException
import os
from oslo_log import log as logging
import time
from fenix.db import api as db_api
from fenix.db import exceptions as db_exc
from fenix.utils.thread import run_async
from fenix.utils.time import datetime_to_str
from fenix.utils.time import is_time_after_time
from fenix.utils.time import reply_time_str
from fenix.utils.time import time_now_str
from fenix.workflow.workflow import BaseWorkflow
LOG = logging.getLogger(__name__)
class Workflow(BaseWorkflow):
def __init__(self, conf, session_id, data):
super(Workflow, self).__init__(conf, session_id, data)
config.load_kube_config()
v_api = client.VersionApi()
self.kapi = client.CoreV1Api()
self.ks = ks_client.Client(version='v3', session=self.auth_session)
LOG.info("%s: initialized with Kubernetes: %s" %
(self.session_id,
v_api.get_code_with_http_info()[0].git_version))
self.hosts = self._init_hosts_by_services()
LOG.info('%s: Execute pre action plugins' % (self.session_id))
self.maintenance_by_plugin_type("localhost", "pre")
self.group_impacted_members = {}
def _init_hosts_by_services(self):
LOG.info("%s: Dicovering hosts by services" % self.session_id)
nodes = self.kapi.list_node().items