From 12f988aecec82f1e67b8a68b7b8bebd3dd4f1e9b Mon Sep 17 00:00:00 2001 From: Tomi Juvonen Date: Wed, 12 Feb 2020 12:09:00 +0200 Subject: [PATCH] Ability to continue failed session -API to update session state after session failed -Thread exception passed to parent -Session DB knows previous state -Changing state will save previous state to help continue session after a failure -API error codes documented better Story: 2005583 Task: #30772 Change-Id: Ifd5eb29a1d3d969b2d9b9648d823e80b435f7cb3 Signed-off-by: Tomi Juvonen PI schema validation --- doc/source/api-ref/v1/maintenance.inc | 34 +++++++++-- doc/source/api-ref/v1/parameters.yaml | 8 +++ doc/source/api-ref/v1/project.inc | 53 +++++++++++++++- .../samples/maintenance-session-put-200.json | 3 + doc/source/api-ref/v1/status.yaml | 16 ++++- fenix/api/v1/controllers/maintenance.py | 23 ++++--- fenix/api/v1/schema.py | 14 +++-- .../versions/001_initial.py | 1 + fenix/db/sqlalchemy/models.py | 1 + fenix/utils/service.py | 12 +++- fenix/utils/thread.py | 22 ++++++- fenix/workflow/workflow.py | 6 +- fenix/workflow/workflows/vnf.py | 61 +++++++++++++------ 13 files changed, 208 insertions(+), 46 deletions(-) create mode 100644 doc/source/api-ref/v1/samples/maintenance-session-put-200.json diff --git a/doc/source/api-ref/v1/maintenance.inc b/doc/source/api-ref/v1/maintenance.inc index fecabe4..bd77612 100644 --- a/doc/source/api-ref/v1/maintenance.inc +++ b/doc/source/api-ref/v1/maintenance.inc @@ -42,7 +42,9 @@ Response codes .. rest_status_code:: error status.yaml - - 404 + - 400 + - 500 + - 509 Update maintenance session (planned future functionality) ========================================================= @@ -50,7 +52,8 @@ Update maintenance session (planned future functionality) .. rest_method:: PUT /v1/maintenance/{session_id}/ Update existing maintenance session. This can be used to continue a failed -session. +session after manually fixing what failed. Workflow should then run +succesfully to the end. Request ------- @@ -58,6 +61,20 @@ Request .. rest_parameters:: parameters.yaml - session_id: session_id + - state: workflow-state-optional + +Response codes +-------------- + +.. rest_status_code:: success status.yaml + + - 200: maintenance-session-put + +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 Get maintenance sessions ======================== @@ -75,7 +92,8 @@ Response codes .. rest_status_code:: error status.yaml - - 404 + - 400 + - 500 Get maintenance session ======================= @@ -100,7 +118,10 @@ Response codes .. rest_status_code:: error status.yaml + - 400 - 404 + - 422 + - 500 Delete maintenance session ========================== @@ -117,7 +138,9 @@ finished. .. rest_status_code:: error status.yaml - - 404 + - 400 + - 422 + - 500 Future ====== @@ -125,4 +148,5 @@ Future On top of some expected changes mentioned above, it will also be handy to get detailed information about the steps run already in the maintenance session. This will be helpful when need to figure out any correcting actions to -successfully finish a failed session. +successfully finish a failed session. For now admin can update failed session +state to previous or his wanted state to try continue a failed session. diff --git a/doc/source/api-ref/v1/parameters.yaml b/doc/source/api-ref/v1/parameters.yaml index 208cef9..a412cfa 100644 --- a/doc/source/api-ref/v1/parameters.yaml +++ b/doc/source/api-ref/v1/parameters.yaml @@ -258,6 +258,14 @@ workflow-state: required: true type: string +workflow-state-optional: + description: | + Maintenance workflow state or previous state if not given. + The workflow will continue from this state. + in: body + required: false + type: string + workflow-state-reply: description: | There can have different values depending on what is the maintenance diff --git a/doc/source/api-ref/v1/project.inc b/doc/source/api-ref/v1/project.inc index ca14d95..d6441fa 100644 --- a/doc/source/api-ref/v1/project.inc +++ b/doc/source/api-ref/v1/project.inc @@ -35,7 +35,9 @@ Response codes .. rest_status_code:: error status.yaml - - 404 + - 400 + - 422 + - 500 Input from project to maintenance session ========================================= @@ -70,6 +72,12 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 + ============================ Project with NFV constraints ============================ @@ -110,6 +118,12 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 + Get instance constraints saved in Fenix DB ========================================== @@ -143,6 +157,13 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 404 + - 422 + - 500 + Update instance constraints saved to Fenix DB ============================================= @@ -176,6 +197,12 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 + Delete instance constraints from Fenix DB ========================================= @@ -200,6 +227,12 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 + Get instance group constraints saved in Fenix DB ================================================ @@ -234,6 +267,12 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 + Update instance group constraints saved to Fenix DB =================================================== @@ -268,6 +307,12 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 + Delete instance group constraints from Fenix DB =============================================== @@ -292,5 +337,11 @@ Response codes - 200 +.. rest_status_code:: error status.yaml + + - 400 + - 422 + - 500 + diff --git a/doc/source/api-ref/v1/samples/maintenance-session-put-200.json b/doc/source/api-ref/v1/samples/maintenance-session-put-200.json new file mode 100644 index 0000000..5f5e483 --- /dev/null +++ b/doc/source/api-ref/v1/samples/maintenance-session-put-200.json @@ -0,0 +1,3 @@ +{ + "state": "PLANNED_MAINTENANCE" +} diff --git a/doc/source/api-ref/v1/status.yaml b/doc/source/api-ref/v1/status.yaml index 2149aee..7f4d908 100644 --- a/doc/source/api-ref/v1/status.yaml +++ b/doc/source/api-ref/v1/status.yaml @@ -11,6 +11,14 @@ .. literalinclude:: samples/create-maintenance-session-post-200.json :language: javascript + maintenance-session-put: | + .. rest_parameters:: parameters.yaml + + - session_id: uuid + + .. literalinclude:: samples/maintenance-session-put-200.json + :language: javascript + get-maintenance-sessions-get: | .. rest_parameters:: parameters.yaml @@ -82,6 +90,9 @@ default: | The entity of the request is in a format not supported by the requested resource for the method. +422: + default: | + The entity of the request is not inline with resource schema 500: default: | Something went wrong with the service which prevents it from fulfilling @@ -92,4 +103,7 @@ request. 503: default: | - The service cannot handle the request right now. \ No newline at end of file + The service cannot handle the request right now. +509: + default: | + There is too many parallel sessions. diff --git a/fenix/api/v1/controllers/maintenance.py b/fenix/api/v1/controllers/maintenance.py index 847bbeb..19e244b 100644 --- a/fenix/api/v1/controllers/maintenance.py +++ b/fenix/api/v1/controllers/maintenance.py @@ -147,8 +147,7 @@ class SessionController(rest.RestController): data = json.loads(request.body.decode('utf8')) try: jsonschema.validate(session_id, schema.uid) - # TBD implement this API - # jsonschema.validate(data, schema.maintenance_session_put) + jsonschema.validate(data, schema.maintenance_session_put) except jsonschema.exceptions.ValidationError as e: LOG.error(str(e.message)) abort(422) @@ -236,14 +235,14 @@ class InstanceController(rest.RestController): if request.body: LOG.error("Unexpected data") abort(400) - session = self.engine_rpcapi.get_instance(instance_id) - if session is None: - LOG.error("Invalid session") + instance = self.engine_rpcapi.get_instance(instance_id) + if instance is None: + LOG.error("Invalid instance: %s" % instance_id) abort(404) try: - response.text = jsonutils.dumps(session) + response.text = jsonutils.dumps(instance) except TypeError: - response.body = jsonutils.dumps(session) + response.body = jsonutils.dumps(instance) # PUT /v1/instance/ @policy.authorize('instance', 'put') @@ -301,14 +300,14 @@ class InstanceGroupController(rest.RestController): if request.body: LOG.error("Unexpected data") abort(400) - session = self.engine_rpcapi.get_instance_group(group_id) - if session is None: - LOG.error("Invalid session") + group = self.engine_rpcapi.get_instance_group(group_id) + if group is None: + LOG.error("Invalid instance_group: %s" % group_id) abort(404) try: - response.text = jsonutils.dumps(session) + response.text = jsonutils.dumps(group) except TypeError: - response.body = jsonutils.dumps(session) + response.body = jsonutils.dumps(group) # PUT /v1/instance_group/ @policy.authorize('instance_group', 'put') diff --git a/fenix/api/v1/schema.py b/fenix/api/v1/schema.py index d041ac8..2e46fb8 100644 --- a/fenix/api/v1/schema.py +++ b/fenix/api/v1/schema.py @@ -71,10 +71,16 @@ maintenance_session_project_instance_put = { 'required': ['instance_action', 'state'] } -# TBD -# maintenance_session_put = { -# -# } + +maintenance_session_put = { + 'type': 'object', + 'properties': { + 'state': { + 'type': 'string', + 'enum': states, + } + } +} maintenance_post = { 'type': 'object', diff --git a/fenix/db/migration/alembic_migrations/versions/001_initial.py b/fenix/db/migration/alembic_migrations/versions/001_initial.py index b15bd3f..abd87d2 100644 --- a/fenix/db/migration/alembic_migrations/versions/001_initial.py +++ b/fenix/db/migration/alembic_migrations/versions/001_initial.py @@ -38,6 +38,7 @@ def upgrade(): sa.Column('created_at', sa.DateTime(), nullable=False), sa.Column('updated_at', sa.DateTime(), nullable=True), sa.Column('session_id', sa.String(36), primary_key=True), + sa.Column('prev_state', sa.String(length=32), nullable=True), sa.Column('state', sa.String(length=32), nullable=True), sa.Column('maintenance_at', sa.DateTime(), nullable=True), sa.Column('meta', MediumText(), nullable=True), diff --git a/fenix/db/sqlalchemy/models.py b/fenix/db/sqlalchemy/models.py index 837746e..87b17ef 100644 --- a/fenix/db/sqlalchemy/models.py +++ b/fenix/db/sqlalchemy/models.py @@ -44,6 +44,7 @@ class MaintenanceSession(mb.FenixBase): __tablename__ = 'sessions' session_id = sa.Column(sa.String(36), primary_key=True) + prev_state = sa.Column(sa.String(length=32), nullable=True) state = sa.Column(sa.String(length=32), nullable=True) maintenance_at = sa.Column(sa.DateTime(), nullable=True) meta = sa.Column(MediumText(), nullable=False) diff --git a/fenix/utils/service.py b/fenix/utils/service.py index d761b09..c431cbd 100644 --- a/fenix/utils/service.py +++ b/fenix/utils/service.py @@ -169,8 +169,16 @@ class EngineEndpoint(object): def admin_update_session(self, ctx, session_id, data): """Update maintenance workflow session""" LOG.info("EngineEndpoint: admin_update_session") - # TBD Update data to workflow and return updated data - return data + # We assume we can now continue the previous state + ses = self.workflow_sessions[session_id].session + ses.stopped = False + if "state" in data.keys() and len(data["state"]): + ses.prev_state = ses.state + ses.state = data["state"] + else: + ses.state, ses.prev_state = ses.prev_state, ses.state + LOG.info("admin_update_session %s state %s" % (session_id, ses.state)) + return ({"state": ses.state}) def project_get_session(self, ctx, session_id, project_id): """Get maintenance workflow session project specific details""" diff --git a/fenix/utils/thread.py b/fenix/utils/thread.py index a035b14..53e4da2 100644 --- a/fenix/utils/thread.py +++ b/fenix/utils/thread.py @@ -13,6 +13,26 @@ # License for the specific language governing permissions and limitations # under the License. +from threading import Thread + + +class PropagatingThread(Thread): + def run(self): + self.exc = None + try: + if hasattr(self, '_Thread__target'): + # Thread uses name mangling prior to Python 3. + self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs) + else: + self.ret = self._target(*self._args, **self._kwargs) + except BaseException as e: + self.exc = e + + def join(self): + super(PropagatingThread, self).join() + if self.exc: + raise self.exc + return self.ret def run_async(func): from functools import wraps @@ -20,7 +40,7 @@ def run_async(func): @wraps(func) def async_func(*args, **kwargs): - thread = Thread(target=func, args=args, kwargs=kwargs) + thread = PropagatingThread(target=func, args=args, kwargs=kwargs) thread.start() return thread diff --git a/fenix/workflow/workflow.py b/fenix/workflow/workflow.py index 54c0a7e..489844c 100644 --- a/fenix/workflow/workflow.py +++ b/fenix/workflow/workflow.py @@ -402,6 +402,10 @@ class BaseWorkflow(Thread): LOG.error("%s: maintenance_failed method not implemented!" % self.session_id) + def state(self, state): + self.session.prev_state = self.session.state + self.session.state = state + def run(self): LOG.info("%s: started" % self.session_id) while not self.stopped: @@ -414,7 +418,7 @@ class BaseWorkflow(Thread): except Exception as e: LOG.error("%s: %s Raised exception: %s" % (self.session_id, statefunc, e), exc_info=True) - self.session.state = "MAINTENANCE_FAILED" + self.state("MAINTENANCE_FAILED") else: time.sleep(1) # IDLE while session removed diff --git a/fenix/workflow/workflows/vnf.py b/fenix/workflow/workflows/vnf.py index 776c0f7..27a2de0 100644 --- a/fenix/workflow/workflows/vnf.py +++ b/fenix/workflow/workflows/vnf.py @@ -342,7 +342,7 @@ class Workflow(BaseWorkflow): if is_time_after_time(reply_at, actions_at): LOG.error('%s: No time for project to answer in state: %s' % (self.session_id, state)) - self.session.state = "MAINTENANCE_FAILED" + self.state("MAINTENANCE_FAILED") return False metadata = self.session.meta self._project_notify(project, instance_ids, allowed_actions, @@ -656,16 +656,39 @@ class Workflow(BaseWorkflow): max_parallel)) if instance.action == 'MIGRATE': if not self.migrate_server(instance, target_host): - return False + self.group_impacted_members[group_id] -= 1 + LOG.debug("%s Reservation freed. remain / " + "max_impacted_members:%s/%s" + % (instance.instance_id, + self.group_impacted_members[group_id], + max_parallel)) + raise Exception('%s: instance %s action ' + '%s failed' % + (self.session_id, instance.instance_id, + instance.action)) self.notify_action_done(instance) elif instance.action == 'OWN_ACTION': pass elif instance.action == 'LIVE_MIGRATE': if not self.live_migrate_server(instance, target_host): - return False + self.group_impacted_members[group_id] -= 1 + LOG.debug("%s Reservation freed. remain / " + "max_impacted_members:%s/%s" + % (instance.instance_id, + self.group_impacted_members[group_id], + max_parallel)) + raise Exception('%s: instance %s action ' + '%s failed' % + (self.session_id, instance.instance_id, + instance.action)) self.notify_action_done(instance) else: self.group_impacted_members[group_id] -= 1 + LOG.debug("%s Reservation freed. remain / " + "max_impacted_members:%s/%s" + % (instance.instance_id, + self.group_impacted_members[group_id], + max_parallel)) raise Exception('%s: instance %s action ' '%s not supported' % (self.session_id, instance.instance_id, @@ -931,11 +954,11 @@ class Workflow(BaseWorkflow): self.initialize_server_info() if not self.projects_listen_alarm('maintenance.scheduled'): - self.session.state = 'MAINTENANCE_FAILED' + self.state('MAINTENANCE_FAILED') return if not self.confirm_maintenance(): - self.session.state = 'MAINTENANCE_FAILED' + self.state('MAINTENANCE_FAILED') return maintenance_empty_hosts = self.get_empty_computes() @@ -944,14 +967,14 @@ class Workflow(BaseWorkflow): if self.need_scale_in(): LOG.info('%s: Need to scale in to get capacity for ' 'empty host' % (self.session_id)) - self.session.state = 'SCALE_IN' + self.state('SCALE_IN') else: LOG.info('%s: Free capacity, but need empty host' % (self.session_id)) - self.session.state = 'PREPARE_MAINTENANCE' + self.state('PREPARE_MAINTENANCE') else: LOG.info('Empty host found') - self.session.state = 'START_MAINTENANCE' + self.state('START_MAINTENANCE') if self.session.maintenance_at > datetime.datetime.utcnow(): time_now = time_now_str() @@ -972,7 +995,7 @@ class Workflow(BaseWorkflow): # how many instances can be affected at the same time, we should # calculate and ask scaling of specific instances if not self.confirm_scale_in(): - self.session.state = 'MAINTENANCE_FAILED' + self.state('MAINTENANCE_FAILED') return # TBD it takes time to have proper information updated about free # capacity. Should make sure instances removed has also VCPUs removed @@ -983,24 +1006,24 @@ class Workflow(BaseWorkflow): if self.need_scale_in(): LOG.info('%s: Need to scale in more to get capacity for ' 'empty host' % (self.session_id)) - self.session.state = 'SCALE_IN' + self.state('SCALE_IN') else: LOG.info('%s: Free capacity, but need empty host' % (self.session_id)) - self.session.state = 'PREPARE_MAINTENANCE' + self.state('PREPARE_MAINTENANCE') else: LOG.info('Empty host found') for host in maintenance_empty_hosts: self._wait_host_empty(host) - self.session.state = 'START_MAINTENANCE' + self.state('START_MAINTENANCE') def prepare_maintenance(self): LOG.info("%s: prepare_maintenance called" % self.session_id) if not self.make_empty_hosts('PREPARE_MAINTENANCE'): LOG.error('make_empty_hosts failed') - self.session.state = 'MAINTENANCE_FAILED' + self.state('MAINTENANCE_FAILED') else: - self.session.state = 'START_MAINTENANCE' + self.state('START_MAINTENANCE') self.update_server_info() def start_maintenance(self): @@ -1008,7 +1031,7 @@ class Workflow(BaseWorkflow): empty_hosts = self.get_empty_computes() if not empty_hosts: LOG.info("%s: No empty host to be maintained" % self.session_id) - self.session.state = 'MAINTENANCE_FAILED' + self.state('MAINTENANCE_FAILED') return maintained_hosts = self.get_maintained_hosts_by_type('compute') if not maintained_hosts: @@ -1028,7 +1051,7 @@ class Workflow(BaseWorkflow): thrs.append(self.host_maintenance_async(host)) for thr in thrs: thr.join() - self.session.state = 'PLANNED_MAINTENANCE' + self.state('PLANNED_MAINTENANCE') def planned_maintenance(self): LOG.info("%s: planned_maintenance called" % self.session_id) @@ -1065,7 +1088,7 @@ class Workflow(BaseWorkflow): self.update_server_info() LOG.info("%s: planned_maintenance done" % self.session_id) - self.session.state = 'MAINTENANCE_COMPLETE' + self.state('MAINTENANCE_COMPLETE') def maintenance_complete(self): LOG.info("%s: maintenance_complete called" % self.session_id) @@ -1074,10 +1097,10 @@ class Workflow(BaseWorkflow): LOG.info('Projects may still need to up scale back to full ' 'capcity') if not self.confirm_maintenance_complete(): - self.session.state = 'MAINTENANCE_FAILED' + self.state('MAINTENANCE_FAILED') return self.update_server_info() - self.session.state = 'MAINTENANCE_DONE' + self.state('MAINTENANCE_DONE') def maintenance_done(self): pass