Ability to continue failed session
-API to update session state after session failed -Thread exception passed to parent -Session DB knows previous state -Changing state will save previous state to help continue session after a failure -API error codes documented better Story: 2005583 Task: #30772 Change-Id: Ifd5eb29a1d3d969b2d9b9648d823e80b435f7cb3 Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>PI schema validation
This commit is contained in:
parent
76fdc1aba5
commit
b193ff5a81
|
@ -42,7 +42,9 @@ Response codes
|
|||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 404
|
||||
- 400
|
||||
- 500
|
||||
- 509
|
||||
|
||||
Update maintenance session (planned future functionality)
|
||||
=========================================================
|
||||
|
@ -50,7 +52,8 @@ Update maintenance session (planned future functionality)
|
|||
.. rest_method:: PUT /v1/maintenance/{session_id}/
|
||||
|
||||
Update existing maintenance session. This can be used to continue a failed
|
||||
session.
|
||||
session after manually fixing what failed. Workflow should then run
|
||||
succesfully to the end.
|
||||
|
||||
Request
|
||||
-------
|
||||
|
@ -58,6 +61,20 @@ Request
|
|||
.. rest_parameters:: parameters.yaml
|
||||
|
||||
- session_id: session_id
|
||||
- state: workflow-state-optional
|
||||
|
||||
Response codes
|
||||
--------------
|
||||
|
||||
.. rest_status_code:: success status.yaml
|
||||
|
||||
- 200: maintenance-session-put
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Get maintenance sessions
|
||||
========================
|
||||
|
@ -75,7 +92,8 @@ Response codes
|
|||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 404
|
||||
- 400
|
||||
- 500
|
||||
|
||||
Get maintenance session
|
||||
=======================
|
||||
|
@ -100,7 +118,10 @@ Response codes
|
|||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 404
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Delete maintenance session
|
||||
==========================
|
||||
|
@ -117,7 +138,9 @@ finished.
|
|||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 404
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Future
|
||||
======
|
||||
|
@ -125,4 +148,5 @@ Future
|
|||
On top of some expected changes mentioned above, it will also be handy to get
|
||||
detailed information about the steps run already in the maintenance session.
|
||||
This will be helpful when need to figure out any correcting actions to
|
||||
successfully finish a failed session.
|
||||
successfully finish a failed session. For now admin can update failed session
|
||||
state to previous or his wanted state to try continue a failed session.
|
||||
|
|
|
@ -258,6 +258,14 @@ workflow-state:
|
|||
required: true
|
||||
type: string
|
||||
|
||||
workflow-state-optional:
|
||||
description: |
|
||||
Maintenance workflow state or previous state if not given.
|
||||
The workflow will continue from this state.
|
||||
in: body
|
||||
required: false
|
||||
type: string
|
||||
|
||||
workflow-state-reply:
|
||||
description: |
|
||||
There can have different values depending on what is the maintenance
|
||||
|
|
|
@ -35,7 +35,9 @@ Response codes
|
|||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 404
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Input from project to maintenance session
|
||||
=========================================
|
||||
|
@ -70,6 +72,12 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
============================
|
||||
Project with NFV constraints
|
||||
============================
|
||||
|
@ -110,6 +118,12 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Get instance constraints saved in Fenix DB
|
||||
==========================================
|
||||
|
||||
|
@ -143,6 +157,13 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 404
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Update instance constraints saved to Fenix DB
|
||||
=============================================
|
||||
|
||||
|
@ -176,6 +197,12 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Delete instance constraints from Fenix DB
|
||||
=========================================
|
||||
|
||||
|
@ -200,6 +227,12 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Get instance group constraints saved in Fenix DB
|
||||
================================================
|
||||
|
||||
|
@ -234,6 +267,12 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Update instance group constraints saved to Fenix DB
|
||||
===================================================
|
||||
|
||||
|
@ -268,6 +307,12 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
Delete instance group constraints from Fenix DB
|
||||
===============================================
|
||||
|
||||
|
@ -292,5 +337,11 @@ Response codes
|
|||
|
||||
- 200
|
||||
|
||||
.. rest_status_code:: error status.yaml
|
||||
|
||||
- 400
|
||||
- 422
|
||||
- 500
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"state": "PLANNED_MAINTENANCE"
|
||||
}
|
|
@ -11,6 +11,14 @@
|
|||
.. literalinclude:: samples/create-maintenance-session-post-200.json
|
||||
:language: javascript
|
||||
|
||||
maintenance-session-put: |
|
||||
.. rest_parameters:: parameters.yaml
|
||||
|
||||
- session_id: uuid
|
||||
|
||||
.. literalinclude:: samples/maintenance-session-put-200.json
|
||||
:language: javascript
|
||||
|
||||
get-maintenance-sessions-get: |
|
||||
.. rest_parameters:: parameters.yaml
|
||||
|
||||
|
@ -82,6 +90,9 @@
|
|||
default: |
|
||||
The entity of the request is in a format not supported by the requested
|
||||
resource for the method.
|
||||
422:
|
||||
default: |
|
||||
The entity of the request is not inline with resource schema
|
||||
500:
|
||||
default: |
|
||||
Something went wrong with the service which prevents it from fulfilling
|
||||
|
@ -92,4 +103,7 @@
|
|||
request.
|
||||
503:
|
||||
default: |
|
||||
The service cannot handle the request right now.
|
||||
The service cannot handle the request right now.
|
||||
509:
|
||||
default: |
|
||||
There is too many parallel sessions.
|
||||
|
|
|
@ -147,8 +147,7 @@ class SessionController(rest.RestController):
|
|||
data = json.loads(request.body.decode('utf8'))
|
||||
try:
|
||||
jsonschema.validate(session_id, schema.uid)
|
||||
# TBD implement this API
|
||||
# jsonschema.validate(data, schema.maintenance_session_put)
|
||||
jsonschema.validate(data, schema.maintenance_session_put)
|
||||
except jsonschema.exceptions.ValidationError as e:
|
||||
LOG.error(str(e.message))
|
||||
abort(422)
|
||||
|
@ -236,14 +235,14 @@ class InstanceController(rest.RestController):
|
|||
if request.body:
|
||||
LOG.error("Unexpected data")
|
||||
abort(400)
|
||||
session = self.engine_rpcapi.get_instance(instance_id)
|
||||
if session is None:
|
||||
LOG.error("Invalid session")
|
||||
instance = self.engine_rpcapi.get_instance(instance_id)
|
||||
if instance is None:
|
||||
LOG.error("Invalid instance: %s" % instance_id)
|
||||
abort(404)
|
||||
try:
|
||||
response.text = jsonutils.dumps(session)
|
||||
response.text = jsonutils.dumps(instance)
|
||||
except TypeError:
|
||||
response.body = jsonutils.dumps(session)
|
||||
response.body = jsonutils.dumps(instance)
|
||||
|
||||
# PUT /v1/instance/<instance_id>
|
||||
@policy.authorize('instance', 'put')
|
||||
|
@ -301,14 +300,14 @@ class InstanceGroupController(rest.RestController):
|
|||
if request.body:
|
||||
LOG.error("Unexpected data")
|
||||
abort(400)
|
||||
session = self.engine_rpcapi.get_instance_group(group_id)
|
||||
if session is None:
|
||||
LOG.error("Invalid session")
|
||||
group = self.engine_rpcapi.get_instance_group(group_id)
|
||||
if group is None:
|
||||
LOG.error("Invalid instance_group: %s" % group_id)
|
||||
abort(404)
|
||||
try:
|
||||
response.text = jsonutils.dumps(session)
|
||||
response.text = jsonutils.dumps(group)
|
||||
except TypeError:
|
||||
response.body = jsonutils.dumps(session)
|
||||
response.body = jsonutils.dumps(group)
|
||||
|
||||
# PUT /v1/instance_group/<group_id>
|
||||
@policy.authorize('instance_group', 'put')
|
||||
|
|
|
@ -71,10 +71,16 @@ maintenance_session_project_instance_put = {
|
|||
'required': ['instance_action', 'state']
|
||||
}
|
||||
|
||||
# TBD
|
||||
# maintenance_session_put = {
|
||||
#
|
||||
# }
|
||||
|
||||
maintenance_session_put = {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'state': {
|
||||
'type': 'string',
|
||||
'enum': states,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
maintenance_post = {
|
||||
'type': 'object',
|
||||
|
|
|
@ -38,6 +38,7 @@ def upgrade():
|
|||
sa.Column('created_at', sa.DateTime(), nullable=False),
|
||||
sa.Column('updated_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('session_id', sa.String(36), primary_key=True),
|
||||
sa.Column('prev_state', sa.String(length=32), nullable=True),
|
||||
sa.Column('state', sa.String(length=32), nullable=True),
|
||||
sa.Column('maintenance_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('meta', MediumText(), nullable=True),
|
||||
|
|
|
@ -44,6 +44,7 @@ class MaintenanceSession(mb.FenixBase):
|
|||
__tablename__ = 'sessions'
|
||||
|
||||
session_id = sa.Column(sa.String(36), primary_key=True)
|
||||
prev_state = sa.Column(sa.String(length=32), nullable=True)
|
||||
state = sa.Column(sa.String(length=32), nullable=True)
|
||||
maintenance_at = sa.Column(sa.DateTime(), nullable=True)
|
||||
meta = sa.Column(MediumText(), nullable=False)
|
||||
|
|
|
@ -169,8 +169,16 @@ class EngineEndpoint(object):
|
|||
def admin_update_session(self, ctx, session_id, data):
|
||||
"""Update maintenance workflow session"""
|
||||
LOG.info("EngineEndpoint: admin_update_session")
|
||||
# TBD Update data to workflow and return updated data
|
||||
return data
|
||||
# We assume we can now continue the previous state
|
||||
ses = self.workflow_sessions[session_id].session
|
||||
ses.stopped = False
|
||||
if "state" in data.keys() and len(data["state"]):
|
||||
ses.prev_state = ses.state
|
||||
ses.state = data["state"]
|
||||
else:
|
||||
ses.state, ses.prev_state = ses.prev_state, ses.state
|
||||
LOG.info("admin_update_session %s state %s" % (session_id, ses.state))
|
||||
return ({"state": ses.state})
|
||||
|
||||
def project_get_session(self, ctx, session_id, project_id):
|
||||
"""Get maintenance workflow session project specific details"""
|
||||
|
|
|
@ -13,14 +13,34 @@
|
|||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from threading import Thread
|
||||
|
||||
|
||||
class PropagatingThread(Thread):
|
||||
def run(self):
|
||||
self.exc = None
|
||||
try:
|
||||
if hasattr(self, '_Thread__target'):
|
||||
# Thread uses name mangling prior to Python 3.
|
||||
self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
|
||||
else:
|
||||
self.ret = self._target(*self._args, **self._kwargs)
|
||||
except BaseException as e:
|
||||
self.exc = e
|
||||
|
||||
def join(self):
|
||||
super(PropagatingThread, self).join()
|
||||
if self.exc:
|
||||
raise self.exc
|
||||
return self.ret
|
||||
|
||||
|
||||
def run_async(func):
|
||||
from functools import wraps
|
||||
from threading import Thread
|
||||
|
||||
@wraps(func)
|
||||
def async_func(*args, **kwargs):
|
||||
thread = Thread(target=func, args=args, kwargs=kwargs)
|
||||
thread = PropagatingThread(target=func, args=args, kwargs=kwargs)
|
||||
thread.start()
|
||||
return thread
|
||||
|
||||
|
|
|
@ -402,6 +402,10 @@ class BaseWorkflow(Thread):
|
|||
LOG.error("%s: maintenance_failed method not implemented!" %
|
||||
self.session_id)
|
||||
|
||||
def state(self, state):
|
||||
self.session.prev_state = self.session.state
|
||||
self.session.state = state
|
||||
|
||||
def run(self):
|
||||
LOG.info("%s: started" % self.session_id)
|
||||
while not self.stopped:
|
||||
|
@ -414,7 +418,7 @@ class BaseWorkflow(Thread):
|
|||
except Exception as e:
|
||||
LOG.error("%s: %s Raised exception: %s" % (self.session_id,
|
||||
statefunc, e), exc_info=True)
|
||||
self.session.state = "MAINTENANCE_FAILED"
|
||||
self.state("MAINTENANCE_FAILED")
|
||||
else:
|
||||
time.sleep(1)
|
||||
# IDLE while session removed
|
||||
|
|
|
@ -342,7 +342,7 @@ class Workflow(BaseWorkflow):
|
|||
if is_time_after_time(reply_at, actions_at):
|
||||
LOG.error('%s: No time for project to answer in state: %s'
|
||||
% (self.session_id, state))
|
||||
self.session.state = "MAINTENANCE_FAILED"
|
||||
self.state("MAINTENANCE_FAILED")
|
||||
return False
|
||||
metadata = self.session.meta
|
||||
self._project_notify(project, instance_ids, allowed_actions,
|
||||
|
@ -656,16 +656,39 @@ class Workflow(BaseWorkflow):
|
|||
max_parallel))
|
||||
if instance.action == 'MIGRATE':
|
||||
if not self.migrate_server(instance, target_host):
|
||||
return False
|
||||
self.group_impacted_members[group_id] -= 1
|
||||
LOG.debug("%s Reservation freed. remain / "
|
||||
"max_impacted_members:%s/%s"
|
||||
% (instance.instance_id,
|
||||
self.group_impacted_members[group_id],
|
||||
max_parallel))
|
||||
raise Exception('%s: instance %s action '
|
||||
'%s failed' %
|
||||
(self.session_id, instance.instance_id,
|
||||
instance.action))
|
||||
self.notify_action_done(instance)
|
||||
elif instance.action == 'OWN_ACTION':
|
||||
pass
|
||||
elif instance.action == 'LIVE_MIGRATE':
|
||||
if not self.live_migrate_server(instance, target_host):
|
||||
return False
|
||||
self.group_impacted_members[group_id] -= 1
|
||||
LOG.debug("%s Reservation freed. remain / "
|
||||
"max_impacted_members:%s/%s"
|
||||
% (instance.instance_id,
|
||||
self.group_impacted_members[group_id],
|
||||
max_parallel))
|
||||
raise Exception('%s: instance %s action '
|
||||
'%s failed' %
|
||||
(self.session_id, instance.instance_id,
|
||||
instance.action))
|
||||
self.notify_action_done(instance)
|
||||
else:
|
||||
self.group_impacted_members[group_id] -= 1
|
||||
LOG.debug("%s Reservation freed. remain / "
|
||||
"max_impacted_members:%s/%s"
|
||||
% (instance.instance_id,
|
||||
self.group_impacted_members[group_id],
|
||||
max_parallel))
|
||||
raise Exception('%s: instance %s action '
|
||||
'%s not supported' %
|
||||
(self.session_id, instance.instance_id,
|
||||
|
@ -931,11 +954,11 @@ class Workflow(BaseWorkflow):
|
|||
self.initialize_server_info()
|
||||
|
||||
if not self.projects_listen_alarm('maintenance.scheduled'):
|
||||
self.session.state = 'MAINTENANCE_FAILED'
|
||||
self.state('MAINTENANCE_FAILED')
|
||||
return
|
||||
|
||||
if not self.confirm_maintenance():
|
||||
self.session.state = 'MAINTENANCE_FAILED'
|
||||
self.state('MAINTENANCE_FAILED')
|
||||
return
|
||||
|
||||
maintenance_empty_hosts = self.get_empty_computes()
|
||||
|
@ -944,14 +967,14 @@ class Workflow(BaseWorkflow):
|
|||
if self.need_scale_in():
|
||||
LOG.info('%s: Need to scale in to get capacity for '
|
||||
'empty host' % (self.session_id))
|
||||
self.session.state = 'SCALE_IN'
|
||||
self.state('SCALE_IN')
|
||||
else:
|
||||
LOG.info('%s: Free capacity, but need empty host' %
|
||||
(self.session_id))
|
||||
self.session.state = 'PREPARE_MAINTENANCE'
|
||||
self.state('PREPARE_MAINTENANCE')
|
||||
else:
|
||||
LOG.info('Empty host found')
|
||||
self.session.state = 'START_MAINTENANCE'
|
||||
self.state('START_MAINTENANCE')
|
||||
|
||||
if self.session.maintenance_at > datetime.datetime.utcnow():
|
||||
time_now = time_now_str()
|
||||
|
@ -972,7 +995,7 @@ class Workflow(BaseWorkflow):
|
|||
# how many instances can be affected at the same time, we should
|
||||
# calculate and ask scaling of specific instances
|
||||
if not self.confirm_scale_in():
|
||||
self.session.state = 'MAINTENANCE_FAILED'
|
||||
self.state('MAINTENANCE_FAILED')
|
||||
return
|
||||
# TBD it takes time to have proper information updated about free
|
||||
# capacity. Should make sure instances removed has also VCPUs removed
|
||||
|
@ -983,24 +1006,24 @@ class Workflow(BaseWorkflow):
|
|||
if self.need_scale_in():
|
||||
LOG.info('%s: Need to scale in more to get capacity for '
|
||||
'empty host' % (self.session_id))
|
||||
self.session.state = 'SCALE_IN'
|
||||
self.state('SCALE_IN')
|
||||
else:
|
||||
LOG.info('%s: Free capacity, but need empty host' %
|
||||
(self.session_id))
|
||||
self.session.state = 'PREPARE_MAINTENANCE'
|
||||
self.state('PREPARE_MAINTENANCE')
|
||||
else:
|
||||
LOG.info('Empty host found')
|
||||
for host in maintenance_empty_hosts:
|
||||
self._wait_host_empty(host)
|
||||
self.session.state = 'START_MAINTENANCE'
|
||||
self.state('START_MAINTENANCE')
|
||||
|
||||
def prepare_maintenance(self):
|
||||
LOG.info("%s: prepare_maintenance called" % self.session_id)
|
||||
if not self.make_empty_hosts('PREPARE_MAINTENANCE'):
|
||||
LOG.error('make_empty_hosts failed')
|
||||
self.session.state = 'MAINTENANCE_FAILED'
|
||||
self.state('MAINTENANCE_FAILED')
|
||||
else:
|
||||
self.session.state = 'START_MAINTENANCE'
|
||||
self.state('START_MAINTENANCE')
|
||||
self.update_server_info()
|
||||
|
||||
def start_maintenance(self):
|
||||
|
@ -1008,7 +1031,7 @@ class Workflow(BaseWorkflow):
|
|||
empty_hosts = self.get_empty_computes()
|
||||
if not empty_hosts:
|
||||
LOG.info("%s: No empty host to be maintained" % self.session_id)
|
||||
self.session.state = 'MAINTENANCE_FAILED'
|
||||
self.state('MAINTENANCE_FAILED')
|
||||
return
|
||||
maintained_hosts = self.get_maintained_hosts_by_type('compute')
|
||||
if not maintained_hosts:
|
||||
|
@ -1028,7 +1051,7 @@ class Workflow(BaseWorkflow):
|
|||
thrs.append(self.host_maintenance_async(host))
|
||||
for thr in thrs:
|
||||
thr.join()
|
||||
self.session.state = 'PLANNED_MAINTENANCE'
|
||||
self.state('PLANNED_MAINTENANCE')
|
||||
|
||||
def planned_maintenance(self):
|
||||
LOG.info("%s: planned_maintenance called" % self.session_id)
|
||||
|
@ -1065,7 +1088,7 @@ class Workflow(BaseWorkflow):
|
|||
self.update_server_info()
|
||||
|
||||
LOG.info("%s: planned_maintenance done" % self.session_id)
|
||||
self.session.state = 'MAINTENANCE_COMPLETE'
|
||||
self.state('MAINTENANCE_COMPLETE')
|
||||
|
||||
def maintenance_complete(self):
|
||||
LOG.info("%s: maintenance_complete called" % self.session_id)
|
||||
|
@ -1074,10 +1097,10 @@ class Workflow(BaseWorkflow):
|
|||
LOG.info('Projects may still need to up scale back to full '
|
||||
'capcity')
|
||||
if not self.confirm_maintenance_complete():
|
||||
self.session.state = 'MAINTENANCE_FAILED'
|
||||
self.state('MAINTENANCE_FAILED')
|
||||
return
|
||||
self.update_server_info()
|
||||
self.session.state = 'MAINTENANCE_DONE'
|
||||
self.state('MAINTENANCE_DONE')
|
||||
|
||||
def maintenance_done(self):
|
||||
pass
|
||||
|
|
Loading…
Reference in New Issue