Support pre, host, hostype and post action plug-ins

Support different type of action plug-ins
Fix some findings during demo implementation

story: 2003846
Task: #30227

Change-Id: Ie2363d865786afaf64d101f3bf7da97827f6b3e6
Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
This commit is contained in:
Tomi Juvonen 2019-03-28 13:18:35 +02:00
parent ac9fe415e6
commit 4fec501cd0
4 changed files with 179 additions and 67 deletions

View File

@ -46,7 +46,7 @@ opts = [
default=os.environ.get('OS_PROJECT_NAME', 'admin'),
help="API host IP"),
cfg.IntOpt('project_maintenance_reply',
default=20,
default=40,
help="Project maintenance reply confirmation time in seconds"),
cfg.IntOpt('project_scale_in_reply',
default=60,

View File

@ -13,6 +13,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
import subprocess
LOG = logging.getLogger(__name__)
@ -28,4 +29,11 @@ class ActionPlugin(object):
def run(self):
LOG.info("%s: Dummy action plugin run %s" % (self.wf.session_id,
self.hostname))
self.ap_dbi.state = "DONE"
try:
output = subprocess.check_output("echo Dummy running in %s" %
self.hostname,
shell=True)
self.ap_dbi.state = "DONE"
except subprocess.CalledProcessError:
self.ap_dbi.state = "FAILED"
LOG.debug("%s: OUTPUT: %s" % (self.wf.session_id, output))

View File

@ -138,6 +138,10 @@ class BaseWorkflow(Thread):
return [host.hostname for host in self.hosts
if host.type == 'compute']
def get_controller_hosts(self):
return [host.hostname for host in self.hosts
if host.type == 'controller']
def get_empty_computes(self):
all_computes = self.get_compute_hosts()
instance_computes = []

View File

@ -56,15 +56,21 @@ class Workflow(BaseWorkflow):
LOG.info("%s: initialized. Nova version %f" % (self.session_id,
nova_version))
LOG.info('%s: Execute pre action plugins' % (self.session_id))
self.maintenance_by_plugin_type("localhost", "pre")
def _init_hosts_by_services(self):
LOG.info("%s: Dicovering hosts by Nova services" % self.session_id)
hosts = []
contoller_hostnames = []
controllers = self.nova.services.list(binary='nova-conductor')
for controller in controllers:
host = {}
service_host = str(controller.__dict__.get(u'host'))
if service_host in contoller_hostnames:
continue
host['hostname'] = service_host
contoller_hostnames.append(service_host)
host['type'] = 'controller'
if str(controller.__dict__.get(u'status')) == 'disabled':
LOG.error("%s: %s nova-conductor disabled before maintenance"
@ -283,23 +289,42 @@ class Workflow(BaseWorkflow):
actions_at = self.session.maintenance_at
state = 'MAINTENANCE'
self.set_projets_state(state)
for project in self.project_names():
LOG.info('\nMAINTENANCE to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
reply_at = reply_time_str(self.conf.project_maintenance_reply)
if is_time_after_time(reply_at, actions_at):
LOG.error('%s: No time for project to answer in state: %s' %
(self.session_id, state))
self.session.state = "MAINTENANCE_FAILED"
return False
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_maintenance_reply,
'MAINTENANCE_TIMEOUT')
return self.wait_projects_state(state, 'MAINTENANCE_TIMEOUT')
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('\nMAINTENANCE to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
reply_at = reply_time_str(self.conf.project_maintenance_reply)
if is_time_after_time(reply_at, actions_at):
LOG.error('%s: No time for project to answer in state: %s'
% (self.session_id, state))
self.session.state = "MAINTENANCE_FAILED"
return False
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_maintenance_reply,
'MAINTENANCE_TIMEOUT')
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_maintenance failed after retries')
break
else:
LOG.info('confirm_maintenance retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def confirm_scale_in(self):
allowed_actions = []
@ -307,17 +332,36 @@ class Workflow(BaseWorkflow):
reply_at = actions_at
state = 'SCALE_IN'
self.set_projets_state(state)
for project in self.project_names():
LOG.info('\nSCALE_IN to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'SCALE_IN_TIMEOUT')
return self.wait_projects_state(state, 'SCALE_IN_TIMEOUT')
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('\nSCALE_IN to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'SCALE_IN_TIMEOUT')
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_scale_in failed after retries')
break
else:
LOG.info('confirm_scale_in retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def need_scale_in(self):
hvisors = self.nova.hypervisors.list(detailed=True)
@ -408,20 +452,38 @@ class Workflow(BaseWorkflow):
actions_at = reply_time_str(self.conf.project_maintenance_reply)
reply_at = actions_at
self.set_projects_state_and_hosts_instances(state, [host])
for project in self.project_names():
if not self.project_has_state_instances(project):
continue
LOG.info('%s to project %s' % (state, project))
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if not self.project_has_state_instances(project):
continue
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('%s to project %s' % (state, project))
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_maintenance_reply,
'%s_TIMEOUT' % state)
return self.wait_projects_state(state, '%s_TIMEOUT' % state)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_maintenance_reply,
'%s_TIMEOUT' % state)
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_host_to_be_emptied failed after retries')
break
else:
LOG.info('confirm_host_to_be_emptied retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def confirm_maintenance_complete(self):
state = 'MAINTENANCE_COMPLETE'
@ -429,17 +491,37 @@ class Workflow(BaseWorkflow):
actions_at = reply_time_str(self.conf.project_scale_in_reply)
reply_at = actions_at
self.set_projets_state(state)
for project in self.project_names():
LOG.info('%s to project %s' % (state, project))
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
allowed_actions = []
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'%s_TIMEOUT' % state)
return self.wait_projects_state(state, '%s_TIMEOUT' % state)
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('%s to project %s' % (state, project))
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
allowed_actions = []
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'%s_TIMEOUT' % state)
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_maintenance_complete failed after '
'retries')
break
else:
LOG.info('confirm_maintenance_complete retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def notify_action_done(self, project, instance):
instance_ids = [instance.instance_id]
@ -501,31 +583,34 @@ class Workflow(BaseWorkflow):
server_id = instance.instance_id
server = self.nova.servers.get(server_id)
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
LOG.info('server %s state %s' % (server_id, instance.state))
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
LOG.info('server %s state %s host %s' % (server_id, instance.state,
orig_host))
last_vm_state = instance.state
retry_migrate = 2
while True:
try:
server.migrate()
time.sleep(5)
retries = 36
retries = 48
while instance.state != 'resized' and retries > 0:
# try to confirm within 3min
# try to confirm within 4min
server = self.nova.servers.get(server_id)
host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
if instance.state == 'resized':
server.confirm_resize()
LOG.info('instance %s migration confirmed' %
server_id)
instance.host = (
str(server.__dict__.get('OS-EXT-SRV-ATTR:host')))
LOG.info('instance %s migration resized to host %s' %
(server_id, host))
instance.host = host
return True
if last_vm_state != instance.state:
LOG.info('instance %s state: %s' % (server_id,
LOG.info('instance %s state changed: %s' % (server_id,
instance.state))
if instance.state == 'error':
LOG.error('instance %s migration failed, state: %s'
% (server_id, instance.state))
instance.host = host
return False
time.sleep(5)
retries = retries - 1
@ -555,7 +640,7 @@ class Workflow(BaseWorkflow):
(server_id, instance.state))
return False
def host_maintenance_by_plugin_type(self, hostname, plugin_type):
def maintenance_by_plugin_type(self, hostname, plugin_type):
aps = self.get_action_plugins_by_type(plugin_type)
if aps:
LOG.info("%s: Calling action plug-ins with type %s" %
@ -590,7 +675,9 @@ class Workflow(BaseWorkflow):
host = self.get_host_by_name(hostname)
LOG.info('%s: Maintaining host %s' % (self.session_id, hostname))
for plugin_type in ["host", host.type]:
self.host_maintenance_by_plugin_type(hostname, plugin_type)
LOG.info('%s: Execute %s action plugins' % (self.session_id,
plugin_type))
self.maintenance_by_plugin_type(hostname, plugin_type)
LOG.info('%s: Maintaining host %s complete' % (self.session_id,
hostname))
@ -690,6 +777,17 @@ class Workflow(BaseWorkflow):
# nova-compute service is disabled, so projects cannot have
# instances scheduled to not maintained hosts
self.disable_host_nova_compute(compute)
for host in self.get_controller_hosts():
LOG.info('IN_MAINTENANCE controller %s' % host)
self._admin_notify(self.conf.workflow_project, host,
'IN_MAINTENANCE',
self.session_id)
self.host_maintenance(host)
self._admin_notify(self.conf.workflow_project, host,
'MAINTENANCE_COMPLETE',
self.session_id)
LOG.info('MAINTENANCE_COMPLETE controller %s' % host)
self.host_maintained(host)
# First we maintain all empty hosts
for host in empty_hosts:
# TBD we wait host VCPUs to report right, but this is not
@ -697,7 +795,7 @@ class Workflow(BaseWorkflow):
# also this could be made parallel if more than one empty host
self._wait_host_empty(host)
LOG.info('IN_MAINTENANCE host %s' % host)
LOG.info('IN_MAINTENANCE compute %s' % host)
self._admin_notify(self.conf.workflow_project, host,
'IN_MAINTENANCE',
self.session_id)
@ -707,7 +805,7 @@ class Workflow(BaseWorkflow):
self.session_id)
self.enable_host_nova_compute(host)
LOG.info('MAINTENANCE_COMPLETE host %s' % host)
LOG.info('MAINTENANCE_COMPLETE compute %s' % host)
self.host_maintained(host)
else:
# Now we maintain hosts gone trough PLANNED_MAINTENANCE
@ -758,6 +856,8 @@ class Workflow(BaseWorkflow):
def maintenance_complete(self):
LOG.info("%s: maintenance_complete called" % self.session_id)
LOG.info('%s: Execute post action plugins' % self.session_id)
self.maintenance_by_plugin_type("localhost", "post")
LOG.info('Projects may still need to up scale back to full '
'capcity')
if not self.confirm_maintenance_complete():