Support pre, host, hostype and post action plug-ins

Support different type of action plug-ins Fix some findings during demo implementation story: 2003846 Task: #30227 Change-Id: Ie2363d865786afaf64d101f3bf7da97827f6b3e6 Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
2019-03-28 13:18:35 +02:00 · 2019-03-28 13:18:35 +02:00 · 4fec501cd0
parent ac9fe415e6
commit 4fec501cd0
4 changed files with 179 additions and 67 deletions
--- a/fenix/utils/service.py
+++ b/fenix/utils/service.py
@ -46,7 +46,7 @@ opts = [
               default=os.environ.get('OS_PROJECT_NAME', 'admin'),
               help="API host IP"),
    cfg.IntOpt('project_maintenance_reply',
-               default=20,
+               default=40,
               help="Project maintenance reply confirmation time in seconds"),
    cfg.IntOpt('project_scale_in_reply',
               default=60,
--- a/fenix/workflow/actions/dummy.py
+++ b/fenix/workflow/actions/dummy.py
@ -13,6 +13,7 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 from oslo_log import log as logging
+import subprocess

 LOG = logging.getLogger(__name__)

@ -28,4 +29,11 @@ class ActionPlugin(object):
    def run(self):
        LOG.info("%s: Dummy action plugin run %s" % (self.wf.session_id,
                                                     self.hostname))
-        self.ap_dbi.state = "DONE"
+        try:
+            output = subprocess.check_output("echo Dummy running in %s" %
+                                             self.hostname,
+                                             shell=True)
+            self.ap_dbi.state = "DONE"
+        except subprocess.CalledProcessError:
+            self.ap_dbi.state = "FAILED"
+        LOG.debug("%s: OUTPUT: %s" % (self.wf.session_id, output))
--- a/fenix/workflow/workflow.py
+++ b/fenix/workflow/workflow.py
@ -138,6 +138,10 @@ class BaseWorkflow(Thread):
        return [host.hostname for host in self.hosts
                if host.type == 'compute']

+    def get_controller_hosts(self):
+        return [host.hostname for host in self.hosts
+                if host.type == 'controller']
+
    def get_empty_computes(self):
        all_computes = self.get_compute_hosts()
        instance_computes = []
--- a/fenix/workflow/workflows/default.py
+++ b/fenix/workflow/workflows/default.py
@ -56,15 +56,21 @@ class Workflow(BaseWorkflow):
        LOG.info("%s: initialized. Nova version %f" % (self.session_id,
                                                       nova_version))

+        LOG.info('%s: Execute pre action plugins' % (self.session_id))
+        self.maintenance_by_plugin_type("localhost", "pre")
+
    def _init_hosts_by_services(self):
        LOG.info("%s: Dicovering hosts by Nova services" % self.session_id)
        hosts = []
-
+        contoller_hostnames = []
        controllers = self.nova.services.list(binary='nova-conductor')
        for controller in controllers:
            host = {}
            service_host = str(controller.__dict__.get(u'host'))
+            if service_host in contoller_hostnames:
+                continue
            host['hostname'] = service_host
+            contoller_hostnames.append(service_host)
            host['type'] = 'controller'
            if str(controller.__dict__.get(u'status')) == 'disabled':
                LOG.error("%s: %s nova-conductor disabled before maintenance"
@ -283,23 +289,42 @@ class Workflow(BaseWorkflow):
        actions_at = self.session.maintenance_at
        state = 'MAINTENANCE'
        self.set_projets_state(state)
-        for project in self.project_names():
-            LOG.info('\nMAINTENANCE to project %s\n' % project)
-            instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
-                                                        self.session_id,
-                                                        project)
-            reply_at = reply_time_str(self.conf.project_maintenance_reply)
-            if is_time_after_time(reply_at, actions_at):
-                LOG.error('%s: No time for project to answer in state: %s' %
-                          (self.session_id, state))
-                self.session.state = "MAINTENANCE_FAILED"
-                return False
-            metadata = self.session.meta
-            self._project_notify(project, instance_ids, allowed_actions,
-                                 actions_at, reply_at, state, metadata)
-        self.start_timer(self.conf.project_maintenance_reply,
-                         'MAINTENANCE_TIMEOUT')
-        return self.wait_projects_state(state, 'MAINTENANCE_TIMEOUT')
+        all_replied = False
+        project_not_replied = None
+        retry = 2
+        while not all_replied:
+            for project in self.project_names():
+                if (project_not_replied is not None and project not in
+                        project_not_replied):
+                    continue
+                LOG.info('\nMAINTENANCE to project %s\n' % project)
+                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
+                                                            self.session_id,
+                                                            project)
+                reply_at = reply_time_str(self.conf.project_maintenance_reply)
+                if is_time_after_time(reply_at, actions_at):
+                    LOG.error('%s: No time for project to answer in state: %s'
+                              % (self.session_id, state))
+                    self.session.state = "MAINTENANCE_FAILED"
+                    return False
+                metadata = self.session.meta
+                self._project_notify(project, instance_ids, allowed_actions,
+                                     actions_at, reply_at, state, metadata)
+            self.start_timer(self.conf.project_maintenance_reply,
+                             'MAINTENANCE_TIMEOUT')
+
+            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
+            if not all_replied:
+                if retry == 0:
+                    LOG.info('confirm_maintenance failed after retries')
+                    break
+                else:
+                    LOG.info('confirm_maintenance retry')
+                    projects = self.get_projects_with_state()
+                    project_not_replied = (
+                        self._project_names_in_state(projects, state))
+            retry -= 1
+        return all_replied

    def confirm_scale_in(self):
        allowed_actions = []
@ -307,17 +332,36 @@ class Workflow(BaseWorkflow):
        reply_at = actions_at
        state = 'SCALE_IN'
        self.set_projets_state(state)
-        for project in self.project_names():
-            LOG.info('\nSCALE_IN to project %s\n' % project)
-            instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
-                                                        self.session_id,
-                                                        project)
-            metadata = self.session.meta
-            self._project_notify(project, instance_ids, allowed_actions,
-                                 actions_at, reply_at, state, metadata)
-        self.start_timer(self.conf.project_scale_in_reply,
-                         'SCALE_IN_TIMEOUT')
-        return self.wait_projects_state(state, 'SCALE_IN_TIMEOUT')
+        all_replied = False
+        project_not_replied = None
+        retry = 2
+        while not all_replied:
+            for project in self.project_names():
+                if (project_not_replied is not None and project not in
+                        project_not_replied):
+                    continue
+                LOG.info('\nSCALE_IN to project %s\n' % project)
+                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
+                                                            self.session_id,
+                                                            project)
+                metadata = self.session.meta
+                self._project_notify(project, instance_ids, allowed_actions,
+                                     actions_at, reply_at, state, metadata)
+            self.start_timer(self.conf.project_scale_in_reply,
+                             'SCALE_IN_TIMEOUT')
+
+            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
+            if not all_replied:
+                if retry == 0:
+                    LOG.info('confirm_scale_in failed after retries')
+                    break
+                else:
+                    LOG.info('confirm_scale_in retry')
+                    projects = self.get_projects_with_state()
+                    project_not_replied = (
+                        self._project_names_in_state(projects, state))
+            retry -= 1
+        return all_replied

    def need_scale_in(self):
        hvisors = self.nova.hypervisors.list(detailed=True)
@ -408,20 +452,38 @@ class Workflow(BaseWorkflow):
        actions_at = reply_time_str(self.conf.project_maintenance_reply)
        reply_at = actions_at
        self.set_projects_state_and_hosts_instances(state, [host])
-        for project in self.project_names():
-            if not self.project_has_state_instances(project):
-                continue
-            LOG.info('%s to project %s' % (state, project))
+        all_replied = False
+        project_not_replied = None
+        retry = 2
+        while not all_replied:
+            for project in self.project_names():
+                if not self.project_has_state_instances(project):
+                    continue
+                if (project_not_replied is not None and project not in
+                        project_not_replied):
+                    continue
+                LOG.info('%s to project %s' % (state, project))

-            instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
-                                                        self.session_id,
-                                                        project)
-            metadata = self.session.meta
-            self._project_notify(project, instance_ids, allowed_actions,
-                                 actions_at, reply_at, state, metadata)
-        self.start_timer(self.conf.project_maintenance_reply,
-                         '%s_TIMEOUT' % state)
-        return self.wait_projects_state(state, '%s_TIMEOUT' % state)
+                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
+                                                            self.session_id,
+                                                            project)
+                metadata = self.session.meta
+                self._project_notify(project, instance_ids, allowed_actions,
+                                     actions_at, reply_at, state, metadata)
+            self.start_timer(self.conf.project_maintenance_reply,
+                             '%s_TIMEOUT' % state)
+            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
+            if not all_replied:
+                if retry == 0:
+                    LOG.info('confirm_host_to_be_emptied failed after retries')
+                    break
+                else:
+                    LOG.info('confirm_host_to_be_emptied retry')
+                    projects = self.get_projects_with_state()
+                    project_not_replied = (
+                        self._project_names_in_state(projects, state))
+            retry -= 1
+        return all_replied

    def confirm_maintenance_complete(self):
        state = 'MAINTENANCE_COMPLETE'
@ -429,17 +491,37 @@ class Workflow(BaseWorkflow):
        actions_at = reply_time_str(self.conf.project_scale_in_reply)
        reply_at = actions_at
        self.set_projets_state(state)
-        for project in self.project_names():
-            LOG.info('%s to project %s' % (state, project))
-            instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
-                                                        self.session_id,
-                                                        project)
-            allowed_actions = []
-            self._project_notify(project, instance_ids, allowed_actions,
-                                 actions_at, reply_at, state, metadata)
-        self.start_timer(self.conf.project_scale_in_reply,
-                         '%s_TIMEOUT' % state)
-        return self.wait_projects_state(state, '%s_TIMEOUT' % state)
+        all_replied = False
+        project_not_replied = None
+        retry = 2
+        while not all_replied:
+            for project in self.project_names():
+                if (project_not_replied is not None and project not in
+                        project_not_replied):
+                    continue
+                LOG.info('%s to project %s' % (state, project))
+                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
+                                                            self.session_id,
+                                                            project)
+                allowed_actions = []
+                self._project_notify(project, instance_ids, allowed_actions,
+                                     actions_at, reply_at, state, metadata)
+            self.start_timer(self.conf.project_scale_in_reply,
+                             '%s_TIMEOUT' % state)
+
+            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
+            if not all_replied:
+                if retry == 0:
+                    LOG.info('confirm_maintenance_complete failed after '
+                             'retries')
+                    break
+                else:
+                    LOG.info('confirm_maintenance_complete retry')
+                    projects = self.get_projects_with_state()
+                    project_not_replied = (
+                        self._project_names_in_state(projects, state))
+            retry -= 1
+        return all_replied

    def notify_action_done(self, project, instance):
        instance_ids = [instance.instance_id]
@ -501,31 +583,34 @@ class Workflow(BaseWorkflow):
        server_id = instance.instance_id
        server = self.nova.servers.get(server_id)
        instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
-        LOG.info('server %s state %s' % (server_id, instance.state))
+        orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
+        LOG.info('server %s state %s host %s' % (server_id, instance.state,
+                                                 orig_host))
        last_vm_state = instance.state
        retry_migrate = 2
        while True:
            try:
                server.migrate()
                time.sleep(5)
-                retries = 36
+                retries = 48
                while instance.state != 'resized' and retries > 0:
-                    # try to confirm within 3min
+                    # try to confirm within 4min
                    server = self.nova.servers.get(server_id)
+                    host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
                    instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
                    if instance.state == 'resized':
                        server.confirm_resize()
-                        LOG.info('instance %s migration confirmed' %
-                                 server_id)
-                        instance.host = (
-                            str(server.__dict__.get('OS-EXT-SRV-ATTR:host')))
+                        LOG.info('instance %s migration resized to host %s' %
+                                 (server_id, host))
+                        instance.host = host
                        return True
                    if last_vm_state != instance.state:
-                        LOG.info('instance %s state: %s' % (server_id,
+                        LOG.info('instance %s state changed: %s' % (server_id,
                                 instance.state))
                    if instance.state == 'error':
                        LOG.error('instance %s migration failed, state: %s'
                                  % (server_id, instance.state))
+                        instance.host = host
                        return False
                    time.sleep(5)
                    retries = retries - 1
@ -555,7 +640,7 @@ class Workflow(BaseWorkflow):
                  (server_id, instance.state))
        return False

-    def host_maintenance_by_plugin_type(self, hostname, plugin_type):
+    def maintenance_by_plugin_type(self, hostname, plugin_type):
        aps = self.get_action_plugins_by_type(plugin_type)
        if aps:
            LOG.info("%s: Calling action plug-ins with type %s" %
@ -590,7 +675,9 @@ class Workflow(BaseWorkflow):
        host = self.get_host_by_name(hostname)
        LOG.info('%s: Maintaining host %s' % (self.session_id, hostname))
        for plugin_type in ["host", host.type]:
-            self.host_maintenance_by_plugin_type(hostname, plugin_type)
+            LOG.info('%s: Execute %s action plugins' % (self.session_id,
+                                                        plugin_type))
+            self.maintenance_by_plugin_type(hostname, plugin_type)
        LOG.info('%s: Maintaining host %s complete' % (self.session_id,
                                                       hostname))

@ -690,6 +777,17 @@ class Workflow(BaseWorkflow):
                # nova-compute service is disabled, so projects cannot have
                # instances scheduled to not maintained hosts
                self.disable_host_nova_compute(compute)
+            for host in self.get_controller_hosts():
+                LOG.info('IN_MAINTENANCE controller %s' % host)
+                self._admin_notify(self.conf.workflow_project, host,
+                                   'IN_MAINTENANCE',
+                                   self.session_id)
+                self.host_maintenance(host)
+                self._admin_notify(self.conf.workflow_project, host,
+                                   'MAINTENANCE_COMPLETE',
+                                   self.session_id)
+                LOG.info('MAINTENANCE_COMPLETE controller %s' % host)
+                self.host_maintained(host)
            # First we maintain all empty hosts
            for host in empty_hosts:
                # TBD we wait host VCPUs to report right, but this is not
@ -697,7 +795,7 @@ class Workflow(BaseWorkflow):
                # also this could be made parallel if more than one empty host
                self._wait_host_empty(host)

-                LOG.info('IN_MAINTENANCE host %s' % host)
+                LOG.info('IN_MAINTENANCE compute %s' % host)
                self._admin_notify(self.conf.workflow_project, host,
                                   'IN_MAINTENANCE',
                                   self.session_id)
@ -707,7 +805,7 @@ class Workflow(BaseWorkflow):
                                   self.session_id)

                self.enable_host_nova_compute(host)
-                LOG.info('MAINTENANCE_COMPLETE host %s' % host)
+                LOG.info('MAINTENANCE_COMPLETE compute %s' % host)
                self.host_maintained(host)
        else:
            # Now we maintain hosts gone trough PLANNED_MAINTENANCE
@ -758,6 +856,8 @@ class Workflow(BaseWorkflow):

    def maintenance_complete(self):
        LOG.info("%s: maintenance_complete called" % self.session_id)
+        LOG.info('%s: Execute post action plugins' % self.session_id)
+        self.maintenance_by_plugin_type("localhost", "post")
        LOG.info('Projects may still need to up scale back to full '
                 'capcity')
        if not self.confirm_maintenance_complete():