Improve WMI jobs logging

This change provides additional log information when handling WMI jobs. This allows us to get a better picture of the job states. This will be extremely helpful for debugging purposes, as currently we don't get much information about pending jobs or errors that occur when stopping jobs. Closes-Bug: #1748208 Change-Id: Ibb4d5969eea13a0b11a6d5b46b797482a6d2a20b
2018-02-08 13:57:10 +02:00 · 2018-02-08 13:57:10 +02:00 · 58509209b9
commit 58509209b9
parent 6aa0f9ff9f
2 changed files with 70 additions and 10 deletions
--- a/os_win/tests/unit/utils/test_jobutils.py
+++ b/os_win/tests/unit/utils/test_jobutils.py
@ -77,6 +77,20 @@ class JobUtilsTestCase(test_base.OsWinBaseTestCase):
                          self.jobutils._wait_for_job,
                          self._FAKE_JOB_PATH)

+    @ddt.data({"extended": False,
+               "expected_fields": ["InstanceID"]},
+              {"extended": True,
+               "expected_fields": ["InstanceID", "DetailedStatus"]})
+    @ddt.unpack
+    def test_get_job_details(self, expected_fields, extended):
+        mock_job = mock.Mock()
+
+        details = self.jobutils._get_job_details(mock_job, extended=extended)
+
+        for field in expected_fields:
+            self.assertEqual(getattr(mock_job, field),
+                             details[field])
+
    def test_get_pending_jobs(self):
        mock_killed_job = mock.Mock(JobState=constants.JOB_STATE_KILLED)
        mock_running_job = mock.Mock(JobState=constants.WMI_JOB_STATE_RUNNING)
--- a/os_win/utils/jobutils.py
+++ b/os_win/utils/jobutils.py
@ -20,6 +20,7 @@ Base Utility class for operations on Hyper-V.

 import time

+import monotonic
 from oslo_log import log as logging

 from os_win import _utils
@ -81,7 +82,17 @@ class JobUtils(baseutils.BaseUtilsVirt):
        job_wmi_path = job_path.replace('\\', '/')
        job = self._get_wmi_obj(job_wmi_path)

+        # We'll log the job status from time to time.
+        last_report_time = 0
+        report_interval = 5
+
        while not self._is_job_completed(job):
+            now = monotonic.monotonic()
+            if now - last_report_time > report_interval:
+                job_details = self._get_job_details(job)
+                LOG.debug("Waiting for WMI job: %s.", job_details)
+                last_report_time = now
+
            time.sleep(0.1)
            job = self._get_wmi_obj(job_wmi_path)

@ -90,25 +101,54 @@ class JobUtils(baseutils.BaseUtilsVirt):

        # We'll raise an exception for killed jobs.
        job_failed = job_state not in self._successful_job_states or err_code
+        job_warnings = job_state == constants.JOB_STATE_COMPLETED_WITH_WARNINGS
+        job_details = self._get_job_details(
+            job, extended=(job_failed or job_warnings))
+
        if job_failed:
            err_sum_desc = getattr(job, 'ErrorSummaryDescription', None)
            err_desc = job.ErrorDescription

+            LOG.error("WMI job failed: %s.", job_details)
            raise exceptions.WMIJobFailed(job_state=job_state,
                                          error_code=err_code,
                                          error_summ_desc=err_sum_desc,
                                          error_desc=err_desc)

-        if job_state == constants.JOB_STATE_COMPLETED_WITH_WARNINGS:
+        if job_warnings:
            LOG.warning("WMI job completed with warnings. For detailed "
-                        "information, please check the Windows event logs.")
+                        "information, please check the Windows event logs. "
+                        "Job details: %s.", job_details)
+        else:
+            LOG.debug("WMI job succeeded: %s.", job_details)

-        desc = job.Description
-        elap = job.ElapsedTime
-        LOG.debug("WMI job succeeded: %(desc)s, Elapsed=%(elap)s",
-                  {'desc': desc, 'elap': elap})
        return job

+    @staticmethod
+    def _get_job_details(job, extended=False):
+        basic_details = [
+            "InstanceID", "Description", "ElementName", "JobStatus",
+            "ElapsedTime", "Cancellable", "JobType", "Owner",
+            "PercentComplete"]
+        extended_details = [
+            "JobState", "StatusDescriptions", "OperationalStatus",
+            "TimeSubmitted", "UntilTime", "TimeOfLastStateChange",
+            "DetailedStatus", "LocalOrUtcTime",
+            "ErrorCode", "ErrorDescription", "ErrorSummaryDescription"]
+
+        fields = list(basic_details)
+        if extended:
+            fields += extended_details
+
+        details = {}
+        for field in fields:
+            try:
+                details[field] = getattr(job, field)
+            except AttributeError:
+                continue
+
+        return details
+
    def _get_pending_jobs_affecting_element(self, element):
        # Msvm_AffectedJobElement is in fact an association between
        # the affected element and the affecting job.
@ -131,10 +171,11 @@ class JobUtils(baseutils.BaseUtilsVirt):
    def _stop_jobs(self, element):
        pending_jobs = self._get_pending_jobs_affecting_element(element)
        for job in pending_jobs:
+            job_details = self._get_job_details(job, extended=True)
            try:
                if not job.Cancellable:
                    LOG.debug("Got request to terminate "
-                              "non-cancelable job.")
+                              "non-cancelable job: %s.", job_details)
                    continue

                job.RequestStateChange(
@ -143,15 +184,20 @@ class JobUtils(baseutils.BaseUtilsVirt):
                # The job may had been completed right before we've
                # attempted to kill it.
                if not _utils._is_not_found_exc(ex):
-                    LOG.debug("Failed to stop job. Exception: %s", ex)
+                    LOG.debug("Failed to stop job. Exception: %s. "
+                              "Job details: %s.", ex, job_details)

        pending_jobs = self._get_pending_jobs_affecting_element(element)
        if pending_jobs:
+            pending_job_details = [self._get_job_details(job, extended=True)
+                                   for job in pending_jobs]
            LOG.debug("Attempted to terminate jobs "
                      "affecting element %(element)s but "
-                      "%(pending_count)s jobs are still pending.",
+                      "%(pending_count)s jobs are still pending: "
+                      "%(pending_jobs)s.",
                      dict(element=element,
-                           pending_count=len(pending_jobs)))
+                           pending_count=len(pending_jobs),
+                           pending_jobs=pending_job_details))
            raise exceptions.JobTerminateFailed()

    def _is_job_completed(self, job):