From ae8bb959392a25550c2f432271e41295f5076195 Mon Sep 17 00:00:00 2001 From: Helen Walsh Date: Tue, 26 Jan 2016 22:26:55 +0000 Subject: [PATCH] EMC VMAX - Operations and timeout issues If a volume operation e.g create, delete, expand, etc were to timeout without fully completing then we were not appropriately catching the error with the appropriate rc (return code) or errordesc (Error description). This new method does an additional check to appropriately capture the status of the job i.e. did it fail to complete in the allocated time. Change-Id: I0b74f9941d722083b70c12bbebeb269d315c02b3 Closes-Bug: #1538214 --- cinder/tests/unit/test_emc_vmax.py | 54 +++++++++-- cinder/volume/drivers/emc/emc_vmax_fc.py | 1 + cinder/volume/drivers/emc/emc_vmax_iscsi.py | 1 + cinder/volume/drivers/emc/emc_vmax_utils.py | 100 ++++++++++++-------- 4 files changed, 107 insertions(+), 49 deletions(-) diff --git a/cinder/tests/unit/test_emc_vmax.py b/cinder/tests/unit/test_emc_vmax.py index 92fc8a6d6e3..322911710e2 100644 --- a/cinder/tests/unit/test_emc_vmax.py +++ b/cinder/tests/unit/test_emc_vmax.py @@ -1172,7 +1172,8 @@ class FakeEcomConnection(object): else: jobinstance['JobState'] = 7 jobinstance['ErrorCode'] = 0 - jobinstance['ErrorDescription'] = '' + jobinstance['ErrorDescription'] = None + jobinstance['OperationalStatus'] = (2, 17) return jobinstance def _getinstance_policycapabilities(self, policycapabilitypath): @@ -2316,8 +2317,9 @@ class EMCVMAXISCSIDriverNoFastTestCase(test.TestCase): self.driver.utils._is_job_finished = mock.Mock( return_value=True) - rc = self.driver.utils._wait_for_job_complete(conn, myjob) - self.assertIsNone(rc) + rc, errordesc = self.driver.utils.wait_for_job_complete(conn, myjob) + self.assertEqual(0, rc) + self.assertIsNone(errordesc) self.driver.utils._is_job_finished.assert_called_once_with( conn, myjob) self.assertTrue(self.driver.utils._is_job_finished.return_value) @@ -2326,13 +2328,31 @@ class EMCVMAXISCSIDriverNoFastTestCase(test.TestCase): # Save the original state and restore it after this test loopingcall_orig = loopingcall.FixedIntervalLoopingCall loopingcall.FixedIntervalLoopingCall = mock.Mock() - rc = self.driver.utils._wait_for_job_complete(conn, myjob) - self.assertIsNone(rc) + rc, errordesc = self.driver.utils.wait_for_job_complete(conn, myjob) + self.assertEqual(0, rc) + self.assertIsNone(errordesc) loopingcall.FixedIntervalLoopingCall.assert_called_once_with( mock.ANY) loopingcall.FixedIntervalLoopingCall.reset_mock() loopingcall.FixedIntervalLoopingCall = loopingcall_orig + def test_wait_for_job_complete_bad_job_state(self): + myjob = SE_ConcreteJob() + myjob.classname = 'SE_ConcreteJob' + myjob['InstanceID'] = '9999' + myjob['status'] = 'success' + myjob['type'] = 'type' + myjob['CreationClassName'] = 'SE_ConcreteJob' + myjob['Job'] = myjob + conn = self.fake_ecom_connection() + self.driver.utils._is_job_finished = mock.Mock( + return_value=True) + self.driver.utils._verify_job_state = mock.Mock( + return_value=(-1, 'Job finished with an error')) + rc, errordesc = self.driver.utils.wait_for_job_complete(conn, myjob) + self.assertEqual(-1, rc) + self.assertEqual('Job finished with an error', errordesc) + def test_wait_for_sync(self): mysync = 'fakesync' conn = self.fake_ecom_connection() @@ -3068,6 +3088,10 @@ class EMCVMAXISCSIDriverNoFastTestCase(test.TestCase): self.driver.delete_volume(notfound_delete_vol) + @mock.patch.object( + emc_vmax_utils.EMCVMAXUtils, + 'wait_for_job_complete', + return_value=(-1, 'error')) @mock.patch.object( emc_vmax_common.EMCVMAXCommon, '_get_pool_and_storage_system', @@ -3077,7 +3101,7 @@ class EMCVMAXISCSIDriverNoFastTestCase(test.TestCase): 'get_volume_type_extra_specs', return_value={'volume_backend_name': 'ISCSINoFAST'}) def test_delete_volume_failed( - self, _mock_volume_type, mock_storage_system): + self, _mock_volume_type, mock_storage_system, mock_wait): self.driver.create_volume(self.data.failed_delete_vol) self.assertRaises(exception.VolumeBackendAPIException, self.driver.delete_volume, @@ -4039,6 +4063,10 @@ class EMCVMAXISCSIDriverFastTestCase(test.TestCase): notfound_delete_vol['provider_location'] = None self.driver.delete_volume(notfound_delete_vol) + @mock.patch.object( + emc_vmax_utils.EMCVMAXUtils, + 'wait_for_job_complete', + return_value=(-1, 'error')) @mock.patch.object( emc_vmax_fast.EMCVMAXFast, 'get_pool_associated_to_policy', @@ -4057,7 +4085,7 @@ class EMCVMAXISCSIDriverFastTestCase(test.TestCase): return_value={'volume_backend_name': 'ISCSIFAST'}) def test_delete_volume_fast_failed( self, _mock_volume_type, _mock_storage_group, - mock_storage_system, mock_policy_pool): + mock_storage_system, mock_policy_pool, mock_wait): self.driver.create_volume(self.data.failed_delete_vol) self.assertRaises(exception.VolumeBackendAPIException, self.driver.delete_volume, @@ -4642,6 +4670,10 @@ class EMCVMAXFCDriverNoFastTestCase(test.TestCase): notfound_delete_vol['provider_location'] = None self.driver.delete_volume(notfound_delete_vol) + @mock.patch.object( + emc_vmax_utils.EMCVMAXUtils, + 'wait_for_job_complete', + return_value=(-1, 'error')) @mock.patch.object( emc_vmax_common.EMCVMAXCommon, '_get_pool_and_storage_system', @@ -4651,7 +4683,7 @@ class EMCVMAXFCDriverNoFastTestCase(test.TestCase): 'get_volume_type_extra_specs', return_value={'volume_backend_name': 'FCNoFAST'}) def test_delete_volume_failed( - self, _mock_volume_type, mock_storage_system): + self, _mock_volume_type, mock_storage_system, mock_wait): self.driver.create_volume(self.data.failed_delete_vol) self.assertRaises(exception.VolumeBackendAPIException, self.driver.delete_volume, @@ -5228,6 +5260,10 @@ class EMCVMAXFCDriverFastTestCase(test.TestCase): self.driver.delete_volume(notfound_delete_vol) + @mock.patch.object( + emc_vmax_utils.EMCVMAXUtils, + 'wait_for_job_complete', + return_value=(-1, 'error')) @mock.patch.object( emc_vmax_fast.EMCVMAXFast, 'get_pool_associated_to_policy', @@ -5246,7 +5282,7 @@ class EMCVMAXFCDriverFastTestCase(test.TestCase): return_value={'volume_backend_name': 'FCFAST'}) def test_delete_volume_fast_failed( self, _mock_volume_type, mock_wrapper, - mock_storage_system, mock_pool_policy): + mock_storage_system, mock_pool_policy, mock_wait): self.driver.create_volume(self.data.failed_delete_vol) self.assertRaises(exception.VolumeBackendAPIException, self.driver.delete_volume, diff --git a/cinder/volume/drivers/emc/emc_vmax_fc.py b/cinder/volume/drivers/emc/emc_vmax_fc.py index 960b09abf0a..51406b673a7 100644 --- a/cinder/volume/drivers/emc/emc_vmax_fc.py +++ b/cinder/volume/drivers/emc/emc_vmax_fc.py @@ -61,6 +61,7 @@ class EMCVMAXFCDriver(driver.FibreChannelDriver): - Getting iscsi ip from port in existing masking view - Replacement of EMCGetTargetEndpoints api (bug #1512791) - VMAX3 snapvx improvements (bug #1522821) + - Operations and timeout issues (bug #1538214) """ VERSION = "2.3.0" diff --git a/cinder/volume/drivers/emc/emc_vmax_iscsi.py b/cinder/volume/drivers/emc/emc_vmax_iscsi.py index ec7ebf456b0..397484e32b9 100644 --- a/cinder/volume/drivers/emc/emc_vmax_iscsi.py +++ b/cinder/volume/drivers/emc/emc_vmax_iscsi.py @@ -67,6 +67,7 @@ class EMCVMAXISCSIDriver(driver.ISCSIDriver): - Getting iscsi ip from port in existing masking view - Replacement of EMCGetTargetEndpoints api (bug #1512791) - VMAX3 snapvx improvements (bug #1522821) + - Operations and timeout issues (bug #1538214) """ VERSION = "2.3.0" diff --git a/cinder/volume/drivers/emc/emc_vmax_utils.py b/cinder/volume/drivers/emc/emc_vmax_utils.py index 2b185cbe845..74f86e3af9e 100644 --- a/cinder/volume/drivers/emc/emc_vmax_utils.py +++ b/cinder/volume/drivers/emc/emc_vmax_utils.py @@ -283,67 +283,50 @@ class EMCVMAXUtils(object): :param conn: connection to the ecom server :param job: the job dict :param extraSpecs: the extraSpecs dict. Defaults to None - :returns: int -- the return code - :returns: errorDesc - the error description string - """ - - jobInstanceName = job['Job'] - if extraSpecs and (INTERVAL in extraSpecs or RETRIES in extraSpecs): - self._wait_for_job_complete(conn, job, extraSpecs) - else: - self._wait_for_job_complete(conn, job) - jobinstance = conn.GetInstance(jobInstanceName, - LocalOnly=False) - rc = jobinstance['ErrorCode'] - errorDesc = jobinstance['ErrorDescription'] - LOG.debug("Return code is: %(rc)lu. " - "Error Description is: %(errorDesc)s.", - {'rc': rc, - 'errorDesc': errorDesc}) - - return rc, errorDesc - - def _wait_for_job_complete(self, conn, job, extraSpecs=None): - """Given the job wait for it to complete. - - :param conn: connection to the ecom server - :param job: the job dict - :param extraSpecs: the extraSpecs dict. Defaults to None - :raises: loopingcall.LoopingCallDone :raises: VolumeBackendAPIException """ def _wait_for_job_complete(): # Called at an interval until the job is finished. - maxJobRetries = self._get_max_job_retries(extraSpecs) retries = kwargs['retries'] - wait_for_job_called = kwargs['wait_for_job_called'] - if self._is_job_finished(conn, job): - raise loopingcall.LoopingCallDone() - if retries > maxJobRetries: - LOG.error(_LE("_wait_for_job_complete " - "failed after %(retries)d " - "tries."), - {'retries': retries}) - - raise loopingcall.LoopingCallDone() try: kwargs['retries'] = retries + 1 - if not wait_for_job_called: + if not kwargs['wait_for_job_called']: if self._is_job_finished(conn, job): + kwargs['rc'], kwargs['errordesc'] = ( + self._verify_job_state(conn, job)) kwargs['wait_for_job_called'] = True except Exception: exceptionMessage = (_("Issue encountered waiting for job.")) LOG.exception(exceptionMessage) raise exception.VolumeBackendAPIException(exceptionMessage) + if retries > maxJobRetries: + kwargs['rc'], kwargs['errordesc'] = ( + self._verify_job_state(conn, job)) + LOG.error(_LE("_wait_for_job_complete " + "failed after %(retries)d " + "tries."), + {'retries': retries}) + + raise loopingcall.LoopingCallDone() + if kwargs['wait_for_job_called']: + raise loopingcall.LoopingCallDone() + maxJobRetries = self._get_max_job_retries(extraSpecs) kwargs = {'retries': 0, - 'wait_for_job_called': False} + 'wait_for_job_called': False, + 'rc': 0, + 'errordesc': None} intervalInSecs = self._get_interval_in_secs(extraSpecs) timer = loopingcall.FixedIntervalLoopingCall(_wait_for_job_complete) timer.start(interval=intervalInSecs).wait() + LOG.debug("Return code is: %(rc)lu. " + "Error Description is: %(errordesc)s.", + {'rc': kwargs['rc'], + 'errordesc': kwargs['errordesc']}) + return kwargs['rc'], kwargs['errordesc'] def _get_max_job_retries(self, extraSpecs): """Get max job retries either default or user defined @@ -393,6 +376,43 @@ class EMCVMAXUtils(object): else: return True + def _verify_job_state(self, conn, job): + """Check if the job is finished. + + :param conn: connection to the ecom server + :param job: the job dict + :returns: boolean -- True if finished; False if not finished; + """ + jobstatedict = {2: 'New', + 3: 'Starting', + 4: 'Running', + 5: 'Suspended', + 6: 'Shutting Down', + 7: 'Completed', + 8: 'Terminated', + 9: 'Killed', + 10: 'Exception', + 11: 'Service', + 32767: 'Queue Pending', + 32768: 'DMTF Reserved', + 65535: 'Vendor Reserved'} + jobInstanceName = job['Job'] + jobinstance = conn.GetInstance(jobInstanceName, + LocalOnly=False) + operationalstatus = jobinstance['OperationalStatus'] + if not operationalstatus: + jobstate = jobinstance['JobState'] + errordescription = (_( + "The job has not completed and is in a %(state)s " + "state.") + % {'state': jobstatedict[int(jobstate)]}) + LOG.error(errordescription) + errorcode = -1 + else: + errordescription = jobinstance['ErrorDescription'] + errorcode = jobinstance['ErrorCode'] + return errorcode, errordescription + def wait_for_sync(self, conn, syncName, extraSpecs=None): """Given the sync name wait for it to fully synchronize.