diff --git a/doc/source/user/db_sync_error_handling.rst b/doc/source/user/db_sync_error_handling.rst index 3413ec147..5bce24b6c 100644 --- a/doc/source/user/db_sync_error_handling.rst +++ b/doc/source/user/db_sync_error_handling.rst @@ -29,8 +29,9 @@ and some error-handling operations. * The maximum or minimum number of pods is out of range * Error compute scale_level -* Conflict with LCM operation - +* LCM operation + * Conflict with LCM operation + * Abnormal LCM operation status The maximum or minimum number of pods is out of range ----------------------------------------------------- @@ -221,7 +222,7 @@ When tacker-conductor.log contains the following error log, it means compute scale_level error. .. note:: If you don't have tacker-conductor.log, - you can execute the following CLI command to create tacker-conductor.log. + you can execute the following CLI command to show tacker-conductor.log. .. code-block:: console @@ -333,22 +334,26 @@ the initial increment is a multiple of the scale level. for details. +LCM operation +------------- + Conflict with LCM operation ---------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + There are two kinds of conflicts: * Database synchronization occurs while LCM operation is in progress. * LCM operation occurs during DB synchronization. Database synchronization occurs while a LCM operation is in progress -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' When tacker-conductor.log contains the following info log, it means database synchronization conflict with LCM operation, and database synchronization will skip. .. note:: If you don't have tacker-conductor.log, - you can execute the following CLI command to create tacker-conductor.log. + you can execute the following CLI command to show tacker-conductor.log. .. code-block:: console @@ -365,7 +370,7 @@ Waiting for LCM operation completes and database synchronization will be repeated at a default time. LCM operation occurs during DB synchronization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +'''''''''''''''''''''''''''''''''''''''''''''' When LCM operation responds 409, it conflicts with Database synchronization. @@ -380,3 +385,37 @@ Debug log: .. code-block:: console Ended sync_db + + +Abnormal LCM operation status +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +During synchronization, Tacker checks the operationState of VnfLcmOpOcc. +For the same vnf instance, if ``FAILED_TEMP`` exists in the operationState, +or the latest operationState is ``FAILED``, +Tacker will output an error log and do not update database. + +.. note:: If you don't have tacker-conductor.log, + you can execute the following CLI command to show tacker-conductor.log. + +.. code-block:: console + + journalctl -u devstack@tacker-conductor + +Error log: + +.. code-block:: console + + The LCM operation status of the vnf: 81c4be9d-25ad-4726-8640-f2c4c326de2e is abnormal, so skip this DB synchronization. + +Error-handling operations: + +To solve this error, you can get with the following ways. + +* For the operation state of ``FAILED_TEMP``, please refer to + `VNF LCM error-handling`_. + +* For the operation state of ``FAILED``, please perform other LCM operations + on this vnf instance until the result is ``COMPLETED``. + +.. _VNF LCM error-handling: https://docs.openstack.org/tacker/latest/user/etsi_vnf_error_handling.html diff --git a/tacker/sol_refactored/common/lcm_op_occ_utils.py b/tacker/sol_refactored/common/lcm_op_occ_utils.py index 05fa4a35d..f77b96e24 100644 --- a/tacker/sol_refactored/common/lcm_op_occ_utils.py +++ b/tacker/sol_refactored/common/lcm_op_occ_utils.py @@ -555,6 +555,24 @@ def get_grant_req_and_grant(context, lcmocc): return grant_reqs[0], grant +def is_lcmocc_failure_status(context, inst_id): + inst_lcmoccs = objects.VnfLcmOpOccV2.get_by_filter( + context, vnfInstanceId=inst_id) + failed_temp_lcmoccs = [ + lcmocc for lcmocc in inst_lcmoccs + if lcmocc.operationState == fields.LcmOperationStateType.FAILED_TEMP] + failed_lcmocc = [ + latest_lcmocc for latest_lcmocc in inst_lcmoccs + if latest_lcmocc.startTime == max( + [lcmocc.startTime for lcmocc in inst_lcmoccs])] + + if failed_temp_lcmoccs or (failed_lcmocc[0].operationState == + fields.LcmOperationStateType.FAILED): + return True + + return False + + def check_lcmocc_in_progress(context, inst_id): # if the controller or conductor executes an operation for the vnf # instance (i.e. operationState is ...ING), other operation for diff --git a/tacker/sol_refactored/conductor/conductor_v2.py b/tacker/sol_refactored/conductor/conductor_v2.py index 46c00949b..0a1050819 100644 --- a/tacker/sol_refactored/conductor/conductor_v2.py +++ b/tacker/sol_refactored/conductor/conductor_v2.py @@ -372,6 +372,14 @@ class ConductorV2(object): @coordinate.lock_vnf_instance('{inst.id}') def _sync_inst(self, context, inst, vim_info): + # NOTE(fengyi): The operation_state in the opocc of vnf_instance + # has FAILED_TEMP or FAILED, then Tacker cannot perform DB sync for + # the vnf_instance. + if lcmocc_utils.is_lcmocc_failure_status(context, inst.id): + raise sol_ex.DbSyncFailed( + f"The LCM operation status of the vnf: {inst.id} is abnormal, " + "so skip this DB synchronization.") + vnf_inst = inst_utils.get_inst(context, inst.id) self.vnflcm_driver.sync_db( context, vnf_inst, vim_info) diff --git a/tacker/tests/unit/vnfm/infra_drivers/kubernetes/test_kubernetes_driver.py b/tacker/tests/unit/vnfm/infra_drivers/kubernetes/test_kubernetes_driver.py index 3fc4a80bd..59ca5ccc9 100644 --- a/tacker/tests/unit/vnfm/infra_drivers/kubernetes/test_kubernetes_driver.py +++ b/tacker/tests/unit/vnfm/infra_drivers/kubernetes/test_kubernetes_driver.py @@ -3858,6 +3858,7 @@ class TestKubernetes(base.TestCase): heal_vnf_request=heal_request_data_obj) self.assertEqual(mock_list_namespaced_pod.call_count, 0) + @mock.patch.object(objects.VnfLcmOpOccList, "get_by_filters") @mock.patch.object(kubernetes_driver.Kubernetes, "_sync_vnfc_resource_and_pod_resource") @mock.patch.object(objects.VimConnectionInfo, "obj_from_primitive") @@ -3869,11 +3870,15 @@ class TestKubernetes(base.TestCase): def test_sync_db( self, mock_list_namespaced_pod, mock_check_pod_information, mock_get_by_id, mock_save, mock_get_vim, mock_vim, - mock_sync_vnfc): + mock_sync_vnfc, mock_op_occs): mock_list_namespaced_pod.return_value = client.V1PodList( items=[fakes.get_fake_pod_info(kind='Deployment')]) mock_check_pod_information.return_value = True + vnf_lcm_op_occ = vnflcm_fakes.vnflcm_scale_out_cnf() + vnf_lcm_op_occs = objects.VnfLcmOpOccList(objects=[vnf_lcm_op_occ]) + mock_op_occs.return_value = vnf_lcm_op_occs + vnf_instance_obj = vnflcm_fakes.return_vnf_instance( fields.VnfInstanceState.INSTANTIATED) vnf_instance_obj.vnf_metadata['namespace'] = "default" @@ -3966,6 +3971,7 @@ class TestKubernetes(base.TestCase): f"Failed to synchronize database vnf: " f"{vnf_instance_obj.id}", cm.output[0]) + @mock.patch.object(objects.VnfLcmOpOccList, "get_by_filters") @mock.patch.object(objects.VimConnectionInfo, "obj_from_primitive") @mock.patch.object(vnflcm_utils, "get_vim") @mock.patch.object(VnfInstance, "save") @@ -3974,7 +3980,7 @@ class TestKubernetes(base.TestCase): @mock.patch.object(client.CoreV1Api, 'list_namespaced_pod') def test_sync_db_check_pod_false( self, mock_list_namespaced_pod, mock_check_pod_information, - mock_get_by_id, mock_save, mock_get_vim, mock_vim): + mock_get_by_id, mock_save, mock_get_vim, mock_vim, mock_op_occs): mock_list_namespaced_pod.return_value = client.V1PodList( items=[fakes.get_fake_pod_info(kind='Pod')]) mock_check_pod_information.side_effect = [True, False] @@ -3992,6 +3998,10 @@ class TestKubernetes(base.TestCase): mock_get_by_id.return_value = vnf_instance_obj mock_vim.return_value = vim_connection_object + vnf_lcm_op_occ = vnflcm_fakes.vnflcm_scale_out_cnf() + vnf_lcm_op_occs = objects.VnfLcmOpOccList(objects=[vnf_lcm_op_occ]) + mock_op_occs.return_value = vnf_lcm_op_occs + self.kubernetes.sync_db( context=self.context, vnf_instance=vnf_instance_obj, vim_info=vim_connection_object) @@ -3999,6 +4009,7 @@ class TestKubernetes(base.TestCase): self.assertEqual(2, mock_check_pod_information.call_count) self.assertEqual(2, mock_save.call_count) + @mock.patch.object(objects.VnfLcmOpOccList, "get_by_filters") @mock.patch.object(kubernetes_driver.Kubernetes, "_sync_vnfc_resource_and_pod_resource") @mock.patch.object(objects.VimConnectionInfo, "obj_from_primitive") @@ -4010,7 +4021,7 @@ class TestKubernetes(base.TestCase): def test_sync_db_not_succeeded( self, mock_list_namespaced_pod, mock_check_pod_information, mock_get_by_id, mock_save, mock_get_vim, mock_vim, - mock_sync_vnfc): + mock_sync_vnfc, mock_op_occs): mock_list_namespaced_pod.return_value = client.V1PodList( items=[fakes.get_fake_pod_info(kind='Pod')]) mock_check_pod_information.return_value = True @@ -4029,11 +4040,16 @@ class TestKubernetes(base.TestCase): mock_vim.return_value = vim_connection_object mock_sync_vnfc.return_value = False + vnf_lcm_op_occ = vnflcm_fakes.vnflcm_scale_out_cnf() + vnf_lcm_op_occs = objects.VnfLcmOpOccList(objects=[vnf_lcm_op_occ]) + mock_op_occs.return_value = vnf_lcm_op_occs + self.kubernetes.sync_db( context=self.context, vnf_instance=vnf_instance_obj, vim_info=vim_connection_object) self.assertEqual(1, mock_sync_vnfc.call_count) + @mock.patch.object(objects.VnfLcmOpOccList, "get_by_filters") @mock.patch.object(objects.VimConnectionInfo, "obj_from_primitive") @mock.patch.object(vnflcm_utils, "get_vim") @mock.patch.object(VnfInstance, "save") @@ -4042,7 +4058,7 @@ class TestKubernetes(base.TestCase): @mock.patch.object(client.CoreV1Api, 'list_namespaced_pod') def test_sync_db_failed_update_db( self, mock_list_namespaced_pod, mock_check_pod_information, - mock_get_by_id, mock_save, mock_get_vim, mock_vim): + mock_get_by_id, mock_save, mock_get_vim, mock_vim, mock_op_occs): mock_list_namespaced_pod.return_value = client.V1PodList( items=[fakes.get_fake_pod_info(kind='Deployment')]) mock_check_pod_information.return_value = True @@ -4060,6 +4076,10 @@ class TestKubernetes(base.TestCase): mock_get_by_id.return_value = vnf_instance_obj mock_vim.return_value = vim_connection_object + vnf_lcm_op_occ = vnflcm_fakes.vnflcm_scale_out_cnf() + vnf_lcm_op_occs = objects.VnfLcmOpOccList(objects=[vnf_lcm_op_occ]) + mock_op_occs.return_value = vnf_lcm_op_occs + log_name = "tacker.vnfm.infra_drivers.kubernetes.kubernetes_driver" with self.assertLogs(logger=log_name, level=logging.ERROR) as cm: self.kubernetes.sync_db( diff --git a/tacker/vnfm/infra_drivers/kubernetes/kubernetes_driver.py b/tacker/vnfm/infra_drivers/kubernetes/kubernetes_driver.py index 38b6764fb..671e682cd 100644 --- a/tacker/vnfm/infra_drivers/kubernetes/kubernetes_driver.py +++ b/tacker/vnfm/infra_drivers/kubernetes/kubernetes_driver.py @@ -2734,6 +2734,15 @@ class Kubernetes(abstract_driver.VnfAbstractDriver, context, vnf_inst.id) if vnf_instance.instantiation_state != 'INSTANTIATED': return False + + # NOTE(fengyi): The operation_state in the opocc of vnf_instance + # has FAILED_TEMP or FAILED, then Tacker cannot perform DB sync for + # the vnf_instance. + if k8s_utils.is_lcmocc_failure_status(context, vnf_inst.id): + LOG.error(f"The LCM operation status of the vnf: {vnf_inst.id} " + f"is abnormal, so skip this DB synchronization.") + return False + # change task_state vnf_instance.task_state = fields.VnfInstanceTaskState.DB_SYNCHRONIZING vnf_instance.save() diff --git a/tacker/vnfm/infra_drivers/kubernetes/utils.py b/tacker/vnfm/infra_drivers/kubernetes/utils.py index a67251a88..21d6a8a0a 100644 --- a/tacker/vnfm/infra_drivers/kubernetes/utils.py +++ b/tacker/vnfm/infra_drivers/kubernetes/utils.py @@ -18,6 +18,8 @@ from oslo_log import log as logging from tacker.common import exceptions +from tacker import objects +from tacker.objects import fields LOG = logging.getLogger(__name__) @@ -81,3 +83,24 @@ def get_namespace_from_manifests(chk_namespaces): if namespaces: return namespaces.pop() return None + + +def is_lcmocc_failure_status(context, inst_id): + filters = {'field': 'vnf_instance_id', 'model': 'VnfLcmOpOccs', + 'value': inst_id, 'op': '=='} + vnf_lcm_op_occs = objects.VnfLcmOpOccList.get_by_filters( + context, read_deleted='no', filters=filters) + + failed_temp_lcmoccs = [ + lcmocc for lcmocc in vnf_lcm_op_occs.objects if + lcmocc.operation_state == fields.LcmOccsOperationState.FAILED_TEMP] + failed_lcmocc = [ + latest_lcmocc for latest_lcmocc in vnf_lcm_op_occs.objects + if latest_lcmocc.start_time == max( + [lcmocc.start_time for lcmocc in vnf_lcm_op_occs.objects])] + + if failed_temp_lcmoccs or (failed_lcmocc[0].operation_state == + fields.LcmOccsOperationState.FAILED): + return True + + return False