Never raise an exception in notify()

notify() is called from python-ovs code which is not built to recover from an exception in this user-overriden code. If there is an exception (e.g. the DB server is down when we process the hash ring), this exception can cause an unrecoverable error in processing OVSDB messages, rendering the neutron worker useless. Change-Id: I5f703d82175d71a222c76df37a82b5ccad890d14 (cherry picked from commit 67e616b238) (cherry picked from commit 848787785e) Conflicts: neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py
2023-01-26 08:37:24 -06:00 · 2023-01-26 08:37:24 -06:00 · 3566cc065e
parent 72aa15c2a4
commit 3566cc065e
1 changed files with 33 additions and 29 deletions
--- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py
+++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py
@ -567,38 +567,42 @@ class OvnIdlDistributedLock(BaseOvnIdl):
        self._last_touch = None
    def notify(self, event, row, updates=None):
        self.notify_handler.notify(event, row, updates, global_=True)
        try:
-            target_node = self._hash_ring.get_node(str(row.uuid))
+            self.notify_handler.notify(event, row, updates, global_=True)
        except exceptions.HashRingIsEmpty as e:
            LOG.error('HashRing is empty, error: %s', e)
            return
        if target_node != self._node_uuid:
            return
        # If the worker hasn't been health checked by the maintenance
        # thread (see bug #1834498), indicate that it's alive here
        time_now = timeutils.utcnow()
        touch_timeout = time_now - datetime.timedelta(
            seconds=ovn_const.HASH_RING_TOUCH_INTERVAL)
        if not self._last_touch or touch_timeout >= self._last_touch:
            # NOTE(lucasagomes): Guard the db operation with an exception
            # handler. If heartbeating fails for whatever reason, log
            # the error and continue with processing the event
            try:
-                ctx = neutron_context.get_admin_context()
+                target_node = self._hash_ring.get_node(str(row.uuid))
-                ovn_hash_ring_db.touch_node(ctx, self._node_uuid)
+            except exceptions.HashRingIsEmpty as e:
-                self._last_touch = time_now
+                LOG.error('HashRing is empty, error: %s', e)
-            except Exception:
+                return
-                LOG.exception('Hash Ring node %s failed to heartbeat',
+            if target_node != self._node_uuid:
-                              self._node_uuid)
+                return
-        LOG.debug('Hash Ring: Node %(node)s (host: %(hostname)s) '
+            # If the worker hasn't been health checked by the maintenance
-                  'handling event "%(event)s" for row %(row)s '
+            # thread (see bug #1834498), indicate that it's alive here
-                  '(table: %(table)s)',
+            time_now = timeutils.utcnow()
-                  {'node': self._node_uuid, 'hostname': CONF.host,
+            touch_timeout = time_now - datetime.timedelta(
-                   'event': event, 'row': row.uuid, 'table': row._table.name})
+                seconds=ovn_const.HASH_RING_TOUCH_INTERVAL)
-        self.notify_handler.notify(event, row, updates)
+            if not self._last_touch or touch_timeout >= self._last_touch:
                # NOTE(lucasagomes): Guard the db operation with an exception
                # handler. If heartbeating fails for whatever reason, log
                # the error and continue with processing the event
                try:
                    ctx = neutron_context.get_admin_context()
                    ovn_hash_ring_db.touch_node(ctx, self._node_uuid)
                    self._last_touch = time_now
                except Exception:
                    LOG.exception('Hash Ring node %s failed to heartbeat',
                                  self._node_uuid)
            LOG.debug('Hash Ring: Node %(node)s (host: %(hostname)s) '
                      'handling event "%(event)s" for row %(row)s '
                      '(table: %(table)s)',
                      {'node': self._node_uuid, 'hostname': CONF.host,
                       'event': event, 'row': row.uuid,
                       'table': row._table.name})
            self.notify_handler.notify(event, row, updates)
        except Exception as e:
            LOG.exception(e)
    @abc.abstractmethod
    def post_connect(self):