Mask system OVS services instead of erroring out

OVS crashes if there are multiple conflicting installations.
As such, the charm currently errors out when detecting system
OVS services.

The problem is that with DPDK, we expect OVS bridges and bonds
to be preconfigured through MAAS/Netplan, which currently
use system (deb) OVS packages.

Instead of erroring out, we'll stop and mask system OVS services,
then reapply Netplan to move the OVS bridges to the snap based
OVS installation. Netplan is expected to contain the following
patch: https://github.com/canonical/netplan/pull/549

Note that any configuration defined in the system OVS db that's
not set in Netplan will be lost.

Change-Id: Id398a2e4ef4cd138aa2db35d71e021f61f94b67d
Signed-off-by: Lucian Petrut <lpetrut@cloudbasesolutions.com>
This commit is contained in:
Lucian Petrut
2025-08-27 11:50:41 +00:00
parent 41812ee91b
commit 722857842c
6 changed files with 294 additions and 322 deletions

View File

@@ -2,7 +2,6 @@ external-libraries:
- charms.data_platform_libs.v0.data_interfaces
- charms.grafana_agent.v0.cos_agent
- charms.operator_libs_linux.v2.snap
- charms.operator_libs_linux.v1.systemd
- charms.rabbitmq_k8s.v0.rabbitmq
- charms.traefik_k8s.v2.ingress
- charms.tls_certificates_interface.v3.tls_certificates

View File

@@ -26,6 +26,7 @@ import io
import logging
import os
import secrets
import shutil
import socket
import string
import subprocess
@@ -63,9 +64,6 @@ from charms.nova_k8s.v0.nova_service import (
NovaConfigChangedEvent,
NovaServiceGoneAwayEvent,
)
from charms.operator_libs_linux.v1.systemd import (
service_running,
)
from cryptography import (
x509,
)
@@ -544,9 +542,85 @@ class HypervisorOperatorCharm(sunbeam_charm.OSBaseOperatorCharm):
else:
logger.debug("Snap settings do not need updating")
def _clear_system_ovs_datapaths(self):
logger.info("Clearing system OVS datapaths.")
if not shutil.which("ovs-dpctl"):
logger.info(
"ovs-dpctl not found, skipped clearing system OVS datapaths."
)
return
result = subprocess.run(
["ovs-dpctl", "dump-dps"],
capture_output=True,
text=True,
check=True,
)
datapaths = result.stdout.strip().split("\n")
for datapath in datapaths:
logger.info("Removing OVS datapath: %s", datapath)
subprocess.run(["ovs-dpctl", "del-dp", datapath], check=True)
def _disable_system_ovs(self) -> bool:
"""Disable deb installed OVS.
OVS crashes if there are multiple conflicting installations, as such
we are going to mask system OVS services and use the snap based
installation from "openstack-hypervisor" instead.
OVS bridges and bonds defined in MAAS will be included in the Netplan
configuration. We expect Netplan to contain the following in
order to work with snap based installations:
https://github.com/canonical/netplan/pull/549
Note that any configuration defined in the system OVS db that's
not set in Netplan will be lost.
Returns a boolean, stating if any changes were made.
"""
ovs_services = [
"openvswitch-switch.service",
"ovs-vswitchd.service",
"ovsdb-server.service",
"ovs-record-hostname.service",
]
changes_made = False
for service_name in ovs_services:
unit_info = utils.get_systemd_unit_status(service_name)
if not unit_info:
logger.debug("%s unit not found.", service_name)
continue
if unit_info["active_state"] != "inactive":
logging.info("Stopping unit: %s", service_name)
subprocess.run(["systemctl", "stop", service_name], check=True)
changes_made = True
else:
logger.debug("%s unit already stopped.", service_name)
if unit_info["load_state"] != "masked":
logging.info("Masking unit: %s", service_name)
subprocess.run(["systemctl", "mask", service_name], check=True)
changes_made = True
else:
logger.debug("%s unit already masked.", service_name)
if changes_made:
self._clear_system_ovs_datapaths()
return changes_made
def ensure_snap_present(self):
"""Install snap if it is not already present."""
config = self.model.config.get
# If we've just disabled the system OVS services, reapply
# the netplan configuration to the snap based installation.
netplan_apply_needed = self._disable_system_ovs()
try:
cache = self.get_snap_cache()
hypervisor = cache["openstack-hypervisor"]
@@ -556,6 +630,14 @@ class HypervisorOperatorCharm(sunbeam_charm.OSBaseOperatorCharm):
snap.SnapState.Latest, channel=config("snap-channel")
)
self._connect_to_epa_orchestrator()
# Netplan expects the "ovs-vsctl" alias in order to pick up the
# snap installation. The other aliases are there for consistency
# and convenience.
hypervisor.alias("ovs-vsctl", "ovs-vsctl")
hypervisor.alias("ovs-appctl", "ovs-appctl")
hypervisor.alias("ovs-dpctl", "ovs-dpctl")
hypervisor.alias("ovs-ofctl", "ovs-ofctl")
except (snap.SnapError, snap.SnapNotFoundError) as e:
logger.error(
"An exception occurred when installing openstack-hypervisor. Reason: %s",
@@ -566,6 +648,14 @@ class HypervisorOperatorCharm(sunbeam_charm.OSBaseOperatorCharm):
"openstack-hypervisor installation failed"
)
if netplan_apply_needed:
logger.info("System OVS services masked, reapplying netplan.")
subprocess.run(["netplan", "apply"], check=True)
else:
logger.debug(
"No OVS changes made, skipping netplan configuration."
)
def _connect_to_epa_orchestrator(self):
"""Connect openstack-hypervisor snap plug to epa-orchestrator snap slot."""
cache = self.get_snap_cache()
@@ -598,21 +688,8 @@ class HypervisorOperatorCharm(sunbeam_charm.OSBaseOperatorCharm):
"""Return snap cache."""
return snap.SnapCache()
def check_system_services(self) -> None:
"""Check if system services are in desired state."""
if service_running("openvswitch-switch.service"):
logger.error(
"OpenVSwitch service is running, please stop it before proceeding. "
"OpenVSwitch is managed by the openstack-hypervisor snap and will "
"conflict with the snap's operation."
)
raise sunbeam_guard.BlockedExceptionError(
"Breaking: OpenVSwitch service is running on the host."
)
def configure_unit(self, event) -> None:
"""Run configuration on this unit."""
self.check_system_services()
self.check_leader_ready()
self.check_relation_handlers_ready(event)
config = self.model.config.get

View File

@@ -18,6 +18,7 @@
import collections
import json
import logging
import os
import subprocess
@@ -162,3 +163,34 @@ def get_cpu_numa_architecture() -> dict:
numa_nodes[int(numa_node_id)].append(int(core_id))
return dict(numa_nodes)
def get_systemd_unit_status(unit_name: str) -> dict[str, str] | None:
"""Get Systemd unit status information."""
result = subprocess.run(
[
"systemctl",
"list-units",
"--all",
"-o",
"json",
"--no-pager",
unit_name,
],
capture_output=True,
text=True,
check=True,
)
units = json.loads(result.stdout)
for unit_info in units:
name = unit_info["unit"]
if name != unit_name:
continue
return {
"name": name,
"description": unit_info["description"],
"load_state": unit_info["load"],
"active_state": unit_info["active"],
"substate": unit_info["sub"],
}

View File

@@ -32,7 +32,6 @@ import jsonschema
import ops
import ops.testing
import ops_sunbeam.test_utils as test_utils
from ops_sunbeam import guard as sunbeam_guard
class _HypervisorOperatorCharm(charm.HypervisorOperatorCharm):
@@ -65,7 +64,6 @@ class TestCharm(test_utils.CharmTestCase):
"get_local_ip_by_default_route",
"subprocess",
"ConsulNotifyRequirer",
"service_running",
"epa_client",
]
@@ -73,7 +71,6 @@ class TestCharm(test_utils.CharmTestCase):
"""Setup OpenStack Hypervisor tests."""
super().setUp(charm, self.PATCHES)
self.patch_obj(os, "system")
self.service_running.return_value = False
self.snap.SnapError = Exception
self.consul_notify_mock = MagicMock()
@@ -477,20 +474,6 @@ class TestCharm(test_utils.CharmTestCase):
with self.assertRaises(snap.SnapError):
self.harness.charm._connect_to_epa_orchestrator()
def test_check_system_services_raises_when_ovs_running(self):
"""Test check_system_services raises BlockedExceptionError if OVS is running."""
self.service_running.return_value = True
self.harness.begin()
with self.assertRaises(sunbeam_guard.BlockedExceptionError):
self.harness.charm.check_system_services()
def test_check_system_services_passes_when_ovs_not_running(self):
"""Test check_system_services does nothing if OVS is not running."""
self.service_running.return_value = False
self.harness.begin()
# Should not raise
self.harness.charm.check_system_services()
@contextlib.contextmanager
def _mock_dpdk_settings_file(self, dpdk_yaml):
with tempfile.NamedTemporaryFile(mode="w") as f:
@@ -751,3 +734,105 @@ network:
self.harness.charm._epa_client.allocate_hugepages.assert_called_once_with(
charm.EPA_ALLOCATION_OVS_DPDK_HUGEPAGES, 2, 1024 * 1024, 0
)
@mock.patch("shutil.which")
def test_clear_system_ovs_datapaths(self, mock_which):
"""Test clearing system ovs datapaths."""
self.harness.begin()
mock_which.return_value = "/usr/bin/ovs-dpctl"
self.subprocess.run.side_effect = [
mock.Mock(stdout="dp1\ndp2"),
mock.Mock(stdout=None),
mock.Mock(stdout=None),
]
self.harness.charm._clear_system_ovs_datapaths()
self.subprocess.run.assert_has_calls(
[
mock.call(
["ovs-dpctl", "dump-dps"],
capture_output=True,
text=True,
check=True,
),
mock.call(["ovs-dpctl", "del-dp", "dp1"], check=True),
mock.call(["ovs-dpctl", "del-dp", "dp2"], check=True),
]
)
@mock.patch("shutil.which")
def test_clear_system_ovs_datapaths_missing_ovs_dpctl(self, mock_which):
"""Test clearing system ovs datapaths, no ovs-dpctl found."""
self.harness.begin()
mock_which.return_value = None
self.harness.charm._clear_system_ovs_datapaths()
self.subprocess.run.assert_not_called()
@mock.patch("utils.get_systemd_unit_status")
@mock.patch.object(
charm.HypervisorOperatorCharm, "_clear_system_ovs_datapaths"
)
def test_disable_system_ovs(self, mock_clear_datapaths, mock_get_status):
"""Test disabling system ovs services."""
self.harness.begin()
mock_get_status.side_effect = [
{
"name": "openvswitch-switch.service",
"load_state": "masked",
"active_state": "active",
"substate": "running",
},
{
"name": "ovs-vswitchd.service",
"load_state": "loaded",
"active_state": "inactive",
"substate": "dead",
},
{
"name": "ovsdb-server.service",
"load_state": "loaded",
"active_state": "active",
"substate": "running",
},
None,
]
ret_val = self.harness.charm._disable_system_ovs()
self.assertTrue(ret_val)
self.subprocess.run.assert_has_calls(
[
mock.call(
["systemctl", "stop", "openvswitch-switch.service"],
check=True,
),
mock.call(
["systemctl", "mask", "ovs-vswitchd.service"], check=True
),
mock.call(
["systemctl", "stop", "ovsdb-server.service"], check=True
),
mock.call(
["systemctl", "mask", "ovsdb-server.service"], check=True
),
]
)
mock_clear_datapaths.assert_called_once_with()
@mock.patch("utils.get_systemd_unit_status")
@mock.patch.object(
charm.HypervisorOperatorCharm, "_clear_system_ovs_datapaths"
)
def test_disable_system_ovs_missing(
self, mock_clear_datapaths, mock_get_status
):
"""Test disabling system ovs, no services found."""
self.harness.begin()
mock_get_status.return_value = None
ret_val = self.harness.charm._disable_system_ovs()
self.assertFalse(ret_val)

View File

@@ -0,0 +1,67 @@
# Copyright 2023 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Openstack hypervisor utils."""
import unittest
from unittest import (
mock,
)
import utils
class TestOpenstackHypervisorUtils(unittest.TestCase):
"""Tests for Openstack hypervisor utils."""
@mock.patch("subprocess.run")
def test_get_systemd_unit_status_not_found(self, mock_run):
"""Test retrieving inexistent systemd unit status."""
mock_run.return_value.stdout = "[]"
self.assertIsNone(utils.get_systemd_unit_status("test.service"))
@mock.patch("subprocess.run")
def test_get_systemd_unit_status(self, mock_run):
"""Test retrieving systemd unit status."""
mock_run.return_value.stdout = (
'[{"unit":"ovs-vswitchd.service","load":"masked",'
'"active":"inactive","sub":"dead",'
'"description":"ovs-vswitchd.service"}]'
)
exp_out = {
"name": "ovs-vswitchd.service",
"load_state": "masked",
"active_state": "inactive",
"substate": "dead",
"description": "ovs-vswitchd.service",
}
out = utils.get_systemd_unit_status("ovs-vswitchd.service")
self.assertEqual(exp_out, out)
mock_run.assert_called_once_with(
[
"systemctl",
"list-units",
"--all",
"-o",
"json",
"--no-pager",
"ovs-vswitchd.service",
],
capture_output=True,
text=True,
check=True,
)

View File

@@ -1,288 +0,0 @@
# Copyright 2021 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Abstractions for stopping, starting and managing system services via systemd.
This library assumes that your charm is running on a platform that uses systemd. E.g.,
Centos 7 or later, Ubuntu Xenial (16.04) or later.
For the most part, we transparently provide an interface to a commonly used selection of
systemd commands, with a few shortcuts baked in. For example, service_pause and
service_resume with run the mask/unmask and enable/disable invocations.
Example usage:
```python
from charms.operator_libs_linux.v0.systemd import service_running, service_reload
# Start a service
if not service_running("mysql"):
success = service_start("mysql")
# Attempt to reload a service, restarting if necessary
success = service_reload("nginx", restart_on_failure=True)
```
"""
__all__ = [ # Don't export `_systemctl`. (It's not the intended way of using this lib.)
"SystemdError",
"daemon_reload",
"service_disable",
"service_enable",
"service_failed",
"service_pause",
"service_reload",
"service_restart",
"service_resume",
"service_running",
"service_start",
"service_stop",
]
import logging
import subprocess
logger = logging.getLogger(__name__)
# The unique Charmhub library identifier, never change it
LIBID = "045b0d179f6b4514a8bb9b48aee9ebaf"
# Increment this major API version when introducing breaking changes
LIBAPI = 1
# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
class SystemdError(Exception):
"""Custom exception for SystemD related errors."""
def _systemctl(*args: str, check: bool = False) -> int:
"""Control a system service using systemctl.
Args:
*args: Arguments to pass to systemctl.
check: Check the output of the systemctl command. Default: False.
Returns:
Returncode of systemctl command execution.
Raises:
SystemdError: Raised if calling systemctl returns a non-zero returncode and check is True.
"""
cmd = ["systemctl", *args]
logger.debug(f"Executing command: {cmd}")
try:
proc = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
encoding="utf-8",
check=check,
)
logger.debug(
f"Command {cmd} exit code: {proc.returncode}. systemctl output:\n{proc.stdout}"
)
return proc.returncode
except subprocess.CalledProcessError as e:
raise SystemdError(
f"Command {cmd} failed with returncode {e.returncode}. systemctl output:\n{e.stdout}"
)
def service_running(service_name: str) -> bool:
"""Report whether a system service is running.
Args:
service_name: The name of the service to check.
Return:
True if service is running/active; False if not.
"""
# If returncode is 0, this means that is service is active.
return _systemctl("--quiet", "is-active", service_name) == 0
def service_failed(service_name: str) -> bool:
"""Report whether a system service has failed.
Args:
service_name: The name of the service to check.
Returns:
True if service is marked as failed; False if not.
"""
# If returncode is 0, this means that the service has failed.
return _systemctl("--quiet", "is-failed", service_name) == 0
def service_start(*args: str) -> bool:
"""Start a system service.
Args:
*args: Arguments to pass to `systemctl start` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl start ...` returns a non-zero returncode.
"""
return _systemctl("start", *args, check=True) == 0
def service_stop(*args: str) -> bool:
"""Stop a system service.
Args:
*args: Arguments to pass to `systemctl stop` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl stop ...` returns a non-zero returncode.
"""
return _systemctl("stop", *args, check=True) == 0
def service_restart(*args: str) -> bool:
"""Restart a system service.
Args:
*args: Arguments to pass to `systemctl restart` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl restart ...` returns a non-zero returncode.
"""
return _systemctl("restart", *args, check=True) == 0
def service_enable(*args: str) -> bool:
"""Enable a system service.
Args:
*args: Arguments to pass to `systemctl enable` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl enable ...` returns a non-zero returncode.
"""
return _systemctl("enable", *args, check=True) == 0
def service_disable(*args: str) -> bool:
"""Disable a system service.
Args:
*args: Arguments to pass to `systemctl disable` (normally the service name).
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl disable ...` returns a non-zero returncode.
"""
return _systemctl("disable", *args, check=True) == 0
def service_reload(service_name: str, restart_on_failure: bool = False) -> bool:
"""Reload a system service, optionally falling back to restart if reload fails.
Args:
service_name: The name of the service to reload.
restart_on_failure:
Boolean indicating whether to fall back to a restart if the reload fails.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl reload|restart ...` returns a non-zero returncode.
"""
try:
return _systemctl("reload", service_name, check=True) == 0
except SystemdError:
if restart_on_failure:
return service_restart(service_name)
else:
raise
def service_pause(service_name: str) -> bool:
"""Pause a system service.
Stops the service and prevents the service from starting again at boot.
Args:
service_name: The name of the service to pause.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if service is still running after being paused by systemctl.
"""
_systemctl("disable", "--now", service_name)
_systemctl("mask", service_name)
if service_running(service_name):
raise SystemdError(f"Attempted to pause {service_name!r}, but it is still running.")
return True
def service_resume(service_name: str) -> bool:
"""Resume a system service.
Re-enable starting the service again at boot. Start the service.
Args:
service_name: The name of the service to resume.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if service is not running after being resumed by systemctl.
"""
_systemctl("unmask", service_name)
_systemctl("enable", "--now", service_name)
if not service_running(service_name):
raise SystemdError(f"Attempted to resume {service_name!r}, but it is not running.")
return True
def daemon_reload() -> bool:
"""Reload systemd manager configuration.
Returns:
On success, this function returns True for historical reasons.
Raises:
SystemdError: Raised if `systemctl daemon-reload` returns a non-zero returncode.
"""
return _systemctl("daemon-reload", check=True) == 0