Fix systemd service start rate limiting
The default limit is to allow 5 restarts in a 10sec period. If a
service goes over that threshold due to the Restart= config option in
the service definition, it will not attempt to restart any further.
We should not set StartLimitIntervalSec to 0 to disable any kind of
rate limiting as that may end up impacting the node load.
Instead, use tenacity to retry with an exponential backoff, when the
service unit enablement fails. Before to retry it, reset the unit's
failure counters with the systemctl wrapper. This is a crash-loop
approach that provides an efficient feature parity to the classic
rate limiting, shall we want to implement that for the systemctl
command wrapper instead.
Closes-bug: #1839841
Change-Id: I537fbf9933f2cbe6e1c2f627ba77da645bd55f25
Signed-off-by: Bogdan Dobrelya <bdobreli@redhat.com>
(cherry picked from commit 2eaebe2cd9
)
This commit is contained in:
parent
6d7c756cb2
commit
5b5e578cc5
|
@ -13,6 +13,7 @@
|
||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import tenacity
|
||||||
|
|
||||||
from paunch.utils import common
|
from paunch.utils import common
|
||||||
|
|
||||||
|
@ -45,12 +46,33 @@ def daemon_reload(log=None):
|
||||||
systemctl(['daemon-reload'], log)
|
systemctl(['daemon-reload'], log)
|
||||||
|
|
||||||
|
|
||||||
|
def reset_failed(service, log=None):
|
||||||
|
systemctl(['reset-failed', service], log)
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE(bogdando): this implements a crash-loop with reset-failed
|
||||||
|
# counters approach that provides an efficient feature parity to the
|
||||||
|
# classic rate limiting, shall we want to implement that for the
|
||||||
|
# systemctl command wrapper instead.
|
||||||
|
@tenacity.retry( # Retry up to 5 times with jittered exponential backoff
|
||||||
|
reraise=True,
|
||||||
|
retry=tenacity.retry_if_exception_type(
|
||||||
|
SystemctlException
|
||||||
|
),
|
||||||
|
wait=tenacity.wait_random_exponential(multiplier=1, max=10),
|
||||||
|
stop=tenacity.stop_after_attempt(5)
|
||||||
|
)
|
||||||
def enable(service, now=True, log=None):
|
def enable(service, now=True, log=None):
|
||||||
cmd = ['enable']
|
cmd = ['enable']
|
||||||
if now:
|
if now:
|
||||||
cmd.append('--now')
|
cmd.append('--now')
|
||||||
cmd.append(service)
|
cmd.append(service)
|
||||||
|
try:
|
||||||
systemctl(cmd, log)
|
systemctl(cmd, log)
|
||||||
|
except SystemctlException as err:
|
||||||
|
# Reset failure counters for the service unit and retry
|
||||||
|
reset_failed(service, log)
|
||||||
|
raise SystemctlException(str(err))
|
||||||
|
|
||||||
|
|
||||||
def disable(service, log=None):
|
def disable(service, log=None):
|
||||||
|
|
Loading…
Reference in New Issue