Fix queue_manager in a containerized environment

This patch fixes the operation of queue_manager in a
containerized environment by adding an additional check
on the start_time in ticks since boot.
This way, we can detect a restart even when the PID remains
unchanged as it is ussual in containers, but the start_time is
different.

[1] https://www.man7.org/linux/man-pages//man5/proc_pid_stat.5.html

From man page above:

(22) starttime  %llu
    The time the process started after system boot.
    Before Linux 2.6, this value was expressed in
    jiffies.  Since Linux 2.6, the value is expressed
    in clock ticks (divide by sysconf(_SC_CLK_TCK)).

Closes-Bug: #2078935
Change-Id: I9e22433ec039ad6783593d9cb7fbe22c9090534e
(cherry picked from commit 6790f702fa5a56c9d25a4becba670bf1810a4162)
(cherry picked from commit 09f808f5383b7deff22a6f8a90ed21fda746f243)
This commit is contained in:
Michal Arbet 2024-09-04 19:01:56 +02:00 committed by Takashi Kajinami
parent 8ab852f334
commit 5369e1ed7e

View File

@ -66,6 +66,13 @@ class QManager(object):
# We use the process group to restart the counter on service restart # We use the process group to restart the counter on service restart
self.pg = os.getpgrp() self.pg = os.getpgrp()
# We need to also handle containerized deployments, so let's
# parse start time (in jiffies) since system boot
#
# https://www.man7.org/linux/man-pages//man5/proc_pid_stat.5.html
with open(f'/proc/{self.pg}/stat', 'r') as f:
self.start_time = int(f.read().split()[21])
def get(self): def get(self):
lock_name = 'oslo_read_shm_%s_%s' % (self.hostname, self.processname) lock_name = 'oslo_read_shm_%s_%s' % (self.hostname, self.processname)
@ -75,28 +82,32 @@ class QManager(object):
# This function is thread and process safe thanks to lockutils # This function is thread and process safe thanks to lockutils
try: try:
with open(self.file_name, 'r') as f: with open(self.file_name, 'r') as f:
pg, c = f.readline().split(':') pg, counter, start_time = f.readline().split(':')
pg = int(pg) pg = int(pg)
c = int(c) counter = int(counter)
start_time = int(start_time)
except (FileNotFoundError, ValueError): except (FileNotFoundError, ValueError):
pg = self.pg pg = self.pg
c = 0 counter = 0
start_time = self.start_time
# Increment the counter # Increment the counter
if pg == self.pg: if pg == self.pg and start_time == self.start_time:
c += 1 counter += 1
else: else:
# The process group changed, maybe service restarted? # The process group is changed, or start time since system boot
# differs. Maybe service restarted ?
# Start over the counter # Start over the counter
c = 1 counter = 1
# Write the new counter # Write the new counter
with open(self.file_name, 'w') as f: with open(self.file_name, 'w') as f:
f.write(str(self.pg) + ':' + str(c)) f.write(str(self.pg) + ':' + str(counter) + ':' +
return c str(start_time))
return counter
c = read_from_shm() counter = read_from_shm()
return self.hostname + ":" + self.processname + ":" + str(c) return self.hostname + ":" + self.processname + ":" + str(counter)
class MessageOperationsHandler(object): class MessageOperationsHandler(object):