From 97e5c0e9b1906f2993b4c12820ac3cb9ddcfe821 Mon Sep 17 00:00:00 2001
From: Mark Goddard <mark@stackhpc.com>
Date: Fri, 5 Jan 2024 11:02:39 +0000
Subject: [PATCH] cadvisor: Set housekeeping interval to Prometheus scrape
 interval

The prometheus_cadvisor container has high CPU usage. On various
production systems I checked it sits around 13-16% on controllers,
averaged over the prometheus 1m scrape interval. When viewed with top we
can see it is a bit spikey and can jump over 100%.

There are various bugs about this, but I found
https://github.com/google/cadvisor/issues/2523 which suggests reducing
the per-container housekeeping interval. This defaults to 1s, which
provides far greater granularity than we need with the default
prometheus scrape interval of 60s.

Reducing the housekeeping interval to 60s on a production controller
reduced the CPU usage from 13% to 3.5% average. This still seems high,
but is more reasonable.

Change-Id: I89c62a45b1f358aafadcc0317ce882f4609543e7
Closes-Bug: #2048223
---
 ansible/roles/prometheus/defaults/main.yml           | 2 +-
 releasenotes/notes/bug-2048223-bb66fa11c6b36c5e.yaml | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 releasenotes/notes/bug-2048223-bb66fa11c6b36c5e.yaml

diff --git a/ansible/roles/prometheus/defaults/main.yml b/ansible/roles/prometheus/defaults/main.yml
index 8818f19a2f..f18256d442 100644
--- a/ansible/roles/prometheus/defaults/main.yml
+++ b/ansible/roles/prometheus/defaults/main.yml
@@ -354,7 +354,7 @@ prometheus_server_command: >-
   --storage.tsdb.path /var/lib/prometheus{% if prometheus_cmdline_extras %} {{ prometheus_cmdline_extras }}{% endif %}
 
 prometheus_blackbox_exporter_cmdline_extras: ""
-prometheus_cadvisor_cmdline_extras: "--docker_only --store_container_labels=false --disable_metrics=percpu,referenced_memory,cpu_topology,resctrl,udp,advtcp,sched,hugetlb,memory_numa,tcp,process"
+prometheus_cadvisor_cmdline_extras: "--docker_only --store_container_labels=false --disable_metrics=percpu,referenced_memory,cpu_topology,resctrl,udp,advtcp,sched,hugetlb,memory_numa,tcp,process --housekeeping_interval={{ prometheus_scrape_interval }}"
 prometheus_elasticsearch_exporter_cmdline_extras: ""
 prometheus_haproxy_exporter_cmdline_extras: ""
 prometheus_memcached_exporter_cmdline_extras: ""
diff --git a/releasenotes/notes/bug-2048223-bb66fa11c6b36c5e.yaml b/releasenotes/notes/bug-2048223-bb66fa11c6b36c5e.yaml
new file mode 100644
index 0000000000..eb06cd79c7
--- /dev/null
+++ b/releasenotes/notes/bug-2048223-bb66fa11c6b36c5e.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Fixes an issue with high CPU usage of the cAdvisor container by setting the
+    per-container housekeeping interval to the same value as the Prometheus
+    scrape interval. `LP#2048223
+    <https://bugs.launchpad.net/kolla-ansible/+bug/2048223>`__