Fix driver task pattern to reduce periodic db load

Previously, a pattern of periodic tasks was created where
nodes, and in many cases, all nodes not actively locked nor
those in maintenance state, were pulled in by a periodic task.
These periodic tasks would then create tasks which generated
additional database queries in order to populate the task object.

With the task object populated, the driver would then evaluate
if the driver in question was for the the driver interface in
question and *then* evaluate if work had to be performed.

However, that field containing a pointer to if work needed to be
performed as often already queried from the database on the
very initial query to generate the list of nodes to evaluate.

In essence, we've moved this up in the sequence so we evaluate
that field in question prior to creating the task, potentially
across every conductor, depending on the query, and ultimately
which drivers are enabled.

This saves potentially saves hundreds of thousands of needless
database queries on a medium size deployment per single day,
depending on which drivers and driver interfaces are in use.

Change-Id: I409e87de2808d442d39e4d0ae6e995668230cbba
This commit is contained in:
Julia Kreger 2021-09-13 09:46:08 -07:00
parent fa1c60cbce
commit 4fc1abf91f
7 changed files with 76 additions and 30 deletions

View File

@ -169,6 +169,14 @@ class DracWSManBIOS(base.BIOSInterface):
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
try:
# NOTE(TheJulia) Evaluate if work is actually required before
# creating a task for every node in the deployment which does
# not have a lock and is not in maintenance mode.
if (not driver_internal_info.get("bios_config_job_ids")
and not driver_internal_info.get(
"factory_reset_time_before_reboot")):
continue
lock_purpose = 'checking async bios configuration jobs'
# Performing read-only/non-destructive work with shared lock
with task_manager.acquire(context, node_uuid,

View File

@ -502,6 +502,17 @@ class DracRedfishManagement(redfish_management.RedfishManagement):
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
try:
task_monitor_url = driver_internal_info.get(
'import_task_monitor_url')
# NOTE(TheJulia): Evaluate if a task montitor URL exists
# based upon our inital DB query before pulling a task for
# every node in the deployment which reduces the overall
# number of DB queries triggering in the background where
# no work is required.
if not task_monitor_url:
continue
lock_purpose = 'checking async import configuration task'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -509,10 +520,6 @@ class DracRedfishManagement(redfish_management.RedfishManagement):
if not isinstance(task.driver.management,
DracRedfishManagement):
continue
task_monitor_url = driver_internal_info.get(
'import_task_monitor_url')
if not task_monitor_url:
continue
self._check_import_configuration_task(
task, task_monitor_url)
except exception.NodeNotFound:

View File

@ -1482,6 +1482,14 @@ class DracWSManRAID(base.RAIDInterface):
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
try:
job_ids = driver_internal_info.get('raid_config_job_ids')
# NOTE(TheJulia): Evaluate if there is work to be done
# based upon the original DB query's results so we don't
# proceed creating tasks for every node in the deployment.
if not job_ids:
continue
lock_purpose = 'checking async raid configuration jobs'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -1489,10 +1497,6 @@ class DracWSManRAID(base.RAIDInterface):
if not isinstance(task.driver.raid, DracWSManRAID):
continue
job_ids = driver_internal_info.get('raid_config_job_ids')
if not job_ids:
continue
self._check_node_raid_jobs(task)
except exception.NodeNotFound:

View File

@ -434,6 +434,13 @@ class IRMCRAID(base.RAIDInterface):
node_list = manager.iter_nodes(fields=fields, filters=filters)
for (node_uuid, driver, conductor_group, raid_config) in node_list:
try:
# NOTE(TheJulia): Evaluate based upon presence of raid
# configuration before triggering a task, as opposed to after
# so we don't create excess node task objects with related
# DB queries.
if not raid_config or raid_config.get('fgi_status'):
continue
lock_purpose = 'checking async RAID configuration tasks'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -444,8 +451,6 @@ class IRMCRAID(base.RAIDInterface):
continue
if task.node.target_raid_config is None:
continue
if not raid_config or raid_config.get('fgi_status'):
continue
task.upgrade_lock()
if node.provision_state != states.CLEANWAIT:
continue

View File

@ -872,6 +872,15 @@ class RedfishManagement(base.ManagementInterface):
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
try:
firmware_updates = driver_internal_info.get(
'firmware_updates')
# NOTE(TheJulia): If we don't have a entry upfront, we can
# safely skip past the node as we know work here is not
# required, otherwise minimizing the number of potential
# nodes to visit.
if not firmware_updates:
continue
lock_purpose = 'checking async firmware update failed.'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -880,11 +889,6 @@ class RedfishManagement(base.ManagementInterface):
RedfishManagement):
continue
firmware_updates = driver_internal_info.get(
'firmware_updates')
if not firmware_updates:
continue
node = task.node
# A firmware update failed. Discard any remaining firmware
@ -921,6 +925,14 @@ class RedfishManagement(base.ManagementInterface):
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
try:
firmware_updates = driver_internal_info.get(
'firmware_updates')
# NOTE(TheJulia): Check and skip upfront before creating a
# task so we don't generate additional tasks and db queries
# for every node in CLEANWAIT which is not locked.
if not firmware_updates:
continue
lock_purpose = 'checking async firmware update tasks.'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -929,11 +941,6 @@ class RedfishManagement(base.ManagementInterface):
RedfishManagement):
continue
firmware_updates = driver_internal_info.get(
'firmware_updates')
if not firmware_updates:
continue
self._check_node_firmware_update(task)
except exception.NodeNotFound:

View File

@ -1032,6 +1032,15 @@ class RedfishRAID(base.RAIDInterface):
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
try:
raid_configs = driver_internal_info.get(
'raid_configs')
# NOTE(TheJulia): Evaluate the presence of raid configuration
# activity before pulling the task, so we don't needlessly
# create database queries with tasks which would be skipped
# anyhow.
if not raid_configs:
continue
lock_purpose = 'checking async RAID config failed.'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -1039,11 +1048,6 @@ class RedfishRAID(base.RAIDInterface):
if not isinstance(task.driver.raid, RedfishRAID):
continue
raid_configs = driver_internal_info.get(
'raid_configs')
if not raid_configs:
continue
node = task.node
# A RAID config failed. Discard any remaining RAID
@ -1080,6 +1084,14 @@ class RedfishRAID(base.RAIDInterface):
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
try:
raid_configs = driver_internal_info.get(
'raid_configs')
# NOTE(TheJulia): Skip to next record if we do not
# have raid configuraiton tasks, so we don't pull tasks
# for every unrelated node in CLEANWAIT.
if not raid_configs:
continue
lock_purpose = 'checking async RAID config tasks.'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -1087,11 +1099,6 @@ class RedfishRAID(base.RAIDInterface):
if not isinstance(task.driver.raid, RedfishRAID):
continue
raid_configs = driver_internal_info.get(
'raid_configs')
if not raid_configs:
continue
self._check_node_raid_config(task)
except exception.NodeNotFound:

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes the pattern of execution for periodic tasks such that the majority
of drivers now evaluate *if* work needs to be performed in advance of
creating a node task. Depending on the individual driver query pattern,
this prevents excess database queries from being triggered with every
task execution.