From ffff76a682b37d7485fe4d261520d102ab3e5fb8 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Tue, 27 Apr 2021 10:22:42 -0700 Subject: [PATCH] Add basic tools for benchmarking Adds a horribly written, just hacked together little tool to help provide sizing insight into an ironic deployment's state and underlying performance. Key data: * Queries the list of node from a pure python interface level with the database and reports timeing for the list of nodes to be returned. This information helps convey how long a periodic hits the database just for the query. * Requests *all* nodes using the query pattern/structure of the nova resource tracker, and uses the marker to make any additional requsts. The data is parsed, and collected, and counts identified vendors, if any. * Collects basic data on conductors in terms of running, conductor groups as well as currently loaded drivers in the deployment. All of this information provides operational insight into *what* conditions exist within the deployment allowing developers to try and identify solutions based on the unique circumstances of larger deployments. Also adds a utility to generate and semi-randomize data to allow us to create a benchmark job in CI. Change-Id: Iae660aea82db8f1c4567ee2982595ccfdf434fe3 --- tools/benchmark/README | 13 ++ .../do_not_run_create_benchmark_data.py | 99 +++++++++ tools/benchmark/generate-statistics.py | 195 ++++++++++++++++++ 3 files changed, 307 insertions(+) create mode 100644 tools/benchmark/README create mode 100644 tools/benchmark/do_not_run_create_benchmark_data.py create mode 100644 tools/benchmark/generate-statistics.py diff --git a/tools/benchmark/README b/tools/benchmark/README new file mode 100644 index 0000000000..25590fe1e7 --- /dev/null +++ b/tools/benchmark/README @@ -0,0 +1,13 @@ +This folder contains two files: + +* do_not_run_create_benchmark_data.py - This script will destroy your + ironic database. DO NOT RUN IT. You have been warned! + It is is intended to generate a semi-random database of node data + which can be used for benchmarks, instead of crafting a raw SQL file + representing a test model + +* generate-statistics.py - This is a utility some statistics to both + aid in basic benchmarking of ironic operations *and* provide developers + with conceptual information regarding a deployment's size. It operates + only by reading the data present and timing how long the result take to + return as well as isolating some key details about the deployment. diff --git a/tools/benchmark/do_not_run_create_benchmark_data.py b/tools/benchmark/do_not_run_create_benchmark_data.py new file mode 100644 index 0000000000..1e050a0f1a --- /dev/null +++ b/tools/benchmark/do_not_run_create_benchmark_data.py @@ -0,0 +1,99 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys +import time + +from oslo_db.sqlalchemy import enginefacade +from sqlalchemy import sql + +from ironic.common import service +from ironic.conf import CONF # noqa To Load Configuration +from ironic.objects import node + + +def _create_test_nodes(): + print("Starting creation of fake nodes.") + start = time.time() + node_count = 10000 + checkin = time.time() + for i in range(0, node_count): + + new_node = node.Node({ + 'power_state': 'power off', + 'driver': 'ipmi', + 'driver_internal_info': {'test-meow': i}, + 'name': 'BenchmarkTestNode-%s' % i, + 'driver_info': { + 'ipmi_username': 'admin', + 'ipmi_password': 'admin', + 'ipmi_address': 'testhost%s.env.top.level.domain' % i}, + 'resource_class': 'CUSTOM_BAREMETAL', + 'properties': { + 'cpu': 4, + 'memory': 32, + 'cats': i, + 'meowing': True}}) + new_node.create() + delta = time.time() - checkin + if delta > 10: + checkin = time.time() + print('* At %s nodes, %0.02f seconds. Total elapsed: %s' + % (i, delta, time.time() - start)) + created = time.time() + elapse = created - start + print('Created %s nodes in %s seconds.\n' % (node_count, elapse)) + + +def _mix_up_nodes_data(): + engine = enginefacade.writer.get_engine() + conn = engine.connect() + + # A list of commands to mix up indexed field data a bit to emulate what + # a production database may somewhat look like. + commands = [ + "UPDATE nodes set maintenance = True where RAND() < 0.1", # noqa Easier to read this way + "UPDATE nodes set driver = 'redfish' where RAND() < 0.5", # noqa Easier to read this way + "UPDATE nodes set reservation = 'fake_conductor01' where RAND() < 0.02", # noqa Easier to read this way + "UPDATE nodes set reservation = 'fake_conductor02' where RAND() < 0.02", # noqa Easier to read this way + "UPDATE nodes set reservation = 'fake_conductor03' where RAND() < 0.02", # noqa Easier to read this way + "UPDATE nodes set reservation = 'fake_conductor04' where RAND() < 0.02", # noqa Easier to read this way + "UPDATE nodes set reservation = 'fake_conductor05' where RAND() < 0.02", # noqa Easier to read this way + "UPDATE nodes set reservation = 'fake_conductor06' where RAND() < 0.02", # noqa Easier to read this way + "UPDATE nodes set provision_state = 'active' where RAND() < 0.8", # noqa Easier to read this way + "UPDATE nodes set power_state = 'power on' where provision_state = 'active' and RAND() < 0.95", # noqa Easier to read this way + "UPDATE nodes set provision_state = 'available' where RAND() < 0.1", # noqa Easier to read this way + "UPDATE nodes set provision_state = 'manageable' where RAND() < 0.1", # noqa Easier to read this way + "UPDATE nodes set provision_state = 'clean wait' where RAND() < 0.05", # noqa Easier to read this way + "UPDATE nodes set provision_state = 'error' where RAND() < 0.05", # noqa Easier to read this way + "UPDATE nodes set owner = (select UUID()) where RAND() < 0.2", # noqa Easier to read this way + "UPDATE nodes set lessee = (select UUID()) where RAND() < 0.2", # noqa Easier to read this way + "UPDATE nodes set instance_uuid = (select UUID()) where RAND() < 0.95 and provision_state = 'active'", # noqa Easier to read this way + "UPDATE nodes set last_error = (select UUID()) where RAND() <0.05", # noqa Easier to read this way + ] + start = time.time() + for command in commands: + print("Executing SQL command: \\" + command + ";\n") + conn.execute(sql.text(command)) + print("* Completed command. %0.04f elapsed since start of commands." + % (time.time() - start)) + + +def main(): + service.prepare_service() + CONF.set_override('debug', False) + _create_test_nodes() + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tools/benchmark/generate-statistics.py b/tools/benchmark/generate-statistics.py new file mode 100644 index 0000000000..65e8d664f0 --- /dev/null +++ b/tools/benchmark/generate-statistics.py @@ -0,0 +1,195 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import datetime +import sys +import time +from unittest import mock + +from ironic_lib import metrics_utils +import oslo_policy +from oslo_utils import timeutils + +from ironic.api.controllers.v1 import node as node_api +from ironic.api.controllers.v1 import utils as api_utils +from ironic.common import context +from ironic.common import service +from ironic.conf import CONF # noqa To Load Configuration +from ironic.db import api as db_api +from ironic.objects import conductor +from ironic.objects import node + + +def _calculate_delta(start, finish): + return finish - start + + +def _add_a_line(): + print('------------------------------------------------------------') + + +def _assess_db_performance(): + start = time.time() + dbapi = db_api.get_instance() + print('Phase - Assess DB performance') + _add_a_line() + got_connection = time.time() + nodes = dbapi.get_node_list() + node_count = len(nodes) + query_complete = time.time() + delta = _calculate_delta(start, got_connection) + print('Obtained DB client in %s seconds.' % delta) + delta = _calculate_delta(got_connection, query_complete) + print('Returned %s nodes in python %s seconds from the DB.\n' % + (node_count, delta)) + # return node count for future use. + return node_count + + +def _assess_db_and_object_performance(): + print('Phase - Assess DB & Object conversion Performance') + _add_a_line() + start = time.time() + node_list = node.Node().list(context.get_admin_context()) + got_list = time.time() + delta = _calculate_delta(start, got_list) + print('Obtained list of node objects in %s seconds.' % delta) + count = 0 + tbl_size = 0 + # In a sense, this helps provide a relative understanding if the + # database is the bottleneck, or the objects post conversion. + # converting completely to json and then measuring the size helps + # ensure that everything is "assessed" while not revealing too + # much detail. + for node_obj in node_list: + # Just looping through the entire set to count should be + # enough to ensure that the entry is loaded from the db + # and then converted to an object. + tbl_size = tbl_size + sys.getsizeof(node_obj.as_dict(secure=True)) + count = count + 1 + delta = _calculate_delta(got_list, time.time()) + print('Took %s seconds to iterate through %s node objects.' % + (delta, count)) + print('Nodes table is roughly %s bytes of JSON.\n' % tbl_size) + observed_vendors = [] + for node_obj in node_list: + vendor = node_obj.driver_internal_info.get('vendor') + if vendor: + observed_vendors.append(vendor) + + +@mock.patch('ironic.api.request') # noqa patch needed for the object model +@mock.patch.object(metrics_utils, 'get_metrics_logger', lambda *_: mock.Mock) +@mock.patch.object(api_utils, 'check_list_policy', lambda *_: None) +@mock.patch.object(api_utils, 'check_allow_specify_fields', lambda *_: None) +@mock.patch.object(api_utils, 'check_allowed_fields', lambda *_: None) +@mock.patch.object(oslo_policy.policy, 'LOG', autospec=True) +def _assess_db_object_and_api_performance(mock_log, mock_request): + print('Phase - Assess DB & Object conversion Performance') + _add_a_line() + # Just mock it to silence it since getting the logger to update + # config seems like not a thing once started. :\ + mock_log.debug = mock.Mock() + # Internal logic requires major/minor versions and a context to + # proceed. This is just to make the NodesController respond properly. + mock_request.context = context.get_admin_context() + mock_request.version.major = 1 + mock_request.version.minor = 71 + + start = time.time() + node_api_controller = node_api.NodesController() + node_api_controller.context = context.get_admin_context() + fields = ("uuid,power_state,target_power_state,provision_state," + "target_provision_state,last_error,maintenance,properties," + "instance_uuid,traits,resource_class") + + total_nodes = 0 + + res = node_api_controller._get_nodes_collection( + chassis_uuid=None, + instance_uuid=None, + associated=None, + maintenance=None, + retired=None, + provision_state=None, + marker=None, + limit=None, + sort_key="id", + sort_dir="asc", + fields=fields.split(',')) + total_nodes = len(res['nodes']) + while len(res['nodes']) != 1: + print(" ** Getting nodes ** %s Elapsed: %s seconds." % + (total_nodes, _calculate_delta(start, time.time()))) + res = node_api_controller._get_nodes_collection( + chassis_uuid=None, + instance_uuid=None, + associated=None, + maintenance=None, + retired=None, + provision_state=None, + marker=res['nodes'][-1]['uuid'], + limit=None, + sort_key="id", + sort_dir="asc", + fields=fields.split(',')) + new_nodes = len(res['nodes']) + if new_nodes == 0: + break + total_nodes = total_nodes + new_nodes + + delta = _calculate_delta(start, time.time()) + print('Took %s seconds to return all %s nodes via ' + 'nodes API call pattern.\n' % (delta, total_nodes)) + + +def _report_conductors(): + print('Phase - identifying conductors/drivers') + _add_a_line() + conductors = conductor.Conductor().list( + context.get_admin_context(), + ) + drivers = [] + groups = [] + online_count = 0 + online_by = timeutils.utcnow(with_timezone=True) - \ + datetime.timedelta(seconds=90) + for conductor_obj in conductors: + if conductor_obj.conductor_group: + groups.append(conductor_obj.conductor_group) + if conductor_obj.updated_at > online_by: + online_count = online_count + 1 + for driver in conductor_obj.drivers: + drivers.append(driver) + conductor_count = len(conductors) + print('Conductor count: %s' % conductor_count) + print('Online conductor count: %s' % online_count) + running_with_groups = len(groups) + print('Conductors with conductor_groups: %s' % running_with_groups) + group_count = len(set(groups)) + print('Conductor group count: %s' % group_count) + driver_list = list(set(drivers)) + print('Presently supported drivers: %s' % driver_list) + + +def main(): + service.prepare_service() + CONF.set_override('debug', False) + _assess_db_performance() + _assess_db_and_object_performance() + _assess_db_object_and_api_performance() + _report_conductors() + + +if __name__ == '__main__': + sys.exit(main())