diff --git a/monitoring/collectd-extensions/PKG-INFO b/monitoring/collectd-extensions/PKG-INFO deleted file mode 100644 index 436011064..000000000 --- a/monitoring/collectd-extensions/PKG-INFO +++ /dev/null @@ -1,10 +0,0 @@ -Metadata-Version: 1.1 -Name: collectd-extensions -Version: 1.0 -Summary: collectd-extensions -Home-page: -Author: Windriver -Author-email: info@windriver.com -License: ASL 2.0 -Description: Titanium Cloud collectd extensions -Platform: UNKNOWN diff --git a/monitoring/collectd-extensions/centos/build_srpm.data b/monitoring/collectd-extensions/centos/build_srpm.data deleted file mode 100644 index d4ac143ba..000000000 --- a/monitoring/collectd-extensions/centos/build_srpm.data +++ /dev/null @@ -1,25 +0,0 @@ -SRC_DIR="$PKG_BASE" - -COPY_LIST="$PKG_BASE/src/LICENSE \ - $PKG_BASE/src/collectd.conf.pmon \ - $PKG_BASE/src/collectd.service \ - $PKG_BASE/src/fm_notifier.py \ - $PKG_BASE/src/mtce_notifier.py \ - $PKG_BASE/src/plugin_common.py \ - $PKG_BASE/src/python_plugins.conf \ - $PKG_BASE/src/cpu.py \ - $PKG_BASE/src/cpu.conf \ - $PKG_BASE/src/memory.py \ - $PKG_BASE/src/memory.conf \ - $PKG_BASE/src/df.conf \ - $PKG_BASE/src/ntpq.py \ - $PKG_BASE/src/ntpq.conf \ - $PKG_BASE/src/interface.py \ - $PKG_BASE/src/interface.conf \ - $PKG_BASE/src/remotels.py \ - $PKG_BASE/src/remotels.conf \ - $PKG_BASE/src/ptp.py \ - $PKG_BASE/src/ptp.conf \ - $PKG_BASE/src/example.py \ - $PKG_BASE/src/example.conf" -TIS_PATCH_VER=13 diff --git a/monitoring/collectd-extensions/centos/collectd-extensions.spec b/monitoring/collectd-extensions/centos/collectd-extensions.spec deleted file mode 100644 index df780ac7e..000000000 --- a/monitoring/collectd-extensions/centos/collectd-extensions.spec +++ /dev/null @@ -1,110 +0,0 @@ -Summary: Titanuim Server collectd Package -Name: collectd-extensions -Version: 1.0 -Release: 0%{?_tis_dist}.%{tis_patch_ver} -License: ASL 2.0 -Group: base -Packager: Wind River -URL: unknown - -# create the files tarball -Source0: %{name}-%{version}.tar.gz -Source1: collectd.service -Source2: collectd.conf.pmon - -# collectd python plugin files - notifiers -Source3: fm_notifier.py -Source4: mtce_notifier.py -Source5: plugin_common.py - -# collectd python plugin files - resource plugins -Source11: cpu.py -Source12: memory.py -Source14: example.py -Source15: ntpq.py -Source16: interface.py -Source17: remotels.py -Source18: ptp.py - -# collectd plugin conf files into /etc/collectd.d -Source100: python_plugins.conf -Source101: cpu.conf -Source102: memory.conf -Source103: df.conf -Source104: example.conf -Source105: ntpq.conf -Source106: interface.conf -Source107: remotels.conf -Source108: ptp.conf - -BuildRequires: systemd-devel - -Requires: systemd -Requires: collectd -Requires: fm-api -Requires: python-httplib2 -Requires: python-influxdb -Requires: python-oslo-concurrency -Requires: tsconfig -Requires: /bin/systemctl - -%description -Titanium Cloud collectd extensions - -%define debug_package %{nil} -%define local_unit_dir %{_sysconfdir}/systemd/system -%define local_plugin_dir %{_sysconfdir}/collectd.d -%define local_python_extensions_dir /opt/collectd/extensions/python -%define local_config_extensions_dir /opt/collectd/extensions/config - -%prep -%setup - -%build - -%install -install -m 755 -d %{buildroot}%{_sysconfdir} -install -m 755 -d %{buildroot}%{local_unit_dir} -install -m 755 -d %{buildroot}%{local_plugin_dir} -install -m 755 -d %{buildroot}%{local_config_extensions_dir} -install -m 755 -d %{buildroot}%{local_python_extensions_dir} - -# support files ; service and pmon conf -install -m 644 %{SOURCE1} %{buildroot}%{local_unit_dir} -install -m 600 %{SOURCE2} %{buildroot}%{local_config_extensions_dir} - -# collectd python plugin files - notifiers -install -m 700 %{SOURCE3} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE4} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE5} %{buildroot}%{local_python_extensions_dir} - -# collectd python plugin files - resource plugins -install -m 700 %{SOURCE11} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE12} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE14} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE15} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir} -install -m 700 %{SOURCE18} %{buildroot}%{local_python_extensions_dir} - - -# collectd plugin conf files into /etc/collectd.d -install -m 600 %{SOURCE100} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE101} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE102} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE103} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE104} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE105} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE106} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE107} %{buildroot}%{local_plugin_dir} -install -m 600 %{SOURCE108} %{buildroot}%{local_plugin_dir} - -%clean -rm -rf $RPM_BUILD_ROOT - -%files -%defattr(-,root,root,-) -%config(noreplace) %{local_unit_dir}/collectd.service -%{local_plugin_dir}/* -%{local_config_extensions_dir}/* -%{local_python_extensions_dir}/* diff --git a/monitoring/collectd-extensions/src/LICENSE b/monitoring/collectd-extensions/src/LICENSE deleted file mode 100644 index d64569567..000000000 --- a/monitoring/collectd-extensions/src/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/monitoring/collectd-extensions/src/collectd.conf.pmon b/monitoring/collectd-extensions/src/collectd.conf.pmon deleted file mode 100644 index 8d905d432..000000000 --- a/monitoring/collectd-extensions/src/collectd.conf.pmon +++ /dev/null @@ -1,18 +0,0 @@ -[process] -process = collectd -service = collectd -style = lsb -pidfile = /var/run/collectd.pid -severity = major ; minor, major, critical -restarts = 3 ; restart retries before error assertion -interval = 5 ; number of seconds to wait between restarts -debounce = 10 ; number of seconds that a process needs to remain - ; running before degrade is removed and retry count - ; is cleared. -startuptime = 3 ; Seconds to wait after process start before starting the debounce monitor -mode = passive ; Monitoring mode: passive (default) or active - ; passive: process death monitoring (default: always) - ; active : heartbeat monitoring, i.e. request / response messaging - ; ignore : do not monitor or stop monitoring -quorum = 0 ; process is in the host watchdog quorum - diff --git a/monitoring/collectd-extensions/src/collectd.service b/monitoring/collectd-extensions/src/collectd.service deleted file mode 100644 index 1ac7cb036..000000000 --- a/monitoring/collectd-extensions/src/collectd.service +++ /dev/null @@ -1,15 +0,0 @@ -[Unit] -Description=Collectd statistics daemon and extension services -Documentation=man:collectd(1) man:collectd.conf(5) -Before=pmon.service -After=local-fs.target network-online.target -Requires=local-fs.target network-online.target - -[Service] -Type=notify -ExecStart=/usr/sbin/collectd -ExecStartPost=/bin/bash -c 'echo $MAINPID > /var/run/collectd.pid' -ExecStopPost=/bin/rm -f /var/run/collectd.pid - -[Install] -WantedBy=multi-user.target diff --git a/monitoring/collectd-extensions/src/cpu.conf b/monitoring/collectd-extensions/src/cpu.conf deleted file mode 100644 index 75394cdb2..000000000 --- a/monitoring/collectd-extensions/src/cpu.conf +++ /dev/null @@ -1,22 +0,0 @@ -# For stock plugin only -# Uncomment to compare stock to tiS plugin readings -# --------------------- -# -# ReportByCpu false -# ReportByState false -# ValuesPercentage true -# - - - - - Instance "used" - Persist true - PersistOK true - WarningMax 90.00 - FailureMax 95.00 - Hits 2 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/cpu.py b/monitoring/collectd-extensions/src/cpu.py deleted file mode 100755 index d32fd33b7..000000000 --- a/monitoring/collectd-extensions/src/cpu.py +++ /dev/null @@ -1,262 +0,0 @@ -# -# Copyright (c) 2018-2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################ -# -# This file is the collectd 'Platform CPU Usage' Monitor. -# -# The Platform CPU Usage is calculated as an averaged percentage of -# platform core usable since the previous sample. -# -# Init Function: -# - if 'worker_reserved.conf exists then query/store PLATFORM_CPU_LIST -# -############################################################################ -import os -import time -import collectd - -debug = False - -PASS = 0 -FAIL = 1 - -PATH = '/proc/cpuinfo' -WORKER_RESERVED_CONF = '/etc/platform/worker_reserved.conf' - -PLUGIN = 'platform cpu usage plugin' - - -# CPU Control class -class CPU: - hostname = "" # hostname for sample notification message - usage = float(0.0) # float value of cpu usage - - processors = int(0) # number of processors for all cpus case - cpu_list = [] # list of CPUs to calculate combined usage for - cpu_time = [] # schedstat time for each CPU - cpu_time_last = [] # last schedstat time for each CPU - time_last = float(0.0) # float of the time the last sample was taken - - def log_error(self, err_str): - """Print an error log with plugin name prefixing the log""" - - collectd.error("%s %s" % (PLUGIN, err_str)) - - -# Instantiate the class -c = CPU() - - -# The collectd configuration interface -# collectd needs this defined ; but not used/needed. -def config_func(config): - collectd.info('%s config function' % PLUGIN) - - -# Get the platform cpu list and number of cpus reported by /proc/cpuinfo -def init_func(): - # get current hostname - c.hostname = os.uname()[1] - - collectd.info('%s init function for %s' % (PLUGIN, c.hostname)) - - raw_list = "" - if os.path.exists(WORKER_RESERVED_CONF): - with open(WORKER_RESERVED_CONF, 'r') as infile: - for line in infile: - if 'PLATFORM_CPU_LIST' in line: - val = line.split("=") - raw_list = val[1].strip('\n')[1:-1].strip('"') - break - if raw_list: - - # Convert the cpu list fetched from the compute - # reserved file into an integer list. - # Handle mix of number list #,# and number range #-# - split_list = raw_list.split(',') - if debug: - collectd.info('%s split list: %s' % (PLUGIN, split_list)) - for cpu in split_list: - if cpu.find('-') == -1: - # add individual cpu # with assumed ',' delimiter - c.cpu_list.append(int(cpu)) - else: - # add all in range #-# - cpu_range = cpu.split('-') - if len(cpu_range) == 2: - first = int(cpu_range[0]) - last = int(cpu_range[1]) + 1 - # add each - for i in list(range(first, last)): - c.cpu_list.append(i) - - # with the full CPU list in hand we can now just read their samples - if debug: - collectd.info('%s full cpu list: %s' % - (PLUGIN, c.cpu_list)) - - try: - f = open('/proc/cpuinfo') - except EnvironmentError as e: - collectd.error(str(e), UserWarning) - else: - - if len(c.cpu_list) == 0: - _want_all_cpus = True - else: - _want_all_cpus = False - - c.processors = 0 - for line in f: - name_value = [s.strip() for s in line.split(':', 1)] - if len(name_value) != 2: - continue - - name, value = name_value - if 'rocessor' in name: - if _want_all_cpus is True: - c.cpu_list.append(int(c.processors)) - c.processors += 1 - - collectd.info('%s has found %d cpus total' % - (PLUGIN, c.processors)) - collectd.info('%s monitoring %d cpus %s' % - (PLUGIN, len(c.cpu_list), c.cpu_list)) - f.close() - - -# Calculate the CPU usage sample -def read_func(): - try: - f = open('/proc/schedstat') - except EnvironmentError as e: - c.log_error('file open failed ; ' + str(e)) - return FAIL - else: - # schedstat time for each CPU - c.cpu_time = [] - - # Loop over each line ... - # get the output version ; only 15 is supported - # get the cpu time from each line staring with 'cpux ....' - for line in f: - - # break each line into name/value pairs - line_split = [s.strip() for s in line.split(' ', 1)] - name, value = line_split - - # get the output version. - if 'ersion' in name: - try: - c.version = int(value) - except ValueError as e: - c.log_error('got invalid schedstat version ; ' + str(e)) - - # TODO: Consider exiting here and raising alarm. - # Calling this type of exit will stop the plugin. - # sys._exit() - return FAIL - - # only version 15 is supported - if c.version == 15: - if 'cpu' in name: - # get the cpu number for each line - if int(name.replace('cpu', '')) in c.cpu_list: - _in_list = True - else: - _in_list = False - - # get cpu time for each cpu that is valid - if len(c.cpu_list) == 0 or _in_list is True: - _schedstat = value - value_split = value.split(' ') - c.cpu_time.append(float(value_split[6])) - if debug: - collectd.info('%s %s schedstat is %s [%s]' % - (PLUGIN, name, value_split[6], - _schedstat)) - else: - collectd.error('%s unsupported schedstat version [%d]' % - (PLUGIN, c.version)) - return 0 - - f.close() - - # Now that we have the cpu time recorded for each cpu - _time_delta = float(0) - _cpu_count = int(0) - if len(c.cpu_time_last) == 0: - c.time_last = time.time() - if c.cpu_list: - # This is a compute node. - # Do not include vswitch or pinned cpus in calculation. - for cpu in c.cpu_list: - c.cpu_time_last.append(float(c.cpu_time[_cpu_count])) - _cpu_count += 1 - if debug: - collectd.info('%s cpu time ; first pass ; %s' % - (PLUGIN, c.cpu_time)) - return PASS - else: - _time_this = time.time() - _time_delta = _time_this - c.time_last - c.total_avg_cpu = 0 - cpu_occupancy = [] - if debug: - collectd.info('%s cpu time ; this pass ; %s -> %s' % - (PLUGIN, c.cpu_time_last, c.cpu_time)) - - if c.cpu_list: - # This is a compute node. - # Do not include vswitch or pinned cpus in calculation. - for cpu in c.cpu_list: - if cpu >= c.processors: - c.log_error(' got out of range cpu number') - else: - _delta = (c.cpu_time[_cpu_count] - c.cpu_time_last[_cpu_count]) - _delta = _delta / 1000000 / _time_delta - cpu_occupancy.append(float((100 * (_delta)) / 1000)) - c.total_avg_cpu += cpu_occupancy[_cpu_count] - if debug: - collectd.info('%s cpu %d - count:%d [%s]' % - (PLUGIN, cpu, _cpu_count, cpu_occupancy)) - _cpu_count += 1 - - else: - collectd.info('%s no cpus to monitor' % PLUGIN) - return 0 - - c.usage = c.total_avg_cpu / _cpu_count - if debug: - collectd.info('%s reports %.2f %% usage (averaged)' % - (PLUGIN, c.usage)) - - # Prepare for next audit ; mode now to last - # c.cpu_time_last = [] - c.cpu_time_last = c.cpu_time - c.time_last = _time_this - - # if os.path.exists('/var/run/fit/cpu_data'): - # with open('/var/run/fit/cpu_data', 'r') as infile: - # for line in infile: - # c.usage = float(line) - # collectd.info("%s using FIT data:%.2f" % - # (PLUGIN, c.usage)) - # break - - # Dispatch usage value to collectd - val = collectd.Values(host=c.hostname) - val.plugin = 'cpu' - val.type = 'percent' - val.type_instance = 'used' - val.dispatch(values=[c.usage]) - - return 0 - - -collectd.register_config(config_func) -collectd.register_init(init_func) -collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/df.conf b/monitoring/collectd-extensions/src/df.conf deleted file mode 100644 index 62eef59dd..000000000 --- a/monitoring/collectd-extensions/src/df.conf +++ /dev/null @@ -1,41 +0,0 @@ - - ValuesPercentage true - IgnoreSelected false - ReportByDevice false - ReportInodes false - ValuesAbsolute false - MountPoint "/" - MountPoint "/tmp" - MountPoint "/dev" - MountPoint "/dev/shm" - MountPoint "/var/run" - MountPoint "/var/log" - MountPoint "/var/lock" - MountPoint "/boot" - MountPoint "/scratch" - MountPoint "/opt/etcd" - MountPoint "/opt/platform" - MountPoint "/opt/extension" - MountPoint "/var/lib/rabbitmq" - MountPoint "/var/lib/postgresql" - MountPoint "/var/lib/ceph/mon" - MountPoint "/var/lib/docker" - MountPoint "/var/lib/docker-distribution" - MountPoint "/var/lib/kubelet" - MountPoint "/var/lib/nova/instances" - MountPoint "/opt/backups" - - - - - - Instance "used" - WarningMax 80.00 - FailureMax 90.00 - Persist true - PersistOK true - Hits 2 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/example.conf b/monitoring/collectd-extensions/src/example.conf deleted file mode 100644 index 574306027..000000000 --- a/monitoring/collectd-extensions/src/example.conf +++ /dev/null @@ -1,13 +0,0 @@ - - - - Instance "used" - Persist true - PersistOK true - WarningMax 49.00 - FailureMax 74.00 - Hits 1 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/example.py b/monitoring/collectd-extensions/src/example.py deleted file mode 100755 index f86514582..000000000 --- a/monitoring/collectd-extensions/src/example.py +++ /dev/null @@ -1,73 +0,0 @@ -# -# Copyright (c) 2018 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -import os -import random -import collectd - -PLUGIN = 'random number plugin' - -# static variables - - -# define a class here that will persist over read calls -class ExampleObject: - hostname = "" - plugin_data = ['1', '100'] - - -obj = ExampleObject() - - -# The config function - called once on collectd process startup -def config_func(config): - """Configure the plugin""" - - for node in config.children: - key = node.key.lower() - val = node.values[0] - - if key == 'data': - obj.plugin_data = str(val).split(' ') - collectd.info("%s configured data '%d:%d'" % - (PLUGIN, - int(obj.plugin_data[0]), - int(obj.plugin_data[1]))) - return 0 - - collectd.info('%s config function' % PLUGIN) - return 0 - - -# The init function - called once on collectd process startup -def init_func(): - - # get current hostname - obj.hostname = os.uname()[1] - return 0 - - -# The sample read function - called on every audit interval -def read_func(): - - # do the work to create the sample - low = int(obj.plugin_data[0]) - high = int(obj.plugin_data[1]) - sample = random.randint(low, high) - - # Dispatch usage value to collectd - val = collectd.Values(host=obj.hostname) - val.plugin = 'example' - val.type = 'percent' - val.type_instance = 'used' - val.dispatch(values=[sample]) - return 0 - - -# register the config, init and read functions -collectd.register_config(config_func) -collectd.register_init(init_func) -collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/fm_notifier.py b/monitoring/collectd-extensions/src/fm_notifier.py deleted file mode 100755 index 9aec6bcab..000000000 --- a/monitoring/collectd-extensions/src/fm_notifier.py +++ /dev/null @@ -1,1586 +0,0 @@ -# -# Copyright (c) 2018-2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Version 1.0 -# -############################################################################ -# -# This file is the collectd 'FM Alarm' Notifier. -# -# This notifier manages raising and clearing alarms based on collectd -# notifications ; i.e. automatic collectd calls to this handler/notifier. -# -# Collectd process startup automatically calls this module's init_func which -# declares and initializes a plugObject class for plugin type in preparation -# for periodic ongoing monitoring where collectd calls notify_func for each -# plugin and instance of that plugin. -# -# All other class or common member functions implemented herein exist in -# support of that aformentioned initialization and periodic monitoring. -# -# Collects provides information about each event as an object passed to the -# notification handler ; the notification object. -# -# object.host - the hostname. -# -# object.plugin - the name of the plugin aka resource. -# object.plugin_instance - plugin instance string i.e. say mountpoint -# for df plugin or numa? node for memory. -# object.type, - the unit i.e. percent or absolute. -# object.type_instance - the attribute i.e. free, used, etc. -# -# object.severity - a integer value 0=OK , 1=warning, 2=failure. -# object.message - a log-able message containing the above along -# with the value. -# -# This notifier uses the notification object to manage plugin/instance alarms. -# -# To avoid stuck alarms or missing alarms the plugin thresholds should be -# configured with Persist = true and persistOK = true. Thes controls tell -# collectd to always send notifications regardless of state change ; which -# would be the case with these cobtrols set to false. -# -# Persist = false ; only send notifications on 'okay' to 'not okay' change. -# PersistOK = false ; only send notifications on 'not okay' to 'okay' change. -# -# With these both set to true in the threshold spec for the plugin then -# collectd will call this notifier for each audit plugin/instance audit. -# -# Collectd supports only 2 threshold severities ; warning and failure. -# The 'failure' maps to 'critical' while 'warning' maps to 'major' in FM. -# -# To avoid unnecessary load on FM, this notifier maintains current alarm -# state and only makes an FM call on alarm state changes. Current alarm state -# is queried by the init function called by collectd on process startup. -# -# Current alarm state is maintained by two severity lists for each plugin, -# a warnings list and a failures list. -# -# When a failure is reported against a specific plugin then that resources's -# entity_id is added to that plugin's alarm object's failures list. Similarly, -# warning assertions get their entity id added to plugin's alarm object's -# warnings list. Any entity id should only exist in one of the lists at one -# time or in none at all if the notification condition is 'okay' and the alarm -# is cleared. -# -# Adding Plugins: -# -# To add new plugin support just search for ADD_NEW_PLUGIN and add the data -# requested in that area. -# -# Example commands to read samples from the influx database -# -# SELECT * FROM df_value WHERE instance='root' AND type='percent_bytes' AND -# type_instance='used' -# SELECT * FROM cpu_value WHERE type='percent' AND type_instance='used' -# SELECT * FROM memory_value WHERE type='percent' AND type_instance='used' -# -############################################################################ -# -# Import list - -# UT imports -import os -import re -import uuid -import collectd -from threading import RLock as Lock -from fm_api import constants as fm_constants -from fm_api import fm_api -import tsconfig.tsconfig as tsc -import plugin_common as pc - -# only load influxdb on the controller -if tsc.nodetype == 'controller': - from influxdb import InfluxDBClient - -api = fm_api.FaultAPIsV2() - -# Debug control -debug = False -debug_lists = False -want_state_audit = False -want_vswitch = False - -# number of notifier loops before the state is object dumped -DEBUG_AUDIT = 2 - -# write a 'value' log on a the resource sample change of more than this amount -LOG_STEP = 10 - -# Number of back to back database update misses -MAX_NO_UPDATE_B4_ALARM = 5 - -# This plugin name -PLUGIN = 'alarm notifier' - -# Path to the plugin's drop dir -PLUGIN_PATH = '/etc/collectd.d/' - -# the name of the collectd samples database -DATABASE_NAME = 'collectd samples' - -READING_TYPE__PERCENT_USAGE = '% usage' - -# Default invalid threshold value -INVALID_THRESHOLD = float(-1) - -# collectd severity definitions ; -# Note: can't seem to pull then in symbolically with a header -NOTIF_FAILURE = 1 -NOTIF_WARNING = 2 -NOTIF_OKAY = 4 - -PASS = 0 -FAIL = 1 - - -# Some plugin_instances are mangled by collectd. -# The filesystem plugin is especially bad for this. -# For instance the "/var/log" MountPoint instance is -# reported as "var-log". -# The following is a list of mangled instances list -# that need the '-' replaced with '/'. -# -# ADD_NEW_PLUGIN if there are new file systems being added that -# have subdirectories in the name then they will need to be added -# to the mangled list -mangled_list = {"dev-shm", - "var-log", - "var-run", - "var-lock", - "var-lib-rabbitmq", - "var-lib-postgresql", - "var-lib-ceph-mon", - "var-lib-docker", - "var-lib-docker-distribution" - "var-lib-kubelet", - "var-lib-nova-instances", - "opt-platform", - "opt-cgcs", - "opt-etcd", - "opt-extension", - "opt-backups"} - -# ADD_NEW_PLUGIN: add new alarm id definition -ALARM_ID__CPU = "100.101" -ALARM_ID__MEM = "100.103" -ALARM_ID__DF = "100.104" -ALARM_ID__EXAMPLE = "100.113" - -ALARM_ID__VSWITCH_CPU = "100.102" -ALARM_ID__VSWITCH_MEM = "100.115" -ALARM_ID__VSWITCH_PORT = "300.001" -ALARM_ID__VSWITCH_IFACE = "300.002" - - -# ADD_NEW_PLUGIN: add new alarm id to the list -ALARM_ID_LIST = [ALARM_ID__CPU, - ALARM_ID__MEM, - ALARM_ID__DF, - ALARM_ID__VSWITCH_CPU, - ALARM_ID__VSWITCH_MEM, - ALARM_ID__VSWITCH_PORT, - ALARM_ID__VSWITCH_IFACE, - ALARM_ID__EXAMPLE] - -# ADD_NEW_PLUGIN: add plugin name definition -# WARNING: This must line up exactly with the plugin -# filename without the extension. -PLUGIN__DF = "df" -PLUGIN__CPU = "cpu" -PLUGIN__MEM = "memory" -PLUGIN__INTERFACE = "interface" -PLUGIN__NTP_QUERY = "ntpq" -PLUGIN__VSWITCH_PORT = "vswitch_port" -PLUGIN__VSWITCH_CPU = "vswitch_cpu" -PLUGIN__VSWITCH_MEM = "vswitch_mem" -PLUGIN__VSWITCH_IFACE = "vswitch_iface" -PLUGIN__EXAMPLE = "example" - -# ADD_NEW_PLUGIN: add plugin name to list -PLUGIN_NAME_LIST = [PLUGIN__CPU, - PLUGIN__MEM, - PLUGIN__DF, - PLUGIN__VSWITCH_CPU, - PLUGIN__VSWITCH_MEM, - PLUGIN__VSWITCH_PORT, - PLUGIN__VSWITCH_IFACE, - PLUGIN__EXAMPLE] - - -# PluginObject Class -class PluginObject: - - dbObj = None # shared database connection obj - host = None # saved hostname - lock = None # global lock for mread_func mutex - database_setup = False # state of database setup - database_setup_in_progress = False # connection mutex - - # Set to True once FM connectivity is verified - # Used to ensure alarms are queried on startup - fm_connectivity = False - - def __init__(self, id, plugin): - """PluginObject Class constructor""" - - # plugin specific static class members. - self.id = id # alarm id ; 100.1?? - self.plugin = plugin # name of the plugin ; df, cpu, memory ... - self.plugin_instance = "" # the instance name for the plugin - self.resource_name = "" # The top level name of the resource - self.instance_name = "" # The instance name - - # Instance specific learned static class members. - self.entity_id = "" # fm entity id host=. - self.instance = "" # _ - - # [ 'float value string','float threshold string] - self.values = [] - self.value = float(0) # float value of reading - - # This member is used to help log change values using the - # LOG_STEP threshold consant - self.last_value = float(0) - - # float value of threshold - self.threshold = float(INVALID_THRESHOLD) - - # Common static class members. - self.reason_warning = "" - self.reason_failure = "" - self.repair = "" - self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL - self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS - self.suppression = True - self.service_affecting = False - - # default most reading types are usage - self.reading_type = READING_TYPE__PERCENT_USAGE - - # Severity tracking lists. - # Maintains severity state between notifications. - # Each is a list of entity ids for severity asserted alarms. - # As alarms are cleared so is the entry in these lists. - # The entity id should only be in one lists for any given raised alarm. - self.warnings = [] - self.failures = [] - - # total notification count - self.count = 0 - - # Debug: state audit controls - self.audit_threshold = 0 - self.audit_count = 0 - - # For plugins that have multiple instances like df (filesystem plugin) - # we need to create an instance of this object for each one. - # This dictionary is used to associate an instance with its object. - self.instance_objects = {} - - def _ilog(self, string): - """Create a collectd notifier info log with the string param""" - collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string)) - - def _llog(self, string): - """Create a collectd notifier info log when debug_lists not empty""" - if debug_lists: - collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string)) - - def _elog(self, string): - """Create a collectd notifier error log with the string param""" - collectd.error('%s %s : %s' % (PLUGIN, self.plugin, string)) - - ########################################################################## - # - # Name : _state_audit - # - # Purpose : Debug Tool to log plugin object info. - # - # Not called in production code. - # - # Only the severity lists are dumped for now. - # Other info can be added as needed. - # Can be run as an audit or called directly. - # - ########################################################################## - - def _state_audit(self, location): - """Log the state of the specified object""" - - if self.id == ALARM_ID__CPU: - _print_state() - - self.audit_count += 1 - if self.warnings: - collectd.info("%s AUDIT %d: %s warning list %s:%s" % - (PLUGIN, - self.audit_count, - self.plugin, - location, - self.warnings)) - if self.failures: - collectd.info("%s AUDIT %d: %s failure list %s:%s" % - (PLUGIN, - self.audit_count, - self.plugin, - location, - self.failures)) - - ########################################################################## - # - # Name : _manage_change - # - # Purpose : Manage sample value change. - # - # Handle no sample update case. - # Parse the notification log. - # Handle base object instances. - # Generate a log entry if the sample value changes more than - # step value. - # - ########################################################################## - - def _manage_change(self, nObject): - """Log resource instance value on step state change""" - - # filter out messages to ignore ; notifications that have no value - if "has not been updated for" in nObject.message: - collectd.info("%s %s %s (%s)" % - (PLUGIN, - self.entity_id, - nObject.message, - nObject.severity)) - return "done" - - # Get the value from the notification message. - # The location in the message is different based on the message type ; - # normal reading or overage reading - # - # message: Host controller-0, plugin memory type percent ... [snip] - # All data sources are within range again. - # Current value of "value" is 51.412038. <------ - # - # message: Host controller-0, plugin df (instance scratch) ... [snip] - # Data source "value" is currently 97.464027. <------ - # That is above the failure threshold of 90.000000. <------ - - # recognized strings - value only value and threshold - # ------------ ------------------- - value_sig_list = ['Current value of', 'is currently'] - - # list of parsed 'string version' float values ['value','threshold'] - self.values = [] - for sig in value_sig_list: - index = nObject.message.find(sig) - if index != -1: - self.values = \ - re.findall(r"[-+]?\d*\.\d+|\d+", nObject.message[index:-1]) - - # contains string versions of the float values extracted from - # the notification message. The threshold value is included for - # readings that are out of threshold. - if len(self.values): - # validate the reading - try: - self.value = round(float(self.values[0]), 2) - # get the threshold if its there. - if len(self.values) > 1: - self.threshold = float(self.values[1]) - else: - self.threshold = float(INVALID_THRESHOLD) # invalid value - - except ValueError as ex: - collectd.error("%s %s value not integer or float (%s) (%s)" % - (PLUGIN, self.entity_id, self.value, str(ex))) - return "done" - except TypeError as ex: - collectd.info("%s %s value has no type (%s)" % - (PLUGIN, self.entity_id, str(ex))) - return "done" - else: - collectd.info("%s %s reported no value (%s)" % - (PLUGIN, self.entity_id, nObject.message)) - return "done" - - # get the last reading - if self.last_value: - last = float(self.last_value) - else: - last = float(0) - - # Determine if the change is large enough to log and save the new value - logit = False - if self.count == 0 or LOG_STEP == 0: - logit = True - elif self.reading_type == "connections": - if self.value != last: - logit = True - elif self.value > last: - if (last + LOG_STEP) < self.value: - logit = True - elif last > self.value: - if (self.value + LOG_STEP) < last: - logit = True - - # Case on types. - # - # Note: only usage type so far - if logit: - resource = self.resource_name - - # setup resource name for filesystem instance usage log - if self.plugin == PLUGIN__DF: - resource = self.instance - - elif self.plugin == PLUGIN__MEM: - if self.instance_name: - if self.instance_name != 'platform': - resource += ' ' + self.instance_name - - # setup resource name for vswitch process instance name - elif self.plugin == PLUGIN__VSWITCH_MEM: - resource += ' Processor ' - resource += self.instance_name - - if self.reading_type == READING_TYPE__PERCENT_USAGE: - tmp = str(self.value).split('.') - if len(tmp[0]) == 1: - pre = ': ' - else: - pre = ': ' - collectd.info("%s reading%s%2.2f %s - %s" % - (PLUGIN, - pre, - self.value, - self.reading_type, - resource)) - - elif self.reading_type == "connections" and \ - self.instance_objects and \ - self.value != self.last_value: - if self.instance_objects: - collectd.info("%s monitor: %2d %s - %s" % - (PLUGIN, - self.value, - self.reading_type, - resource)) - - ########################################################################## - # - # Name : _update_alarm - # - # Purpose : Compare current severity to instance severity lists to - # facilitate early 'do nothing' exit from a notification. - # - # Description: Avoid clearing an already cleared alarm. - # Refresh asserted alarm data for usage reading type alarms - # - # Returns : True if the alarm needs refresh, otherwise false. - # - ########################################################################## - def _update_alarm(self, entity_id, severity, this_value, last_value): - """Check for need to update alarm data""" - - if entity_id in self.warnings: - self._llog(entity_id + " is already in warnings list") - current_severity_str = "warning" - elif entity_id in self.failures: - self._llog(entity_id + " is already in failures list") - current_severity_str = "failure" - else: - self._llog(entity_id + " is already OK") - current_severity_str = "okay" - - # Compare to current state to previous state. - # If they are the same then return done. - if severity == current_severity_str: - if severity == "okay": - return False - if self.reading_type != READING_TYPE__PERCENT_USAGE: - return False - elif round(last_value, 2) == round(this_value, 2): - return False - return True - - ######################################################################## - # - # Name : _manage_alarm - # - # Putpose : Alarm Severity Tracking - # - # This class member function accepts a severity level and entity id. - # It manages the content of the current alarm object's 'failures' and - # 'warnings' lists ; aka Severity Lists. - # - # These Severity Lists are used to record current alarmed state for - # each instance of a plugin. - # If an alarm is raised then its entity id is added to the appropriate - # severity list. - # - # A failure notification or critical alarm goes in the failures list. - # A warning notification or major alarm goes into the warnings list. - # - # These lists are used to avoid making unnecessary calls to FM. - # - # Startup Behavior: - # - # The collectd daemon runs the init function of every plugin on startup. - # That includes this notifier plugin. The init function queries the FM - # database for any active alarms. - # - # This member function is called for any active alarms that are found. - # The entity id for active alarms is added to the appropriate - # Severity List. This way existing alarms are maintained over collectd - # process startup. - # - # Runtime Behavior: - # - # The current severity state is first queried and compared to the - # newly reported severity level. If they are the same then a "done" - # is returned telling the caller that there is no further work to do. - # Otherwise, the lists are managed in a way that has the entity id - # of a raised alarm in the corresponding severity list. - # - # See inline comments below for each specific severity and state - # transition case. - # - ######################################################################### - - def _manage_alarm(self, entity_id, severity): - """Manage the alarm severity lists and report state change""" - - collectd.debug("%s manage alarm %s %s %s" % - (PLUGIN, - self.id, - severity, - entity_id)) - - # Get the instance's current state - if entity_id in self.warnings: - current_severity_str = "warning" - elif entity_id in self.failures: - current_severity_str = "failure" - else: - current_severity_str = "okay" - - # Compare to current state to previous state. - # If they are the same then return done. - if severity == current_severity_str: - return "done" - - # Otherwise, manage the severity lists ; case by case. - warnings_list_change = False - failures_list_change = False - - # Case 1: Handle warning to failure severity change. - if severity == "warning" and current_severity_str == "failure": - - if entity_id in self.failures: - self.failures.remove(entity_id) - failures_list_change = True - self._llog(entity_id + " is removed from failures list") - else: - self._elog(entity_id + " UNEXPECTEDLY not in failures list") - - # Error detection - if entity_id in self.warnings: - self.warnings.remove(entity_id) - self._elog(entity_id + " UNEXPECTEDLY in warnings list") - - self.warnings.append(entity_id) - warnings_list_change = True - self._llog(entity_id + " is added to warnings list") - - # Case 2: Handle failure to warning alarm severity change. - elif severity == "failure" and current_severity_str == "warning": - - if entity_id in self.warnings: - self.warnings.remove(entity_id) - warnings_list_change = True - self._llog(entity_id + " is removed from warnings list") - else: - self._elog(entity_id + " UNEXPECTEDLY not in warnings list") - - # Error detection - if entity_id in self.failures: - self.failures.remove(entity_id) - self._elog(entity_id + " UNEXPECTEDLY in failures list") - - self.failures.append(entity_id) - failures_list_change = True - self._llog(entity_id + " is added to failures list") - - # Case 3: Handle new alarm. - elif severity != "okay" and current_severity_str == "okay": - if severity == "warning": - self.warnings.append(entity_id) - warnings_list_change = True - self._llog(entity_id + " added to warnings list") - elif severity == "failure": - self.failures.append(entity_id) - failures_list_change = True - self._llog(entity_id + " added to failures list") - - # Case 4: Handle alarm clear. - else: - # plugin is okay, ensure this plugin's entity id - # is not in either list - if entity_id in self.warnings: - self.warnings.remove(entity_id) - warnings_list_change = True - self._llog(entity_id + " removed from warnings list") - if entity_id in self.failures: - self.failures.remove(entity_id) - failures_list_change = True - self._llog(entity_id + " removed from failures list") - - if warnings_list_change is True: - if self.warnings: - collectd.info("%s %s warnings %s" % - (PLUGIN, self.plugin, self.warnings)) - else: - collectd.info("%s %s no warnings" % - (PLUGIN, self.plugin)) - - if failures_list_change is True: - if self.failures: - collectd.info("%s %s failures %s" % - (PLUGIN, self.plugin, self.failures)) - else: - collectd.info("%s %s no failures" % - (PLUGIN, self.plugin)) - - ########################################################################## - # - # Name : _get_instance_object - # - # Purpose : Safely get an object from the self instance object list - # indexed by eid. - # - ########################################################################## - def _get_instance_object(self, eid): - """Safely get an object from the self instance object dict while locked - - :param eid: the index for the instance object dictionary - :return: object or None - """ - - try: - collectd.debug("%s %s Get Lock ..." % (PLUGIN, self.plugin)) - with PluginObject.lock: - obj = self.instance_objects[eid] - return obj - except: - collectd.error("%s failed to get instance from %s object list" % - (PLUGIN, self.plugin)) - return None - - ########################################################################## - # - # Name : _add_instance_object - # - # Purpose : Safely add an object to the self instance object list - # indexed by eid while locked. if found locked the instance - # add will be re-attempted on next sample. - # - ########################################################################## - def _add_instance_object(self, obj, eid): - """Update self instance_objects list while locked - - :param obj: the object to add - :param eid: index for instance_objects - :return: nothing - """ - try: - collectd.debug("%s %s Add Lock ..." % (PLUGIN, self.plugin)) - with PluginObject.lock: - self.instance_objects[eid] = obj - except: - collectd.error("%s failed to add instance to %s object list" % - (PLUGIN, self.plugin)) - - ########################################################################## - # - # Name : _copy_instance_object - # - # Purpose : Copy select members of self object to target object. - # - ########################################################################## - def _copy_instance_object(self, object): - """Copy select members of self object to target object""" - - object.resource_name = self.resource_name - object.instance_name = self.instance_name - object.reading_type = self.reading_type - - object.reason_warning = self.reason_warning - object.reason_failure = self.reason_failure - object.repair = self.repair - - object.alarm_type = self.alarm_type - object.cause = self.cause - object.suppression = self.suppression - object.service_affecting = self.service_affecting - - ########################################################################## - # - # Name : _create_instance_object - # - # Purpose : Create a new instance object and tack it on the supplied base - # object's instance object dictionary. - # - ########################################################################## - def _create_instance_object(self, instance): - - try: - # create a new plugin object - inst_obj = PluginObject(self.id, self.plugin) - self._copy_instance_object(inst_obj) - - # initialize the object with instance specific data - inst_obj.instance_name = instance - inst_obj.entity_id = _build_entity_id(self.plugin, - instance) - - self._add_instance_object(inst_obj, inst_obj.entity_id) - - collectd.debug("%s created %s instance (%s) object %s" % - (PLUGIN, inst_obj.resource_name, - inst_obj.entity_id, inst_obj)) - - collectd.info("%s monitoring %s %s %s" % - (PLUGIN, - inst_obj.resource_name, - inst_obj.instance_name, - inst_obj.reading_type)) - - return inst_obj - - except: - collectd.error("%s %s:%s inst object create failed" % - (PLUGIN, inst_obj.resource_name, instance)) - return None - - ########################################################################## - # - # Name : _create_instance_objects - # - # Purpose : Create a list of instance objects for 'self' type plugin and - # add those objects to the parent's instance_objects dictionary. - # - # Note : This is currently only used for the DF (filesystem) plugin. - # All other instance creations/allocations are done on-demand. - # - ########################################################################## - def _create_instance_objects(self): - """Create, initialize and add an instance object to this/self plugin""" - - # Create the File System subordinate instance objects. - if self.id == ALARM_ID__DF: - - # read the df.conf file and return/get a list of mount points - conf_file = PLUGIN_PATH + 'df.conf' - if not os.path.exists(conf_file): - collectd.error("%s cannot create filesystem " - "instance objects ; missing : %s" % - (PLUGIN, conf_file)) - return FAIL - - mountpoints = [] - with open(conf_file, 'r') as infile: - for line in infile: - if 'MountPoint ' in line: - - # get the mountpoint path from the line - try: - mountpoint = line.split('MountPoint ')[1][1:-2] - mountpoints.append(mountpoint) - except: - collectd.error("%s skipping invalid '%s' " - "mountpoint line: %s" % - (PLUGIN, conf_file, line)) - - collectd.debug("%s MountPoints: %s" % (PLUGIN, mountpoints)) - - # loop over the mount points - for mp in mountpoints: - # create a new plugin object - inst_obj = PluginObject(ALARM_ID__DF, PLUGIN__DF) - - # initialize the object with instance specific data - inst_obj.resource_name = self.resource_name - inst_obj.instance_name = mp - inst_obj.instance = mp - # build the plugin instance name from the mount point - if mp == '/': - inst_obj.plugin_instance = 'root' - else: - inst_obj.plugin_instance = mp[1:].replace('/', '-') - - inst_obj.entity_id = _build_entity_id(PLUGIN__DF, - inst_obj.plugin_instance) - - # add this subordinate object to the parent's - # instance object list - self._add_instance_object(inst_obj, inst_obj.entity_id) - - collectd.info("%s monitoring %s usage" % - (PLUGIN, inst_obj.instance)) - - -PluginObject.host = os.uname()[1] - - -# ADD_NEW_PLUGIN: add plugin to this table -# This instantiates the plugin objects -PLUGINS = { - PLUGIN__CPU: PluginObject(ALARM_ID__CPU, PLUGIN__CPU), - PLUGIN__MEM: PluginObject(ALARM_ID__MEM, PLUGIN__MEM), - PLUGIN__DF: PluginObject(ALARM_ID__DF, PLUGIN__DF), - PLUGIN__VSWITCH_CPU: PluginObject(ALARM_ID__VSWITCH_CPU, - PLUGIN__VSWITCH_CPU), - PLUGIN__VSWITCH_MEM: PluginObject(ALARM_ID__VSWITCH_MEM, - PLUGIN__VSWITCH_MEM), - PLUGIN__VSWITCH_PORT: PluginObject(ALARM_ID__VSWITCH_PORT, - PLUGIN__VSWITCH_PORT), - PLUGIN__VSWITCH_IFACE: PluginObject(ALARM_ID__VSWITCH_IFACE, - PLUGIN__VSWITCH_IFACE), - PLUGIN__EXAMPLE: PluginObject(ALARM_ID__EXAMPLE, PLUGIN__EXAMPLE)} - - -##################################################################### -# -# Name : clear_alarm -# -# Description: Clear the specified alarm with the specified entity ID. -# -# Returns : True if operation succeeded -# False if there was an error exception. -# -# Assumptions: Caller can decide to retry based on return status. -# -##################################################################### -def clear_alarm(alarm_id, eid): - """Clear the specified alarm:eid""" - - try: - if api.clear_fault(alarm_id, eid) is True: - collectd.info("%s %s:%s alarm cleared" % - (PLUGIN, alarm_id, eid)) - else: - collectd.info("%s %s:%s alarm already cleared" % - (PLUGIN, alarm_id, eid)) - return True - - except Exception as ex: - collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % - (PLUGIN, alarm_id, eid, ex)) - return False - - -def _get_base_object(alarm_id): - """Get the alarm object for the specified alarm id""" - for plugin in PLUGIN_NAME_LIST: - if PLUGINS[plugin].id == alarm_id: - return PLUGINS[plugin] - return None - - -def _get_object(alarm_id, eid): - """Get the plugin object for the specified alarm id and eid""" - - base_obj = _get_base_object(alarm_id) - if len(base_obj.instance_objects): - try: - return(base_obj.instance_objects[eid]) - except: - collectd.debug("%s %s has no instance objects" % - (PLUGIN, base_obj.plugin)) - return base_obj - - -def _build_entity_id(plugin, plugin_instance): - """Builds an entity id string based on the collectd notification object""" - - inst_error = False - - entity_id = 'host=' - entity_id += PluginObject.host - - if plugin == PLUGIN__MEM: - if plugin_instance != 'platform': - entity_id += '.numa=' + plugin_instance - - elif plugin == PLUGIN__VSWITCH_MEM: - - # host=.processor= - if plugin_instance: - entity_id += '.processor=' + plugin_instance - else: - inst_error = True - - elif plugin == PLUGIN__VSWITCH_IFACE: - - # host=.interface= - if plugin_instance: - entity_id += '.interface=' + plugin_instance - else: - inst_error = True - - elif plugin == PLUGIN__VSWITCH_PORT: - - # host=.port= - if plugin_instance: - entity_id += '.port=' + plugin_instance - else: - inst_error = True - - elif plugin == PLUGIN__DF: - - # host=.filesystem= - if plugin_instance: - instance = plugin_instance - - # build the entity_id for this plugin - entity_id += '.filesystem=/' - - # collectd replaces the instance '/' with the word 'root' - # So skip over "root" as '/' is already part of the - # entity_id - if instance != 'root': - # Look for other instances that are in the mangled list - if instance in mangled_list: - instance = instance.replace('-', '/') - entity_id += instance - - if inst_error is True: - collectd.error("%s eid build failed ; missing instance" % plugin) - return None - - return entity_id - - -def _get_df_mountpoints(): - - conf_file = PLUGIN_PATH + 'df.conf' - if not os.path.exists(conf_file): - collectd.error("%s cannot create filesystem " - "instance objects ; missing : %s" % - (PLUGIN, conf_file)) - return FAIL - - mountpoints = [] - with open(conf_file, 'r') as infile: - for line in infile: - if 'MountPoint ' in line: - - # get the mountpoint path from the line - try: - mountpoint = line.split('MountPoint ')[1][1:-2] - mountpoints.append(mountpoint) - except: - collectd.error("%s skipping invalid '%s' " - "mountpoint line: %s" % - (PLUGIN, conf_file, line)) - - return(mountpoints) - - -def _print_obj(obj): - """Print a single object""" - base_object = False - for plugin in PLUGIN_NAME_LIST: - if PLUGINS[plugin] == obj: - base_object = True - break - - num = len(obj.instance_objects) - if num > 0 or base_object is True: - prefix = "PLUGIN " - if num: - prefix += str(num) - else: - prefix += " " - else: - prefix = "INSTANCE" - - if obj.plugin_instance: - resource = obj.plugin + ":" + obj.plugin_instance - else: - resource = obj.plugin - - collectd.info("%s %s res: %s name: %s\n" % - (PLUGIN, prefix, resource, obj.resource_name)) - collectd.info("%s eid : %s\n" % (PLUGIN, obj.entity_id)) - collectd.info("%s inst: %s name: %s\n" % - (PLUGIN, obj.instance, obj.instance_name)) - collectd.info("%s value:%2.1f thld:%2.1f cause:%s (%d) type:%s" % - (PLUGIN, - obj.value, - obj.threshold, - obj.cause, - obj.count, - obj.reading_type)) - collectd.info("%s warn:%s fail:%s" % - (PLUGIN, obj.warnings, obj.failures)) - collectd.info("%s repair:t: %s" % - (PLUGIN, obj.repair)) - if obj.cause != fm_constants.ALARM_PROBABLE_CAUSE_50: - collectd.info("%s reason:w: %s\n" - "%s reason:f: %s\n" % - (PLUGIN, obj.reason_warning, - PLUGIN, obj.reason_failure)) - # collectd.info(" ") - - -def _print_state(obj=None): - """Print the current object state""" - try: - objs = [] - if obj is None: - for plugin in PLUGIN_NAME_LIST: - objs.append(PLUGINS[plugin]) - else: - objs.append(obj) - - collectd.debug("%s _print_state Lock ..." % PLUGIN) - with PluginObject.lock: - for o in objs: - _print_obj(o) - if len(o.instance_objects): - for inst_obj in o.instance_objects: - _print_obj(o.instance_objects[inst_obj]) - - except Exception as ex: - collectd.error("%s _print_state exception ; %s" % - (PLUGIN, ex)) - - -def _database_setup(database): - """Setup the influx database for collectd resource samples""" - - collectd.info("%s setting up influxdb:%s database" % - (PLUGIN, database)) - - error_str = "" - - # http://influxdb-python.readthedocs.io/en/latest/examples.html - # http://influxdb-python.readthedocs.io/en/latest/api-documentation.html - PluginObject.dbObj = InfluxDBClient('127.0.0.1', '8086', database) - if PluginObject.dbObj: - try: - PluginObject.dbObj.create_database('collectd') - - ############################################################ - # - # TODO: Read current retention period from service parameter - # Make it a puppet implementation. - # - # Create a '1 week' samples retention policy - # ----------------------------------------- - # name = 'collectd samples' - # duration = set retention period in time - # xm - minutes - # xh - hours - # xd - days - # xw - weeks - # xy - years - # database = 'collectd' - # default = True ; make it the default - # - ############################################################ - - PluginObject.dbObj.create_retention_policy( - DATABASE_NAME, '1w', 1, database, True) - except Exception as ex: - if str(ex) == 'database already exists': - try: - collectd.info("%s influxdb:collectd %s" % - (PLUGIN, str(ex))) - PluginObject.dbObj.create_retention_policy( - DATABASE_NAME, '1w', 1, database, True) - except Exception as ex: - if str(ex) == 'retention policy already exists': - collectd.info("%s influxdb:collectd %s" % - (PLUGIN, str(ex))) - else: - error_str = "failure from influxdb ; " - error_str += str(ex) - else: - error_str = "failed to create influxdb:" + database - else: - error_str = "failed to connect to influxdb:" + database - - if not error_str: - found = False - retention = \ - PluginObject.dbObj.get_list_retention_policies(database) - for r in range(len(retention)): - if retention[r]["name"] == DATABASE_NAME: - collectd.info("%s influxdb:%s samples retention " - "policy: %s" % - (PLUGIN, database, retention[r])) - found = True - if found is True: - collectd.info("%s influxdb:%s is setup" % (PLUGIN, database)) - PluginObject.database_setup = True - else: - collectd.error("%s influxdb:%s retention policy NOT setup" % - (PLUGIN, database)) - - -def _clear_alarm_for_missing_filesystems(): - """Clear alarmed file systems that are no longer mounted or present""" - - # get the DF (filesystem plugin) base object. - df_base_obj = PLUGINS[PLUGIN__DF] - # create a single alarm list from both wranings and failures list - # to avoid having to duplicate the code below for each. - # At this point we don't care about severity, we just need to - # determine if an any-severity' alarmed filesystem no longer exists - # so we can cleanup by clearing its alarm. - # Note: the 2 lists shpould always contain unique data between them - alarm_list = df_base_obj.warnings + df_base_obj.failures - if len(alarm_list): - for eid in alarm_list: - # search for any of them that might be alarmed. - obj = df_base_obj._get_instance_object(eid) - - # only care about df (file system plugins) - if obj is not None and \ - obj.plugin == PLUGIN__DF and \ - obj.entity_id == eid and \ - obj.plugin_instance != 'root': - - # For all others replace all '-' with '/' - path = '/' + obj.plugin_instance.replace('-', '/') - if os.path.ismount(path) is False: - if clear_alarm(df_base_obj.id, obj.entity_id) is True: - collectd.info("%s cleared alarm for missing %s" % - (PLUGIN, path)) - df_base_obj._manage_alarm(obj.entity_id, "okay") - else: - collectd.debug("%s maintaining alarm for %s" % - (PLUGIN, path)) - - -# Collectd calls this function on startup. -# Initialize each plugin object with plugin specific data. -# Query FM for existing alarms and run with that starting state. -def init_func(): - """Collectd FM Notifier Initialization Function""" - - PluginObject.lock = Lock() - - PluginObject.host = os.uname()[1] - collectd.info("%s %s:%s init function" % - (PLUGIN, tsc.nodetype, PluginObject.host)) - - # Constant CPU Plugin Object Settings - obj = PLUGINS[PLUGIN__CPU] - obj.resource_name = "Platform CPU" - obj.instance_name = PLUGIN__CPU - obj.repair = "Monitor and if condition persists, " - obj.repair += "contact next level of support." - collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) - - ########################################################################### - - # Constant Memory Plugin Object settings - obj = PLUGINS[PLUGIN__MEM] - obj.resource_name = "Platform Memory" - obj.instance_name = PLUGIN__MEM - obj.repair = "Monitor and if condition persists, " - obj.repair += "contact next level of support; " - obj.repair += "may require additional memory on Host." - collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) - - ########################################################################### - - # Constant FileSystem Plugin Object settings - obj = PLUGINS[PLUGIN__DF] - obj.resource_name = "File System" - obj.instance_name = PLUGIN__DF - obj.repair = "Monitor and if condition persists, " - obj.repair += "contact next level of support." - - # The FileSystem (DF) plugin has multiple instances - # One instance per file system mount point being monitored. - # Create one DF instance object per mount point - obj._create_instance_objects() - - # ntp query is for controllers only - if want_vswitch is False: - collectd.debug("%s vSwitch monitoring disabled" % PLUGIN) - elif tsc.nodetype == 'worker' or 'worker' in tsc.subfunctions: - - ####################################################################### - - # Constant vSwitch CPU Usage Plugin Object settings - obj = PLUGINS[PLUGIN__VSWITCH_CPU] - obj.resource_name = "vSwitch CPU" - obj.instance_name = PLUGIN__VSWITCH_CPU - obj.repair = "Monitor and if condition persists, " - obj.repair += "contact next level of support." - collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) - - ####################################################################### - - # Constant vSwitch Memory Usage Plugin Object settings - obj = PLUGINS[PLUGIN__VSWITCH_MEM] - obj.resource_name = "vSwitch Memory" - obj.instance_name = PLUGIN__VSWITCH_MEM - obj.repair = "Monitor and if condition persists, " - obj.repair += "contact next level of support." - collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) - - ####################################################################### - - # Constant vSwitch Port State Monitor Plugin Object settings - obj = PLUGINS[PLUGIN__VSWITCH_PORT] - obj.resource_name = "vSwitch Port" - obj.instance_name = PLUGIN__VSWITCH_PORT - obj.reading_type = "state" - obj.reason_failure = "'Data' Port failed." - obj.reason_warning = "'Data' Port failed." - obj.repair = "Check cabling and far-end port configuration and " - obj.repair += "status on adjacent equipment." - obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT - obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL - obj.service_affecting = True - collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name)) - - ####################################################################### - - # Constant vSwitch Interface State Monitor Plugin Object settings - obj = PLUGINS[PLUGIN__VSWITCH_IFACE] - obj.resource_name = "vSwitch Interface" - obj.instance_name = PLUGIN__VSWITCH_IFACE - obj.reading_type = "state" - obj.reason_failure = "'Data' Interface failed." - obj.reason_warning = "'Data' Interface degraded." - obj.repair = "Check cabling and far-end port configuration and " - obj.repair += "status on adjacent equipment." - obj.alarm_type = fm_constants.FM_ALARM_TYPE_4 # EQUIPMENT - obj.cause = fm_constants.ALARM_PROBABLE_CAUSE_29 # LOSS_OF_SIGNAL - obj.service_affecting = True - collectd.info("%s monitoring %s state" % (PLUGIN, obj.resource_name)) - - ########################################################################### - - obj = PLUGINS[PLUGIN__EXAMPLE] - obj.resource_name = "Example" - obj.instance_name = PLUGIN__EXAMPLE - obj.repair = "Not Applicable" - collectd.info("%s monitoring %s usage" % (PLUGIN, obj.resource_name)) - - # ... - # ADD_NEW_PLUGIN: Add new plugin object initialization here ... - # ... - - if tsc.nodetype == 'controller': - PluginObject.database_setup_in_progress = True - _database_setup('collectd') - PluginObject.database_setup_in_progress = False - - -# The notifier function inspects the collectd notification and determines if -# the representative alarm needs to be asserted, severity changed, or cleared. -def notifier_func(nObject): - - if PluginObject.fm_connectivity is False: - - # handle multi threading startup - with PluginObject.lock: - if PluginObject.fm_connectivity is True: - return 0 - - ################################################################## - # - # With plugin objects initialized ... - # Query FM for any resource alarms that may already be raised - # Load the queries severity state into the appropriate - # severity list for those that are. - for alarm_id in ALARM_ID_LIST: - collectd.debug("%s searching for all '%s' alarms " % - (PLUGIN, alarm_id)) - try: - alarms = api.get_faults_by_id(alarm_id) - except Exception as ex: - collectd.error("%s 'get_faults_by_id' exception ; %s" % - (PLUGIN, ex)) - return 0 - - if alarms: - for alarm in alarms: - want_alarm_clear = False - eid = alarm.entity_instance_id - # ignore alarms not for this host - if PluginObject.host not in eid: - continue - - base_obj = _get_base_object(alarm_id) - if base_obj is None: - # might be a plugin instance - clear it - want_alarm_clear = True - - collectd.info('%s found %s %s alarm [%s]' % - (PLUGIN, - alarm.severity, - alarm_id, - eid)) - - if want_alarm_clear is True: - - if clear_alarm(alarm_id, eid) is False: - collectd.error("%s %s:%s clear failed" % - (PLUGIN, - alarm_id, - eid)) - else: - collectd.info("%s clear %s %s alarm %s" % - (PLUGIN, - alarm.severity, - alarm_id, - eid)) - continue - - if alarm.severity == "critical": - sev = "failure" - elif alarm.severity == "major": - sev = "warning" - else: - sev = "okay" - continue - - # Load the alarm severity by plugin/instance lookup. - if base_obj is not None: - base_obj._manage_alarm(eid, sev) - - PluginObject.fm_connectivity = True - collectd.info("%s initialization complete" % PLUGIN) - - collectd.debug('%s notification: %s %s:%s - %s %s %s [%s]' % ( - PLUGIN, - nObject.host, - nObject.plugin, - nObject.plugin_instance, - nObject.type, - nObject.type_instance, - nObject.severity, - nObject.message)) - - # Load up severity variables and alarm actions based on - # this notification's severity level. - if nObject.severity == NOTIF_OKAY: - severity_str = "okay" - _severity_num = fm_constants.FM_ALARM_SEVERITY_CLEAR - _alarm_state = fm_constants.FM_ALARM_STATE_CLEAR - elif nObject.severity == NOTIF_FAILURE: - severity_str = "failure" - _severity_num = fm_constants.FM_ALARM_SEVERITY_CRITICAL - _alarm_state = fm_constants.FM_ALARM_STATE_SET - elif nObject.severity == NOTIF_WARNING: - severity_str = "warning" - _severity_num = fm_constants.FM_ALARM_SEVERITY_MAJOR - _alarm_state = fm_constants.FM_ALARM_STATE_SET - else: - collectd.debug('%s with unsupported severity %d' % - (PLUGIN, nObject.severity)) - return 0 - - if tsc.nodetype == 'controller': - if PluginObject.database_setup is False: - if PluginObject.database_setup_in_progress is False: - PluginObject.database_setup_in_progress = True - _database_setup('collectd') - PluginObject.database_setup_in_progress = False - - # get plugin object - if nObject.plugin in PLUGINS: - base_obj = obj = PLUGINS[nObject.plugin] - - # if this notification is for a plugin instance then get that - # instances's object instead. - # If that object does not yet exists then create it. - eid = '' - - # DF instances are statically allocated - if nObject.plugin == PLUGIN__DF: - eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) - - # get this instances object - obj = base_obj._get_instance_object(eid) - if obj is None: - # path should never be hit since all DF instances - # are statically allocated. - return 0 - - elif nObject.plugin_instance: - need_instance_object_create = False - # Build the entity_id from the parent object if needed - # Build the entity_id from the parent object if needed - eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) - try: - # Need lock when reading/writing any obj.instance_objects list - with PluginObject.lock: - - # we will take an exception if this object is not - # in the list. The exception handling code below will - # create and add this object for success path the next - # time around. - inst_obj = base_obj.instance_objects[eid] - - collectd.debug("%s %s instance %s already exists %s" % - (PLUGIN, nObject.plugin, eid, inst_obj)) - # _print_state(inst_obj) - - except: - need_instance_object_create = True - - if need_instance_object_create is True: - base_obj._create_instance_object(nObject.plugin_instance) - inst_obj = base_obj._get_instance_object(eid) - if inst_obj: - collectd.debug("%s %s:%s inst object created" % - (PLUGIN, - inst_obj.plugin, - inst_obj.instance)) - else: - collectd.error("%s %s:%s inst object create failed" % - (PLUGIN, - nObject.plugin, - nObject.plugin_instance)) - return 0 - - # re-assign the object - obj = inst_obj - else: - if not len(base_obj.entity_id): - # Build the entity_id from the parent object if needed - eid = _build_entity_id(nObject.plugin, nObject.plugin_instance) - - # update the object with the eid if its not already set. - if not len(obj.entity_id): - obj.entity_id = eid - - else: - collectd.debug("%s notification for unknown plugin: %s %s" % - (PLUGIN, nObject.plugin, nObject.plugin_instance)) - return 0 - - # if obj.warnings or obj.failures: - # _print_state(obj) - - # If want_state_audit is True then run the audit. - # Primarily used for debug - # default state is False - # TODO: comment out for production code. - if want_state_audit: - obj.audit_threshold += 1 - if obj.audit_threshold == DEBUG_AUDIT: - obj.audit_threshold = 0 - obj._state_audit("audit") - - # manage reading value change ; store last and log if gt obj.step - action = obj._manage_change(nObject) - if action == "done": - return 0 - - # increment just before any possible return for a valid sample - obj.count += 1 - - # audit file system presence every time we get the - # notification for the root file system ; which will - # always be there. - if obj.instance == '/': - _clear_alarm_for_missing_filesystems() - - # exit early if there is no alarm update to be made - if base_obj._update_alarm(obj.entity_id, - severity_str, - obj.value, - obj.last_value) is False: - return 0 - - obj.last_value = round(obj.value, 2) - - if _alarm_state == fm_constants.FM_ALARM_STATE_CLEAR: - if clear_alarm(obj.id, obj.entity_id) is False: - return 0 - else: - - # manage addition of the failure reason text - if obj.cause == fm_constants.ALARM_PROBABLE_CAUSE_50: - # if this is a threshold alarm then build the reason text that - # includes the threshold and the reading that caused the assertion. - reason = obj.resource_name - reason += " threshold exceeded ;" - if obj.threshold != INVALID_THRESHOLD: - reason += " threshold {:2.2f}".format(obj.threshold) + "%," - if obj.value: - reason += " actual {:2.2f}".format(obj.value) + "%" - - elif _severity_num == fm_constants.FM_ALARM_SEVERITY_CRITICAL: - reason = obj.reason_failure - - else: - reason = obj.reason_warning - - # build the alarm object - fault = fm_api.Fault( - alarm_id=obj.id, - alarm_state=_alarm_state, - entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, - entity_instance_id=obj.entity_id, - severity=_severity_num, - reason_text=reason, - alarm_type=base_obj.alarm_type, - probable_cause=base_obj.cause, - proposed_repair_action=base_obj.repair, - service_affecting=base_obj.service_affecting, - suppression=base_obj.suppression) - - try: - alarm_uuid = api.set_fault(fault) - if pc.is_uuid_like(alarm_uuid) is False: - collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % - (PLUGIN, - base_obj.id, - obj.entity_id, - alarm_uuid)) - return 0 - - except Exception as ex: - collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" % - (PLUGIN, - obj.id, - obj.entity_id, - _severity_num, - ex)) - return 0 - - # update the lists now that - base_obj._manage_alarm(obj.entity_id, severity_str) - - collectd.info("%s %s alarm %s:%s %s:%s value:%2.2f" % ( - PLUGIN, - _alarm_state, - base_obj.id, - severity_str, - obj.instance, - obj.entity_id, - obj.value)) - - # Debug only: comment out for production code. - # obj._state_audit("change") - - return 0 - - -collectd.register_init(init_func) -collectd.register_notification(notifier_func) diff --git a/monitoring/collectd-extensions/src/interface.conf b/monitoring/collectd-extensions/src/interface.conf deleted file mode 100644 index de3afaf23..000000000 --- a/monitoring/collectd-extensions/src/interface.conf +++ /dev/null @@ -1,13 +0,0 @@ - - - - Instance "used" - Persist true - PersistOK true - WarningMin 51 - FailureMin 1 -# Hits 2 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/interface.py b/monitoring/collectd-extensions/src/interface.py deleted file mode 100755 index 82049ea7a..000000000 --- a/monitoring/collectd-extensions/src/interface.py +++ /dev/null @@ -1,981 +0,0 @@ -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################ -# -# This is the Host Interface Monitor plugin for collectd. -# -# Only mgmt, cluster-host and oam interfaces are supported with the following -# mapping specified in /etc/platform/platform.conf -# -# oam - oam_interface | controller | mandatory -# mgmnt - management_interface | all hosts | mandatory -# clstr - cluster_host_interface | any host | optional -# -# This plugin queries the maintenance Link Monitor daemon 'lmon' -# for a link status summary of that hosts configured networks. -# -# This plugin's read_func issues an http GET request to the Link Monitor -# which responds with a json string that represents a complete summary -# of the monitored links, state and the time of the last event or when -# initial status was learned. An example of the Link Monitor response is -# -# { -# "status" : "pass" -# "link_info": [ -# { "network":"mgmt", -# "type":"vlan", -# "links": [ -# { "name":"enp0s8.1", "state":"Up", "time":"5674323454567" }, -# { "name":"enp0s8.2", "state":"Up", "time":"5674323454567" }] -# }, -# { "network":"clstr", -# "type":"bond", -# "bond":"bond0", -# "links": [ -# { "name":"enp0s9f1", "state":"Down", "time":"5674323454567" }, -# { "name":"enp0s9f0", "state":"Up" , "time":"5674323454567" }] -# }, -# { "network":"oam", -# "type":"single", -# "links": [ -# { "name":"enp0s3", "state":"Up", "time":"5674323454567" }] -# }] -# } -# -# On failure -# -# { -# "status" : "fail ; bad request " -# } -# -# This plugin then uses this information to manage interface alarm -# assertion and clear with appropriate severity. -# -# Severity: Interface and Port levels -# -# Alarm Level Minor Major Critical -# ----------- ----- --------------------- ---------------------------- -# Interface N/A One of lag pair is Up All Interface ports are Down -# Port N/A Physical Link is Down N/A -# -# Sample Data: represented as % of total links Up for that network interface -# -# 100 or 100% percent used - all links of interface are up. -# 50 or 50% percent used - one of lag pair is Up and the other is Down -# 0 or 0% percent used - all ports for that network are Down -# -############################################################################ - -import os -import time -import datetime -import collectd -import plugin_common as pc -from fm_api import constants as fm_constants -from fm_api import fm_api - -# Fault manager API Object -api = fm_api.FaultAPIsV2() - -# name of the plugin - all logs produced by this plugin are prefixed with this -PLUGIN = 'interface plugin' - -# Interface Monitoring Interval in seconds -PLUGIN_AUDIT_INTERVAL = 10 - -# Sample Data 'type' and 'instance' database field values. -PLUGIN_TYPE = 'percent' -PLUGIN_TYPE_INSTANCE = 'usage' - -# The Link Status Query URL -PLUGIN_HTTP_URL_PREFIX = 'http://localhost:' - -# This plugin's timeout -PLUGIN_HTTP_TIMEOUT = 5 - -# Specify the link monitor as the maintenance destination service -# full path should look like ; http://localhost:2122/mtce/lmon -PLUGIN_HTTP_URL_PATH = '/mtce/lmon' - -# Port and Interface Alarm Identifiers -PLUGIN_OAM_PORT_ALARMID = '100.106' # OAM Network Port -PLUGIN_OAM_IFACE_ALARMID = '100.107' # OAM Network Interface - -PLUGIN_MGMT_PORT_ALARMID = '100.108' # Management Network Port -PLUGIN_MGMT_IFACE_ALARMID = '100.109' # Management Network Interface - -PLUGIN_CLSTR_PORT_ALARMID = '100.110' # Cluster-host Network Port -PLUGIN_CLSTR_IFACE_ALARMID = '100.111' # Cluster-host Nwk Interface - -# List of all alarm identifiers. -ALARM_ID_LIST = [PLUGIN_OAM_PORT_ALARMID, - PLUGIN_OAM_IFACE_ALARMID, - PLUGIN_MGMT_PORT_ALARMID, - PLUGIN_MGMT_IFACE_ALARMID, - PLUGIN_CLSTR_PORT_ALARMID, - PLUGIN_CLSTR_IFACE_ALARMID] - -# Monitored Network Name Strings -NETWORK_MGMT = 'mgmt' -NETWORK_CLSTR = 'cluster-host' -NETWORK_OAM = 'oam' - -# Port / Interface State strings -LINK_UP = 'Up' -LINK_DOWN = 'Down' - -# Alarm control actions -ALARM_ACTION_RAISE = 'raise' -ALARM_ACTION_CLEAR = 'clear' - -# Alarm level. -# Ports are the lowest level and represent a physical link -# Interfaces are port groupings in terms of LAG -LEVEL_PORT = 'port' -LEVEL_IFACE = 'interface' - -# Run phases -RUN_PHASE__INIT = 0 -RUN_PHASE__ALARMS_CLEARED = 1 -RUN_PHASE__HTTP_REQUEST_PASS = 2 - - -# Link Object (aka Port or Physical interface) Structure -# and member functions. -class LinkObject: - - def __init__(self, alarm_id): - - self.name = None - self.state = LINK_UP - self.timestamp = float(0) - self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR - self.alarm_id = alarm_id - self.state_change = True - - collectd.debug("%s LinkObject constructor: %s" % - (PLUGIN, alarm_id)) - - ################################################################## - # - # Name : raise_port_alarm - # - # Purpose : This link object member function is used to - # raise link/port alarms. - # - # Parameters : Network the link is part of. - # - # Returns : False on failure - # True on success - # - ################################################################## - def raise_port_alarm(self, network): - """Raise a port alarm""" - - if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: - - if manage_alarm(self.name, - network, - LEVEL_PORT, - ALARM_ACTION_RAISE, - fm_constants.FM_ALARM_SEVERITY_MAJOR, - self.alarm_id, - self.timestamp) is True: - - self.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - collectd.info("%s %s %s port alarm raised" % - (PLUGIN, self.name, self.alarm_id)) - return True - else: - return False - else: - return True - - ################################################################## - # - # Name : clear_port_alarm - # - # Purpose : This link object member function is used to - # clear link/port alarms. - # - # Parameters : Network the link is part of. - # - # Returns : False on failure - # True on success. - # - ################################################################## - def clear_port_alarm(self, network): - """Clear a port alarm""" - - if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: - if manage_alarm(self.name, - network, - LEVEL_PORT, - ALARM_ACTION_CLEAR, - fm_constants.FM_ALARM_SEVERITY_CLEAR, - self.alarm_id, - self.timestamp) is True: - - collectd.info("%s %s %s port alarm cleared" % - (PLUGIN, self.name, self.alarm_id)) - self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR - return True - else: - return False - else: - return True - - -# Interface (aka Network) Level Object Structure and member functions -class NetworkObject: - - def __init__(self, name): - - self.name = name - self.sample = 0 - self.sample_last = 0 - self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR - self.degraded = False - self.timestamp = float(0) - - # add the respective alarm IDs to each object - alarm_id = None - if name == NETWORK_OAM: - alarm_id = PLUGIN_OAM_PORT_ALARMID - self.alarm_id = PLUGIN_OAM_IFACE_ALARMID - elif name == NETWORK_MGMT: - alarm_id = PLUGIN_MGMT_PORT_ALARMID - self.alarm_id = PLUGIN_MGMT_IFACE_ALARMID - elif name == NETWORK_CLSTR: - alarm_id = PLUGIN_CLSTR_PORT_ALARMID - self.alarm_id = PLUGIN_CLSTR_IFACE_ALARMID - else: - self.alarm_id = "" - collectd.error("%s unexpected network (%s)" % (PLUGIN, name)) - - collectd.debug("%s %s NetworkObject constructor: %s" % - (PLUGIN, name, self.alarm_id)) - - if alarm_id: - self.link_one = LinkObject(alarm_id) - self.link_two = LinkObject(alarm_id) - - ################################################################## - # - # Name : raise_iface_alarm - # - # Purpose : This network object member function used to - # raise interface alarms. - # - # Parameters : None - # - # Returns : False on failure - # True on success - # - ################################################################## - def raise_iface_alarm(self, severity): - """Raise an interface alarm""" - - if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR: - collectd.error("%s %s raise alarm called with clear severity" % - (PLUGIN, self.name)) - return True - - if self.severity != severity: - if manage_alarm(self.name, - self.name, - LEVEL_IFACE, - ALARM_ACTION_RAISE, - severity, - self.alarm_id, - self.timestamp) is True: - - self.severity = severity - collectd.info("%s %s %s %s interface alarm raised" % - (PLUGIN, - self.name, - self.alarm_id, - pc.get_severity_str(severity))) - return True - else: - return False - else: - return True - - ################################################################## - # - # Name : clear_iface_alarm - # - # Purpose : This network object member function used to - # clear interface alarms. - # - # Parameters : None - # - # Returns : False on failure - # True on success. - # - ################################################################## - def clear_iface_alarm(self): - """Clear an interface alarm""" - - if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: - if manage_alarm(self.name, - self.name, - LEVEL_IFACE, - ALARM_ACTION_CLEAR, - fm_constants.FM_ALARM_SEVERITY_CLEAR, - self.alarm_id, - self.timestamp) is True: - - collectd.info("%s %s %s %s interface alarm cleared" % - (PLUGIN, - self.name, - self.alarm_id, - pc.get_severity_str(self.severity))) - self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR - return True - else: - return False - else: - return True - - ###################################################################### - # - # Name : manage_iface_alarm - # - # Purpose : clear or raise appropriate severity level interface alarm - # - # Returns : None - # - ###################################################################### - def manage_iface_alarm(self): - # Single Link Config - if self.link_two.name is None: - if self.link_one.state == LINK_DOWN: - if self.severity != fm_constants.FM_ALARM_SEVERITY_CRITICAL: - self.timestamp = self.link_one.timestamp - self.raise_iface_alarm( - fm_constants.FM_ALARM_SEVERITY_CRITICAL) - elif self.link_one.state == LINK_UP: - if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: - self.clear_iface_alarm() - - # Lagged Link Config - # - # The interface level timestamp is updated based on the failed - # link timestamps - elif self.link_one.state == LINK_UP and \ - self.link_two.state == LINK_DOWN: - if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: - self.timestamp = self.link_two.timestamp - self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_MAJOR) - - elif self.link_one.state == LINK_DOWN and \ - self.link_two.state == LINK_UP: - if self.severity != fm_constants.FM_ALARM_SEVERITY_MAJOR: - self.timestamp = self.link_one.timestamp - self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_MAJOR) - - elif self.link_one.state == LINK_UP and self.link_two.state == LINK_UP: - if self.severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: - self.clear_iface_alarm() - - elif self.link_one.state == LINK_DOWN and \ - self.link_two.state == LINK_DOWN: - if self.severity != fm_constants.FM_ALARM_SEVERITY_CRITICAL: - if self.link_one.timestamp > self.link_two.timestamp: - self.timestamp = self.link_one.timestamp - else: - self.timestamp = self.link_two.timestamp - self.raise_iface_alarm(fm_constants.FM_ALARM_SEVERITY_CRITICAL) - - -# Plugin Control Object -obj = pc.PluginObject(PLUGIN, PLUGIN_HTTP_URL_PREFIX) - - -# Network Object List - Primary Network/Link Control Object -NETWORKS = [NetworkObject(NETWORK_MGMT), - NetworkObject(NETWORK_OAM), - NetworkObject(NETWORK_CLSTR)] - - -########################################################################## -# -# Name : get_timestamp -# -# Purpose : Convert the long long int microsecond time as string -# that accompany link info from the Link Monitor (lmond) -# and catch exceptions in doing so. -# -# Parameters: lmon_time - long long int as string -# -# Returns : float time that can be consumed by datetime.fromtimestamp -# -# Returns same unit of now time if provided lmon_time is -# invalid. -# -########################################################################## -def get_timestamp(lmon_time): - """Convert lmon time to fm timestamp time""" - - if lmon_time: - try: - return(float(float(lmon_time) / 1000000)) - except: - collectd.error("%s failed to parse timestamp ;" - " using current time" % PLUGIN) - else: - collectd.error("%s no timestamp ;" - " using current time" % PLUGIN) - - return(float(time.time())) - - -def dump_network_info(network): - """Log the specified network info""" - - link_one_event_time = datetime.datetime.fromtimestamp( - float(network.link_one.timestamp)).strftime('%Y-%m-%d %H:%M:%S') - - link_two_info = '' - if network.link_two.name is not None: - link_two_event_time = datetime.datetime.fromtimestamp( - float(network.link_two.timestamp)).strftime('%Y-%m-%d %H:%M:%S') - - link_two_info += "; link two '" - link_two_info += network.link_two.name - link_two_info += "' went " + network.link_two.state - link_two_info += " at " + link_two_event_time - - pcnt = '%' - - collectd.info("%s %5s %3d%c ; " - "link one '%s' went %s at %s %s" % - (PLUGIN, - network.name, - network.sample, - pcnt, - network.link_one.name, - network.link_one.state, - link_one_event_time, - link_two_info)) - - -######################################################################### -# -# Name : this_hosts_alarm -# -# Purpose : Determine if the supplied eid is for this host. -# -# Description: The eid formats for the alarms managed by this plugin are -# -# host=.port= -# host=.interface= -# -# Assumptions: There is no restriction preventing the system -# administrator from creating hostnames with period's ('.') -# in them. Because so the eid cannot simply be split -# around '='s and '.'s. Instead its split around this -# plugins level type '.port' or '.interface'. -# -# Returns : True if hostname is a match -# False otherwise -# -########################################################################## -def this_hosts_alarm(hostname, eid): - """Check if the specified eid is for this host""" - - if hostname: - if eid: - # 'host=controller-0.interface=mgmt' - try: - eid_host = None - eid_disected = eid.split('=') - if len(eid_disected) == 3: - # ['host', 'controller-0.interface', 'mgmt'] - if len(eid_disected[1].split('.port')) == 2: - eid_host = eid_disected[1].split('.port')[0] - if eid_host and eid_host == hostname: - return True - elif len(eid_disected[1].split('.interface')) == 2: - eid_host = eid_disected[1].split('.interface')[0] - if eid_host and eid_host == hostname: - return True - except Exception as ex: - collectd.error("%s failed to parse alarm eid (%s)" - " [eid:%s]" % (PLUGIN, str(ex), eid)) - - return False - - -########################################################################## -# -# Name : clear_alarms -# -# Purpose : Clear all interface alarms on process startup. -# -# Description: Called after first successful Link Status query. -# -# Loops over the provided alarm id list querying all alarms -# for each. Any that are raised are precisely cleared. -# -# Prevents stuck alarms over port and interface reconfig. -# -# If the original alarm case still exists the alarm will -# be re-raised with the original link event timestamp that -# is part of the Link Status query response. -# -# Parameters : A list of this plugin's alarm ids -# -# Returns : True on Success -# False on Failure -# -########################################################################## -def clear_alarms(alarm_id_list): - """Clear alarm state of all plugin alarms""" - found = False - for alarm_id in alarm_id_list: - - try: - alarms = api.get_faults_by_id(alarm_id) - except Exception as ex: - collectd.error("%s 'get_faults_by_id' exception ;" - " %s ; %s" % - (PLUGIN, alarm_id, ex)) - return False - - if alarms: - for alarm in alarms: - eid = alarm.entity_instance_id - if this_hosts_alarm(obj.hostname, eid) is False: - # ignore other host alarms - continue - - if alarm_id == PLUGIN_OAM_PORT_ALARMID or \ - alarm_id == PLUGIN_OAM_IFACE_ALARMID or \ - alarm_id == PLUGIN_MGMT_PORT_ALARMID or \ - alarm_id == PLUGIN_MGMT_IFACE_ALARMID or \ - alarm_id == PLUGIN_CLSTR_PORT_ALARMID or \ - alarm_id == PLUGIN_CLSTR_IFACE_ALARMID: - - try: - if api.clear_fault(alarm_id, eid) is False: - collectd.info("%s %s:%s:%s alarm already cleared" % - (PLUGIN, - alarm.severity, - alarm_id, - eid)) - else: - found = True - collectd.info("%s %s:%s:%s alarm cleared" % - (PLUGIN, - alarm.severity, - alarm_id, - eid)) - except Exception as ex: - collectd.error("%s 'clear_fault' exception ; " - "%s:%s ; %s" % - (PLUGIN, alarm_id, eid, ex)) - return False - if found is False: - collectd.info("%s found no startup alarms" % PLUGIN) - - return True - - -########################################################################## -# -# Name : manage_alarm -# -# Purpose : Raises or clears port and interface alarms based on -# calling parameters. -# -# Returns : True on success -# False on failure -# -########################################################################## -def manage_alarm(name, network, level, action, severity, alarm_id, timestamp): - """Manage raise and clear of port and interface alarms""" - - ts = datetime.datetime.fromtimestamp( - float(timestamp)).strftime('%Y-%m-%d %H:%M:%S') - collectd.debug("%s %s %s %s alarm for %s:%s [%s] %s" % (PLUGIN, - severity, level, alarm_id, network, name, action, ts)) - - if action == ALARM_ACTION_CLEAR: - alarm_state = fm_constants.FM_ALARM_STATE_CLEAR - reason = '' - repair = '' - else: - # reason ad repair strings are only needed on alarm assertion - alarm_state = fm_constants.FM_ALARM_STATE_SET - reason = "'" + network.upper() + "' " + level - repair = 'Check cabling and far-end port configuration ' \ - 'and status on adjacent equipment.' - - # build the alarm eid and name string - if level == LEVEL_PORT: - eid = 'host=' + obj.hostname + "." + level + '=' + name - reason += " failed" - else: - eid = 'host=' + obj.hostname + "." + level + '=' + network - if severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: - reason += " degraded" - else: - reason += " failed" - - if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR: - try: - if api.clear_fault(alarm_id, eid) is False: - collectd.info("%s %s:%s alarm already cleared" % - (PLUGIN, alarm_id, eid)) - else: - collectd.info("%s %s:%s alarm cleared" % - (PLUGIN, alarm_id, eid)) - return True - - except Exception as ex: - collectd.error("%s 'clear_fault' failed ; %s:%s ; %s" % - (PLUGIN, alarm_id, eid, ex)) - return False - - else: - fault = fm_api.Fault( - uuid="", - alarm_id=alarm_id, - alarm_state=alarm_state, - entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, - entity_instance_id=eid, - severity=severity, - reason_text=reason, - alarm_type=fm_constants.FM_ALARM_TYPE_7, - probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN, - proposed_repair_action=repair, - service_affecting=True, - timestamp=ts, - suppression=True) - - try: - alarm_uuid = api.set_fault(fault) - except Exception as ex: - collectd.error("%s 'set_fault' exception ; %s:%s ; %s" % - (PLUGIN, alarm_id, eid, ex)) - return False - - if pc.is_uuid_like(alarm_uuid) is False: - collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % - (PLUGIN, alarm_id, eid, alarm_uuid)) - return False - else: - return True - - -# The config function - called once on collectd process startup -def config_func(config): - """Configure the plugin""" - - # Need to update the Link Status Query URL with the port number. - url_updated = False - - # The Link Monitor port number is first searched for in - # the /etc/mtc/lmond.conf file. - # If its not there then its taken from the plugin config. - - # /etc/mtc/lmond.conf - fn = '/etc/mtc/lmond.conf' - if (os.path.exists(fn)): - try: - with open(fn, 'r') as infile: - for line in infile: - if 'lmon_query_port' in line: - if isinstance(int(line.split()[2]), int): - - # add the port - obj.url += line.split()[2] - - # add the path /mtce/lmon - obj.url += PLUGIN_HTTP_URL_PATH - - url_updated = "config file" - break - except EnvironmentError as e: - collectd.error(str(e), UserWarning) - - if url_updated is False: - # Try the config as this might be updated by manifest - for node in config.children: - key = node.key.lower() - val = int(node.values[0]) - if key == 'port': - if isinstance(int(val), int): - - # add the port - obj.url += str(val) - - # add the path /mtce/lmon - obj.url += PLUGIN_HTTP_URL_PATH - - url_updated = "manifest" - break - - if url_updated: - collectd.info("%s configured by %s [%s]" % - (PLUGIN, url_updated, obj.url)) - obj.config_done = True - else: - collectd.error("%s config failure ; cannot monitor" % - (PLUGIN)) - return 0 - - -# The init function - called once on collectd process startup -def init_func(): - """Init the plugin""" - - if obj.config_done is False: - collectd.info("%s configuration failed" % PLUGIN) - time.sleep(300) - return False - - if obj.init_done is False: - if obj.init_ready() is False: - return 0 - - obj.hostname = obj.gethostname() - obj.init_done = True - collectd.info("%s initialization complete" % PLUGIN) - - return 0 - - -# The sample read function - called on every audit interval -def read_func(): - """collectd interface monitor plugin read function""" - - if obj.init_done is False: - init_func() - return 0 - - if obj.phase < RUN_PHASE__ALARMS_CLEARED: - - # clear all alarms on first audit - # - # block on fm availability - # - # If the existing raised alarms are still valid then - # they will be re-raised with the same timestamp the - # original event occurred at once auditing resumes. - if clear_alarms(ALARM_ID_LIST) is False: - collectd.error("%s failed to clear existing alarms ; " - "retry next audit" % PLUGIN) - - # Don't proceed till we can communicate with FM and - # clear all existing interface and port alarms. - return 0 - else: - obj.phase = RUN_PHASE__ALARMS_CLEARED - - # Throttle HTTP request error retries - if obj.http_retry_count != 0: - obj.http_retry_count += 1 - if obj.http_retry_count > obj.HTTP_RETRY_THROTTLE: - obj.http_retry_count = 0 - return 0 - - # Issue query and construct the monitoring object - success = obj.make_http_request(to=PLUGIN_HTTP_TIMEOUT) - - if success is False: - obj.http_retry_count += 1 - return 0 - - if len(obj.jresp) == 0: - collectd.error("%s no json response from http request" % PLUGIN) - obj.http_retry_count += 1 - return 0 - - # Check query status - try: - if obj.jresp['status'] != 'pass': - collectd.error("%s link monitor query %s" % - (PLUGIN, obj.jresp['status'])) - obj.http_retry_count += 1 - return 0 - - except Exception as ex: - collectd.error("%s http request get reason failed ; %s" % - (PLUGIN, str(ex))) - collectd.info("%s resp:%d:%s" % - (PLUGIN, len(obj.jresp), obj.jresp)) - obj.http_retry_count += 1 - return 0 - - # log the first query response - if obj.audits == 0: - collectd.info("%s Link Status Query Response:%d:\n%s" % - (PLUGIN, len(obj.jresp), obj.jresp)) - - # uncomment below for debug purposes - # - # for network in NETWORKS: - # dump_network_info(network) - - try: - link_info = obj.jresp['link_info'] - for network_link_info in link_info: - collectd.debug("%s parse link info:%s" % - (PLUGIN, network_link_info)) - for network in NETWORKS: - if network.name == network_link_info['network']: - links = network_link_info['links'] - nname = network.name - if len(links) > 0: - link_one = links[0] - - # get initial link one name - if network.link_one.name is None: - network.link_one.name = link_one['name'] - - network.link_one.timestamp =\ - float(get_timestamp(link_one['time'])) - - # load link one state - if link_one['state'] == LINK_UP: - collectd.debug("%s %s IS Up [%s]" % - (PLUGIN, network.link_one.name, - network.link_one.state)) - if network.link_one.state != LINK_UP: - network.link_one.state_change = True - network.link_one.clear_port_alarm(nname) - network.link_one.state = LINK_UP - else: - collectd.debug("%s %s IS Down [%s]" % - (PLUGIN, network.link_one.name, - network.link_one.state)) - if network.link_one.state == LINK_UP: - network.link_one.state_change = True - network.link_one.raise_port_alarm(nname) - network.link_one.state = LINK_DOWN - - if len(links) > 1: - link_two = links[1] - - # get initial link two name - if network.link_two.name is None: - network.link_two.name = link_two['name'] - - network.link_two.timestamp =\ - float(get_timestamp(link_two['time'])) - - # load link two state - if link_two['state'] == LINK_UP: - collectd.debug("%s %s IS Up [%s]" % - (PLUGIN, network.link_two.name, - network.link_two.state)) - if network.link_two.state != LINK_UP: - network.link_two.state_change = True - network.link_two.clear_port_alarm(nname) - network.link_two.state = LINK_UP - else: - collectd.debug("%s %s IS Down [%s]" % - (PLUGIN, network.link_two.name, - network.link_two.state)) - if network.link_two.state == LINK_UP: - network.link_two.state_change = True - network.link_two.raise_port_alarm(nname) - network.link_two.state = LINK_DOWN - - # manage interface alarms - network.manage_iface_alarm() - - except Exception as ex: - collectd.error("%s link monitor query parse exception ; %s " % - (PLUGIN, obj.resp)) - - # handle state changes - for network in NETWORKS: - if network.link_two.name is not None and \ - network.link_one.state_change is True: - - if network.link_one.state == LINK_UP: - collectd.info("%s %s link one '%s' is Up" % - (PLUGIN, - network.name, - network.link_one.name)) - else: - collectd.info("%s %s link one '%s' is Down" % - (PLUGIN, - network.name, - network.link_one.name)) - - if network.link_two.name is not None and \ - network.link_two.state_change is True: - - if network.link_two.state == LINK_UP: - collectd.info("%s %s link two '%s' is Up" % - (PLUGIN, - network.name, - network.link_two.name)) - else: - collectd.info("%s %s link two %s 'is' Down" % - (PLUGIN, - network.name, - network.link_two.name)) - - # Dispatch usage value to collectd - val = collectd.Values(host=obj.hostname) - val.plugin = 'interface' - val.type = 'percent' - val.type_instance = 'used' - - # For each interface [ mgmt, oam, infra ] - # calculate the percentage used sample - # sample = 100 % when all its links are up - # sample = 0 % when all its links are down - # sample = 50 % when one of a lagged group is down - for network in NETWORKS: - - if network.link_one.name is not None: - - val.plugin_instance = network.name - - network.sample = 0 - - if network.link_two.name is not None: - # lagged - - if network.link_one.state == LINK_UP: - network.sample = 50 - if network.link_two.state == LINK_UP: - network.sample += 50 - else: - if network.link_one.state == LINK_UP: - network.sample = 100 - val.dispatch(values=[network.sample]) - - if network.link_one.state_change is True or \ - network.link_two.state_change is True: - - dump_network_info(network) - - network.link_one.state_change = False - network.link_two.state_change = False - - network.sample_last = network.sample - - else: - collectd.debug("%s %s network not provisioned" % - (PLUGIN, network.name)) - obj.audits += 1 - - return 0 - - -# register the config, init and read functions -collectd.register_config(config_func) -collectd.register_init(init_func) -collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL) diff --git a/monitoring/collectd-extensions/src/memory.conf b/monitoring/collectd-extensions/src/memory.conf deleted file mode 100644 index 5e5195f09..000000000 --- a/monitoring/collectd-extensions/src/memory.conf +++ /dev/null @@ -1,21 +0,0 @@ -# For stock plugin only -# Uncomment to compare stock to tiS plugin readings -# --------------------- -# -# ValuesAbsolute false -# ValuesPercentage true -# - - - - - Instance "used" - Persist true - PersistOK true - WarningMax 80.00 - FailureMax 90.00 - Hits 2 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/memory.py b/monitoring/collectd-extensions/src/memory.py deleted file mode 100755 index 933763697..000000000 --- a/monitoring/collectd-extensions/src/memory.py +++ /dev/null @@ -1,279 +0,0 @@ -# -# Copyright (c) 2018-2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################ -# -# This file is the collectd 'Platform CPU Usage' Monitor. -# -# The Platform CPU Usage is calculated as an averaged percentage of -# platform core usable since the previous sample. -# -# Init Function: -# - if 'worker_reserved.conf exists then query/store PLATFORM_CPU_LIST -# -############################################################################ -import os -import collectd - -debug = False - -PLUGIN = 'platform memory usage' -PLUGIN_NUMA = 'numa memory usage' -PLUGIN_HUGE = 'hugepage memory usage' - - -# CPU Control class -class MEM: - hostname = "" # hostname for sample notification message - cmd = '/proc/meminfo' # the query comment - value = float(0.0) # float value of memory usage - - # meminfo values we care about - memTotal_kB = 0 - memFree_kB = 0 - buffers = 0 - cached = 0 - SReclaimable = 0 - CommitLimit = 0 - Committed_AS = 0 - HugePages_Total = 0 - HugePages_Free = 0 - Hugepagesize = 0 - AnonPages = 0 - FilePages = 0 - - # derived values - avail = 0 - total = 0 - strict = 0 - - -# Instantiate the class -obj = MEM() - - -def log_meminfo(plugin, name, meminfo): - """Log the supplied meminfo""" - - if debug is False: - return - - collectd.info("%s %s" % (plugin, name)) - collectd.info("%s ---------------------------" % plugin) - collectd.info("%s memTotal_kB : %f" % (plugin, meminfo.memTotal_kB)) - collectd.info("%s memFree_kB : %f" % (plugin, meminfo.memFree_kB)) - collectd.info("%s Buffers : %f" % (plugin, meminfo.buffers)) - collectd.info("%s Cached : %f" % (plugin, meminfo.cached)) - collectd.info("%s SReclaimable : %f" % (plugin, meminfo.SReclaimable)) - collectd.info("%s CommitLimit : %f" % (plugin, meminfo.CommitLimit)) - collectd.info("%s Committed_AS : %f" % (plugin, meminfo.Committed_AS)) - collectd.info("%s HugePages_Total: %f" % (plugin, meminfo.HugePages_Total)) - collectd.info("%s HugePages_Free : %f" % (plugin, meminfo.HugePages_Free)) - collectd.info("%s Hugepagesize : %f" % (plugin, meminfo.Hugepagesize)) - collectd.info("%s AnonPages : %f" % (plugin, meminfo.AnonPages)) - - -def config_func(config): - """Configure the memory usage plugin""" - - for node in config.children: - key = node.key.lower() - val = node.values[0] - - if key == 'path': - obj.cmd = str(val) - collectd.info("%s configured query command: '%s'" % - (PLUGIN, obj.cmd)) - return 0 - - collectd.info("%s no config command provided ; " - "defaulting to '%s'" % - (PLUGIN, obj.cmd)) - - -# Load the hostname and kernel memory 'overcommit' setting. -def init_func(): - # get current hostname - obj.hostname = os.uname()[1] - - # get strict setting - # - # a value of 0 means "heuristic overcommit" - # a value of 1 means "always overcommit" - # a value of 2 means "don't overcommit". - # - # set strict true strict=1 if value is = 2 - # otherwise strict is false strict=0 (default) - - fn = '/proc/sys/vm/overcommit_memory' - if os.path.exists(fn): - with open(fn, 'r') as infile: - for line in infile: - if int(line) == 2: - obj.strict = 1 - break - - collectd.info("%s strict:%d" % (PLUGIN, obj.strict)) - - -# Calculate the CPU usage sample -def read_func(): - meminfo = {} - try: - with open(obj.cmd) as fd: - for line in fd: - meminfo[line.split(':')[0]] = line.split(':')[1].strip() - - except EnvironmentError as e: - collectd.error("%s unable to read from %s ; str(e)" % - (PLUGIN, str(e))) - return 0 - - # setup the sample structure - val = collectd.Values(host=obj.hostname) - val.type = 'percent' - val.type_instance = 'used' - - # fit_value = 0 - # if os.path.exists('/var/run/fit/mem_data'): - # with open('/var/run/fit/mem_data', 'r') as infile: - # for line in infile: - # fit_value = float(line) - # collectd.info("%s using FIT data:%.2f" % - # (PLUGIN, fit_value)) - # break - - # remove the 'unit' (kB) suffix that might be on some of the lines - for line in meminfo: - # remove the units from the value read - value_unit = [u.strip() for u in meminfo[line].split(' ', 1)] - if len(value_unit) == 2: - value, unit = value_unit - meminfo[line] = float(value) - else: - meminfo[line] = float(meminfo[line]) - - obj.memTotal_kB = float(meminfo['MemTotal']) - obj.memFree_kB = float(meminfo['MemFree']) - obj.buffers = float(meminfo['Buffers']) - obj.cached = float(meminfo['Cached']) - obj.SReclaimable = float(meminfo['SReclaimable']) - obj.CommitLimit = float(meminfo['CommitLimit']) - obj.Committed_AS = float(meminfo['Committed_AS']) - obj.HugePages_Total = float(meminfo['HugePages_Total']) - obj.HugePages_Free = float(meminfo['HugePages_Free']) - obj.Hugepagesize = float(meminfo['Hugepagesize']) - obj.AnonPages = float(meminfo['AnonPages']) - - log_meminfo(PLUGIN, "/proc/meminfo", obj) - - obj.avail = float(float(obj.memFree_kB) + - float(obj.buffers) + - float(obj.cached) + - float(obj.SReclaimable)) - obj.total = float(float(obj.avail) + - float(obj.AnonPages)) - - if obj.strict == 1: - obj.value = float(float(obj.Committed_AS) / float(obj.CommitLimit)) - else: - obj.value = float(float(obj.AnonPages) / float(obj.total)) - obj.value = float(float(obj.value) * 100) - - # if fit_value != 0: - # obj.value = fit_value - - if debug is True: - collectd.info("%s ---------------------------" % PLUGIN) - collectd.info("%s memAvail: %d" % (PLUGIN, obj.avail)) - collectd.info("%s memTotal: %d" % (PLUGIN, obj.total)) - collectd.info('%s reports %.2f %% usage' % (PLUGIN, obj.value)) - - # Dispatch usage value to collectd - val.plugin = 'memory' - val.plugin_instance = 'platform' - val.dispatch(values=[obj.value]) - - ##################################################################### - # Now get the Numa Node Memory Usage - ##################################################################### - numa_node_files = [] - fn = "/sys/devices/system/node/" - files = os.listdir(fn) - for file in files: - if 'node' in file: - numa_node_files.append(fn + file + '/meminfo') - - for numa_node in numa_node_files: - meminfo = {} - try: - with open(numa_node) as fd: - for line in fd: - meminfo[line.split()[2][0:-1]] = line.split()[3].strip() - - obj.memFree_kB = float(meminfo['MemFree']) - obj.FilePages = float(meminfo['FilePages']) - obj.SReclaimable = float(meminfo['SReclaimable']) - obj.AnonPages = float(meminfo['AnonPages']) - obj.HugePages_Total = float(meminfo['HugePages_Total']) - obj.HugePages_Free = float(meminfo['HugePages_Free']) - - log_meminfo(PLUGIN, numa_node, obj) - - avail = float(float(obj.memFree_kB) + - float(obj.FilePages) + - float(obj.SReclaimable)) - total = float(float(avail) + - float(obj.AnonPages)) - obj.value = float(float(obj.AnonPages)) / float(total) - obj.value = float(float(obj.value) * 100) - - # if fit_value != 0: - # obj.value = fit_value - - # Dispatch usage value to collectd for this numa node - val.plugin_instance = numa_node.split('/')[5] - val.dispatch(values=[obj.value]) - - collectd.debug('%s reports %s at %.2f %% usage (%s)' % - (PLUGIN_NUMA, - val.plugin, - obj.value, - val.plugin_instance)) - - # Numa Node Huge Page Memory Monitoring - # - # Only monitor if there is Huge Page Memory - if obj.HugePages_Total > 0: - obj.value = \ - float(float(obj.HugePages_Total - - obj.HugePages_Free)) / \ - float(obj.HugePages_Total) - obj.value = float(float(obj.value) * 100) - - # if fit_value != 0: - # obj.value = fit_value - - # Dispatch huge page memory usage value - # to collectd for this numa node. - val.plugin_instance = numa_node.split('/')[5] + '_hugepages' - val.dispatch(values=[obj.value]) - - collectd.debug('%s reports %s at %.2f %% usage (%s)' % - (PLUGIN_HUGE, - val.plugin, - obj.value, - val.plugin_instance)) - - except EnvironmentError as e: - collectd.error("%s unable to read from %s ; str(e)" % - (PLUGIN_NUMA, str(e))) - - return 0 - - -collectd.register_config(config_func) -collectd.register_init(init_func) -collectd.register_read(read_func) diff --git a/monitoring/collectd-extensions/src/mtce_notifier.py b/monitoring/collectd-extensions/src/mtce_notifier.py deleted file mode 100755 index 14336a381..000000000 --- a/monitoring/collectd-extensions/src/mtce_notifier.py +++ /dev/null @@ -1,380 +0,0 @@ -# -# Copyright (c) 2018-2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################# -# -# This file is the collectd 'Maintenance' Notifier. -# -# Collects provides information about each event as an object passed to the -# notification handler ; the notification object. -# -# object.host - the hostname -# -# object.plugin - the name of the plugin aka resource -# object.plugin_instance - plugin instance string i.e. say mountpoint -# for df plugin -# object.type, - the unit i.e. percent or absolute -# object.type_instance - the attribute i.e. free, used, etc -# -# object.severity - a integer value 0=OK , 1=warning, 2=failure -# object.message - a log-able message containing the above along -# with the value -# -# This notifier manages requesting mtce to assert or clear its collectd -# host-degrade-cause flag based on notification messages sent from collectd. -# -# Messages to maintenance are throttled ONE_EVERY while this state is the -# same as last state. -# -# Message is sent on every state change -# from clear to assert or -# from assert to clear -# -# See code comments for details. -# -############################################################################ -# -# Import list - -import os -import socket -import collectd -import tsconfig.tsconfig as tsc - -# This plugin name -PLUGIN = 'degrade notifier' - -# collectd severity definitions ; -# Note: can't seem to pull then in symbolically with a header -NOTIF_FAILURE = 1 -NOTIF_WARNING = 2 -NOTIF_OKAY = 4 - -# default mtce port. -# ... with configuration override -MTCE_CMD_RX_PORT = 2101 - -# same state message throttle count. -# ... only send the degrade message every 'this' number -# while the state of assert or clear remains the same. -ONE_EVERY = 10 - -PLUGIN__DF = 'df' -PLUGIN__MEM = 'memory' -PLUGIN__CPU = 'cpu' - -PLUGIN__VSWITCH_MEM = 'vswitch_mem' -PLUGIN__VSWITCH_CPU = 'vswitch_cpu' -PLUGIN__VSWITCH_PORT = "vswitch_port" -PLUGIN__VSWITCH_IFACE = "vswitch_iface" - - -PLUGIN_INTERFACE = 'interface' -PLUGIN__EXAMPLE = 'example' - - -# The collectd Maintenance Notifier Object -class collectdMtceNotifierObject: - - def __init__(self, port): - """collectdMtceNotifierObject Class constructor""" - # default maintenance port - self.port = port - self.addr = None - - # specifies the protocol family to use when messaging maintenance. - # if system is IPV6, then that is learned and this 'protocol' is - # updated with AF_INET6 - self.protocol = socket.AF_INET - - # List of plugin names that require degrade for specified severity. - self.degrade_list__failure = [PLUGIN__DF, - PLUGIN__MEM, - PLUGIN__CPU, - PLUGIN__VSWITCH_MEM, - PLUGIN__VSWITCH_CPU, - PLUGIN__VSWITCH_PORT, - PLUGIN__VSWITCH_IFACE, - PLUGIN_INTERFACE, - PLUGIN__EXAMPLE] - self.degrade_list__warning = [PLUGIN_INTERFACE] - - # the running list of resources that require degrade. - # a degrade clear message is sent whenever this list is empty. - # a degrade assert message is sent whenever this list is not empty. - self.degrade_list = [] - - # throttle down sending of duplicate degrade assert/clear messages - self.last_state = "undef" - self.msg_throttle = 0 - - -# Instantiate the mtce_notifier object -# This object persists from notificaiton to notification -obj = collectdMtceNotifierObject(MTCE_CMD_RX_PORT) - - -def _get_active_controller_ip(): - """Get the active controller host IP""" - - try: - obj.addr = socket.getaddrinfo('controller', None)[0][4][0] - collectd.info("%s controller ip: %s" % (PLUGIN, obj.addr)) - except Exception as ex: - obj.addr = None - collectd.error("%s failed to get controller ip ; %s" % - (PLUGIN, str(ex))) - return 0 - - -def _df_instance_to_path(df_inst): - """Convert a df instance name to a mountpoint""" - - # df_root is not a dynamic file system. Ignore that one. - if df_inst == 'df_root': - return '/' - else: - # For all others replace all '-' with '/' - return('/' + df_inst[3:].replace('-', '/')) - - -# This function removes degraded file systems that are no longer present. -def _clear_degrade_for_missing_filesystems(): - """Remove degraded file systems that are no longer mounted or present""" - - for df_inst in obj.degrade_list: - - # Only file system plugins are looked at. - # File system plugin instance names are prefixed with 'df_' - # as the first 3 chars in the instance name. - if df_inst[0:3] == 'df_': - path = _df_instance_to_path(df_inst) - - # check the mount point. - # if the mount point no longer exists then remove - # this instance from the degrade list. - if os.path.ismount(path) is False: - collectd.info("%s clearing degrade for missing %s ; %s" % - (PLUGIN, path, obj.degrade_list)) - obj.degrade_list.remove(df_inst) - - return 0 - - -# The collectd configuration interface -# -# Used to configure the maintenance port. -# key = 'port' -# val = port number -# -def config_func(config): - """Configure the maintenance degrade notifier plugin""" - - collectd.debug('%s config function' % PLUGIN) - for node in config.children: - key = node.key.lower() - val = node.values[0] - - if key == 'port': - obj.port = int(val) - collectd.info("%s configured mtce port: %d" % - (PLUGIN, obj.port)) - return 0 - - obj.port = MTCE_CMD_RX_PORT - collectd.error("%s no mtce port provided ; defaulting to %d" % - (PLUGIN, obj.port)) - - -# Collectd calls this function on startup. -def init_func(): - """Collectd Mtce Notifier Initialization Function""" - - obj.host = os.uname()[1] - collectd.info("%s %s:%s sending to mtce port %d" % - (PLUGIN, tsc.nodetype, obj.host, obj.port)) - - collectd.debug("%s init function" % PLUGIN) - - -# This is the Notifier function that is called by collectd. -# -# Handling steps are -# -# 1. build resource name from notification object. -# 2. check resource against severity lists. -# 3. manage this instance's degrade state. -# 4. send mtcAgent the degrade state message. -# -def notifier_func(nObject): - """Collectd Mtce Notifier Handler Function""" - - # Create the resource name from the notifier object. - # format: _ - resource = nObject.plugin - if nObject.plugin_instance: - resource += "_" + nObject.plugin_instance - - # This block looks at the current notification severity - # and manages the degrade_list. - # If the specified plugin name exists in each of the warnings - # or failure lists and there is a current severity match then - # add that resource instance to the degrade list. - # Conversly if this notification is OKAY then make sure this - # resource instance is not in the degrade list (remove it if it is) - if nObject.severity is NOTIF_OKAY: - if obj.degrade_list and resource in obj.degrade_list: - obj.degrade_list.remove(resource) - - elif nObject.severity is NOTIF_FAILURE: - if obj.degrade_list__failure: - if nObject.plugin in obj.degrade_list__failure: - if resource not in obj.degrade_list: - # handle dynamic filesystems going missing over a swact - # or unmount and being reported as a transient error by - # the df plugin. Don't add it to the failed list if the - # mountpoint is gone. - add = True - if nObject.plugin == PLUGIN__DF: - path = _df_instance_to_path(resource) - add = os.path.ismount(path) - if add is True: - collectd.info("%s %s added to degrade list" % - (PLUGIN, resource)) - obj.degrade_list.append(resource) - else: - # If severity is failure and no failures cause degrade - # then make sure this plugin is not in the degrade list, - # Should never occur. - if resource in obj.degrade_list: - obj.degrade_list.remove(resource) - - elif nObject.severity is NOTIF_WARNING: - if obj.degrade_list__warning: - if nObject.plugin in obj.degrade_list__warning: - if resource not in obj.degrade_list: - # handle dynamic filesystems going missing over a swact - # or unmount and being reported as a transient error by - # the df plugin. Don't add it to the failed list if the - # mountpoint is gone. - add = True - if nObject.plugin == PLUGIN__DF: - path = _df_instance_to_path(resource) - add = os.path.ismount(path) - if add is True: - collectd.info("%s %s added to degrade list" % - (PLUGIN, resource)) - obj.degrade_list.append(resource) - else: - # If severity is warning and no warnings cause degrade - # then make sure this plugin is not in the degrade list. - # Should never occur.. - if resource in obj.degrade_list: - obj.degrade_list.remove(resource) - else: - collectd.info("%s unsupported severity %d" % - (PLUGIN, nObject.severity)) - return 0 - - # running counter of notifications. - obj.msg_throttle += 1 - - # Support for Dynamic File Systems - # -------------------------------- - # Some active controller mounted filesystems can become - # unmounted under the watch of collectd. This can occur - # as a result of a Swact. If an 'degrade' is raised at the - # time an fs disappears then that state can become stuck - # active until the next Swact. This call handles this case. - # - # Audit file system presence every time we get the - # notification for the root file system. - # Depending on the root filesystem always being there. - if nObject.plugin == 'df' \ - and nObject.plugin_instance == 'root' \ - and len(obj.degrade_list): - _clear_degrade_for_missing_filesystems() - - # If degrade list is empty then a clear state is sent to maintenance. - # If degrade list is NOT empty then an assert state is sent to maintenance - # For logging and to ease debug the code below will create a list of - # degraded resource instances to be included in the message to maintenance - # for mtcAgent to optionally log it. - resources = "" - if obj.degrade_list: - # loop over the list, - # limit the degraded resource list being sent to mtce to 5 - for r in obj.degrade_list[0:1:5]: - resources += r + ',' - resources = resources[:-1] - state = "assert" - else: - state = "clear" - - # Message throttling .... - - # Avoid sending the same last state message for up to ONE_EVERY count. - # Just reduce load on mtcAgent - if obj.last_state == state and obj.msg_throttle < ONE_EVERY: - return 0 - - # if the degrade state has changed then log it and proceed - if obj.last_state != state: - if obj.last_state != "undef": - collectd.info("%s degrade %s %s" % - (PLUGIN, - state, - obj.degrade_list)) - - # Save state for next time - obj.last_state = state - - # Clear the message throttle counter - obj.msg_throttle = 0 - - # Send the degrade state ; assert or clear message to mtcAgent. - # If we get a send failure then log it and set the addr to None - # so it forces us to refresh the controller address on the next - # notification - try: - mtce_socket = socket.socket(obj.protocol, socket.SOCK_DGRAM) - if mtce_socket: - if obj.addr is None: - _get_active_controller_ip() - if obj.addr is None: - return 0 - - # Create the Maintenance message. - message = "{\"service\":\"collectd_notifier\"," - message += "\"hostname\":\"" + nObject.host + "\"," - message += "\"degrade\":\"" + state + "\"," - message += "\"resource\":\"" + resources + "\"}" - collectd.debug("%s: %s" % (PLUGIN, message)) - - mtce_socket.settimeout(1.0) - mtce_socket.sendto(message, (obj.addr, obj.port)) - mtce_socket.close() - else: - collectd.error("%s %s failed to open socket (%s)" % - (PLUGIN, resource, obj.addr)) - except socket.error as e: - if e.args[0] == socket.EAI_ADDRFAMILY: - # Handle IPV4 to IPV6 switchover: - obj.protocol = socket.AF_INET6 - collectd.info("%s %s ipv6 addressing (%s)" % - (PLUGIN, resource, obj.addr)) - else: - collectd.error("%s %s socket error (%s) ; %s" % - (PLUGIN, resource, obj.addr, str(e))) - # try self correction - obj.addr = None - obj.protocol = socket.AF_INET - - return 0 - - -collectd.register_config(config_func) -collectd.register_init(init_func) -collectd.register_notification(notifier_func) diff --git a/monitoring/collectd-extensions/src/ntpq.conf b/monitoring/collectd-extensions/src/ntpq.conf deleted file mode 100644 index 02aebc127..000000000 --- a/monitoring/collectd-extensions/src/ntpq.conf +++ /dev/null @@ -1,13 +0,0 @@ - - - - Instance "reachable" - Persist true - PersistOK true - WarningMin 1 - FailureMin 0 - Hits 2 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/ntpq.py b/monitoring/collectd-extensions/src/ntpq.py deleted file mode 100755 index d3ee4538c..000000000 --- a/monitoring/collectd-extensions/src/ntpq.py +++ /dev/null @@ -1,857 +0,0 @@ -############################################################################ -# Copyright (c) 2018-2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################# -# -# This is the NTP connectivity monitor plugin for collectd. -# -# This plugin uses the industry standard ntpq exec to query NTP attributes. -# -# This plugin executes 'ntpq -np' to determined which provisioned servers -# are reachable. The ntpq output includes Tally Code. The tally Code is -# represented by the first character in each server's line item. -# -# The only ntpq output looked at by this plugin are the Tally Codes and -# associated IPs. -# -# Tally Code Summary: -# -# A server is considered reachable only when the Tally Code is a * or a +. -# A server is considered unreachable if the Tally Code is a ' ' (space) -# A server with a '*' Tally Code is the 'selected' server. -# -# Here is an example of the ntpq command output -# -# remote refid st t when poll reach delay offset jitter -# ============================================================================= -# +192.168.204.104 206.108.0.133 2 u 203 1024 377 0.226 -3.443 1.137 -# +97.107.129.217 200.98.196.212 2 u 904 1024 377 21.677 5.577 0.624 -# 192.95.27.155 24.150.203.150 2 u 226 1024 377 15.867 0.381 1.124 -# -97.107.129.217 200.98.196.212 2 u 904 1024 377 21.677 5.577 0.624 -# *182.95.27.155 24.150.203.150 2 u 226 1024 377 15.867 0.381 1.124 -# -# The local controller node is not to be considered a reachable server and is -# never alarmed if it is not reachable. -# -# Normal running modes with no alarms include -# -# 0 - All NTP servers are reachable and one is selected -# 1 - No NTP servers are provisioned -# -# Failure modes that warrant alarms include -# -# 2 - None of the NTP servers are reachable - major alarm -# 3 - Some NTP servers reachable and one is selected - server IP minor alarm -# 4 - Some NTP servers reachable but none is selected - major alarm -# -# None of these failures result in a host being degraded. -# -# This script will only be run on the controller nodes. -# -# This script logs to daemon.log with the 'collectd' process label -# -############################################################################### - -import os -import subprocess -import uuid -import collectd -from fm_api import constants as fm_constants -from fm_api import fm_api -import tsconfig.tsconfig as tsc -import socket - -api = fm_api.FaultAPIsV2() - -PLUGIN = 'NTP query plugin' -PLUGIN_INTERVAL = 600 # audit interval in secs -PLUGIN_CONF = '/etc/ntp.conf' -PLUGIN_EXEC = '/usr/sbin/ntpq' -PLUGIN_EXEC_OPTIONS = '-pn' -PLUGIN_ALARMID = "100.114" - - -# define a class here that will persist over read calls -class NtpqObject: - - # static variables set in init - hostname = '' # the name of this host - base_eid = '' # the eid for the major alarm - init_complete = False # set to true once config is complete - alarm_raised = False # True when the major alarm is asserted - - server_list_conf = [] # list of servers in the /etc/ntp.conf file - server_list_ntpq = [] # list of servers in the ntpq -np output - unreachable_servers = [] # list of unreachable servers - reachable_servers = [] # list of reachable servers - selected_server = 'None' # the ip address of the selected server - selected_server_save = 'None' # the last selected server ; note change - peer_selected = False # true when peer is selected - - # variables used to raise alarms to FM - suppression = True - service_affecting = False - name = "NTP" - alarm_type = fm_constants.FM_ALARM_TYPE_1 - cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN - repair = "Monitor and if condition persists, " - repair += "contact next level of support." - - -# This plugin's class object - persists over read calls -obj = NtpqObject() - - -############################################################################### -# -# Name : _add_unreachable_server -# -# Description: This private interface is used to add an ip to the -# unreachable servers list. -# -# Parameters : IP address -# -############################################################################### - -def _add_unreachable_server(ip=None): - """Add ip to unreachable_servers list""" - - if ip: - if ip not in obj.unreachable_servers: - collectd.debug("%s adding '%s' to unreachable servers list: %s" % - (PLUGIN, ip, obj.unreachable_servers)) - - obj.unreachable_servers.append(ip) - - collectd.info("%s added '%s' to unreachable servers list: %s" % - (PLUGIN, ip, obj.unreachable_servers)) - else: - collectd.debug("%s ip '%s' already in unreachable_servers list" % - (PLUGIN, ip)) - else: - collectd.error("%s _add_unreachable_server called with no IP" % PLUGIN) - - -############################################################################### -# -# Name : _raise_alarm -# -# Description: This private interface is used to raise NTP alarms. -# -# Parameters : Optional IP address -# -# If called with no or empty IP then a generic major alarm is raised. -# If called with an IP then an IP specific minor alarm is raised. -# -# Returns : Error indication. -# -# True : is error. FM call failed to set the -# alarm and needs to be retried. -# -# False: no error. FM call succeeds -# -############################################################################### - -def _raise_alarm(ip=None): - """Assert an NTP alarm""" - - if not ip: - # Don't re-raise the alarm if its already raised - if obj.alarm_raised is True: - return False - - if obj.peer_selected: - reason = "NTP cannot reach external time source; " \ - "syncing with peer controller only" - fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR - else: - reason = "NTP configuration does not contain any valid " - reason += "or reachable NTP servers." - fm_severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - - eid = obj.base_eid - - else: - reason = "NTP address " - reason += ip - reason += " is not a valid or a reachable NTP server." - eid = obj.base_eid + '=' + ip - fm_severity = fm_constants.FM_ALARM_SEVERITY_MINOR - - try: - fault = fm_api.Fault( - alarm_id=PLUGIN_ALARMID, - alarm_state=fm_constants.FM_ALARM_STATE_SET, - entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, - entity_instance_id=eid, - severity=fm_severity, - reason_text=reason, - alarm_type=obj.alarm_type, - probable_cause=obj.cause, - proposed_repair_action=obj.repair, - service_affecting=obj.service_affecting, - suppression=obj.suppression) - - alarm_uuid = api.set_fault(fault) - if _is_uuid_like(alarm_uuid) is False: - - # Don't _add_unreachable_server list if the fm call failed. - # That way it will be retried at a later time. - collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % - (PLUGIN, PLUGIN_ALARMID, eid, alarm_uuid)) - return 0 - else: - collectd.info("%s raised alarm %s:%s" % - (PLUGIN, - PLUGIN_ALARMID, - eid)) - if ip: - _add_unreachable_server(ip) - else: - obj.alarm_raised = True - - except Exception as ex: - collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" % - (PLUGIN, - PLUGIN_ALARMID, - eid, - fm_severity, - ex)) - return 0 - - -############################################################################### -# -# Name : _clear_base_alarm -# -# Description: This private interface is used to clear the NTP base alarm. -# -# Parameters : None -# -# Returns : Error indication. -# -# False: is error. FM call failed to clear the -# alarm and needs to be retried. -# -# True : no error. FM call succeeds -# -############################################################################### - -def _clear_base_alarm(): - """Clear the NTP base alarm""" - - try: - if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is False: - collectd.info("%s %s:%s alarm already cleared" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) - else: - collectd.info("%s %s:%s alarm cleared" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) - obj.alarm_raised = False - return True - - except Exception as ex: - collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % - (PLUGIN, - PLUGIN_ALARMID, - obj.base_eid, - ex)) - return False - - -############################################################################### -# -# Name : _remove_ip_from_unreachable_list -# -# Description: This private interface is used to remove the specified IP -# from the unreachable servers list and clear its alarm if raised. -# -# Parameters : IP address -# -############################################################################### - -def _remove_ip_from_unreachable_list(ip): - """Remove an IP address from the unreachable list and clear its NTP alarms""" - - # remove from unreachable list if its there - if ip and ip in obj.unreachable_servers: - - eid = obj.base_eid + '=' + ip - collectd.debug("%s trying to clear alarm %s" % (PLUGIN, eid)) - - try: - # clear the alarm if its asserted - if api.clear_fault(PLUGIN_ALARMID, eid) is True: - collectd.info("%s %s:%s alarm cleared " % - (PLUGIN, PLUGIN_ALARMID, eid)) - else: - # alarm does not exist - collectd.info("%s %s:%s alarm clear" % - (PLUGIN, PLUGIN_ALARMID, eid)) - - obj.unreachable_servers.remove(ip) - - except Exception as ex: - collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % - (PLUGIN, - PLUGIN_ALARMID, - eid, - ex)) - - -############################################################################### -# -# Name : _add_ip_to_ntpq_server_list -# -# Description: This private interface is used to create a list if servers -# found in the ntpq output. -# -# This list is used to detect and handle servers that might come -# and go between readings that might otherwise result in stuck -# alarms. -# -# Parameters : IP address -# -# Returns : nothing -# -############################################################################### - -def _add_ip_to_ntpq_server_list(ip): - """Add this IP to the list of servers that ntpq reports against""" - - if ip not in obj.server_list_ntpq: - obj.server_list_ntpq.append(ip) - - -############################################################################## -# -# Name : _cleanup_stale_servers -# -# Description: This private interface walks through each server tracking list -# removing any that it finds that are not in the ntpq server list. -# -# Alarms are cleared as needed to avoid stale alarms -# -# Parameters : None -# -# Returns : nothing -# -############################################################################### - -def _cleanup_stale_servers(): - """Cleanup the server IP tracking lists""" - - collectd.debug("%s CLEANUP REACHABLE: %s %s" % - (PLUGIN, obj.server_list_ntpq, obj.reachable_servers)) - for ip in obj.reachable_servers: - if ip not in obj.server_list_ntpq: - collectd.info("%s removing missing '%s' server from reachable " - "server list" % (PLUGIN, ip)) - obj.reachable_servers.remove(ip) - - collectd.debug("%s CLEANUP UNREACHABLE: %s %s" % - (PLUGIN, obj.server_list_ntpq, obj.unreachable_servers)) - for ip in obj.unreachable_servers: - if ip not in obj.server_list_ntpq: - collectd.info("%s removing missing '%s' server from unreachable " - "server list" % (PLUGIN, ip)) - _remove_ip_from_unreachable_list(ip) - - -############################################################################### -# -# Name : _get_ntp_servers -# -# Description: This private interface reads the list of ntp servers from the -# ntp.conf file -# -# Parameters : None -# -# Returns : nothing -# -# Updates : server_list_conf -# -############################################################################### - -def _get_ntp_servers(): - """Read the provisioned servers from the ntp conf file""" - - with open(PLUGIN_CONF, 'r') as infile: - for line in infile: - if line.startswith('server '): - ip = line.rstrip().split(' ')[1] - if ip not in obj.server_list_conf: - obj.server_list_conf.append(ip) - if len(obj.server_list_conf): - collectd.info("%s server list: %s" % - (PLUGIN, obj.server_list_conf)) - else: - ################################################################## - # - # Handle NTP_NOT_PROVISIONED (1) case - # - # There is no alarming for this case. - # Clear any that may have been raised. - # - ################################################################## - collectd.info("%s NTP Service Disabled ; no provisioned servers" % - PLUGIN) - - # clear all alarms - if obj.alarm_raised: - _clear_base_alarm() - - if obj.unreachable_servers: - for ip in obj.unreachable_servers: - _remove_ip_from_unreachable_list(ip) - - -############################################################################### -# -# Name : is_controller -# -# Description: This private interface returns a True if the specified ip is -# associated with a local controller. -# -# Parameters : IP address -# -# Returns : True or False -# -############################################################################### - -def _is_controller(ip): - """Returns True if this IP corresponds to one of the controllers""" - - collectd.debug("%s check if '%s' is a controller ip" % (PLUGIN, ip)) - with open('/etc/hosts', 'r') as infile: - for line in infile: - # skip over file comment lines prefixed with '#' - if line[0] == '#': - continue - # line format is 'ip' 'name' .... - split_line = line.split() - if len(split_line) >= 2: - # look for exact match ip that contains controller in its name - if split_line[0] == ip and 'controller' in line: - collectd.debug("%s %s is a controller" % (PLUGIN, ip)) - return True - return False - - -############################################################################### -# -# Name : _is_ip_address -# -# Description: This private interface returns: -# AF_INET if val is ipv4 -# AF_INET6 if val is ipv6 -# False if val is not a valid ip address -# -# Parameters : val is a uuid string -# -# Returns : socket.AF_INET for ipv4, socket.AF_INET6 for ipv6 -# or False for invalid -# -############################################################################### - -def _is_ip_address(val): - try: - socket.inet_pton(socket.AF_INET, val) - return socket.AF_INET - except socket.error: - pass - - try: - socket.inet_pton(socket.AF_INET6, val) - return socket.AF_INET6 - except socket.error: - pass - - return False - - -############################################################################### -# -# Name : is_uuid_like -# -# Description: This private interface returns a True if the specified value is -# a valid uuid. -# -# Parameters : val is a uuid string -# -# Returns : True or False -# -############################################################################### - -def _is_uuid_like(val): - """Returns validation of a value as a UUID""" - try: - return str(uuid.UUID(val)) == val - except (TypeError, ValueError, AttributeError): - return False - - -############################################################################### -# -# Name : config_func -# -# Description: The configuration interface this plugin publishes to collectd. -# -# collectd calls this interface one time on its process startup -# when it loads this plugin. -# -# There is currently no specific configuration options to parse -# for this plugin. -# -# Parameters : collectd config object -# -# Returns : zero -# -############################################################################### - -def config_func(config): - """Configure the plugin""" - - collectd.debug('%s config function' % PLUGIN) - return 0 - - -############################################################################### -# -# Name : init_func -# -# Description: The initialization interface this plugin publishes to collectd. -# -# collectd calls this interface one time on its process startup -# when it loads this plugin. -# -# 1. get hostname -# 2. build base entity id for the NTP alarm -# 3. query FM for existing NTP alarms -# - base alarm is maintained and state loaded if it exists -# - ntp ip minor alalrms are cleared on init. This is done to -# auto correct ntp server IP address changes over process -# restart ; avoid stuck alarms. -# -# Parameters : None -# -# Returns : zero -# -############################################################################### - -def init_func(): - - # ntp query is for controllers only - if tsc.nodetype != 'controller': - return 0 - - # do nothing till config is complete. - # init_func will be called again by read_func once config is complete. - if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is False: - return 0 - - # get current hostname - obj.hostname = os.uname()[1] - if not obj.hostname: - collectd.error("%s failed to get hostname" % PLUGIN) - return 1 - - obj.base_eid = 'host=' + obj.hostname + '.ntp' - collectd.debug("%s on %s with entity id '%s'" % - (PLUGIN, obj.hostname, obj.base_eid)) - - # get a list of provisioned ntp servers - _get_ntp_servers() - - # manage existing alarms. - try: - alarms = api.get_faults_by_id(PLUGIN_ALARMID) - - except Exception as ex: - collectd.error("%s 'get_faults_by_id' exception ; %s ; %s" % - (PLUGIN, PLUGIN_ALARMID, ex)) - return 0 - - if alarms: - for alarm in alarms: - eid = alarm.entity_instance_id - # ignore alarms not for this host - if obj.hostname not in eid: - continue - - # maintain only the base alarm. - if alarm.entity_instance_id != obj.base_eid: - # clear any ntp server specific alarms over process restart - # this is done to avoid the potential for stuck ntp ip alarms - collectd.info("%s clearing found startup alarm '%s'" % - (PLUGIN, alarm.entity_instance_id)) - try: - api.clear_fault(PLUGIN_ALARMID, alarm.entity_instance_id) - except Exception as ex: - collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % - (PLUGIN, - PLUGIN_ALARMID, - alarm.entity_instance_id, - ex)) - return 0 - - else: - obj.alarm_raised = True - collectd.info("%s found alarm %s:%s" % - (PLUGIN, - PLUGIN_ALARMID, - alarm.entity_instance_id)) - - # ensure the base alarm is cleared if there are no - # provisioned servers. - if not obj.server_list_conf: - _clear_base_alarm() - - else: - collectd.info("%s no major startup alarms found" % PLUGIN) - - obj.init_complete = True - - return 0 - - -############################################################################### -# -# Name : read_func -# -# Description: The sample read interface this plugin publishes to collectd. -# -# collectd calls this interface every audit interval. -# -# Runs ntpq -np to query NTP status and manages alarms based on -# the result. -# -# See file header (above) for more specific behavioral detail. -# -# Should only run on a controller ; both -# -# Parameters : None -# -# Returns : zero or non-zero on significant error -# -############################################################################### - -def read_func(): - - # ntp query is for controllers only - if tsc.nodetype != 'controller': - return 0 - - if obj.init_complete is False: - if os.path.exists(tsc.VOLATILE_CONTROLLER_CONFIG_COMPLETE) is True: - collectd.info("%s re-running init" % PLUGIN) - init_func() - return 0 - - # get a list if provisioned ntp servers - _get_ntp_servers() - - # nothing to do while there are no provisioned NTP servers - if len(obj.server_list_conf) == 0: - return 0 - - # Do NTP Query - data = subprocess.check_output([PLUGIN_EXEC, PLUGIN_EXEC_OPTIONS]) - - # Keep this FIT test code but make it commented out for security - # - # if os.path.exists('/var/run/fit/ntpq_data'): - # data = '' - # collectd.info("%s using ntpq FIT data" % PLUGIN) - # with open('/var/run/fit/ntpq_data', 'r') as infile: - # for line in infile: - # data += line - - if not data: - collectd.error("%s no data from query" % PLUGIN) - return 0 - - # Get the ntp query output into a list of lines - obj.ntpq = data.split('\n') - - # keep track of changes ; only log on changes - reachable_list_changed = False - unreachable_list_changed = False - - # Manage the selected server name - # - # save the old value so we can print a log if the selected server changes - if obj.selected_server: - obj.selected_server_save = obj.selected_server - # always assume no selected server ; till its learned - obj.selected_server = '' - - # start with a fresh empty list for this new run to populate - obj.server_list_ntpq = [] - - # Loop through the ntpq output. - # Ignore the first 2 lines ; just header data. - for i in range(2, len(obj.ntpq)): - - # ignore empty or lines that are not long enough - if len(obj.ntpq[i]) < 10: - continue - - # log the ntpq output ; minus the 2 lines of header - collectd.info("NTPQ: %s" % obj.ntpq[i]) - - # Unreachable servers are ones whose line start with a space - ip = '' - if obj.ntpq[i][0] == ' ': - # get the ip address - # example format of line:['', '132.163.4.102', '', '', '.INIT.', - # get ip from index [1] of the list - unreachable = obj.ntpq[i].split(' ')[1] - if unreachable: - # check to see if its a controller ip - # we skip over controller ips - if _is_controller(unreachable) is False: - _add_ip_to_ntpq_server_list(unreachable) - if unreachable not in obj.unreachable_servers: - if _raise_alarm(unreachable) is False: - unreachable_list_changed = True - # if the FM call to raise the alarm worked then - # add this ip to the unreachable list if its not - # already in it - _add_unreachable_server(unreachable) - - # Reachable servers are ones whose line start with a '+' - elif obj.ntpq[i][0] == '+': - # remove the '+' and get the ip - ip = obj.ntpq[i].split(' ')[0][1:] - - elif obj.ntpq[i][0] == '*': - # remove the '*' and get the ip - cols = obj.ntpq[i].split(' ') - ip = cols[0][1:] - if ip: - ip_family = _is_ip_address(ip) - obj.peer_selected = _is_controller(ip) - if ip != obj.selected_server and obj.alarm_raised is True: - # a new ntp server is selected, old alarm may not be - # valid - _clear_base_alarm() - obj.alarm_raised = False - if obj.peer_selected is False: - if obj.selected_server: - # done update the selected server if more selections - # are found. go with the first one found. - collectd.info("%s additional selected server found" - " '%s'; current selection is '%s'" % - (PLUGIN, ip, obj.selected_server)) - else: - # update the selected server list - obj.selected_server = ip - collectd.debug("%s selected server is '%s'" % - (PLUGIN, obj.selected_server)) - else: - # refer to peer - refid = '' - for i in range(1, len(cols)): - if cols[i] != '': - refid = cols[i] - break - - if refid not in ('', '127.0.0.1') and \ - not _is_controller(refid) and \ - socket.AF_INET == ip_family: - # ipv4, peer controller refer to a time source is not - # itself or a controller (this node) - obj.selected_server = ip - collectd.debug("peer controller has a reliable " - "source") - - # anything else is unreachable - else: - unreachable = obj.ntpq[i][1:].split(' ')[0] - if _is_controller(unreachable) is False: - _add_ip_to_ntpq_server_list(unreachable) - if unreachable not in obj.unreachable_servers: - if _raise_alarm(unreachable) is False: - unreachable_list_changed = True - # if the FM call to raise the alarm worked then - # add this ip to the unreachable list if its not - # already in it - _add_unreachable_server(unreachable) - - if ip: - # if the ip is valid then manage it - if _is_controller(ip) is False: - _add_ip_to_ntpq_server_list(ip) - # add the ip to the reachable servers list - # if its not already there - if ip not in obj.reachable_servers: - obj.reachable_servers.append(ip) - reachable_list_changed = True - # make sure this IP is no longer in the unreachable - # list and that alarms for it are cleared - _remove_ip_from_unreachable_list(ip) - - _cleanup_stale_servers() - - if obj.selected_server: - if obj.selected_server != obj.selected_server_save: - collectd.info("%s selected server changed from '%s' to '%s'" % - (PLUGIN, - obj.selected_server_save, - obj.selected_server)) - obj.selected_server_save = obj.selected_server - if obj.alarm_raised is True: - _clear_base_alarm() - - elif obj.alarm_raised is False: - if obj.peer_selected: - collectd.info("%s peer is selected" % PLUGIN) - else: - collectd.error("%s no selected server" % PLUGIN) - if _raise_alarm() is False: - obj.selected_server_save = 'None' - - # only log and act on changes - if reachable_list_changed is True: - if obj.reachable_servers: - collectd.info("%s reachable servers: %s" % - (PLUGIN, obj.reachable_servers)) - if obj.alarm_raised is True: - if obj.selected_server and obj.reachable_servers: - _clear_base_alarm() - else: - collectd.error("%s no reachable servers" % PLUGIN) - _raise_alarm() - - # only log changes - if unreachable_list_changed is True: - if obj.unreachable_servers: - collectd.info("%s unreachable servers: %s" % - (PLUGIN, obj.unreachable_servers)) - else: - collectd.info("%s all servers are reachable" % PLUGIN) - - # The sample published to the database is simply the number - # of reachable servers if one is selected - if not obj.selected_server: - sample = 0 - else: - sample = len(obj.reachable_servers) - - # Dispatch usage value to collectd - val = collectd.Values(host=obj.hostname) - val.plugin = 'ntpq' - val.type = 'absolute' - val.type_instance = 'reachable' - val.dispatch(values=[sample]) - - return 0 - - -# register the config, init and read functions -collectd.register_config(config_func) -collectd.register_init(init_func) -collectd.register_read(read_func, interval=PLUGIN_INTERVAL) diff --git a/monitoring/collectd-extensions/src/plugin_common.py b/monitoring/collectd-extensions/src/plugin_common.py deleted file mode 100644 index d236bb8b1..000000000 --- a/monitoring/collectd-extensions/src/plugin_common.py +++ /dev/null @@ -1,311 +0,0 @@ -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################ -# -# This file contains common collectd plugin constructs and utilities -# -############################################################################ - -import collectd -import json -import uuid -import httplib2 -import socket -import os -from oslo_concurrency import processutils -from fm_api import constants as fm_constants -import tsconfig.tsconfig as tsc - -# http request constants -PLUGIN_TIMEOUT = 10 -PLUGIN_HTTP_HEADERS = {'Accept': 'application/json', 'Connection': 'close'} - -MIN_AUDITS_B4_FIRST_QUERY = 2 - - -class PluginObject(object): - - def __init__(self, plugin, url): - - # static variables set in init_func - self.plugin = plugin # the name of this plugin - self.hostname = '' # the name of this host - self.port = 0 # the port number for this plugin - self.base_eid = '' # the base entity id host= - self.controller = False # set true if node is controller - - # dynamic gate variables - self.virtual = False # set to True if host is virtual - self.config_complete = False # set to True once config is complete - self.config_done = False # set true if config_func completed ok - self.init_done = False # set true if init_func completed ok - self.fm_connectivity = False # set true when fm connectivity ok - - self.alarm_type = fm_constants.FM_ALARM_TYPE_7 # OPERATIONAL - self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS - self.suppression = True - self.service_affecting = False - - # dynamic variables set in read_func - self.usage = float(0) # last usage value recorded as float - self.value = float(0) # last read value - self.audits = 0 # number of audit since init - self.enabled = False # tracks a plugin's enabled state - self.alarmed = False # tracks the current alarmed state - self.mode = '' # mode specific to plugin - - # http and json specific variables - self.url = url # target url - self.jresp = None # used to store the json response - self.resp = '' - - self.objects = [] # list of plugin specific objects - self.cmd = '' # plugin specific command string - - # Log controls - self.config_logged = False # used to log once the plugin config - self.error_logged = False # used to prevent log flooding - self.log_throttle_count = 0 # used to count throttle logs - self.INIT_LOG_THROTTLE = 10 # the init log throttle threshold - self.http_retry_count = 0 # track http error cases - self.HTTP_RETRY_THROTTLE = 6 # http retry threshold - self.phase = 0 # tracks current phase; init, sampling - - collectd.debug("%s Common PluginObject constructor [%s]" % - (plugin, url)) - - ########################################################################### - # - # Name : init_ready - # - # Description: Test for init ready condition - # - # Parameters : plugin name - # - # Returns : False if initial config complete is not done - # True if initial config complete is done - # - ########################################################################### - - def init_ready(self): - """Test for system init ready state""" - - if os.path.exists(tsc.INITIAL_CONFIG_COMPLETE_FLAG) is False: - self.log_throttle_count += 1 - if self.log_throttle_count > self.INIT_LOG_THROTTLE: - collectd.info("%s initialization needs retry" % self.plugin) - self.log_throttle_count = 0 - return False - else: - self.log_throttle_count = 0 - - return True - - ########################################################################### - # - # Name : gethostname - # - # Description: load the hostname - # - # Parameters : plugin name - # - # Returns : Success - hostname - # Failure - None - # - # Updates : obj.hostname - # - ########################################################################### - def gethostname(self): - """Fetch the hostname""" - - # get current hostname - try: - hostname = socket.gethostname() - if hostname: - return hostname - except: - collectd.error("%s failed to get hostname" % self.plugin) - - return None - - ########################################################################### - # - # Name : is_virtual - # - # Description: Execute facter command with output filter on 'is_virtual' - # - # Parameters : None - # - # Returns : True if current host is virtual. - # False if current host is NOT virtual - # - ########################################################################### - def is_virtual(self): - """Check for virtual host""" - - try: - cmd = '/usr/bin/facter is_virtual' - res, err = processutils.execute(cmd, shell=True) - if err: - return False - elif res: - # remove the trailing '\n' with strip() - if res.strip() == 'true': - collectd.info("%s %s is virtual" % - (self.plugin, self.hostname)) - return True - - except Exception as ex: - collectd.info("%s failed to execute '/usr/bin/facter' ; %s" % - self.plugin, ex) - - return False - - ########################################################################### - # - # Name : check_for_fit - # - # Description: load FIT data if it is present - # - # Fit Format : unit data -> 0 89 - # - instance 0 value 89 - # - # Parameters : plugin name - # object to update with fit - # name in fit file - # unit - # - # Returns : Did a failure occur ? - # False = no - # True = yes - # - # Updates : self.usage with FIT value if FIT conditions are present - # and apply - # - ########################################################################### - def check_for_fit(self, name, unit): - """Load FIT data into usage if it exists""" - - fit_file = '/var/run/fit/' + name + '_data' - - if os.path.exists(fit_file): - valid = False - with open(fit_file, 'r') as infile: - for line in infile: - try: - inst, val = line.split(' ') - if int(unit) == int(inst): - self.usage = float(val) - valid = True - - except: - try: - val = float(line) - self.usage = float(val) - valid = True - - except: - collectd.error("%s bad FIT data; ignoring" % - self.plugin) - - if valid is True: - collectd.info("%s %.2f usage (unit %d) (FIT)" % - (self.plugin, unit, self.usage)) - return False - - return True - - ########################################################################### - # - # Name : make_http_request - # - # Description: Issue an http request to the specified URL. - # Load and return the response - # Handling execution errors - # - # Parameters : self as current context. - # - # Optional: - # - # url - override the default self url with http address to - # issue the get request to. - # to - timeout override - # hdrs - override use of the default header list - # - # Updates : self.jresp with the json string response from the request. - # - # Returns : Error indication (True/False) - # True on success - # False on error - # - ########################################################################### - def make_http_request(self, url=None, to=None, hdrs=None): - """Make a blocking HTTP Request and return result""" - - try: - - # handle timeout override - if to is None: - to = PLUGIN_TIMEOUT - - # handle url override - if url is None: - url = self.url - - # handle header override - if hdrs is None: - hdrs = PLUGIN_HTTP_HEADERS - - http = httplib2.Http(timeout=to) - resp = http.request(url, headers=hdrs) - - except Exception as ex: - collectd.info("%s http request exception ; %s" % - (self.plugin, str(ex))) - return False - - try: - collectd.debug("%s Resp: %s" % - (self.plugin, resp[1])) - - self.resp = resp[1] - self.jresp = json.loads(resp[1]) - - except Exception as ex: - collectd.error("%s http response parse exception ; %s" % - (self.plugin, str(ex))) - if len(self.resp): - collectd.error("%s response: %s" % - (self.plugin, self.resp)) - return False - return True - - -def is_uuid_like(val): - """Returns validation of a value as a UUID - - For our purposes, a UUID is a canonical form string: - aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa - """ - try: - return str(uuid.UUID(val)) == val - except (TypeError, ValueError, AttributeError): - return False - - -def get_severity_str(severity): - """get string that represents the specified severity""" - - if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR: - return "clear" - elif severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL: - return "critical" - elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: - return "major" - elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR: - return "minor" - else: - return "unknown" diff --git a/monitoring/collectd-extensions/src/ptp.conf b/monitoring/collectd-extensions/src/ptp.conf deleted file mode 100644 index 2f1d15b52..000000000 --- a/monitoring/collectd-extensions/src/ptp.conf +++ /dev/null @@ -1,15 +0,0 @@ - - - - Instance "nsec" - Persist true - PersistOK true - WarningMax 1000 - FailureMax 1000000 - WarningMin -1000 - FailureMin -1000000 - Hits 2 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/ptp.py b/monitoring/collectd-extensions/src/ptp.py deleted file mode 100755 index 5232675a2..000000000 --- a/monitoring/collectd-extensions/src/ptp.py +++ /dev/null @@ -1,988 +0,0 @@ -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################ -# -# This file is the collectd 'Precision Time Protocol' Service Monitor. -# -# Algorithm: -# -# while not config ; check again -# while not init ; retry -# if startup -# clear all ptp alarms -# if ptp enabled -# if ptp not running -# raise 'process' alarm -# else -# read grand master and current skew -# if not controller and is grand master -# raise 'no lock' alarm -# if skew is out-of-tolerance -# raise out-of-tolerance alarm -# -# -# manage alarm state throught -# retry on alarm state change failures -# only make raise/clear alarm calls on severity state changes -# -############################################################################ -import os -import collectd -import subprocess -import tsconfig.tsconfig as tsc -import plugin_common as pc -from fm_api import constants as fm_constants -from fm_api import fm_api - -debug = False - -# Fault manager API Object -api = fm_api.FaultAPIsV2() - -PLUGIN_ALARMID = "100.119" - -# name of the plugin - all logs produced by this plugin are prefixed with this -PLUGIN = 'ptp plugin' - -# Service name -PTP = 'Precision Time Protocol (PTP)' - -# Interface Monitoring Interval in seconds -PLUGIN_AUDIT_INTERVAL = 300 - -# Sample Data 'type' and 'instance' database field values. -PLUGIN_TYPE = 'time_offset' -PLUGIN_TYPE_INSTANCE = 'nsec' - -# Primary PTP service name -PLUGIN_SERVICE = 'ptp4l.service' - -# Plugin configuration file -# -# This plugin looks for the timestamping mode in the ptp4l config file. -# time_stamping hardware -# -PLUGIN_CONF_FILE = '/etc/ptp4l.conf' -PLUGIN_CONF_TIMESTAMPING = 'time_stamping' - -# Tools used by plugin -SYSTEMCTL = '/usr/bin/systemctl' -ETHTOOL = '/usr/sbin/ethtool' -PLUGIN_STATUS_QUERY_EXEC = '/usr/sbin/pmc' - -# Query PTP service administrative (enabled/disabled) state -# -# > systemctl is-enabled ptp4l -# enabled -# > systemctl disable ptp4l -# > systemctl is-enabled ptp4l -# disabled - -SYSTEMCTL_IS_ENABLED_OPTION = 'is-enabled' -SYSTEMCTL_IS_ENABLED_RESPONSE = 'enabled' -SYSTEMCTL_IS_DISABLED_RESPONSE = 'disabled' - -# Query PTP service activity (active=running / inactive) state -# -# > systemctl is-active ptp4l -# active -# > systemctl stop ptp4l -# > systemctl is-active ptp4l -# inactive - -SYSTEMCTL_IS_ACTIVE_OPTION = 'is-active' -SYSTEMCTL_IS_ACTIVE_RESPONSE = 'active' -SYSTEMCTL_IS_INACTIVE_RESPONSE = 'inactive' - -# Alarm Cause codes ; used to specify what alarm EID to assert or clear. -ALARM_CAUSE__NONE = 0 -ALARM_CAUSE__PROCESS = 1 -ALARM_CAUSE__OOT = 2 -ALARM_CAUSE__NO_LOCK = 3 -ALARM_CAUSE__UNSUPPORTED_HW = 4 -ALARM_CAUSE__UNSUPPORTED_SW = 5 -ALARM_CAUSE__UNSUPPORTED_LEGACY = 6 - -# Run Phase -RUN_PHASE__INIT = 0 -RUN_PHASE__DISABLED = 1 -RUN_PHASE__NOT_RUNNING = 2 -RUN_PHASE__SAMPLING = 3 - -# Clock Sync Out-Of-Tolerance thresholds -OOT_MINOR_THRESHOLD = int(1000) -OOT_MAJOR_THRESHOLD = int(1000000) - -# Instantiate the common plugin control object -obj = pc.PluginObject(PLUGIN, "") - - -# Create an alarm management class -class PTP_alarm_object: - - def __init__(self, interface=None): - - self.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR - self.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 - self.alarm = ALARM_CAUSE__NONE - self.interface = interface - self.raised = False - self.reason = '' - self.repair = '' - self.eid = '' - - -# Plugin specific control class and object. -class PTP_ctrl_object: - - def __init__(self): - - self.gm_log_throttle = 0 - self.nolock_alarm_object = None - self.process_alarm_object = None - self.oot_alarm_object = None - - -ctrl = PTP_ctrl_object() - - -# Alarm object list, one entry for each interface and alarm cause case -ALARM_OBJ_LIST = [] - - -# UT verification utilities -def assert_all_alarms(): - for o in ALARM_OBJ_LIST: - raise_alarm(o.alarm, o.interface, 0) - - -def clear_all_alarms(): - for o in ALARM_OBJ_LIST: - if clear_alarm(o.eid) is True: - msg = 'cleared' - else: - msg = 'clear failed' - collectd.info("%s %s:%s alarm %s" % - (PLUGIN, PLUGIN_ALARMID, o.eid, msg)) - - -def print_alarm_object(o): - collectd.info("%s Interface:%s Cause: %d Severity:%s Raised:%d" % - (PLUGIN, - o.interface, - o.alarm, - o.severity, - o.raised)) - collectd.info("%s Entity:[%s]" % (PLUGIN, o.eid)) - collectd.info("%s Reason:[%s]" % (PLUGIN, o.reason)) - collectd.info("%s Repair:[%s]" % (PLUGIN, o.repair)) - - -def print_alarm_objects(): - for o in ALARM_OBJ_LIST: - print_alarm_object(o) - - -# Interface:Supported Modes dictionary. key:value -# -# interface:modes -# -interfaces = {} - - -##################################################################### -# -# Name : _get_supported_modes -# -# Description: Invoke ethtool -T and load its -# time stamping capabilities. -# -# hardware, software or legacy. -# -# Parameters : The name of the physical interface to query the -# supported modes for. -# -# Interface Capabilities Output Examples: -# -# vbox prints this as it only supports software timestamping -# software-transmit (SOF_TIMESTAMPING_TX_SOFTWARE) -# software-receive (SOF_TIMESTAMPING_RX_SOFTWARE) -# -# full support output looks like this -# hardware-transmit (SOF_TIMESTAMPING_TX_HARDWARE) -# software-transmit (SOF_TIMESTAMPING_TX_SOFTWARE) -# hardware-receive (SOF_TIMESTAMPING_RX_HARDWARE) -# software-receive (SOF_TIMESTAMPING_RX_SOFTWARE) -# hardware-raw-clock (SOF_TIMESTAMPING_RAW_HARDWARE) -# -# Only legacy support output looks like this -# hardware-raw-clock (SOF_TIMESTAMPING_RAW_HARDWARE) -# -# Provisionable PTP Modes are -# hardware -> hardware-transmit/receive -# software -> software-transmit/receive -# legacy -> hardware-raw-clock - -TIMESTAMP_MODE__HW = 'hardware' -TIMESTAMP_MODE__SW = 'software' -TIMESTAMP_MODE__LEGACY = 'legacy' - - -# -# Returns : a list of supported modes -# -##################################################################### -def _get_supported_modes(interface): - """Get the supported modes for the specified interface""" - - hw_tx = hw_rx = sw_tx = sw_rx = False - modes = [] - data = subprocess.check_output([ETHTOOL, '-T', interface]).split('\n') - if data: - collectd.debug("%s 'ethtool -T %s' output:%s\n" % - (PLUGIN, interface, data)) - check_for_modes = False - for i in range(0, len(data)): - collectd.debug("%s data[%d]:%s\n" % (PLUGIN, i, data[i])) - if 'Capabilities' in data[i]: - - # start of capabilities list - check_for_modes = True - - elif check_for_modes is True: - - if 'PTP Hardware Clock' in data[i]: - # no more modes after this label - break - elif 'hardware-transmit' in data[i]: - hw_tx = True - elif 'hardware-receive' in data[i]: - hw_rx = True - elif 'software-transmit' in data[i]: - sw_tx = True - elif 'software-receive' in data[i]: - sw_rx = True - elif 'hardware-raw-clock' in data[i]: - modes.append(TIMESTAMP_MODE__LEGACY) - - if sw_tx is True and sw_rx is True: - modes.append(TIMESTAMP_MODE__SW) - - if hw_tx is True and hw_rx is True: - modes.append(TIMESTAMP_MODE__HW) - - if modes: - collectd.debug("%s %s interface PTP capabilities: %s" % - (PLUGIN, interface, modes)) - else: - collectd.info("%s no capabilities advertised for %s" % - (PLUGIN, interface)) - - else: - collectd.info("%s no ethtool output for %s" % (PLUGIN, interface)) - return None - - return modes - - -##################################################################### -# -# Name : get_alarm_object -# -# Description: Search the alarm list based on the alarm cause -# code and interface. -# -# Returns : Alarm object if found ; otherwise None -# -##################################################################### -def get_alarm_object(alarm, interface=None): - """Alarm object lookup""" - - for o in ALARM_OBJ_LIST: - # print_alarm_object(o) - if interface is None: - if o.alarm == alarm: - return o - else: - if o.interface == interface: - if o.alarm == alarm: - return o - - collectd.info("%s alarm object lookup failed ; %d:%s" % - (PLUGIN, alarm, interface)) - return None - - -##################################################################### -# -# Name : clear_alarm -# -# Description: Clear the ptp alarm with the specified entity ID. -# -# Returns : True if operation succeeded -# False if there was an error exception. -# -# Assumptions: Caller can decide to retry based on return status. -# -##################################################################### -def clear_alarm(eid): - """Clear the ptp alarm with the specified entity ID""" - - try: - if api.clear_fault(PLUGIN_ALARMID, eid) is True: - collectd.info("%s %s:%s alarm cleared" % - (PLUGIN, PLUGIN_ALARMID, eid)) - else: - collectd.info("%s %s:%s alarm already cleared" % - (PLUGIN, PLUGIN_ALARMID, eid)) - return True - - except Exception as ex: - collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % - (PLUGIN, PLUGIN_ALARMID, eid, ex)) - return False - - -##################################################################### -# -# Name : raise_alarm -# -# Description: Assert a specific PTP alarm based on the alarm cause -# code and interface. -# -# Handle special case cause codes -# Handle failure to raise fault -# -# Assumptions: Short circuited Success return if the alarm is -# already known to be asserted. -# -# Returns : False on Failure -# True on Success -# -##################################################################### -def raise_alarm(alarm_cause, interface=None, data=0): - """Assert a cause based PTP alarm""" - - collectd.debug("%s Raising Alarm %d" % (PLUGIN, alarm_cause)) - - alarm = get_alarm_object(alarm_cause, interface) - if alarm is None: - # log created for None case in the get_alarm_object util - return True - - # copy the reason as it might be updated for the OOT, - # most typical, case. - reason = alarm.reason - - # Handle some special cases - # - - if alarm_cause == ALARM_CAUSE__OOT: - # If this is an out of tolerance alarm then add the - # out of tolerance reading to the reason string before - # asserting the alarm. - # - # Keep the alarm updated with the latest sample reading - # and severity even if its already asserted. - if abs(float(data)) > 100000000000: - reason += 'more than 100 seconds' - elif abs(float(data)) > 10000000000: - reason += 'more than 10 seconds' - elif abs(float(data)) > 1000000000: - reason += 'more than 1 second' - elif abs(float(data)) > 1000000: - reason += str(abs(int(data)) / 1000000) - reason += ' millisecs' - elif abs(float(data)) > 1000: - reason += str(abs(int(data)) / 1000) - reason += ' microsecs' - else: - reason += str(float(data)) - reason += ' ' + PLUGIN_TYPE_INSTANCE - - elif alarm.raised is True: - # If alarm already raised then exit. - # - # All other alarms are a Major so there is no need to - # track a change in severity and update accordingly. - return True - - elif alarm_cause == ALARM_CAUSE__PROCESS: - reason = 'Provisioned ' + PTP + ' \'' + obj.mode - reason += '\' time stamping mode seems to be unsupported by this host' - - try: - fault = fm_api.Fault( - alarm_id=PLUGIN_ALARMID, - alarm_state=fm_constants.FM_ALARM_STATE_SET, - entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, - entity_instance_id=alarm.eid, - severity=alarm.severity, - reason_text=reason, - alarm_type=obj.alarm_type, - probable_cause=alarm.cause, - proposed_repair_action=alarm.repair, - service_affecting=False, # obj.service_affecting, - suppression=True) # obj.suppression) - - alarm_uuid = api.set_fault(fault) - if pc.is_uuid_like(alarm_uuid) is False: - - # Don't _add_unreachable_server list if the fm call failed. - # That way it will be retried at a later time. - collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % - (PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm_uuid)) - return False - - else: - collectd.info("%s %s:%s:%s alarm raised" % - (PLUGIN, PLUGIN_ALARMID, alarm.eid, alarm.severity)) - alarm.raised = True - return True - - except Exception as ex: - collectd.error("%s 'set_fault' exception ; %s:%s:%s ; %s" % - (PLUGIN, - PLUGIN_ALARMID, - alarm.eid, - alarm.severity, - ex)) - return False - - -##################################################################### -# -# Name : create_interface_alarm_objects -# -# Description: Create alarm objects for specified interface -# -##################################################################### -def create_interface_alarm_objects(interface=None): - """Create alarm objects""" - - collectd.debug("%s Alarm Object Create: Interface:%s " % - (PLUGIN, interface)) - - if interface is None: - o = PTP_alarm_object() - o.alarm = ALARM_CAUSE__PROCESS - o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - o.reason = obj.hostname + ' does not support the provisioned ' - o.reason += PTP + ' mode ' - o.repair = 'Check host hardware reference manual ' - o.repair += 'to verify that the selected PTP mode is supported' - o.eid = obj.base_eid + '.ptp' - o.cause = fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN # 'unknown' - ALARM_OBJ_LIST.append(o) - ctrl.process_alarm_object = o - - o = PTP_alarm_object() - o.alarm = ALARM_CAUSE__OOT - o.severity = fm_constants.FM_ALARM_SEVERITY_CLEAR - o.reason = obj.hostname + ' ' - o.reason += PTP + " clocking is out of tolerance by " - o.repair = "Check quality of the clocking network" - o.eid = obj.base_eid + '.ptp=out-of-tolerance' - o.cause = fm_constants.ALARM_PROBABLE_CAUSE_50 # THRESHOLD CROSS - ALARM_OBJ_LIST.append(o) - ctrl.oot_alarm_object = o - - o = PTP_alarm_object() - # Only applies to storage and worker nodes - o.alarm = ALARM_CAUSE__NO_LOCK - o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - o.reason = obj.hostname - o.reason += ' is not locked to remote PTP Grand Master' - o.repair = 'Check network' - o.eid = obj.base_eid + '.ptp=no-lock' - o.cause = fm_constants.ALARM_PROBABLE_CAUSE_51 # timing-problem - ALARM_OBJ_LIST.append(o) - ctrl.nolock_alarm_object = o - - else: - o = PTP_alarm_object(interface) - o.alarm = ALARM_CAUSE__UNSUPPORTED_HW - o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - o.reason = obj.hostname + " '" + interface + "' does not support " - o.reason += PTP + ' Hardware timestamping' - o.repair = 'Check host hardware reference manual to verify PTP ' - o.repair += 'Hardware timestamping is supported by this interface' - o.eid = obj.base_eid + '.ptp=' + interface - o.eid += '.unsupported=hardware-timestamping' - o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error' - ALARM_OBJ_LIST.append(o) - - o = PTP_alarm_object(interface) - o.alarm = ALARM_CAUSE__UNSUPPORTED_SW - o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - o.reason = obj.hostname + " '" + interface + "' does not support " - o.reason += PTP + ' Software timestamping' - o.repair = 'Check host hardware reference manual to verify PTP ' - o.repair += 'Software timestamping is supported by this interface' - o.eid = obj.base_eid + '.ptp=' + interface - o.eid += '.unsupported=software-timestamping' - o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error' - ALARM_OBJ_LIST.append(o) - - o = PTP_alarm_object(interface) - o.alarm = ALARM_CAUSE__UNSUPPORTED_LEGACY - o.severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - o.reason = obj.hostname + " '" + interface + "' does not support " - o.reason += PTP + " Legacy timestamping" - o.repair = 'Check host hardware reference manual to verify PTP ' - o.repair += 'Legacy or Raw Clock is supported by this host' - o.eid = obj.base_eid + '.ptp=' + interface - o.eid += '.unsupported=legacy-timestamping' - o.cause = fm_constants.ALARM_PROBABLE_CAUSE_7 # 'config error' - ALARM_OBJ_LIST.append(o) - - -##################################################################### -# -# Name : read_timestamp_mode -# -# Description: Refresh the timestamping mode if it changes -# -##################################################################### -def read_timestamp_mode(): - """Load timestamping mode""" - - if os.path.exists(PLUGIN_CONF_FILE): - current_mode = obj.mode - with open(PLUGIN_CONF_FILE, 'r') as infile: - for line in infile: - if PLUGIN_CONF_TIMESTAMPING in line: - obj.mode = line.split()[1].strip('\n') - break - - if obj.mode: - if obj.mode != current_mode: - collectd.info("%s Timestamping Mode: %s" % - (PLUGIN, obj.mode)) - else: - collectd.error("%s failed to get Timestamping Mode" % PLUGIN) - else: - collectd.error("%s failed to load ptp4l configuration" % PLUGIN) - obj.mode = None - - -##################################################################### -# -# Name : init_func -# -# Description: The collectd initialization entrypoint for -# this plugin -# -# Assumptions: called only once -# -# Algorithm : check for no -# -# -##################################################################### -def init_func(): - - if obj.init_ready() is False: - return False - - obj.hostname = obj.gethostname() - obj.base_eid = 'host=' + obj.hostname - - # Create the interface independent alarm objects. - create_interface_alarm_objects() - - # load monitored interfaces and supported modes - if os.path.exists(PLUGIN_CONF_FILE): - with open(PLUGIN_CONF_FILE, 'r') as infile: - for line in infile: - # The PTP interfaces used are specified in the ptp4l.conf - # file as [interface]. There may be more than one. - # Presently there is no need to track the function of the - # interface ; namely mgmnt or oam. - if line[0] == '[': - interface = line.split(']')[0].split('[')[1] - if interface and interface != 'global': - interfaces[interface] = _get_supported_modes(interface) - create_interface_alarm_objects(interface) - - if PLUGIN_CONF_TIMESTAMPING in line: - obj.mode = line.split()[1].strip('\n') - - if obj.mode: - collectd.info("%s Timestamping Mode: %s" % - (PLUGIN, obj.mode)) - else: - collectd.error("%s failed to get Timestamping Mode" % PLUGIN) - else: - collectd.error("%s failed to load ptp4l configuration" % PLUGIN) - obj.mode = None - - for key, value in interfaces.items(): - collectd.info("%s interface %s supports timestamping modes: %s" % - (PLUGIN, key, value)) - - # remove '# to dump alarm object data - # print_alarm_objects() - - if tsc.nodetype == 'controller': - obj.controller = True - - obj.virtual = obj.is_virtual() - obj.init_done = True - obj.log_throttle_count = 0 - collectd.info("%s initialization complete" % PLUGIN) - - -##################################################################### -# -# Name : read_func -# -# Description: The collectd audit entrypoint for PTP Monitoring -# -# Assumptions: collectd calls init_func one time. -# -# -# retry init if needed -# retry fm connect if needed -# check service enabled state -# check service running state -# error -> alarm host=.ptp -# check -# -##################################################################### -def read_func(): - - if obj.virtual is True: - return 0 - - # check and run init until it reports init_done True - if obj.init_done is False: - if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE): - collectd.info("%s re-running init" % PLUGIN) - obj.log_throttle_count += 1 - init_func() - return 0 - - if obj.fm_connectivity is False: - - try: - # query FM for existing alarms. - alarms = api.get_faults_by_id(PLUGIN_ALARMID) - except Exception as ex: - collectd.error("%s 'get_faults_by_id' exception ;" - " %s ; %s" % - (PLUGIN, PLUGIN_ALARMID, ex)) - return 0 - - if alarms: - for alarm in alarms: - collectd.debug("%s found startup alarm '%s'" % - (PLUGIN, alarm.entity_instance_id)) - - eid = alarm.entity_instance_id - if eid is None: - collectd.error("%s startup alarm query error ; no eid" % - PLUGIN) - continue - - # get the hostname host=.stuff - # split over base eid and then - # compare that to this plugin's base eid - # ignore alarms not for this host - if eid.split('.')[0] != obj.base_eid: - continue - else: - # load the state of the specific alarm - instance = eid.split('.')[1].split('=') - if instance[0] == 'ptp': - # clear all ptp alarms on process startup - # just in case interface names have changed - # since the alarm was raised. - if clear_alarm(eid) is False: - # if we can't clear the alarm now then error out. - collectd.error("%s failed to clear startup " - "alarm %s:%s" % - (PLUGIN, PLUGIN_ALARMID, eid)) - # try again next time around - return 0 - else: - collectd.info("%s cleared startup alarm '%s'" % - (PLUGIN, alarm.entity_instance_id)) - else: - - if clear_alarm(eid) is False: - collectd.error("%s failed to clear invalid PTP " - "alarm %s:%s" % - (PLUGIN, PLUGIN_ALARMID, - alarm.entity_instance_id)) - return 0 - else: - collectd.info("%s cleared found invalid startup" - " alarm %s:%s" % - (PLUGIN, - PLUGIN_ALARMID, - alarm.entity_instance_id)) - else: - collectd.info("%s no startup alarms found" % PLUGIN) - - obj.config_complete = True - obj.fm_connectivity = True - # assert_all_alarms() - - # This plugin supports PTP in-service state change by checking - # service state on every audit ; every 5 minutes. - data = subprocess.check_output([SYSTEMCTL, - SYSTEMCTL_IS_ENABLED_OPTION, - PLUGIN_SERVICE]) - collectd.debug("%s PTP admin state:%s" % (PLUGIN, data.rstrip())) - - if data.rstrip() == SYSTEMCTL_IS_DISABLED_RESPONSE: - - # Manage execution phase - if obj.phase != RUN_PHASE__DISABLED: - obj.phase = RUN_PHASE__DISABLED - obj.log_throttle_count = 0 - - if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE): - collectd.info("%s PTP Service Disabled" % PLUGIN) - obj.log_throttle_count += 1 - - for o in ALARM_OBJ_LIST: - if o.raised is True: - if clear_alarm(o.eid) is True: - o.raised = False - else: - collectd.error("%s %s:%s clear alarm failed " - "; will retry" % - (PLUGIN, PLUGIN_ALARMID, o.eid)) - return 0 - - data = subprocess.check_output([SYSTEMCTL, - SYSTEMCTL_IS_ACTIVE_OPTION, - PLUGIN_SERVICE]) - - if data.rstrip() == SYSTEMCTL_IS_INACTIVE_RESPONSE: - - # Manage execution phase - if obj.phase != RUN_PHASE__NOT_RUNNING: - obj.phase = RUN_PHASE__NOT_RUNNING - obj.log_throttle_count = 0 - - if ctrl.process_alarm_object.alarm == ALARM_CAUSE__PROCESS: - if ctrl.process_alarm_object.raised is False: - collectd.error("%s PTP service enabled but not running" % - PLUGIN) - if raise_alarm(ALARM_CAUSE__PROCESS) is True: - ctrl.process_alarm_object.raised = True - - # clear all other alarms if the 'process' alarm is raised - elif ctrl.process_alarm_object.raised is True: - if clear_alarm(ctrl.process_alarm_object.eid) is True: - msg = 'cleared' - ctrl.process_alarm_object.raised = False - else: - msg = 'failed to clear' - collectd.info("%s %s %s:%s" % - (PLUGIN, msg, PLUGIN_ALARMID, - ctrl.process_alarm_object.eid)) - return 0 - - # Handle clearing the 'process' alarm if it is asserted and - # the process is now running - if ctrl.process_alarm_object.raised is True: - if clear_alarm(ctrl.process_alarm_object.eid) is True: - ctrl.process_alarm_object.raised = False - collectd.info("%s PTP service enabled and running" % PLUGIN) - - # Auto refresh the timestamping mode in case collectd runs - # before the ptp manifest or the mode changes on the fly by - # an in-service manifest. - # Every 4 audits. - obj.audits += 1 - if not obj.audits % 4: - read_timestamp_mode() - - # Manage execution phase - if obj.phase != RUN_PHASE__SAMPLING: - obj.phase = RUN_PHASE__SAMPLING - obj.log_throttle_count = 0 - - # Let's read the port status information - # - # sudo /usr/sbin/pmc -u -b 0 'GET PORT_DATA_SET' - # - data = subprocess.check_output([PLUGIN_STATUS_QUERY_EXEC, - '-u', '-b', '0', 'GET PORT_DATA_SET']) - - port_locked = False - obj.resp = data.split('\n') - for line in obj.resp: - if 'portState' in line: - collectd.debug("%s portState : %s" % (PLUGIN, line.split()[1])) - port_state = line.split()[1] - if port_state == 'SLAVE': - port_locked = True - - # Let's read the clock info, Grand Master sig and skew - # - # sudo /usr/sbin/pmc -u -b 0 'GET TIME_STATUS_NP' - # - data = subprocess.check_output([PLUGIN_STATUS_QUERY_EXEC, - '-u', '-b', '0', 'GET TIME_STATUS_NP']) - - got_master_offset = False - master_offset = 0 - my_identity = '' - gm_identity = '' - gm_present = False - obj.resp = data.split('\n') - for line in obj.resp: - if 'RESPONSE MANAGEMENT TIME_STATUS_NP' in line: - collectd.debug("%s key : %s" % - (PLUGIN, line.split()[0].split('-')[0])) - my_identity = line.split()[0].split('-')[0] - if 'master_offset' in line: - collectd.debug("%s Offset : %s" % (PLUGIN, line.split()[1])) - master_offset = float(line.split()[1]) - got_master_offset = True - if 'gmPresent' in line: - collectd.debug("%s gmPresent : %s" % (PLUGIN, line.split()[1])) - gm_present = line.split()[1] - if 'gmIdentity' in line: - collectd.debug("%s gmIdentity: %s" % (PLUGIN, line.split()[1])) - gm_identity = line.split()[1] - - # Handle case where this host is the Grand Master - # ... or assumes it is. - if my_identity == gm_identity or port_locked is False: - - if obj.controller is False: - - # Compute and storage nodes should not be the Grand Master - if ctrl.nolock_alarm_object.raised is False: - if raise_alarm(ALARM_CAUSE__NO_LOCK, None, 0) is True: - ctrl.nolock_alarm_object.raised = True - - # produce a throttled log while this host is not locked to the GM - if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE): - collectd.info("%s %s not locked to remote Grand Master " - "(%s)" % (PLUGIN, obj.hostname, gm_identity)) - obj.log_throttle_count += 1 - - # No samples for storage and compute nodes that are not - # locked to a Grand Master - return 0 - - else: - # Controllers can be a Grand Master ; throttle the log - if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE): - collectd.info("%s %s is Grand Master:%s" % - (PLUGIN, obj.hostname, gm_identity)) - obj.log_throttle_count += 1 - - # The Grand Master will always be 0 so there is no point - # creating a sample for it. - return 0 - - # Handle clearing nolock alarm for computes and storage nodes - elif obj.controller is False: - if ctrl.nolock_alarm_object.raised is True: - if clear_alarm(ctrl.nolock_alarm_object.eid) is True: - ctrl.nolock_alarm_object.raised = False - - # Keep this FIT test code but make it commented out for security - # if os.path.exists('/var/run/fit/ptp_data'): - # master_offset = 0 - # with open('/var/run/fit/ptp_data', 'r') as infile: - # for line in infile: - # master_offset = int(line) - # got_master_offset = True - # collectd.info("%s using ptp FIT data skew:%d" % - # (PLUGIN, master_offset)) - # break - - # Send sample and Manage the Out-Of-Tolerance alarm - if got_master_offset is True: - - if not (obj.log_throttle_count % obj.INIT_LOG_THROTTLE): - collectd.info("%s %s is collecting samples [%5d] " - "with Grand Master %s" % - (PLUGIN, obj.hostname, - float(master_offset), gm_identity)) - - obj.log_throttle_count += 1 - - # setup the sample structure and dispatch - val = collectd.Values(host=obj.hostname) - val.type = PLUGIN_TYPE - val.type_instance = PLUGIN_TYPE_INSTANCE - val.plugin = 'ptp' - val.dispatch(values=[float(master_offset)]) - - # Manage the sample OOT alarm severity - severity = fm_constants.FM_ALARM_SEVERITY_CLEAR - if abs(master_offset) > OOT_MAJOR_THRESHOLD: - severity = fm_constants.FM_ALARM_SEVERITY_MAJOR - elif abs(master_offset) > OOT_MINOR_THRESHOLD: - severity = fm_constants.FM_ALARM_SEVERITY_MINOR - - # Handle clearing of Out-Of-Tolerance alarm - if severity == fm_constants.FM_ALARM_SEVERITY_CLEAR: - if ctrl.oot_alarm_object.raised is True: - if clear_alarm(ctrl.oot_alarm_object.eid) is True: - ctrl.oot_alarm_object.severity = \ - fm_constants.FM_ALARM_SEVERITY_CLEAR - ctrl.oot_alarm_object.raised = False - - else: - # Special Case: - # ------------- - # Don't raise minor alarm when in software timestamping mode. - # Too much skew in software or legacy mode ; alarm would bounce. - # TODO: Consider making ptp a real time process - if severity == fm_constants.FM_ALARM_SEVERITY_MINOR \ - and obj.mode != 'hardware': - return 0 - - # Handle debounce of the OOT alarm. - # Debounce by 1 for the same severity level. - if ctrl.oot_alarm_object.severity != severity: - ctrl.oot_alarm_object.severity = severity - - # This will keep refreshing the alarm text with the current - # skew value while still debounce on state transitions. - # - # Precision ... (PTP) clocking is out of tolerance by 1004 nsec - # - elif severity == fm_constants.FM_ALARM_SEVERITY_MINOR: - # Handle raising the Minor OOT Alarm. - rc = raise_alarm(ALARM_CAUSE__OOT, None, master_offset) - if rc is True: - ctrl.oot_alarm_object.raised = True - - elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: - # Handle raising the Major OOT Alarm. - rc = raise_alarm(ALARM_CAUSE__OOT, None, master_offset) - if rc is True: - ctrl.oot_alarm_object.raised = True - - # Record the value that is alarmable - if severity != fm_constants.FM_ALARM_SEVERITY_CLEAR: - collectd.info("%s Grand Master ID: %s ; " - "HOST ID: %s ; " - "GM Present:%s ; " - "Skew:%5d" % (PLUGIN, - gm_identity, - my_identity, - gm_present, - master_offset)) - else: - collectd.info("%s No Clock Sync" % PLUGIN) - - return 0 - - -collectd.register_init(init_func) -collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL) diff --git a/monitoring/collectd-extensions/src/python_plugins.conf b/monitoring/collectd-extensions/src/python_plugins.conf deleted file mode 100644 index 22d6b3346..000000000 --- a/monitoring/collectd-extensions/src/python_plugins.conf +++ /dev/null @@ -1,21 +0,0 @@ -LoadPlugin python - - ModulePath "/opt/collectd/extensions/python" - Import "cpu" - - Path "/proc/cpuinfo" - - Import "memory" - - Path "/proc/meminfo" - - Import "ntpq" - Import "ptp" - Import "interface" - - Port 2122 - - Import "remotels" - LogTraces = true - Encoding "utf-8" - diff --git a/monitoring/collectd-extensions/src/remotels.conf b/monitoring/collectd-extensions/src/remotels.conf deleted file mode 100644 index f9e588992..000000000 --- a/monitoring/collectd-extensions/src/remotels.conf +++ /dev/null @@ -1,13 +0,0 @@ - - - - Instance "reachable" - Persist true - PersistOK true - WarningMin 1 - FailureMin 0 - Hits 2 - Invert false - - - diff --git a/monitoring/collectd-extensions/src/remotels.py b/monitoring/collectd-extensions/src/remotels.py deleted file mode 100755 index 95c3cda7b..000000000 --- a/monitoring/collectd-extensions/src/remotels.py +++ /dev/null @@ -1,350 +0,0 @@ -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -############################################################################ -# -# This is the Remote Logging Server plugin for collectd. -# -# The Remote Logging Server is enabled if /etc/syslog-ng/syslog-ng.conf -# contains '@include remotelogging.conf' -# -# There is no asynchronous notification of remote logging server -# configuration enable/disable state changes. Therefore, each audit -# interval needs to check whether its enabled or not. -# -# every audit interval ... -# -# read_func: -# check enabled: -# if disabled and alarmed: -# clear alarm -# if enabled: -# get ip and port -# query status -# if connected and alarmed: -# clear alarm -# if not connected and not alarmed: -# raise alarm -# -# system remotelogging-modify --ip_address -# --transport tcp -# --enabled True -# -############################################################################ - -import os -import collectd -import tsconfig.tsconfig as tsc -import plugin_common as pc -from fm_api import constants as fm_constants -from oslo_concurrency import processutils -from fm_api import fm_api - -# Fault manager API Object -api = fm_api.FaultAPIsV2() - -# name of the plugin -PLUGIN_NAME = 'remotels' - -# all logs produced by this plugin are prefixed with this -PLUGIN = 'remote logging server' - -# Interface Monitoring Interval in seconds -PLUGIN_AUDIT_INTERVAL = 60 - -# Sample Data 'type' and 'instance' database field values. -PLUGIN_TYPE = 'absolute' -PLUGIN_TYPE_INSTANCE = 'reachable' - -# Remote Logging Connectivity Alarm ID -PLUGIN_ALARMID = '100.118' - -# The file where this plugin learns if remote logging is enabled -SYSLOG_CONF_FILE = '/etc/syslog-ng/syslog-ng.conf' - -# Plugin Control Object -obj = pc.PluginObject(PLUGIN, "") - - -# Raise Remote Logging Server Alarm -def raise_alarm(): - """Raise Remote Logging Server Alarm""" - - repair = 'Ensure Remote Log Server IP is reachable from ' - repair += 'Controller through OAM interface; otherwise ' - repair += 'contact next level of support.' - - reason = 'Controller cannot establish connection with ' - reason += 'remote logging server.' - - try: - fault = fm_api.Fault( - alarm_id=PLUGIN_ALARMID, - alarm_state=fm_constants.FM_ALARM_STATE_SET, - entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, - entity_instance_id=obj.base_eid, - severity=fm_constants.FM_ALARM_SEVERITY_MINOR, - reason_text=reason, - alarm_type=fm_constants.FM_ALARM_TYPE_1, - probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_6, - proposed_repair_action=repair, - service_affecting=False, - suppression=False) - - alarm_uuid = api.set_fault(fault) - if pc.is_uuid_like(alarm_uuid) is False: - collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % - (PLUGIN, PLUGIN_ALARMID, - obj.base_eid, alarm_uuid)) - else: - collectd.info("%s %s:%s alarm raised" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) - obj.alarmed = True - - except Exception as ex: - collectd.error("%s 'set_fault' exception ; %s:%s ; %s " % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid, ex)) - - -# Clear remote logging server alarm -def clear_alarm(): - """Clear remote logging server alarm""" - - try: - if api.clear_fault(PLUGIN_ALARMID, obj.base_eid) is True: - collectd.info("%s %s:%s alarm cleared" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) - else: - collectd.info("%s %s:%s alarm clear" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid)) - - obj.alarmed = False - return True - - except Exception as ex: - collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % - (PLUGIN, PLUGIN_ALARMID, obj.base_eid, ex)) - return False - - -# The config function - called once on collectd process startup -def config_func(config): - """Configure the plugin""" - - # all configuration is learned during normal monitoring - obj.config_done = True - return 0 - - -# The init function - called once on collectd process startup -def init_func(): - """Init the plugin""" - - # remote logging server monitoring is for controllers only - if tsc.nodetype != 'controller': - return 0 - - if obj.init_done is False: - if obj.init_ready() is False: - return False - - obj.hostname = obj.gethostname() - obj.base_eid = 'host=' + obj.hostname - obj.init_done = True - collectd.info("%s initialization complete" % PLUGIN) - - return True - - -# The sample read function - called on every audit interval -def read_func(): - """Remote logging server connectivity plugin read function""" - - # remote logging server monitoring is for controllers only - if tsc.nodetype != 'controller': - return 0 - - if obj.init_done is False: - init_func() - return 0 - - # get current state - current_enabled_state = obj.enabled - - # check to see if remote logging is enabled - obj.enabled = False # assume disabled - if os.path.exists(SYSLOG_CONF_FILE) is True: - with open(SYSLOG_CONF_FILE, 'r') as infile: - for line in infile: - if line.startswith('@include '): - service = line.rstrip().split(' ')[1] - if service == '"remotelogging.conf"': - obj.enabled = True - break - - if current_enabled_state == obj.enabled: - logit = False - else: - if obj.enabled is False: - collectd.info("%s is disabled" % PLUGIN) - else: - collectd.info("%s is enabled" % PLUGIN) - logit = True - - # Handle startup case by clearing existing alarm if its raised. - # Its runtime cheaper and simpler to issue a blind clear than query. - if obj.audits == 0: - if clear_alarm() is False: - # if clear fails then retry next time - return 0 - if obj.enabled is False: - collectd.info("%s is disabled" % PLUGIN) - obj.audits = 1 - - if obj.enabled is False: - if obj.alarmed is True: - clear_alarm() - return 0 - - # If we get here then the server is enabled ... - # Need to query it - - # Get the ip and port from line that looks like this - # - # tag proto address port - # ----------------------------- --- -------------- --- - # destination remote_log_server {tcp("128.224.186.65" port(514));}; - # - address = protocol = port = '' - with open(SYSLOG_CONF_FILE, 'r') as infile: - for line in infile: - if line.startswith('destination remote_log_server'): - try: - if len(line.split('{')) > 1: - protocol = line.split('{')[1][0:3] - address = line.split('{')[1].split('"')[1] - port = line.split('{')[1].split('(')[2].split(')')[0] - if not protocol or not address or not port: - collectd.error("%s remote log server credentials " - "parse error ; (%s:%s:%s)" % - (PLUGIN, protocol, address, port)) - return 1 - else: - # line parsed ; move on ... - break - else: - collectd.error("%s remote log server line parse error" - " ; %s" % (PLUGIN, line)) - except Exception as ex: - collectd.error("%s remote log server credentials " - "parse exception ; (%s)" % (PLUGIN, line)) - - if ':' in address: - ipv = 6 - protocol += 6 - - # Monitoring of IPV6 is not currently supported - return 0 - - else: - ipv = 4 - - # This plugin detects server connectivity through its socket status. - # To get that construct the remote logging server IP string. - # The files being looked at(/proc/net/tcp(udp)) use hex values, - # so convert the string caps hex value with reverse ordering of - # the "ipv4" values - index = 3 - addr = [0, 0, 0, 0] - - # swap order - for tup in address.split('.'): - addr[index] = int(tup) - index -= 1 - - # build the CAPs HEX address - UPPER_HEX_IP = '' - for tup in addr: - val = hex(int(tup)).split('x')[-1].upper() - if len(val) == 1: - UPPER_HEX_IP += '0' - UPPER_HEX_IP += val - UPPER_HEX_IP += ':' - tmp = hex(int(port)).split('x')[-1].upper() - for i in range(4 - len(tmp)): - UPPER_HEX_IP += '0' - UPPER_HEX_IP += tmp - - # log example tcp:ipv4:128.224.186.65:514 : IP:41BAE080:0202 - collectd.debug("%s %s:ipv%d:%s:%s : IP:%s" % - (PLUGIN, protocol, ipv, address, port, UPPER_HEX_IP)) - - cmd = "cat /proc/net/" + protocol - cmd += " | awk '{print $3 \" \" $4}' | grep " + UPPER_HEX_IP - cmd += " | awk '{print $2}'" - res, err = processutils.execute(cmd, shell=True) - if err: - collectd.error("%s processutils error:%s" % (PLUGIN, err)) - - # cmd example: - # cat /proc/net/tcp | awk '{print $3 " " $4}' - # | grep 41BAE080:0202 - # | awk '{print $2}' - collectd.debug("%s Cmd:%s" % (PLUGIN, cmd)) - return 0 - - if res and res.rstrip() == '01': - # connected state reads 01 - # Example log: Res:[01] - - # clear alarm if - # - currently alarmed and - # - debounced by 1 ; need 2 connected readings in a row - if obj.alarmed is True: - clear_alarm() - - # Only log on state change - if obj.usage != 1: - logit = True - - obj.usage = 1 - conn = '' - - else: - # res typically reads 02 when notr connected - # Example log: Res:[02] - collectd.debug("%s Res:[%s] " % (PLUGIN, res.rstrip())) - - # raise alarm if - # - not already alarmed - # - debounced by 1 ; need 2 failures in a row - if obj.alarmed is False and obj.usage == 0: - raise_alarm() - - # only log on state change - if obj.usage == 1 or obj.audits == 1: - logit = True - - obj.usage = 0 - conn = 'not ' - - if logit is True: - collectd.info("%s is %sconnected [%s ipv%d %s:%s]" % - (PLUGIN, conn, protocol, ipv, address, port)) - obj.audits += 1 - - # Dispatch usage value to collectd - val = collectd.Values(host=obj.hostname) - val.plugin = PLUGIN_NAME - val.type = PLUGIN_TYPE - val.type_instance = PLUGIN_TYPE_INSTANCE - val.dispatch(values=[obj.usage]) - return 0 - - -# register the config, init and read functions -collectd.register_config(config_func) -collectd.register_init(init_func) -collectd.register_read(read_func, interval=PLUGIN_AUDIT_INTERVAL) diff --git a/monitoring/influxdb-extensions/PKG-INFO b/monitoring/influxdb-extensions/PKG-INFO deleted file mode 100644 index fa9936ba0..000000000 --- a/monitoring/influxdb-extensions/PKG-INFO +++ /dev/null @@ -1,10 +0,0 @@ -Metadata-Version: 1.1 -Name: influxdb-extensions -Version: 1.0 -Summary: influxdb-extensions -Home-page: -Author: Windriver -Author-email: info@windriver.com -License: ASL 2.0 -Description: Titanium Cloud influxdb extensions. -Platform: UNKNOWN diff --git a/monitoring/influxdb-extensions/centos/build_srpm.data b/monitoring/influxdb-extensions/centos/build_srpm.data deleted file mode 100644 index 03c5dbcc8..000000000 --- a/monitoring/influxdb-extensions/centos/build_srpm.data +++ /dev/null @@ -1,7 +0,0 @@ -SRC_DIR="$PKG_BASE" - -COPY_LIST="$PKG_BASE/src/LICENSE \ - $PKG_BASE/src/influxdb.conf.pmon \ - $PKG_BASE/src/influxdb.service" - -TIS_PATCH_VER=2 diff --git a/monitoring/influxdb-extensions/centos/influxdb-extensions.spec b/monitoring/influxdb-extensions/centos/influxdb-extensions.spec deleted file mode 100644 index ea3018846..000000000 --- a/monitoring/influxdb-extensions/centos/influxdb-extensions.spec +++ /dev/null @@ -1,46 +0,0 @@ -Summary: Titanuim Server influxdb Extensions Package -Name: influxdb-extensions -Version: 1.0 -Release: 0%{?_tis_dist}.%{tis_patch_ver} -License: ASL 2.0 -Group: base -Packager: Wind River -URL: unknown - -# create the files tarball -Source0: %{name}-%{version}.tar.gz - -source1: influxdb.service -Source2: influxdb.conf.pmon - -Requires: systemd -Requires: influxdb -Requires: /bin/systemctl - -%description -Titanium Cloud influxdb extensions - -%define debug_package %{nil} -%define local_unit_dir %{_sysconfdir}/systemd/system - -%prep -%setup - -%build - -%install -install -m 755 -d %{buildroot}%{_sysconfdir} -install -m 755 -d %{buildroot}%{_sysconfdir}/influxdb -install -m 755 -d %{buildroot}%{local_unit_dir} - -install -m 644 %{SOURCE1} %{buildroot}%{local_unit_dir} -install -m 600 %{SOURCE2} %{buildroot}%{_sysconfdir}/influxdb - - -%clean -rm -rf $RPM_BUILD_ROOT - -%files -%defattr(-,root,root,-) -%config(noreplace) %{local_unit_dir}/influxdb.service -%{_sysconfdir}/influxdb/* diff --git a/monitoring/influxdb-extensions/src/LICENSE b/monitoring/influxdb-extensions/src/LICENSE deleted file mode 100644 index d64569567..000000000 --- a/monitoring/influxdb-extensions/src/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/monitoring/influxdb-extensions/src/influxdb.conf b/monitoring/influxdb-extensions/src/influxdb.conf deleted file mode 100644 index b0a5f000f..000000000 --- a/monitoring/influxdb-extensions/src/influxdb.conf +++ /dev/null @@ -1,322 +0,0 @@ -### Welcome to the InfluxDB configuration file. - -# Once every 24 hours InfluxDB will report anonymous data to m.influxdb.com -# The data includes raft id (random 8 bytes), os, arch, version, and metadata. -# We don't track ip addresses of servers reporting. This is only used -# to track the number of instances running and the versions, which -# is very helpful for us. -# Change this option to true to disable reporting. -reporting-disabled = false - -### -### Enterprise registration control -### - -[registration] -# enabled = true -# url = "https://enterprise.influxdata.com" # The Enterprise server URL -# token = "" # Registration token for Enterprise server - -### -### [meta] -### -### Controls the parameters for the Raft consensus group that stores metadata -### about the InfluxDB cluster. -### - -[meta] - dir = "/var/lib/influxdb/meta" - hostname = "localhost" - bind-address = ":8088" - retention-autocreate = true - election-timeout = "1s" - heartbeat-timeout = "1s" - leader-lease-timeout = "500ms" - commit-timeout = "50ms" - cluster-tracing = false - - # If enabled, when a Raft cluster loses a peer due to a `DROP SERVER` command, - # the leader will automatically ask a non-raft peer node to promote to a raft - # peer. This only happens if there is a non-raft peer node available to promote. - # This setting only affects the local node, so to ensure if operates correctly, be sure to set - # it in the config of every node. - raft-promotion-enabled = true - -### -### [data] -### -### Controls where the actual shard data for InfluxDB lives and how it is -### flushed from the WAL. "dir" may need to be changed to a suitable place -### for your system, but the WAL settings are an advanced configuration. The -### defaults should work for most systems. -### - -[data] - dir = "/var/lib/influxdb/data" - - # Controls the engine type for new shards. Options are b1, bz1, or tsm1. - # b1 is the 0.9.2 storage engine, bz1 is the 0.9.3 and 0.9.4 engine. - # tsm1 is the 0.9.5 engine and is currenly EXPERIMENTAL. Until 0.9.5 is - # actually released data written into a tsm1 engine may be need to be wiped - # between upgrades. - # engine ="bz1" - - # The following WAL settings are for the b1 storage engine used in 0.9.2. They won't - # apply to any new shards created after upgrading to a version > 0.9.3. - max-wal-size = 104857600 # Maximum size the WAL can reach before a flush. Defaults to 100MB. - wal-flush-interval = "10m" # Maximum time data can sit in WAL before a flush. - wal-partition-flush-delay = "2s" # The delay time between each WAL partition being flushed. - - # These are the WAL settings for the storage engine >= 0.9.3 - wal-dir = "/var/lib/influxdb/wal" - wal-enable-logging = true - - # When a series in the WAL in-memory cache reaches this size in bytes it is marked as ready to - # flush to the index - # wal-ready-series-size = 25600 - - # Flush and compact a partition once this ratio of series are over the ready size - # wal-compaction-threshold = 0.6 - - # Force a flush and compaction if any series in a partition gets above this size in bytes - # wal-max-series-size = 2097152 - - # Force a flush of all series and full compaction if there have been no writes in this - # amount of time. This is useful for ensuring that shards that are cold for writes don't - # keep a bunch of data cached in memory and in the WAL. - # wal-flush-cold-interval = "10m" - - # Force a partition to flush its largest series if it reaches this approximate size in - # bytes. Remember there are 5 partitions so you'll need at least 5x this amount of memory. - # The more memory you have, the bigger this can be. - # wal-partition-size-threshold = 20971520 - - # Whether queries should be logged before execution. Very useful for troubleshooting, but will - # log any sensitive data contained within a query. - # query-log-enabled = true - -### -### [hinted-handoff] -### -### Controls the hinted handoff feature, which allows nodes to temporarily -### store queued data when one node of a cluster is down for a short period -### of time. -### - -[hinted-handoff] - enabled = true - dir = "/var/lib/influxdb/hh" - max-size = 1073741824 - max-age = "168h" - retry-rate-limit = 0 - - # Hinted handoff will start retrying writes to down nodes at a rate of once per second. - # If any error occurs, it will backoff in an exponential manner, until the interval - # reaches retry-max-interval. Once writes to all nodes are successfully completed the - # interval will reset to retry-interval. - retry-interval = "1s" - retry-max-interval = "1m" - - # Interval between running checks for data that should be purged. Data is purged from - # hinted-handoff queues for two reasons. 1) The data is older than the max age, or - # 2) the target node has been dropped from the cluster. Data is never dropped until - # it has reached max-age however, for a dropped node or not. - purge-interval = "1h" - -### -### [cluster] -### -### Controls non-Raft cluster behavior, which generally includes how data is -### shared across shards. -### - -[cluster] - shard-writer-timeout = "10s" # The time within which a shard must respond to write. - write-timeout = "5s" # The time within which a write operation must complete on the cluster. - -### -### [retention] -### -### Controls the enforcement of retention policies for evicting old data. -### - -[retention] - enabled = true - check-interval = "30m" - -### -### [shard-precreation] -### -### Controls the precreation of shards, so they are created before data arrives. -### Only shards that will exist in the future, at time of creation, are precreated. - -[shard-precreation] - enabled = true - check-interval = "10m" - advance-period = "30m" - -### -### Controls the system self-monitoring, statistics and diagnostics. -### -### The internal database for monitoring data is created automatically if -### if it does not already exist. The target retention within this database -### is called 'monitor' and is also created with a retention period of 7 days -### and a replication factor of 1, if it does not exist. In all cases the -### this retention policy is configured as the default for the database. - -[monitor] - store-enabled = true # Whether to record statistics internally. - store-database = "_internal" # The destination database for recorded statistics - store-interval = "10s" # The interval at which to record statistics - -### -### [admin] -### -### Controls the availability of the built-in, web-based admin interface. If HTTPS is -### enabled for the admin interface, HTTPS must also be enabled on the [http] service. -### - -[admin] - enabled = true - bind-address = ":8083" - https-enabled = false - https-certificate = "/etc/ssl/influxdb.pem" - -### -### [http] -### -### Controls how the HTTP endpoints are configured. These are the primary -### mechanism for getting data into and out of InfluxDB. -### - -[http] - enabled = true - bind-address = ":8086" - auth-enabled = false - log-enabled = true - write-tracing = false - pprof-enabled = false - https-enabled = false - https-certificate = "/etc/ssl/influxdb.pem" - -### -### [[graphite]] -### -### Controls one or many listeners for Graphite data. -### - -[[graphite]] - enabled = false - # database = "graphite" - # bind-address = ":2003" - # protocol = "tcp" - # consistency-level = "one" - # name-separator = "." - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Batching - # will buffer points in memory if you have many coming in. - - # batch-size = 1000 # will flush if this many points get buffered - # batch-pending = 5 # number of batches that may be pending in memory - # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit - # udp-read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. - - ## "name-schema" configures tag names for parsing the metric name from graphite protocol; - ## separated by `name-separator`. - ## The "measurement" tag is special and the corresponding field will become - ## the name of the metric. - ## e.g. "type.host.measurement.device" will parse "server.localhost.cpu.cpu0" as - ## { - ## measurement: "cpu", - ## tags: { - ## "type": "server", - ## "host": "localhost, - ## "device": "cpu0" - ## } - ## } - # name-schema = "type.host.measurement.device" - - ## If set to true, when the input metric name has more fields than `name-schema` specified, - ## the extra fields will be ignored. - ## Otherwise an error will be logged and the metric rejected. - # ignore-unnamed = true - -### -### [collectd] -### -### Controls the listener for collectd data. -### - -[collectd] - enabled = true - bind-address = "127.0.0.1:25826" - database = "collectd" - typesdb = "/usr/share/collectd/types.db" - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Batching - # will buffer points in memory if you have many coming in. - - # batch-size = 1000 # will flush if this many points get buffered - # batch-pending = 5 # number of batches that may be pending in memory - # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit - # read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. - -### -### [opentsdb] -### -### Controls the listener for OpenTSDB data. -### - -[opentsdb] - enabled = false - # bind-address = ":4242" - # database = "opentsdb" - # retention-policy = "" - # consistency-level = "one" - # tls-enabled = false - # certificate= "" - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Only points - # metrics received over the telnet protocol undergo batching. - - # batch-size = 1000 # will flush if this many points get buffered - # batch-pending = 5 # number of batches that may be pending in memory - # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit - -### -### [[udp]] -### -### Controls the listeners for InfluxDB line protocol data via UDP. -### - -[[udp]] - enabled = false - # bind-address = "" - # database = "udp" - # retention-policy = "" - - # These next lines control how batching works. You should have this enabled - # otherwise you could get dropped metrics or poor performance. Batching - # will buffer points in memory if you have many coming in. - - # batch-size = 1000 # will flush if this many points get buffered - # batch-pending = 5 # number of batches that may be pending in memory - # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit - # read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. - -### -### [continuous_queries] -### -### Controls how continuous queries are run within InfluxDB. -### - -[continuous_queries] - log-enabled = true - enabled = true - recompute-previous-n = 2 - recompute-no-older-than = "10m" - compute-runs-per-interval = 10 - compute-no-more-than = "2m" diff --git a/monitoring/influxdb-extensions/src/influxdb.conf.pmon b/monitoring/influxdb-extensions/src/influxdb.conf.pmon deleted file mode 100644 index 0556f2ac6..000000000 --- a/monitoring/influxdb-extensions/src/influxdb.conf.pmon +++ /dev/null @@ -1,17 +0,0 @@ -[process] -process = influxdb -service = influxdb -style = lsb -pidfile = /var/run/influxdb/influxdb.pid -severity = major ; minor, major, critical -restarts = 3 ; restart retries before error assertion -interval = 5 ; number of seconds to wait between restarts -debounce = 10 ; number of seconds that a process needs to remain - ; running before degrade is removed and retry count - ; is cleared. -startuptime = 3 ; Seconds to wait after process start before starting the debounce monitor -mode = passive ; Monitoring mode: passive (default) or active - ; passive: process death monitoring (default: always) - ; active : heartbeat monitoring, i.e. request / response messaging - ; ignore : do not monitor or stop monitoring -quorum = 0 ; process is in the host watchdog quorum diff --git a/monitoring/influxdb-extensions/src/influxdb.logrotate b/monitoring/influxdb-extensions/src/influxdb.logrotate deleted file mode 100644 index b2bef9a22..000000000 --- a/monitoring/influxdb-extensions/src/influxdb.logrotate +++ /dev/null @@ -1,16 +0,0 @@ -#daily -nodateext - -/var/log/influxdb/influxdb.log -{ - size 20M - start 1 - missingok - rotate 20 - compress - sharedscripts - postrotate - systemctl reload syslog-ng > /dev/null 2>&1 || true - endscript -} - diff --git a/monitoring/influxdb-extensions/src/influxdb.service b/monitoring/influxdb-extensions/src/influxdb.service deleted file mode 100644 index 7617d2a6e..000000000 --- a/monitoring/influxdb-extensions/src/influxdb.service +++ /dev/null @@ -1,25 +0,0 @@ -[Unit] -Description=InfluxDB open-source, distributed, time series database -Documentation=https://influxdb.com/docs/ -Before=collectd.service -Before=pmon.service -After=local-fs.target network-online.target -Requires=local-fs.target network-online.target - -[Service] -User=influxdb -Group=influxdb -LimitNOFILE=65536 -Environment='STDOUT=/dev/null' -Environment='STDERR=/var/log/influxdb/influxd.log' -EnvironmentFile=-/etc/default/influxdb -PermissionsStartOnly=true -ExecStartPre=-/usr/bin/mkdir -p /var/run/influxdb -ExecStartPre=-/usr/bin/chown influxdb:influxdb /var/run/influxdb -ExecStart=/bin/sh -c "/usr/bin/influxd -config /etc/influxdb/influxdb.conf -pidfile /var/run/influxdb/influxdb.pid ${INFLUXD_OPTS} >> ${STDOUT} 2>> ${STDERR}" -ExecStopPost=/bin/bash -c 'rm /var/run/influxdb/influxdb.pid' -KillMode=control-group - -[Install] -WantedBy=multi-user.target -Alias=influxd.service diff --git a/tools/monitor-tools/LICENSE b/tools/monitor-tools/LICENSE deleted file mode 100644 index d64569567..000000000 --- a/tools/monitor-tools/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/tools/monitor-tools/centos/build_srpm.data b/tools/monitor-tools/centos/build_srpm.data deleted file mode 100644 index acaff6535..000000000 --- a/tools/monitor-tools/centos/build_srpm.data +++ /dev/null @@ -1,2 +0,0 @@ -SRC_DIR=scripts -TIS_PATCH_VER=0 diff --git a/tools/monitor-tools/centos/monitor-tools.spec b/tools/monitor-tools/centos/monitor-tools.spec deleted file mode 100644 index 84322e0fd..000000000 --- a/tools/monitor-tools/centos/monitor-tools.spec +++ /dev/null @@ -1,42 +0,0 @@ -Summary: Monitor tools package -Name: monitor-tools -Version: 1.0 -Release: %{tis_patch_ver}%{?_tis_dist} -License: Apache-2.0 -Group: base -Packager: Wind River -URL: unknown -BuildArch: noarch -Source: %name-%version.tar.gz - -Requires: initscripts-config - -%description -This package contains data collection tools to monitor host performance. -Tools are general purpose engineering and debugging related. Includes -overall memory, cpu occupancy, per-task cpu, per-task scheduling, per-task -io. - -%prep -%autosetup - -%install -rm -rf $RPM_BUILD_ROOT -%global _buildsubdir %{_builddir}/%{name}-%{version} -install -d %{buildroot}/usr/bin -install %{_buildsubdir}/memtop %{buildroot}/usr/bin -install %{_buildsubdir}/schedtop %{buildroot}/usr/bin -install %{_buildsubdir}/occtop %{buildroot}/usr/bin - -%files -%license LICENSE -%defattr(-,root,root,-) -/usr/bin/* - -%post -grep schedstats /etc/sysctl.conf -if [ $? -ne 0 ]; then - echo -e "\nkernel.sched_schedstats=1" >> /etc/sysctl.conf - sysctl -p &>/dev/null -fi -exit 0 diff --git a/tools/monitor-tools/scripts/LICENSE b/tools/monitor-tools/scripts/LICENSE deleted file mode 100644 index d64569567..000000000 --- a/tools/monitor-tools/scripts/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/tools/monitor-tools/scripts/memtop b/tools/monitor-tools/scripts/memtop deleted file mode 100755 index 9bcaf6429..000000000 --- a/tools/monitor-tools/scripts/memtop +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/perl -######################################################################## -# -# Copyright (c) 2015 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -# -######################################################################## -# -# Description: -# This displays overall memory information per sample period. -# Output includes total, used, avail, per-numa node breakdown of avail -# and free hugepages memory. -# -# Usage: memtop OPTIONS -# memtop [--delay=] [--repeat=] [--period=] [--help] -# - -# Summarize high-level memory usage. -use 5.10.0; -use warnings; -use strict; -use Benchmark ':hireswallclock'; -use POSIX qw(strftime); -use Data::Dumper; -use File::Basename; -use File::Spec (); -use Time::HiRes qw(time usleep); -use Carp qw(croak carp); - -# IEC and SI constants -use constant SI_k => 1.0E3; -use constant SI_M => 1.0E6; -use constant SI_G => 1.0E9; -use constant Ki => 1024.0; -use constant Mi => 1024.0*1024.0; -use constant Gi => 1024.0*1024.0*1024.0; - -# Name of this program -our $TOOLNAME = basename($0); -our $VERSION = "0.1"; - -# Argument list parameters -our ($arg_debug, - $arg_delay, - $arg_repeat, - $arg_period) = (); - -# Globals -our $t_0 = (); -our $t_1 = (); -our $t_elapsed = (); -our $t_final = (); -our $is_strict = (); -our $num_nodes = (); - -#------------------------------------------------------------------------------- -# MAIN Program -#------------------------------------------------------------------------------- -# benchmark variables -my ($bd, $b0, $b1); - -# Autoflush output -select(STDERR); -$| = 1; -select(STDOUT); # default -$| = 1; - -# Parse input arguments and print tool usage if necessary -&parse_memtop_args( - \$::arg_debug, - \$::arg_delay, - \$::arg_repeat, - \$::arg_period, -); - -# Print out some debugging information -if (defined $::arg_debug) { - $Data::Dumper::Indent = 1; -} - -# Strict vs non-strict memory accounting -$::is_strict = &is_strict(); - -# Number of numa nodes -$::num_nodes = &num_numa_nodes(); - -# Print tool header and selected options -printf "%s %s -- ". - "selected options: delay = %.3fs, repeat = %d, period = %.3fs, %s, unit = %s\n", - $::TOOLNAME, $::VERSION, - $::arg_delay, $::arg_repeat, $::arg_period, - $::is_strict ? 'strict' : 'non-strict', - 'MiB'; - -# Capture timestamp -$b0 = new Benchmark; - -# Get current hires epoc timestamp -$::t_1 = time(); -$::t_final = $::t_1 + $::arg_period; - -# Set initial delay -$::t_elapsed = $::arg_delay; - -# Main loop -my $delay = SI_M*$::arg_delay - 600.0; -REPEAT_LOOP: for (my $rep=1; $rep <= $::arg_repeat; $rep++) { - # Copy all state variables - $::t_0 = $::t_1; - - # Sleep for desired interarrival time - usleep( $delay ); - - # Current hires epoc timestamp - $::t_1 = time(); - - # Delta calculation - $::t_elapsed = $::t_1 - $::t_0; - - # Print summary - &print_memory(\$::t_1); - - # Exit if we have reached period - last if ((defined $::t_final) && ($::t_1 > $::t_final)); -} - -# Print that tool has finished -print "done\n"; - -# Capture timestamp and report delta -if (defined $::arg_debug) { - $b1 = new Benchmark; $bd = Benchmark::timediff($b1, $b0); - printf "processing time: %s\n", timestr($bd); -} -exit 0; - - -################################################################################ - -# Parse input option arguments -sub parse_memtop_args { - (local *::arg_debug, - local *::arg_delay, - local *::arg_repeat, - local *::arg_period, - ) = @_; - - # Local variables - my ($fail, $arg_help); - - # Use the Argument processing module - use Getopt::Long; - - # Process input arguments - $fail = 0; - GetOptions( - "debug:i", \$::arg_debug, - "delay=f", \$::arg_delay, - "repeat=i", \$::arg_repeat, - "period=i", \$::arg_period, - "help|h", \$arg_help - ) || GetOptionsMessage(); - - # Print help documentation if user has selected --help - &ListHelp() if (defined $arg_help); - - # Validate options - if ((defined $::arg_repeat) && (defined $::arg_period)) { - $fail = 1; - warn "$::TOOLNAME: Input error: cannot specify both --repeat and --period options.\n"; - } - if ((defined $::arg_delay) && ($::arg_delay < 0.01)) { - $fail = 1; - warn "$::TOOLNAME: Input error: --delay %f is less than 0.01.\n", - $::arg_delay; - } - if (@::ARGV) { - $fail = 1; - warn "$::TOOLNAME: Input error: not expecting these options: '@::ARGV'.\n"; - } - - # Set reasonable defaults - $::arg_delay ||= 1.0; - $::arg_repeat ||= 1; - if ($::arg_period) { - $::arg_repeat = $::arg_period / $::arg_delay; - } else { - $::arg_period = $::arg_delay * $::arg_repeat; - } - - # Upon missing or invalid options, print usage - if ($fail == 1) { - &Usage(); - exit 1; - } -} - -# Print out a warning message and usage -sub GetOptionsMessage { - warn "$::TOOLNAME: Error processing input arguments.\n"; - &Usage(); - exit 1; -} - -# Print out program usage -sub Usage { - printf "Usage: $::TOOLNAME OPTIONS\n"; - printf " [--delay=] [--repeat=] [--period=]\n"; - printf " [--help]\n"; - printf "\n"; -} - -# Print tool help -sub ListHelp { - printf "$::TOOLNAME -- displays high memory usage at high level\n"; - &Usage(); - printf " --delay= : output interval (seconds): default: 1.0\n"; - printf " --repeat= : number of repeat samples: default: 1\n"; - printf " --period= : overall tool duration (seconds): default: --\n"; - printf " --help : this help\n"; - printf "\n"; - exit 0; -} - -# Print memory summary -sub print_memory { - (local *::t_1) = @_; - - # counter - our $count; - $::count++; $::count %= 15; - - my ($file, $n); - my %mem = (); - my %node = (); - - my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst); - ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($::t_1); - my $msec = 1000.0*($::t_1 - int($::t_1)); - - # Process all entries of MEMINFO - $file = '/proc/meminfo'; - open(FILE, $file) || die "Cannot open file: $file ($!)"; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^(\S+):\s+(\d+)\b/) { - $mem{$1} = $2; - } - } - close(FILE); - - # Process all entries of per-Node MEMINFO - for ($n=0; $n < $::num_nodes; $n++) { - $file = sprintf('/sys/devices/system/node/node%d/meminfo', $n); - open(FILE, $file) || die "Cannot open file: $file ($!)"; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^Node\s+(\d+)\s+(\S+):\s+(\d+)\b/) { - $node{$1}{$2} = $3; - } - } - close(FILE); - } - - # Calculate available memory - if ($::is_strict) { - $mem{'Avail'} = $mem{'CommitLimit'} - $mem{'Committed_AS'}; - } else { - $mem{'Avail'} = $mem{'MemFree'} + - $mem{'Cached'} + - $mem{'Buffers'} + - $mem{'SReclaimable'}; - } - $mem{'Used'} = $mem{'MemTotal'} - $mem{'Avail'}; - $mem{'Anon'} = $mem{'AnonPages'}; - for ($n=0; $n < $::num_nodes; $n++) { - $node{$n}{'Avail'} = $node{$n}{'MemFree'} + - $node{$n}{'FilePages'} + - $node{$n}{'SReclaimable'}; - $node{$n}{'HFree'} = $node{$n}{'HugePages_Free'} * $mem{'Hugepagesize'}; - } - - # Print heading every so often - if ($::count == 1) { - printf "%s ". - "%8s %8s %8s %7s %6s %6s %8s %8s %7s %7s %8s %8s", - 'yyyy-mm-dd hh:mm:ss.fff', - 'Tot', 'Used', 'Free', 'Ca', 'Buf', 'Slab', 'CAS', 'CLim', 'Dirty', 'WBack', 'Anon', 'Avail'; - for ($n=0; $n < $::num_nodes; $n++) { - printf " %8s %8s", sprintf('%d:Avail', $n), sprintf('%d:HFree', $n); - } - printf "\n"; - } - - # Print one line memory summary - printf "%4d-%02d-%02d %02d:%02d:%02d.%03d ". - "%8.1f %8.1f %8.1f %7.1f %6.1f %6.1f %8.1f %8.1f %7.1f %7.1f %8.1f %8.1f", - 1900+$year, 1+$mon, $mday, $hour, $min, $sec, $msec, - $mem{'MemTotal'}/Ki, - $mem{'Used'}/Ki, - $mem{'MemFree'}/Ki, - $mem{'Cached'}/Ki, - $mem{'Buffers'}/Ki, - $mem{'Slab'}/Ki, - $mem{'Committed_AS'}/Ki, - $mem{'CommitLimit'}/Ki, - $mem{'Dirty'}/Ki, - $mem{'Writeback'}/Ki, - $mem{'Anon'}/Ki, - $mem{'Avail'}/Ki; - for ($n=0; $n < $::num_nodes; $n++) { - printf " %8.1f %8.1f", $node{$n}{'Avail'}/Ki, $node{$n}{'HFree'}/Ki; - } - printf "\n"; - -} - -sub num_numa_nodes { - my $file = '/proc/cpuinfo'; - my %nodes = (); - open(FILE, $file) || die "Cannot open file: $file ($!)"; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^physical\s+id\s+:\s+(\d+)\b/) { - $nodes{$1} = 1; - } - } - close(FILE); - return scalar keys %nodes; -} - -sub is_strict { - my $value = 0; - my $file = '/proc/sys/vm/overcommit_memory'; - open(FILE, $file) || die "Cannot open file: $file ($!)"; - $_ = ; - $value = /(\d+)/; - close(FILE); - return ($value == 2) ? 1 : 0; -} - -1; diff --git a/tools/monitor-tools/scripts/occtop b/tools/monitor-tools/scripts/occtop deleted file mode 100755 index d4178a1eb..000000000 --- a/tools/monitor-tools/scripts/occtop +++ /dev/null @@ -1,592 +0,0 @@ -#!/usr/bin/perl -######################################################################## -# -# Copyright (c) 2015-2016 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -# -######################################################################## -# -# Description: -# This displays per-core occupancy information per sample period. -# Output includes total occupancy, and per-core occupancy based on -# hi-resolution timings. -# -# Usage: occtop OPTIONS -# [--delay=] [--repeat=] [--period=] -# [--header=] -# [--help] - -use strict; -use warnings; -use Data::Dumper; -use POSIX qw(uname strftime); -use Time::HiRes qw(clock_gettime usleep CLOCK_MONOTONIC CLOCK_REALTIME); - -use Benchmark ':hireswallclock'; -use Carp qw(croak carp); - -# Define toolname -our $TOOLNAME = "occtop"; -our $VERSION = "0.1"; - -# Constants -use constant SI_k => 1.0E3; -use constant SI_M => 1.0E6; -use constant SI_G => 1.0E9; -use constant Ki => 1024.0; -use constant Mi => 1024.0*1024.0; -use constant Gi => 1024.0*1024.0*1024.0; - -# Globals -our %percpu_0 = (); -our %percpu_1 = (); -our %D_percpu = (); -our %loadavg = (); -our $D_total = 0.0; -our $tm_0 = 0.0; -our $tm_1 = 0.0; -our $tr_0 = 0.0; -our $tr_1 = 0.0; -our $tm_elapsed = 0.0; -our $tm_final = 0.0; -our $uptime = 0.0; -our $num_cpus = 1; -our $num_tasks = 0; -our $num_blk = 0; -our $print_host = 1; -our $is_schedstat = 1; -our $USER_HZ = 100; # no easy way to get this -our $CLOCK_NS = SI_G / $USER_HZ; - -# Argument list parameters -our ($arg_debug, - $arg_delay, - $arg_repeat, - $arg_period, - $arg_header, - ) = (); - -#------------------------------------------------------------------------------- -# MAIN Program -#------------------------------------------------------------------------------- -my $MIN_DELAY = 0.001; -my $MAX_DELAY = 0.001; - -# benchmark variables -my ($bd, $b0, $b1); - -# Autoflush output -select(STDERR); -$| = 1; -select(STDOUT); # default -$| = 1; - -# Parse input arguments and print tool usage if necessary -&parse_occtop_args( - \$::arg_debug, - \$::arg_delay, - \$::arg_repeat, - \$::arg_period, - \$::arg_header, -); - -# Print out some debugging information -if (defined $::arg_debug) { - $Data::Dumper::Indent = 1; -} - -# Check for schedstat support; fallback to stats -$is_schedstat = -e '/proc/schedstat' ? 1 : 0; - -# Print out selected options -printf "selected options: delay = %.3fs, repeat = %d, header = %d, source = %s\n", - $::arg_delay, $::arg_repeat, $::arg_header, $is_schedstat ? 'schedstat' : 'jiffie'; - -# Capture timestamp -$b0 = new Benchmark; - -# Get number of logical cpus -&get_num_logical_cpus(\$::num_cpus); - - -# Get current hires epoc timestamp -$::tm_1 = clock_gettime(CLOCK_MONOTONIC); -$::tr_1 = clock_gettime(CLOCK_REALTIME); -$::tm_final = $::tm_1 + $::arg_delay*$::arg_repeat; - -# Set initial delay -$::tm_elapsed = $::arg_delay; -$MAX_DELAY = $::arg_delay + $MIN_DELAY; - -# Get overall per-cpu stats -if ($is_schedstat) { - &read_schedstat(\%::percpu_1); -} else { - &read_stat(\%::percpu_1); -} - -# Main loop -REPEAT_LOOP: for (my $repeat=1; $repeat <= $::arg_repeat; $repeat++) { - - # copy all state variables - %::tm_0 = (); %::tr_0 = (); %::percpu_0 = (); - $::tm_0 = $::tm_1; $::tr_0 = $::tr_1; - foreach my $cpu (keys %::percpu_1) { $::percpu_0{$cpu} = $::percpu_1{$cpu}; } - - # estimate sleep delay to achieve desired interarrival by subtracting out - # the measured cpu runtime of the tool. - my $delay = $::arg_delay; - $delay = $MIN_DELAY if ($delay < $MIN_DELAY); - $delay = $MAX_DELAY if ($delay > $MAX_DELAY); - usleep( SI_M*$delay ); - - # Collect current state - $::tm_1 = (); $::tr_1 = (); %::percpu_1 = (); - # Get current hires epoc timestamp - $::tm_1 = clock_gettime(CLOCK_MONOTONIC); - $::tr_1 = clock_gettime(CLOCK_REALTIME); - # Get overall per-cpu stats - if ($is_schedstat) { - &read_schedstat(\%::percpu_1); - } else { - &read_stat(\%::percpu_1); - } - - # Get current uptime - &get_uptime(\$::uptime); - # Get current loadavg - &get_loadavg(\%::loadavg, \$::runq, \$::num_tasks); - # Get current processes blocked - &get_blocked(\$::num_blk); - - # Delta calculation - %::D_percpu = (); - $::tm_elapsed = $tm_1 - $tm_0; - foreach my $cpu (keys %::percpu_1) { - $::D_percpu{$cpu}{'runtime'} = ($::percpu_1{$cpu} - $::percpu_0{$cpu})/1.0E6; - if ($::tm_elapsed > 0.0) { - $::D_percpu{$cpu}{'occ'} = 100.0*$D_percpu{$cpu}{'runtime'}/1.0E3/$::tm_elapsed; - } else { - $::D_percpu{$cpu}{'occ'} = 0.0; - } - } - - # Print tool header - if ($repeat == 1) { - &occtop_header( - \$::tr_1, - \$::uptime, - \%::loadavg, - \$::runq, - \$::num_blk, - \$::num_tasks, - \$::print_host, - ); - } - - # Print one-liner summary - &print_occtop( - \$::tr_1, - \$::num_cpus, - \%::D_percpu, - \$::arg_header, - ); - - # exit repeat loop if we have exceeded overall time - last if ($::tm_1 > $::tm_final); - -} # REPEAT LOOP - -# Print that tool has finished -print "done\n"; - -# Capture timestamp and report delta -$b1 = new Benchmark; $bd = Benchmark::timediff($b1, $b0); -printf "processing time: %s\n", timestr($bd); -exit 0; - - -#------------------------------------------------------------------------------- - -# Parse per-cpu hi-resolution scheduling stats -sub read_schedstat -{ - (local *::percpu) = @_; - my ($version, $timestamp); - my ($cpu, $cputime); - my ($fh, $file); - - %::percpu = (); - - # parse /proc/schedstat - $file = '/proc/schedstat'; - open($fh, $file) || croak "Cannot open file: $file ($!)"; - $_ = <$fh>; ($version) = /^version\s+(\d+)/; - $_ = <$fh>; ($timestamp) = /^timestamp\s+(\d+)/; - - if ($version == 15) { - LOOP_SCHEDSTAT: while (<$fh>) { - # version 15: cputime is 7th field - if (/^cpu(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+/) { - $cpu = $1; $cputime = $2; - $::percpu{$cpu} = $cputime; - } - } - } else { - croak "schedstat version: $version method not implemented."; - } - close($fh); -} - -# Parse per-cpu jiffie stats; cputime excludes iowait. -sub read_stat -{ - (local *::percpu) = @_; - my ($cpu, $cputime); - my ($user, $sys, $nice, $idle, $iowt, $hirq, $sirq); - my ($fh, $file); - - %::percpu = (); - - # parse /proc/stat - $file = '/proc/stat'; - open($fh, $file) || croak "Cannot open file: $file ($!)"; - LOOP_STAT: while (<$fh>) { - if (/^cpu(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+/) { - $cpu =$1; $user = $2; $sys = $3; $nice = $4; $idle = $5; $iowt = $6; $hirq = $7; $sirq = $8; - $cputime = $CLOCK_NS * ($user + $sys + $nice + $iowt + $hirq + $sirq); - $::percpu{$cpu} = $cputime; - } - } - close($fh); -} - -# Parse load-average from /proc/loadavg -sub get_loadavg -{ - (local *::loadavg, local *::runq, *::num_tasks) = @_; - - $::loadavg{'1'} = 0.0; - $::loadavg{'5'} = 0.0; - $::loadavg{'15'} = 0.0; - $::runq = 0; - $::num_tasks = 0; - - my $file = '/proc/loadavg'; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - $_ = <$fh>; - if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\/(\d+)\s+\d+/) { - $::loadavg{'1'} = $1; - $::loadavg{'5'} = $2; - $::loadavg{'15'} = $3; - $::runq = $4; - $::num_tasks = $5; - } - close($fh); -} - -# Parse blocked from /proc/stat -sub get_blocked -{ - (local *::num_blk) = @_; - - $::num_blk = 0; - - my $file = '/proc/stat'; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - while ($_ = <$fh>) { - if (/^procs_blocked\s+(\d+)/) { - $::num_blk = $1; - } - } - close($fh); -} - -# Parse uptime from /proc/uptime -sub get_uptime -{ - (local *::uptime) = @_; - $::uptime = 0.0; - - my $file = '/proc/uptime'; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - $_ = <$fh>; - if (/^(\S+)\s+\S+/) { - $::uptime = $1; - } - close($fh); -} - -# Get number of online logical cpus -sub get_num_logical_cpus { - (local *::num_cpus) = @_; - $::num_cpus = 0; - - my $file = "/proc/cpuinfo"; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - LOOP_CPUINFO: while (<$fh>) { - if (/^[Pp]rocessor\s+:\s\d+/) { - $::num_cpus++; - } - } - close($fh); -} - -# Print occupancy summary -sub print_occtop { - (local *::tr_1, - local *::num_cpus, - local *::D_percpu, - local *::arg_header, - ) = @_; - - # counter - our $count; - $::count++; $::count %= $::arg_header; - $::count = 1 if ($::arg_header == 1); - - my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst); - ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($::tr_1); - my $msec = 1000.0*($::tr_1 - int($::tr_1)); - - # Print heading every so often - if ($::count == 1) { - printf "%s ". - "%7s ", - 'yyyy-mm-dd hh:mm:ss.fff', - 'total'; - for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { - printf "%5s ", $cpu; - } - print "\n"; - } - - # Print one summary - my $occ_total = 0.0; - for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { - $occ_total += $::D_percpu{$cpu}{'occ'}; - } - printf "%4d-%02d-%02d %02d:%02d:%02d.%03d ". - "%7.1f ", - 1900+$year, 1+$mon, $mday, $hour, $min, $sec, $msec, - $occ_total; - for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { - printf "%5.1f ", $::D_percpu{$cpu}{'occ'}; - } - print "\n"; -} - -# Print header -sub occtop_header { - (local *::tr_1, - local *::uptime, - local *::loadavg, - local *::runq, - local *::num_blk, - local *::num_tasks, - local *::print_host, - ) = @_; - - # process epoch to get current timestamp - my $mm_in_s = 60; - my $hh_in_s = 60*60; - my $dd_in_s = 24*60*60; - my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst); - ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($::tr_1); - my $msec = 1000.0*($::tr_1 - int($::tr_1)); - - # convert uptime to elapsed ::: - my ($up, $up_dd, $up_hh, $up_mm, $up_ss); - $up = int($::uptime); - $up_dd = int($up/$dd_in_s); - $up -= $dd_in_s*$up_dd; - $up_hh = int($up/$hh_in_s); - $up -= $hh_in_s*$up_hh; - $up_mm = int($up/$mm_in_s); - $up -= $mm_in_s*$up_mm; - $up_ss = $up; - - #occtop -- 2014/03/03 02:00:21.357 ldavg:0.07, 0.09, 0.08 runq:1 nproc:440 up:6:13:00:56 - printf "%s %s -- ". - "%4d-%02d-%02d %02d:%02d:%02d.%03d ". - "ldavg:%.2f, %.2f, %.2f runq:%d blk:%d nproc:%d ". - "up:%d:%02d:%02d:%02d\n", - $::TOOLNAME, $::VERSION, - 1900+$year, 1+$mon, $mday, $hour, $min, $sec, $msec, - $::loadavg{'1'}, $::loadavg{'5'}, $::loadavg{'15'}, - $::runq, $::num_blk, $::num_tasks, - $up_dd, $up_hh, $up_mm, $up_ss; - - return if (!($::print_host)); - - # After first print, disable print host information - $::print_host = 0; - - # Get host specific information - my ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE); - ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE) = POSIX::uname(); - my ($NODETYPE, $SUBFUNCTION, $BUILDINFO) = ('-', '-', '-'); - my ($SW_VERSION, $BUILD_ID) = ('-', '-'); - - # Get platform nodetype and subfunction - PLATFORM: { - my $file = "/etc/platform/platform.conf"; - open(FILE, $file) || next; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^nodetype=(\S+)/) { - $NODETYPE = $1; - } - if (/^subfunction=(\S+)/) { - $SUBFUNCTION = $1; - } - } - close(FILE); - } - - # Get loadbuild info - BUILD: { - my $file = "/etc/build.info"; - open(FILE, $file) || next; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^SW_VERSION=\"([^"]+)\"/) { - $SW_VERSION = $1; - } - if (/^BUILD_ID=\"([^"]+)\"/) { - $BUILD_ID = $1; - } - } - close(FILE); - } - $BUILDINFO = join(' ', $SW_VERSION, $BUILD_ID); - - # Parse /proc/cpuinfo to get specific processor info - my ($n_cpu, $model_name, $cpu_MHz) = (0, '-', 0); - CPUINFO: { - my $file = "/proc/cpuinfo"; - open(FILE, $file) || croak "Cannot open file: $file ($!)"; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^[Pp]rocessor\s+:\s+\d+/) { - $n_cpu++; - } elsif (/^model name\s+:\s+(.*)$/) { - $_ = $1; s/\s+/ /g; - $model_name = $_; - } elsif (/^cpu MHz\s+:\s+(\S+)/) { - $cpu_MHz = $1; - } elsif (/^bogomips\s+:\s+(\S+)/) { - $cpu_MHz = $1 if ($cpu_MHz == 0); - } - } - close(FILE); - } - - printf " host:%s nodetype:%s subfunction:%s\n", - $NODENAME, $NODETYPE, $SUBFUNCTION; - printf " arch:%s processor:%s speed:%.0f #CPUs:%d\n", - $MACHINE, $model_name, $cpu_MHz, $n_cpu; - printf " %s %s build:%s\n", $OSTYPE, $OSRELEASE, $BUILDINFO; - -} - -# Parse and validate command line arguments -sub parse_occtop_args { - (local *::arg_debug, - local *::arg_delay, - local *::arg_repeat, - local *::arg_period, - local *::arg_header, - ) = @_; - - # Local variables - my ($fail, $arg_help); - - # Use the Argument processing module - use Getopt::Long; - - # Print usage if no arguments - if (!@::ARGV) { - &Usage(); - exit 0; - } - - # Process input arguments - $fail = 0; - GetOptions( - "debug:i", \$::arg_debug, - "delay=f", \$::arg_delay, - "period=i", \$::arg_period, - "repeat=i", \$::arg_repeat, - "header:i", \$::arg_header, - "help|h", \$arg_help - ) || GetOptionsMessage(); - - # Print help documentation if user has selected --help - &ListHelp() if (defined $arg_help); - - # Validate options - if ((defined $::arg_repeat) && (defined $::arg_period)) { - $fail = 1; - warn "$::TOOLNAME: Input error: cannot specify both --repeat and --period options.\n"; - } - if ((defined $::arg_delay) && ($::arg_delay < 0.01)) { - $fail = 1; - warn "$::TOOLNAME: Input error: --delay %f is less than 0.01.\n", - $::arg_delay; - } - if (@::ARGV) { - $fail = 1; - warn "$::TOOLNAME: Input error: not expecting these options: '@::ARGV'.\n"; - } - - # Set reasonable defaults - $::arg_header ||= 15; - $::arg_delay ||= 1.0; - $::arg_repeat ||= 1; - if ($::arg_period) { - $::arg_repeat = $::arg_period / $::arg_delay; - } else { - $::arg_period = $::arg_delay * $::arg_repeat; - } - - # Upon missing or invalid options, print usage - if ($fail == 1) { - &Usage(); - exit 1; - } -} - -# Print out a warning message and usage -sub GetOptionsMessage { - warn "$::TOOLNAME: Error processing input arguments.\n"; - &Usage(); - exit 1; -} - -# Print out program usage -sub Usage { - printf "Usage: $::TOOLNAME OPTIONS\n"; - printf " [--delay=] [--repeat=] [--period=]\n"; - printf " [--header=]\n"; - printf " [--help]\n"; - - printf "\n"; -} - -# Print tool help -sub ListHelp { - printf "$::TOOLNAME -- display hi-resolution per-cpu occupancy\n"; - &Usage(); - printf "Options: miscellaneous\n"; - printf " --delay= : output interval (seconds): default: 1.0\n"; - printf " --repeat= : number of repeat samples: default: 1\n"; - printf " --period= : overall tool duration (seconds): default: --\n"; - printf " --header= : print header every num samples: default: 15\n"; - printf " --help : this help\n"; - exit 0; -} - -1; diff --git a/tools/monitor-tools/scripts/schedtop b/tools/monitor-tools/scripts/schedtop deleted file mode 100755 index a170040b9..000000000 --- a/tools/monitor-tools/scripts/schedtop +++ /dev/null @@ -1,1312 +0,0 @@ -#!/usr/bin/perl -######################################################################## -# -# Copyright (c) 2015-2016 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -# -######################################################################## -# -# Description: -# This displays occupancy and scheduling information per sample period. -# Output includes total occupancy, per-core occupancy, loadavg, per-task cpu, -# per-task scheduling, per-task io-wait. -# -# Usage: schedtop OPTIONS -# [--delay=] [--repeat=] [--period=] -# [--reset-hwm] [--idle] [--sort=] -# [--help] - -use strict; -use warnings; -use Data::Dumper; -use POSIX qw(uname strftime); -use Time::HiRes qw(clock_gettime usleep CLOCK_MONOTONIC CLOCK_REALTIME); -use Benchmark ':hireswallclock'; -use Carp qw(croak carp); -use Math::BigInt; - -# Define toolname -our $TOOLNAME = "schedtop"; -our $VERSION = "0.1"; - -# Constants -use constant SI_k => 1.0E3; -use constant SI_M => 1.0E6; -use constant SI_G => 1.0E9; -use constant Ki => 1024.0; -use constant Mi => 1024.0*1024.0; -use constant Gi => 1024.0*1024.0*1024.0; - -# Globals -our %opt_V = (); -our %opt_P = (); -our %percpu_0 = (); -our %percpu_1 = (); -our %task_0 = (); -our %task_1 = (); -our %tids_0 = (); -our %tids_1 = (); -our %D_task = (); -our %D_percpu = (); -our %loadavg = (); -our $tm_0 = (); -our $tm_1 = (); -our $tr_0 = (); -our $tr_1 = (); -our $tm_elapsed = (); -our $tr_elapsed = (); -our $tm_final = (); -our $uptime = (); -our $num_cpus = 1; -our $affinity_mask = Math::BigInt->new('0'); -our $w_aff = 10; -our $num_tasks = 0; -our $num_blk = 0; -our $is_schedstat = 1; -our $USER_HZ = 100; # no easy way to get this -our $CLOCK_NS = SI_G / $USER_HZ; -our $print_host = 1; - -# Print options -our ($P_none, $P_lite, $P_brief, $P_full) = (0, 1, 2, 3); -our ($P_ps, $P_cpu, $P_del, $P_io, $P_id, $P_cmd) = (0, 1, 2, 3, 4, 5); -our @P_list = ($::P_ps, $::P_cpu, $::P_del, $::P_io, $::P_id, $::P_cmd); - -# Argument list parameters -our ($arg_debug, - $arg_delay, - $arg_repeat, - $arg_period, - $arg_reset_hwm, - $arg_idle, - $arg_sort, - $arg_print) = (); - -#------------------------------------------------------------------------------- -# MAIN Program -#------------------------------------------------------------------------------- -my $ONE_BILLION = 1.0E9; -my $MIN_DELAY = 0.001; -my $MAX_DELAY = 0.001; - -# benchmark variables -my ($bd, $b0, $b1); -my @policies = ('OT', 'FF', 'RR', 'BA', 'ID', 'UN', 'UN'); -my @delta_list = ( - 'nr_switches', - 'nr_migrations', - 'exec_runtime', - 'wait_sum', - 'wait_count', - 'iowait_sum', - 'iowait_count', - 'syscr', - 'syscw', - 'read_bytes', - 'write_bytes', - 'cancelled_write_bytes', -); - -my @state_list = ( - 'exec_max', 'wait_max', - 'pid', 'ppid', 'state', 'comm', 'cmdline', 'wchan', 'affinity', - 'VmSize', 'VmRSS', 'start_time', - 'nice', 'policy', 'priority', 'rt_priority', 'task_cpu' -); - -# Autoflush output -select(STDERR); -$| = 1; -select(STDOUT); # default -$| = 1; - -# Parse input arguments and print tool usage if necessary -&parse_schedtop_args( - \$::arg_debug, - \$::arg_delay, - \$::arg_repeat, - \$::arg_period, - \$::arg_reset_hwm, - \$::arg_idle, - \$::arg_sort, - \$::arg_print, -); - -# Set default print options -if ($::arg_print eq 'full') { - for my $P (@::P_list) { $::opt_P{$P} = $::P_full; } -} elsif ($::arg_print eq 'brief') { - for my $P (@::P_list) { $::opt_P{$P} = $::P_brief; } -} else { - for my $P (@::P_list) { $::opt_P{$P} = $::P_none; } -} -# Disable some options if data not present -$::opt_V{'sched'} = &is_sched(); -$::opt_V{'io'} = &is_io(); -if ($::opt_V{'sched'} == 0) { - $::opt_P{$::P_cpu} = $::P_none; - $::opt_P{$::P_del} = $::P_none; - $::opt_P{$::P_io} = $::P_none; - undef $::arg_reset_hwm; -} -if ($::opt_V{'io'} == 0) { - if ($::opt_V{'sched'} == 0) { - $::opt_P{$::P_io} = $::P_none; - $::arg_sort = 'cpu'; - } else { - if ($::opt_P{$::P_io} != $::P_none) { - $::opt_P{$::P_io} = $::P_lite; - } - } -} - -# Check for root user -if ($>) { - warn "$::TOOLNAME: requires root/sudo.\n"; - exit 1; -} - -# Print out some debugging information -if (defined $::arg_debug) { - $Data::Dumper::Indent = 1; -} - -# Check for schedstat support; fallback to stats -$is_schedstat = -e '/proc/schedstat' ? 1 : 0; - -# Print out selected options -printf "selected options: ". - "delay = %.3fs, repeat = %d, idle=%s, hwm=%s, sort=%s, print=%s\n", - $::arg_delay, $::arg_repeat, - (defined $::arg_idle ? 'idle_tasks' : 'no_idle_tasks'), - (defined $::arg_reset_hwm ? 'reset-hwm' : 'unchanged'), - $::arg_sort, $::arg_print; - -# Capture timestamp -$b0 = new Benchmark; - -# Get number of logical cpus -&get_num_logical_cpus(\$::num_cpus); -$::affinity_mask = Math::BigInt->new('0'); -for (my $i=0; $i < $::num_cpus; $i++) { - my $y = Math::BigInt->new('1'); - $y->blsft($i); - $::affinity_mask->bior($y); -} -$w_aff = &max(length 'AFF', length $::affinity_mask->as_hex()); - -# Reset scheduling hi-water marks -if (defined $::arg_reset_hwm) { - &get_tids(\%::tids_1); - &reset_sched_hwm(\%::tids_1); - sleep(0.001); -} - -# Get current hires epoc timestamp -$::tm_1 = clock_gettime(CLOCK_MONOTONIC); -$::tr_1 = clock_gettime(CLOCK_REALTIME); -$::tm_final = $::tm_1 + $::arg_delay*$::arg_repeat; - -# Set initial delay -$::tm_elapsed = $::arg_delay; -$MAX_DELAY = $::arg_delay + $MIN_DELAY; - - -# Get overall per-cpu stats -if ($is_schedstat) { - &read_schedstat(\%::percpu_1); -} else { - &read_stat(\%::percpu_1); -} -# Get list of pids and tids -&get_tids(\%::tids_1); -# Get current scheduling and io info for all tids -&read_sched(\%::tids_1, \%::task_1); - -# determine column sort order -my ($s_key1, $s_key2, $s_key3) = (); -if ($::arg_sort eq 'cpu') { - ($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', 'pid'); -} elsif ($::arg_sort eq 'io') { - ($s_key1, $s_key2, $s_key3) = ('io', 'ios', 'exec_runtime'); -} else { - ($s_key1, $s_key2, $s_key3) = ('exec_runtime', 'nr_switches', , 'pid'); -} - -# Main loop -REPEAT_LOOP: for (my $repeat=1; $repeat <= $::arg_repeat; $repeat++) { - - # copy all state variables - $::tm_0 = (); $::tr_0 = (); %::percpu_0 = (); %::tids_0 = (); %::task_0 = (); - $::tm_0 = $::tm_1; $::tr_0 = $::tr_1; - foreach my $cpu (keys %::percpu_1) { $::percpu_0{$cpu} = $::percpu_1{$cpu}; } - foreach my $tid (keys %::tids_1) { $::tids_0{$tid} = $::tids_1{$tid}; } - foreach my $tid (keys %::task_1) { - foreach my $var (keys $::task_1{$tid}) { - $::task_0{$tid}{$var} = $::task_1{$tid}{$var}; - } - } - - # estimate sleep delay to achieve desired interarrival by subtracting out - # the measured cpu runtime of the tool. - my $delay = $::arg_delay; - if (defined $::D_task{$$}{'exec_runtime'}) { - $delay -= ($::D_task{$$}{'exec_runtime'}/SI_k); - } - $delay = $MIN_DELAY if ($delay < $MIN_DELAY); - $delay = $MAX_DELAY if ($delay > $MAX_DELAY); - usleep( SI_M*$delay ); - - # Collect current state - $::tm_1 = (); $::tr_1 = (); %::percpu_1 = (); %::tids_1 = (); %::task_1 = (); - # Get current hires epoc timestamp - $::tm_1 = clock_gettime(CLOCK_MONOTONIC); - $::tr_1 = clock_gettime(CLOCK_REALTIME); - # Get overall per-cpu stats - if ($is_schedstat) { - &read_schedstat(\%::percpu_1); - } else { - &read_stat(\%::percpu_1); - } - # Get list of pids and tids - &get_tids(\%::tids_1); - # Get current scheduling and io info for all tids - &read_sched(\%::tids_1, \%::task_1); - # Get current uptime - &get_uptime(\$::uptime); - # Get current loadavg - &get_loadavg(\%::loadavg, \$::runq, \$::num_tasks); - # Get current processes blocked - &get_blocked(\$::num_blk); - - # Delta calculation - %::D_task = (); %::D_percpu = (); - $::tm_elapsed = $::tm_1 - $::tm_0; - $::tr_elapsed = $::tr_1 - $::tr_0; - foreach my $tid (keys %::task_1) { - next if ( !(exists $::task_0{$tid}) ); - - # simple delta - foreach my $var (@delta_list) { - $::D_task{$tid}{$var} = ($::task_1{$tid}{$var} - $::task_0{$tid}{$var}); - } - # state information - foreach my $state (@state_list) { - $::D_task{$tid}{$state} = $::task_1{$tid}{$state}; - } - - # derived calculations - my $exec_runtime = $::D_task{$tid}{'exec_runtime'}; - my $nr_switches = $::D_task{$tid}{'nr_switches'}; - my $iowait_sum = $::D_task{$tid}{'iowait_sum'}; - if ($nr_switches > 0.0) { - $::D_task{$tid}{'tlen'} = $exec_runtime / $nr_switches; - } else { - $::D_task{$tid}{'tlen'} = 0.0; - } - if ($::tm_elapsed > 0.0) { - $::D_task{$tid}{'occ'} = 100.0*$exec_runtime/1.0E3/$::tm_elapsed; - $::D_task{$tid}{'iowait'} = 100.0*$iowait_sum/1.0E3/$::tm_elapsed; - } else { - $::D_task{$tid}{'occ'} = 0.0; - $::D_task{$tid}{'iowait'} = 0.0; - } - $::D_task{$tid}{'io'} = $::D_task{$tid}{'read_bytes'} - + $::D_task{$tid}{'write_bytes'} - + $::D_task{$tid}{'cancelled_write_bytes'}; - $::D_task{$tid}{'ios'} = $::D_task{$tid}{'syscw'} - + $::D_task{$tid}{'iowait_count'}; - } - - foreach my $cpu (keys %::percpu_1) { - $::D_percpu{$cpu}{'runtime'} = ($::percpu_1{$cpu} - $::percpu_0{$cpu})/1.0E6; - if ($::tm_elapsed > 0.0) { - $::D_percpu{$cpu}{'occ'} = 100.0*$D_percpu{$cpu}{'runtime'}/1.0E3/$::tm_elapsed; - } else { - $::D_percpu{$cpu}{'occ'} = 0.0; - } - } - my $occ_total = 0.0; - for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { - $occ_total += $::D_percpu{$cpu}{'occ'}; - } - - # Print summary - &schedtop_header( - \$::tr_1, - \$::tm_elapsed, - \$::tr_elapsed, - \$::uptime, - \$::loadavg, - \$::runq, - \$::num_blk, - \$::num_tasks, - \$::print_host - ); - - printf "%-5s %7s ", 'core:', 'total'; - for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { - printf "%5s ", $cpu; - } - print "\n"; - printf "%-5s %7.1f ", 'occ:', $occ_total; - for (my $cpu=0; $cpu < $::num_cpus; $cpu++) { - printf "%5.1f ", $::D_percpu{$cpu}{'occ'}; - } - print "\n"; - print "\n"; - - # Build up output line by specific area - my $L = (); - $L = ''; - $L .= sprintf "%6s %6s %6s ", "TID", "PID", "PPID"; - if ($::opt_P{$::P_ps} != $::P_none) { - $L .= sprintf "%1s %2s %*s %2s %3s %4s ", - "S", "P", $w_aff, "AFF", "PO", "NI", "PR"; - } - if ($::opt_P{$::P_cpu} == $::P_brief) { - $L .= sprintf "%6s %7s ", "ctxt", "occ"; - } elsif ($::opt_P{$::P_cpu} == $::P_full) { - $L .= sprintf "%6s %6s %7s ", "ctxt", "migr", "occ"; - } - if ($::opt_P{$::P_del} != $::P_none) { - $L .= sprintf "%7s %7s %7s %7s ", "tlen", "tmax", "delay", "dmax"; - } - if ($::opt_P{$::P_io} == $::P_lite) { - $L .= sprintf "%7s %6s ", "iowt", "iocnt"; - } elsif ($::opt_P{$::P_io} == $::P_brief) { - $L .= sprintf "%7s %8s %8s ", "iowt", "read", "write"; - } elsif ($::opt_P{$::P_io} == $::P_full) { - $L .= sprintf "%7s %8s %8s %8s %8s %8s ", - "iowt", "read", "write", "wcncl", "rsysc", "wsysc"; - } - if ($::opt_P{$::P_id} != $::P_none) { - $L .= sprintf "%-22s ", "wchan"; - } - if ($::opt_P{$::P_cmd} == $::P_brief) { - $L .= sprintf "%s", "cmdline"; - } elsif ($::opt_P{$::P_cmd} == $::P_full) { - $L .= sprintf "%-15s %s", "comm", "cmdline"; - } - print $L, "\n"; - - foreach my $tid (sort {($D_task{$b}{$s_key1} <=> $D_task{$a}{$s_key1}) or - ($D_task{$b}{$s_key2} <=> $D_task{$a}{$s_key2}) or - ($D_task{$b}{$s_key3} <=> $D_task{$a}{$s_key3})} keys %D_task) { - my $exec_runtime = $::D_task{$tid}{'exec_runtime'}; - my $nr_switches = $::D_task{$tid}{'nr_switches'}; - my $aff = $::D_task{$tid}{'affinity'}->as_hex(); - - # skip printing if there is no actual delta - if ( !(defined $::arg_idle) ) { - next if (($exec_runtime == 0.0) && ($nr_switches == 0)); - } - - # Build up output line by specific area - $L = ''; - $L .= sprintf "%6d %6d %6d ", - $tid, $::D_task{$tid}{'pid'}, $::D_task{$tid}{'ppid'}; - if ($::opt_P{$::P_ps} != $::P_none) { - $L .= sprintf "%1s %2d %*s %2s %3d %4d ", - $::D_task{$tid}{'state'}, $::D_task{$tid}{'task_cpu'}, $w_aff, $aff, - $policies[$::D_task{$tid}{'policy'}], $::D_task{$tid}{'nice'}, - $::D_task{$tid}{'priority'}; - } - if ($::opt_P{$::P_cpu} == $::P_brief) { - $L .= sprintf "%6d %7.2f ", - $::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'occ'}; - } elsif ($::opt_P{$::P_cpu} == $::P_full) { - $L .= sprintf "%6d %6d %7.2f ", - $::D_task{$tid}{'nr_switches'}, $::D_task{$tid}{'nr_migrations'}, - $::D_task{$tid}{'occ'}, - } - if ($::opt_P{$::P_del} != $::P_none) { - $L .= sprintf "%7.3f %7.1f %7.3f %7.1f ", - $::D_task{$tid}{'tlen'}, $::D_task{$tid}{'exec_max'}, - $::D_task{$tid}{'wait_sum'}, $::D_task{$tid}{'wait_max'}; - } - if ($::opt_P{$::P_io} == $::P_lite) { - $L .= sprintf "%7.2f %6d ", - $::D_task{$tid}{'iowait'}, $::D_task{$tid}{'iowait_count'}; - } elsif ($::opt_P{$::P_io} == $::P_brief) { - $L .= sprintf "%7.2f %8s %8s ", - $::D_task{$tid}{'iowait'}, - &format_SI($::D_task{$tid}{'read_bytes'}), - &format_SI($::D_task{$tid}{'write_bytes'}); - } elsif ($::opt_P{$::P_io} == $::P_full) { - $L .= sprintf "%7.2f %8s %8s %8s %8s %8s ", - $::D_task{$tid}{'iowait'}, - &format_SI($::D_task{$tid}{'read_bytes'}), - &format_SI($::D_task{$tid}{'write_bytes'}), - &format_SI($::D_task{$tid}{'cancelled_write_bytes'}), - &format_SI($::D_task{$tid}{'syscr'}), - &format_SI($::D_task{$tid}{'syscw'}); - } - if ($::opt_P{$::P_id} != $::P_none) { - $L .= sprintf "%-22s ", substr($::D_task{$tid}{'wchan'}, 0, 22); - } - if ($::opt_P{$::P_cmd} == $::P_brief) { - $L .= sprintf "%s", $::D_task{$tid}{'cmdline'}; - } elsif ($::opt_P{$::P_cmd} == $::P_full) { - $L .= sprintf "%-15s %s", - $::D_task{$tid}{'comm'}, $::D_task{$tid}{'cmdline'}; - } - print $L, "\n"; - } - print "\n"; - - # exit repeat loop if we have exceeded overall time - last if ($::tm_1 > $::tm_final); - -} # REPEAT LOOP - -# Print that tool has finished -print "done\n"; - -# Capture timestamp and report delta -$b1 = new Benchmark; $bd = Benchmark::timediff($b1, $b0); -printf "processing time: %s\n", timestr($bd); -exit 0; - - -#------------------------------------------------------------------------------- -# Convert a number to SI unit xxx.yyyG -sub format_SI -{ - (my $value) = @_; - if ($value >= SI_G) { - return sprintf("%.3fG", $value/SI_G); - } elsif ($value >= SI_M) { - return sprintf("%.3fM", $value/SI_M); - } elsif ($value >= SI_k) { - return sprintf("%.3fk", $value/SI_k); - } else { - return sprintf("%.0f", $value); - } -} - -# Convert to IEC binary unit xxx.yyyGi -# Since underlying memory units are in pages, don't need decimals for Ki -sub format_IEC -{ - (my $value) = @_; - if ($value >= Gi) { - return sprintf("%.3fGi", $value/Gi); - } elsif ($value >= Mi) { - return sprintf("%.3fMi", $value/Mi); - } elsif ($value >= Ki) { - return sprintf("%.0fKi", $value/Ki); - } else { - return sprintf("%.0f", $value); - } -} - -# Determine whether scheduler stats are available -sub is_sched -{ - return (-e '/proc/1/task/1/sched') ? 1 : 0; -} - -# Determine whether IO stats are available -sub is_io -{ - return (-e '/proc/1/task/1/io') ? 1 : 0; -} - -# Determine max of array -sub max { - my ($max, @vars) = @_; - for (@vars) { - $max = $_ if $_ > $max; - } - return $max; -} - -# Determine tids and pid mapping by walking /proc//task/ -sub get_tids -{ - (local *::tids) = @_; - my (@pids_, @tids_) = (); - my ($dh, $pid, $tid); - - # get pid list - my $dir = '/proc'; - opendir($dh, $dir) || croak "Cannot open directory: $dir ($!)"; - @pids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh); - closedir $dh; - - # get tid list - foreach $pid (@pids_) { - $dir = '/proc/' . $pid . '/task'; - opendir(my $dh, $dir) || next; - @tids_ = grep { /^\d+$/ && -d "$dir/$_" } readdir($dh); - closedir $dh; - foreach $tid (@tids_) { $::tids{$tid} = $pid; } - } -} - -# Reset scheduling hi-water-marks -sub reset_sched_hwm -{ - (local *::tids) = @_; - - # reset scheduling hi-water-marks by writing '0' to each task - foreach my $tid (keys %::tids) { - my $file = '/proc/' . $tid . '/sched'; - open(my $fh, "> $file") || next; - print $fh "0\n"; - close($fh); - } -} - -# Parse cpu and scheduling info for each tid -# - ignore the specific tid if there is incomplete data, -# (i.e., cannot obtain info because task has died, -# eg. missing ./stat, ./status, ./cmdline, ./wchan) -# -sub read_sched -{ - (local *::tids, local *::task) = @_; - - %::task = (); - foreach my $tid (keys %::tids) { - my ($fh, $file, $pid, $comm, $cmdline, $wchan, $id) = (); - my ($tpid, $tcomm, $state, $ppid, $pgrp, $sid, - $tty_nr, $tty_pgrp, $flags, - $min_flt, $cmin_flt, $maj_flt, $cmaj_flt, - $utime, $stime, $cutime, $cstime, - $priority, $nice, $num_threads, - $it_real_value, $start_time, - $vsize, $rss, $rsslim, - $start_code, $end_code, $start_stack, $esp, $eip, - $pending, $blocked, $sigign, $sigcatch, $wchan_addr, - $dum1, $dum2, $exit_signal, $task_cpu, - $rt_priority, $policy, $blkio_ticks, - $gtime, $cgtime, - $start_data, $end_data, $start_brk, $arg_start, $arg_end, - $env_start, $env_end, $exit_code) = (); - - my ($nr_switches, $nr_migrations) = (0,0); - my ($exec_runtime, $exec_max) = (0.0, 0.0); - my ($wait_max, $wait_sum, $wait_count) = (0.0, 0.0, 0); - my ($iowait_sum, $iowait_count) = (0.0, 0); - my ($VmSize, $VmRSS) = (); - my $Cpus_allowed = Math::BigInt->new('0'); - my $affinity = Math::BigInt->new('0'); - my ($rchar, $wchar, $syscr, $syscw, $read_bytes, $write_bytes, - $cancelled_write_bytes) = (0,0,0,0,0,0,0); - - my ($sched_valid, $io_valid, $status_valid, $cmdline_valid, - $wchan_valid, $stat_valid) = (); - - $pid = $::tids{$tid}; - - # NOTE: Format change over time: OLD: se.statistics.X, NEW: se.statistics->X - #cat /proc/1/sched - #systemd (1, #threads: 1) - #------------------------------------------------------------------- - #se.exec_start : 33792676.285222 - #se.vruntime : 28019997.693224 - #se.sum_exec_runtime : 21918.207287 - #se.nr_migrations : 5413 - #se.statistics->sum_sleep_runtime : 1166561.198533 - #se.statistics->wait_start : 0.000000 - #se.statistics->sleep_start : 33792676.285222 - #se.statistics->block_start : 0.000000 - #se.statistics->sleep_max : 18951.679990 - #se.statistics->block_max : 0.000000 - #se.statistics->exec_max : 0.909747 - #se.statistics->slice_max : 1.790123 - #se.statistics->wait_max : 4.026544 - #se.statistics->wait_sum : 507.245963 - #se.statistics->wait_count : 2540 - #se.statistics->iowait_sum : 0.000000 - #se.statistics->iowait_count : 0 - #se.statistics->nr_migrations_cold : 0 - #se.statistics->nr_failed_migrations_affine : 67 - #se.statistics->nr_failed_migrations_running : 1 - #se.statistics->nr_failed_migrations_hot : 1 - #se.statistics->nr_forced_migrations : 0 - #se.statistics->nr_wakeups : 2472 - #se.statistics->nr_wakeups_sync : 34 - #se.statistics->nr_wakeups_migrate : 176 - #se.statistics->nr_wakeups_local : 1442 - #se.statistics->nr_wakeups_remote : 1030 - #se.statistics->nr_wakeups_affine : 155 - #se.statistics->nr_wakeups_affine_attempts : 969 - #se.statistics->nr_wakeups_passive : 0 - #se.statistics->nr_wakeups_idle : 0 - #avg_atom : 0.286970 - #avg_per_cpu : 4.049179 - #nr_switches : 76378 - #nr_voluntary_switches : 72308 - #nr_involuntary_switches : 4070 - #se.load.weight : 1024 - #policy : 0 - #prio : 120 - #clock-delta : 28 - - # parse /proc//task//sched - $file = '/proc/' . $pid . '/task/' . $tid . '/sched'; - open($fh, $file) || goto SKIP_SCHED; - $_ = <$fh>; - if (/^(.*)\s+\((\d+),\s+#threads:/) { - $comm = $1; $id = $2; - } - my ($k, $v, $c0); - LOOP_SCHED: while (<$fh>) { - if (/^se\.statistics.{1,2}wait_max\s+:\s+(\S+)/) { - $wait_max = $1; - } elsif (/^se\.statistics.{1,2}wait_sum\s+:\s+(\S+)/) { - $wait_sum = $1; - } elsif (/^se\.statistics.{1,2}wait_count\s+:\s+(\S+)/) { - $wait_count = $1; - } elsif (/^se\.statistics.{1,2}exec_max\s+:\s+(\S+)/) { - $exec_max = $1; - } elsif (/^se\.statistics.{1,2}iowait_sum\s+:\s+(\S+)/) { - $iowait_sum = $1; - } elsif (/^se\.statistics.{1,2}iowait_count\s+:\s+(\S+)/) { - $iowait_count = $1; - } elsif (/^se\.sum_exec_runtime\s+:\s+(\S+)/) { - $exec_runtime = $1; - } elsif (/^se\.nr_migrations\s+:\s+(\S+)/) { - $nr_migrations = $1; - } elsif (/^nr_switches\s+:\s+(\S+)/) { - $nr_switches = $1; - $sched_valid = 1; - last LOOP_SCHED; - } - } - close($fh); - SKIP_SCHED:; - - #cat /proc/1/io - #rchar: 3432590242 - #wchar: 438665986 - #syscr: 316595 - #syscw: 104722 - #read_bytes: 1586438144 - #write_bytes: 246829056 - #cancelled_write_bytes: 7798784 - - # parse /proc//task//io - $file = '/proc/' . $pid . '/task/' . $tid . '/io'; - open($fh, $file) || goto SKIP_IO; - LOOP_IO: while (<$fh>) { - if (/^rchar:\s+(\S+)/) { - $rchar = $1; - } elsif (/^wchar:\s+(\S+)/) { - $wchar = $1; - } elsif (/^syscr:\s+(\S+)/) { - $syscr = $1; - } elsif (/^syscw:\s+(\S+)/) { - $syscw = $1; - } elsif (/^read_bytes:\s+(\S+)/) { - $read_bytes = $1; - } elsif (/^write_bytes:\s+(\S+)/) { - $write_bytes = $1; - } elsif (/^cancelled_write_bytes:\s+(\S+)/) { - $cancelled_write_bytes = $1; - $io_valid = 1; - last LOOP_IO; - } - } - close($fh); - SKIP_IO:; - - # parse /proc//task//status - $file = '/proc/' . $pid . '/task/' . $tid . '/status'; - open($fh, $file) || next; - LOOP_STATUS: while (<$fh>) { - if (/^Name:\s+(.*)/) { - $comm = $1; - } elsif (/^State:\s+(\S+)/) { - $state = $1; - } elsif (/^PPid:\s+(\S+)/) { - $ppid = $1; - } elsif (/^VmSize:\s+(\S+)/) { - $VmSize = $1; - } elsif (/^VmRSS:\s+(\S+)/) { - $VmRSS = $1; - } elsif (/^Cpus_allowed:\s+([0]+,)*(\S+)/) { - my $h = $2; $h =~ tr/,/_/; - $Cpus_allowed = Math::BigInt->from_hex($h); - $affinity = $Cpus_allowed->band($::affinity_mask); - $status_valid = 1; - last LOOP_STATUS; - } - } - close($fh); - - # parse /proc//task//cmdline - $file = '/proc/' . $pid . '/task/' . $tid . '/cmdline'; - open($fh, $file) || next; - LOOP_CMDLINE: while (<$fh>) { - if (/^(.*)$/) { - $cmdline = $1; - $cmdline =~ s/\000/ /g; - $cmdline_valid = 1; - last LOOP_CMDLINE; - } - } - if (!$cmdline_valid) { - $cmdline_valid = 1; - $cmdline = $comm; - } - close($fh); - - # parse /proc//task//wchan - $file = '/proc/' . $pid . '/task/' . $tid . '/wchan'; - open($fh, $file) || next; - LOOP_WCHAN: while (<$fh>) { - if (/^(.*)$/) { - $wchan = $1; - $wchan_valid = 1; - last LOOP_WCHAN; - } - } - close($fh); - - #Table 1-4: Contents of the stat files (as of 2.6.30-rc7) - #.............................................................................. - # Field Content - # tpid process id (or tid, if /proc//task//stat) - # tcomm filename of the executable - # state state (R is running, S is sleeping, D is sleeping in an - # uninterruptible wait, Z is zombie, T is traced or stopped) - # ppid process id of the parent process - # pgrp pgrp of the process - # sid session id - # tty_nr tty the process uses - # tty_pgrp pgrp of the tty - # flags task flags - # min_flt number of minor faults - # cmin_flt number of minor faults with child's - # maj_flt number of major faults - # cmaj_flt number of major faults with child's - # utime user mode jiffies - # stime kernel mode jiffies - # cutime user mode jiffies with child's - # cstime kernel mode jiffies with child's - # priority priority level - # nice nice level - # num_threads number of threads - # it_real_value (obsolete, always 0) - # start_time time the process started after system boot - # vsize virtual memory size - # rss resident set memory size - # rsslim current limit in bytes on the rss - # start_code address above which program text can run - # end_code address below which program text can run - # start_stack address of the start of the main process stack - # esp current value of ESP - # eip current value of EIP - # pending bitmap of pending signals - # blocked bitmap of blocked signals - # sigign bitmap of ignored signals - # sigcatch bitmap of catched signals - # wchan address where process went to sleep - # 0 (place holder) - # 0 (place holder) - # exit_signal signal to send to parent thread on exit - # task_cpu which CPU the task is scheduled on - # rt_priority realtime priority - # policy scheduling policy (man sched_setscheduler) - # blkio_ticks time spent waiting for block IO - # gtime guest time of the task in jiffies - # cgtime guest time of the task children in jiffies - # start_data address above which program data+bss is placed - # end_data address below which program data+bss is placed - # start_brk address above which program heap can be expanded with brk() - # arg_start address above which program command line is placed - # arg_end address below which program command line is placed - # env_start address above which program environment is placed - # env_end address below which program environment is placed - # exit_code the thread's exit_code in the form reported by the waitpid system call - - # parse /proc//task//stat - $file = '/proc/' . $pid . '/task/' . $tid . '/stat'; - my $dummy; - open($fh, $file) || next; - $_ = <$fh>; - ($tpid, $tcomm, $dummy) = /^(\d+)\s+\((.*)\)\s+(.*)/; - ($state, $ppid, $pgrp, $sid, - $tty_nr, $tty_pgrp, $flags, - $min_flt, $cmin_flt, $maj_flt, $cmaj_flt, - $utime, $stime, $cutime, $cstime, - $priority, $nice, $num_threads, - $it_real_value, $start_time, - $vsize, $rss, $rsslim, - $start_code, $end_code, $start_stack, $esp, $eip, - $pending, $blocked, $sigign, $sigcatch, $wchan_addr, - $dum1, $dum2, $exit_signal, $task_cpu, - $rt_priority, $policy, $blkio_ticks, $gtime, $cgtime, - $start_data, $end_data, $start_brk, $arg_start, $arg_end, - $env_start, $env_end, $exit_code) = split(/\s+/, $dummy); - $stat_valid = 1; - close($fh); - - # sched - if (defined $sched_valid) { - $::task{$tid}{'exec_runtime'} = $exec_runtime; - $::task{$tid}{'exec_max'} = $exec_max; - $::task{$tid}{'wait_max'} = $wait_max; - $::task{$tid}{'wait_sum'} = $wait_sum; - $::task{$tid}{'wait_count'} = $wait_count; - $::task{$tid}{'iowait_sum'} = $iowait_sum; - $::task{$tid}{'iowait_count'} = $iowait_count; - $::task{$tid}{'nr_migrations'} = $nr_migrations; - $::task{$tid}{'nr_switches'} = $nr_switches; - } else { - $::task{$tid}{'exec_runtime'} = 0; - $::task{$tid}{'exec_max'} = 0; - $::task{$tid}{'wait_max'} = 0; - $::task{$tid}{'wait_sum'} = 0; - $::task{$tid}{'wait_count'} = 0; - $::task{$tid}{'iowait_sum'} = 0; - $::task{$tid}{'iowait_count'} = 0; - $::task{$tid}{'nr_migrations'} = 0; - $::task{$tid}{'nr_switches'} = 0; - } - - # io - if (defined $io_valid) { - $::task{$tid}{'rchar'} = $rchar; - $::task{$tid}{'wchar'} = $wchar; - $::task{$tid}{'syscr'} = $syscr; - $::task{$tid}{'syscw'} = $syscw; - $::task{$tid}{'read_bytes'} = $read_bytes; - $::task{$tid}{'write_bytes'} = $write_bytes; - $::task{$tid}{'cancelled_write_bytes'} = $cancelled_write_bytes; - } else { - $::task{$tid}{'rchar'} = 0; - $::task{$tid}{'wchar'} = 0; - $::task{$tid}{'syscr'} = 0; - $::task{$tid}{'syscw'} = 0; - $::task{$tid}{'read_bytes'} = 0; - $::task{$tid}{'write_bytes'} = 0; - $::task{$tid}{'cancelled_write_bytes'} = 0; - } - - # status - if (defined $status_valid) { - $::task{$tid}{'pid'} = $pid; - $::task{$tid}{'comm'} = $comm; - $::task{$tid}{'state'} = $state; - $::task{$tid}{'ppid'} = $ppid; - $::task{$tid}{'VmSize'} = $VmSize; - $::task{$tid}{'VmRSS'} = $VmRSS; - $::task{$tid}{'affinity'} = $affinity; - } else { - $::task{$tid}{'pid'} = 0; - $::task{$tid}{'comm'} = '-'; - $::task{$tid}{'state'} = '-'; - $::task{$tid}{'ppid'} = 0; - $::task{$tid}{'VmSize'} = 0; - $::task{$tid}{'VmRSS'} = 0; - $::task{$tid}{'affinity'} = Math::BigInt->new('0'); - } - - # cmdline - if (defined $cmdline_valid) { - $::task{$tid}{'cmdline'} = $cmdline; - } else { - $::task{$tid}{'cmdline'} = $comm; - } - - # wchan - if (defined $cmdline_valid) { - $::task{$tid}{'wchan'} = $wchan; - } else { - $::task{$tid}{'wchan'} = '-'; - } - - # stat - if (defined $stat_valid) { - $::task{$tid}{'nice'} = $nice; - $::task{$tid}{'policy'} = $policy; - $::task{$tid}{'priority'} = $priority; - $::task{$tid}{'rt_priority'} = $rt_priority; - $::task{$tid}{'start_time'} = $start_time; - $::task{$tid}{'task_cpu'} = $task_cpu; - } else { - $::task{$tid}{'nice'} = 0; - $::task{$tid}{'policy'} = '-'; - $::task{$tid}{'priority'} = 0; - $::task{$tid}{'rt_priority'} = 0; - $::task{$tid}{'start_time'} = ''; - $::task{$tid}{'task_cpu'} = 0; - } - } -} - -# Parse per-cpu hi-resolution scheduling stats -sub read_schedstat -{ - (local *::percpu) = @_; - my ($version, $timestamp); - my ($cpu, $cputime); - my ($fh, $file); - - %::percpu = (); - - # parse /proc/schedstat - $file = '/proc/schedstat'; - open($fh, $file) || croak "Cannot open file: $file ($!)"; - $_ = <$fh>; ($version) = /^version\s+(\d+)/; - $_ = <$fh>; ($timestamp) = /^timestamp\s+(\d+)/; - - if ($version == 15) { - LOOP_SCHEDSTAT: while (<$fh>) { - # version 15: cputime is 7th field - if (/^cpu(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+/) { - $cpu = $1; $cputime = $2; - $::percpu{$cpu} = $cputime; - } - } - } else { - croak "schedstat version: $version method not implemented."; - } - close($fh); - SKIP_SCHED:; -} - -# Parse per-cpu jiffie stats; cputime excludes iowait. -sub read_stat -{ - (local *::percpu) = @_; - my ($cpu, $cputime); - my ($user, $sys, $nice, $idle, $iowt, $hirq, $sirq); - my ($fh, $file); - - %::percpu = (); - - # parse /proc/stat - $file = '/proc/stat'; - open($fh, $file) || croak "Cannot open file: $file ($!)"; - LOOP_STAT: while (<$fh>) { - if (/^cpu(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+/) { - $cpu =$1; $user = $2; $sys = $3; $nice = $4; $idle = $5; $iowt = $6; $hirq = $7; $sirq = $8; - $cputime = $CLOCK_NS * ($user + $sys + $nice + $iowt + $hirq + $sirq); - $::percpu{$cpu} = $cputime; - } - } - close($fh); -} - -# Parse load-average from /proc/loadavg -sub get_loadavg -{ - (local *::loadavg, local *::runq, local *::num_tasks) = @_; - - $::loadavg{'1'} = 0.0; - $::loadavg{'5'} = 0.0; - $::loadavg{'15'} = 0.0; - $::runq = 0; - $::num_tasks = 0; - - my $file = '/proc/loadavg'; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - $_ = <$fh>; - if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\/(\d+)\s+\d+/) { - $::loadavg{'1'} = $1; - $::loadavg{'5'} = $2; - $::loadavg{'15'} = $3; - $::runq = $4; - $::num_tasks = $5; - } - close($fh); -} - -# Parse blocked from /proc/stat -sub get_blocked -{ - (local *::num_blk) = @_; - - $::num_blk = 0; - - my $file = '/proc/stat'; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - while ($_ = <$fh>) { - if (/^procs_blocked\s+(\d+)/) { - $::num_blk = $1; - } - } - close($fh); -} - -# Parse uptime from /proc/uptime -sub get_uptime -{ - (local *::uptime) = @_; - $::uptime = 0.0; - - my $file = '/proc/uptime'; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - $_ = <$fh>; - if (/^(\S+)\s+\S+/) { - $::uptime = $1; - } - close($fh); -} - -# Get number of online logical cpus -sub get_num_logical_cpus { - (local *::num_cpus) = @_; - $::num_cpus = 0; - - my $file = "/proc/cpuinfo"; - open(my $fh, $file) || croak "Cannot open file: $file ($!)"; - LOOP_CPUINFO: while (<$fh>) { - if (/^[Pp]rocessor\s+:\s\d+/) { - $::num_cpus++; - } - } - close($fh); -} - -# Print header -sub schedtop_header { - (local *::tr_1, - local *::tm_elapsed, - local *::tr_elapsed, - local *::uptime, - local *::loadavg, - local *::runq, - local *::num_blk, - local *::num_tasks, - local *::print_host, - ) = @_; - - # process epoch to get current timestamp - my $mm_in_s = 60; - my $hh_in_s = 60*60; - my $dd_in_s = 24*60*60; - my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst); - ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($::tr_1); - my $msec = 1000.0*($::tr_1 - int($::tr_1)); - - # convert uptime to elapsed ::: - my ($up, $up_dd, $up_hh, $up_mm, $up_ss); - $up = int($::uptime); - $up_dd = int($up/$dd_in_s); - $up -= $dd_in_s*$up_dd; - $up_hh = int($up/$hh_in_s); - $up -= $hh_in_s*$up_hh; - $up_mm = int($up/$mm_in_s); - $up -= $mm_in_s*$up_mm; - $up_ss = $up; - - # Calculate skew of CLOCK_REALTIME vs CLOCK_MONOTONIC, - # and display skew if > 5% relative difference. - my $skew_ms = ($::tr_elapsed - $::tm_elapsed)*1000.0; - my $skew = ""; - if (abs($skew_ms)/$::tm_elapsed > 50.0) { - $skew = sprintf " skew:%.3f ms", $skew_ms; - } - - #schedtop -- 2014/03/03 02:00:21.357 dt:2050.003 ms ldavg:0.07, 0.09, 0.08 runq:1 blk:0 nproc:440 up:6:13:00:56 skew:0.001 ms - printf "%s %s -- ". - "%4d-%02d-%02d %02d:%02d:%02d.%03d ". - "dt:%.3f ms ". - "ldavg:%.2f, %.2f, %.2f runq:%d blk:%d nproc:%d ". - "up:%d:%02d:%02d:%02d %s\n", - $::TOOLNAME, $::VERSION, - 1900+$year, 1+$mon, $mday, $hour, $min, $sec, $msec, - $::tm_elapsed*1000.0, - $::loadavg{'1'}, $::loadavg{'5'}, $::loadavg{'15'}, - $::runq, $::num_blk, $::num_tasks, - $up_dd, $up_hh, $up_mm, $up_ss, - $skew; - - return if (!($::print_host)); - - # After first print, disable print host information - $::print_host = 0; - - # Get host specific information - my ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE); - ($OSTYPE, $NODENAME, $OSRELEASE, $version, $MACHINE) = POSIX::uname(); - my ($NODETYPE, $SUBFUNCTION, $BUILDINFO) = ('-', '-', '-'); - my ($SW_VERSION, $BUILD_ID) = ('-', '-'); - - # Get platform nodetype and subfunction - PLATFORM: { - my $file = "/etc/platform/platform.conf"; - open(FILE, $file) || next; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^nodetype=(\S+)/) { - $NODETYPE = $1; - } - if (/^subfunction=(\S+)/) { - $SUBFUNCTION = $1; - } - } - close(FILE); - } - - # Get loadbuild info - BUILD: { - my $file = "/etc/build.info"; - open(FILE, $file) || next; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^SW_VERSION=\"([^"]+)\"/) { - $SW_VERSION = $1; - } - if (/^BUILD_ID=\"([^"]+)\"/) { - $BUILD_ID = $1; - } - } - close(FILE); - } - $BUILDINFO = join(' ', $SW_VERSION, $BUILD_ID); - - # Parse /proc/cpuinfo to get specific processor info - my ($n_cpu, $model_name, $cpu_MHz) = (0, '-', 0); - CPUINFO: { - my $file = "/proc/cpuinfo"; - open(FILE, $file) || croak "Cannot open file: $file ($!)"; - while($_ = ) { - s/[\0\e\f\r\a]//g; chomp; # strip control characters if any - if (/^[Pp]rocessor\s+:\s+\d+/) { - $n_cpu++; - } elsif (/^model name\s+:\s+(.*)$/) { - $_ = $1; s/\s+/ /g; - $model_name = $_; - } elsif (/^cpu MHz\s+:\s+(\S+)/) { - $cpu_MHz = $1; - } elsif (/^bogomips\s+:\s+(\S+)/) { - $cpu_MHz = $1 if ($cpu_MHz == 0); - } - } - close(FILE); - } - - printf " host:%s nodetype:%s subfunction:%s\n", - $NODENAME, $NODETYPE, $SUBFUNCTION; - printf " arch:%s processor:%s speed:%.0f #CPUs:%d\n", - $MACHINE, $model_name, $cpu_MHz, $n_cpu; - printf " %s %s build:%s\n", $OSTYPE, $OSRELEASE, $BUILDINFO; - -} - -# Parse and validate command line arguments -sub parse_schedtop_args { - (local *::arg_debug, - local *::arg_delay, - local *::arg_repeat, - local *::arg_period, - local *::arg_reset_hwm, - local *::arg_idle, - local *::arg_sort, - local *::arg_print, - ) = @_; - - # Local variables - my ($fail, $arg_help); - - # Use the Argument processing module - use Getopt::Long; - - # Print usage if no arguments - if (!@::ARGV) { - &Usage(); - exit 0; - } - - # Process input arguments - $fail = 0; - GetOptions( - "debug:i", \$::arg_debug, - "delay=f", \$::arg_delay, - "period=i", \$::arg_period, - "repeat=i", \$::arg_repeat, - "reset-hwm", \$::arg_reset_hwm, - "idle", \$::arg_idle, - "sort=s", \$::arg_sort, - "print=s", \$::arg_print, - "help|h", \$arg_help - ) || GetOptionsMessage(); - - # Print help documentation if user has selected --help - &ListHelp() if (defined $arg_help); - - # Validate options - if ((defined $::arg_repeat) && (defined $::arg_period)) { - $fail = 1; - warn "$::TOOLNAME: Input error: cannot specify both --repeat and --period options.\n"; - } - if ((defined $::arg_delay) && ($::arg_delay < 0.01)) { - $fail = 1; - warn "$::TOOLNAME: Input error: --delay %f is less than 0.01.\n", - $::arg_delay; - } - if ((defined $::arg_sort) && !(($::arg_sort eq 'cpu') || ($::arg_sort eq 'io'))) { - $fail = 1; - warn "$::TOOLNAME: Input error: --sort=$::arg_sort invalid; valid options are: cpu, io.\n"; - } - if ((defined $::arg_print) && !(($::arg_print eq 'brief') || ($::arg_print eq 'full'))) { - $fail = 1; - warn "$::TOOLNAME: Input error: --print=$::arg_print invalid; valid options are: brief, full\n"; - } - if (@::ARGV) { - $fail = 1; - warn "$::TOOLNAME: Input error: not expecting these options: '@::ARGV'.\n"; - } - - # Set reasonable defaults - $::arg_delay ||= 1.0; - $::arg_repeat ||= 1; - if ($::arg_period) { - $::arg_repeat = $::arg_period / $::arg_delay; - } else { - $::arg_period = $::arg_delay * $::arg_repeat; - } - $::arg_sort ||= 'cpu'; - $::arg_print ||= 'full'; - - # Upon missing or invalid options, print usage - if ($fail == 1) { - &Usage(); - exit 1; - } -} - -# Print out a warning message and usage -sub GetOptionsMessage { - warn "$::TOOLNAME: Error processing input arguments.\n"; - &Usage(); - exit 1; -} - -# Print out program usage -sub Usage { - printf "Usage: $::TOOLNAME OPTIONS\n"; - printf " [--delay=] [--repeat=] [--period=]\n"; - printf " [--reset-hwm] [--idle] [--sort=] [--print=]\n"; - printf " [--help]\n"; - - printf "\n"; -} - -# Print tool help -sub ListHelp { - printf "$::TOOLNAME -- display per-task scheduling occupancy\n"; - &Usage(); - printf "Options: miscellaneous\n"; - printf " --delay= : output interval (seconds): default: 1.0\n"; - printf " --repeat= : number of repeat samples: default: 1\n"; - printf " --period= : overall tool duration (seconds): default: --\n"; - printf " --reset-hwm : reset scheduling delay hi-water marks\n"; - printf " --idle : specify printing of idle tasks\n"; - printf " --sort= : sort order, select from 'cpu' or 'io'\n"; - printf " --print= : select 'brief' or 'full' fields to display\n"; - printf " --help : this help\n"; - exit 0; -} - -1; diff --git a/tools/vm-topology/centos/build_srpm.data b/tools/vm-topology/centos/build_srpm.data deleted file mode 100644 index 0b28ab0c3..000000000 --- a/tools/vm-topology/centos/build_srpm.data +++ /dev/null @@ -1,4 +0,0 @@ -PACKAGE_NAME=vm-topology -VERSION=1.0 -SRC_DIR=$PKG_BASE/$PACKAGE_NAME -TIS_PATCH_VER=1 diff --git a/tools/vm-topology/centos/vm-topology.spec b/tools/vm-topology/centos/vm-topology.spec deleted file mode 100644 index d844a4db7..000000000 --- a/tools/vm-topology/centos/vm-topology.spec +++ /dev/null @@ -1,61 +0,0 @@ -%global pypi_name vm-topology - -Summary: vm_topology -Name: vm-topology -Version: 1.0 -Release: %{tis_patch_ver}%{?_tis_dist} -License: Apache-2.0 -Group: base -Packager: Wind River - -URL: unknown -Source0: %{pypi_name}-%{version}.tar.gz - -BuildArch: noarch - -BuildRequires: python -BuildRequires: python-setuptools -BuildRequires: python2-pip -BuildRequires: python2-wheel -BuildRequires: python-keyring -BuildRequires: libvirt - -Requires: python -Requires: python-keyring -Requires: /usr/bin/env -Requires: libvirt - -%description -Show compute resources and VM topology - -%prep -%autosetup -p 1 -n %{pypi_name}-%{version} -# Remove bundled egg-info -rm -rf %{pypi_name}.egg-info -# Let RPM handle the dependencies -rm -f requirements.txt - -%build -%{__python2} setup.py build -%py2_build_wheel - -%install -%{__python2} setup.py install --skip-build --root %{buildroot} -mkdir -p $RPM_BUILD_ROOT/wheels -install -m 644 dist/*.whl $RPM_BUILD_ROOT/wheels/ - -%files -%defattr(-,root,root,-) -%license LICENSE -%{_bindir}/vm-topology -%{python2_sitelib}/vm_topology -%{python2_sitelib}/*.egg-info - -%package wheels -Summary: %{name} wheels - -%description wheels -Contains python wheels for %{name} - -%files wheels -/wheels/* diff --git a/tools/vm-topology/vm-topology/LICENSE b/tools/vm-topology/vm-topology/LICENSE deleted file mode 100644 index d64569567..000000000 --- a/tools/vm-topology/vm-topology/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/tools/vm-topology/vm-topology/setup.py b/tools/vm-topology/vm-topology/setup.py deleted file mode 100644 index 0b0a5a26f..000000000 --- a/tools/vm-topology/vm-topology/setup.py +++ /dev/null @@ -1,19 +0,0 @@ -# -# Copyright (c) 2013-2014 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -import setuptools - -setuptools.setup( - name='vm_topology', - description='Show compute resources and VM topology', - version='1.0.0', - license='Apache-2.0', - packages=['vm_topology', 'vm_topology.exec'], - entry_points={ - 'console_scripts': [ - 'vm-topology = vm_topology.exec.vm_topology:main', - ]} -) diff --git a/tools/vm-topology/vm-topology/vm_topology/__init__.py b/tools/vm-topology/vm-topology/vm_topology/__init__.py deleted file mode 100644 index 147b74f99..000000000 --- a/tools/vm-topology/vm-topology/vm_topology/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -# Copyright (c) 2014 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# diff --git a/tools/vm-topology/vm-topology/vm_topology/exec/__init__.py b/tools/vm-topology/vm-topology/vm_topology/exec/__init__.py deleted file mode 100644 index 147b74f99..000000000 --- a/tools/vm-topology/vm-topology/vm_topology/exec/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# -# Copyright (c) 2014 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# diff --git a/tools/vm-topology/vm-topology/vm_topology/exec/vm_topology.py b/tools/vm-topology/vm-topology/vm_topology/exec/vm_topology.py deleted file mode 100644 index 256c338ef..000000000 --- a/tools/vm-topology/vm-topology/vm_topology/exec/vm_topology.py +++ /dev/null @@ -1,2165 +0,0 @@ -# -# Copyright (c) 2014-2018 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -""" -usage: vm-topology [-h] - [-s ] - -Tool to summarize server resource usage and vcpu placement -related attributes for nova and libvirt. - -Details: -- shows nova view of server attributes including extended resources: - - project, compute host, server name, libvirt name, image name, flavor - - vm status, task state, power state, uptime - - pinning, numa nodes, cpuset, cpulists, server groups -- shows nova view of compute resource usage, aggregates -- shows libvirt view of servers, running state -- shows migrations in-progress -- shows flavors used -- shows images used -""" - -import argparse -import datetime -import hashlib -import copy -import libvirt -import logging -from itertools import groupby -import multiprocessing -import os -import pprint -from prettytable import PrettyTable -import psutil -import re -import sys -import signal -import textwrap -import time - -from cinderclient import client as cinder_client -from glanceclient import client as glance_client -from keystoneclient.auth.identity import v3 as keystone_identity -from keystoneclient.v3 import client as keystone_client -from keystoneauth1 import loading as keystone -from keystoneauth1 import session -from novaclient import client as nova_client -from novaclient.v2 import migrations - -from oslo_serialization import jsonutils - -from sqlalchemy.ext.automap import automap_base -from sqlalchemy import create_engine -from sqlalchemy import MetaData -from sqlalchemy.sql import select - -from xml.dom import minidom -from xml.etree import ElementTree - -NOVACONF = '/etc/nova/nova.conf' -AUTHTOKEN_GROUP = 'keystone_authtoken' -NOVACLIENT_VERSION = '2.25' -CINDERCLIENT_VERSION = '2' - -# NOTE: Old glanceclient version 1 gives access to image properties -GLANCECLIENT_VERSION = '1' - -from keystonemiddleware.auth_token import _opts as keystone_auth_token_opts -from oslo_config import cfg -from oslo_config import types - -CONF = cfg.CONF - -"""---------------------------------------------------------------------------- -Global definitions -----------------------------------------------------------------------------""" - -# logger -logger = logging.getLogger(__name__) -logging.getLogger('multiprocessing').setLevel(logging.CRITICAL) -logging.getLogger('sqlalchemy.engine').setLevel(logging.CRITICAL) - -# debug and show options -debug = {} -show = {} - -# Constants -Ki = 1024 -Mi = Ki * Ki - -# Active worker pids -active_pids = multiprocessing.Manager().dict() - -# libvirt timeout parameters -LIBVIRT_TIMEOUT_SEC = 5.0 -LIBVIRT_REAP_SEC = LIBVIRT_TIMEOUT_SEC + 2.0 - -############################################################################### -## Subroutines -############################################################################### - - -# Define a context manager to suppress stdout and stderr. -class suppress_stdout_stderr(object): - """Context manager for doing a "deep suppression" of stdout and stderr - - i.e. will suppress all print, even if the print originates in a - compiled C/Fortran sub-function. - - This will not suppress raised exceptions, since exceptions are printed - to stderr just before a script exits, and after the context manager has - exited (at least, I think that is why it lets exceptions through). - """ - def __init__(self): - # Open a pair of null files - self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] - # Save the actual stdout (1) and stderr (2) file descriptors. - self.save_fds = (os.dup(1), os.dup(2)) - - def __enter__(self): - # Assign the null pointers to stdout and stderr. - os.dup2(self.null_fds[0], 1) - os.dup2(self.null_fds[1], 2) - - def __exit__(self, *_): - # Re-assign the real stdout/stderr back to (1) and (2) - os.dup2(self.save_fds[0], 1) - os.dup2(self.save_fds[1], 2) - # Close the null files - os.close(self.null_fds[0]) - os.close(self.null_fds[1]) - - -def atoi(text): - return int(text) if text.isdigit() else text - - -def natural_keys(text): - """alist.sort(key=natural_keys) sorts in human order""" - return [atoi(c) for c in re.split('(\d+)', text)] - - -def help_text_epilog(): - text = textwrap.dedent('''\ - Tables and Field descriptions: - ------------------------------ - - COMPUTE HOSTS: Legend: U = Used, A = Avail - Host - compute host name - status - host operational status - model - processor model - topology - processor cpu topology (sockets, cores, threads) - servers - number of servers - node - physical processor node (a.k.a., numa node) - pcpus - physical vcpus per numa node (avail to libvirt) - U:dedicated - used dedicated vcpus (a.k.a., pinned) - U:shared - used shared vcpus (a.k.a., float) - memory - host memory (MiB) available for libvirt - U:memory - used memory for servers (MiB) - A:mem_4K - available 4K host memory for servers (MiB) - A:mem_2M - available 2M host memory for servers (MiB) - A:mem_1G - available 1G host memory for servers (MiB) - Aggregate - list of host aggregate names - - Note: - - rows similar to 'nova hypervisor-show ' - - last row similar to 'nova hypervisor-stats' - - LOGICAL CPU TOPOLOGY (compute hosts): - cpu_id - logical cpu id - socket_id - socket id (a.k.a., processor node, numa node) - core_id - physical core id on a given socket_id - thread_id - hyperthread (SMT) index of a given core_id - sibling_id - hyperthread sibling cpu_id(s) (excludes cpu_id) - - SERVERS (nova view): - tenant - server tenant name (a.k.a. project) - ID - server uuid - instance_name - server libvirt name - name - server name - host - server host - vm_state - server vm state - task_state - server task state - power_state - server power state - image - server image name (or image volume booted from) - flavor - server flavor name - vcpus - server number of vcpus (scaling: min, cur, max) - memory - server memory (MiB) - instance_topology - server numa topology - (dedicated vs shared, pgsize, - mapping of vpus, pcpus, shared_vcpu, sibings) - in_libvirt - indicates server also seen in libvirt - - SERVERS (libvirt view): - uuid - server uuid - instance_name - server libvirt name - host - server host - id - server libvirt id - state - server libvirt state - vcpus - server number of vcpus - memory - server memory (MiB) - nodelist - server list of numa nodes - cpulist - server list of pcpu[i] for each vcpu i - in_nova - indicates server also seen in nova - - MIGRATIONS (in progress): Legend: S=Source, D=Destination - ID - server uuid - status - migration status - S:node - source node - D:node - destination node - S:compute - source compute - D:compute - destination compute - S:flavor[PKey] - source flavor primary key id - D:flavor[PKey] - destination flavor primary key id - created_at - timestamp of migration - - FLAVORS (in use): - id - flavor_id - name - flavor name - vcpus - number of vcpus - ram - memory (MiB) - disk - disk stgorage (GB) - ephemeral - ephemeral storage (GB) - swap - swap size (MiB) - rxtx_factor - RX/TX factor (default 1) - is_public - make flavor accessible to the public (default true) - extra_specs - metadata containing key=value pairs - - IMAGES (in use): - id - image id - name - image name - minDisk - minimum size of disk to boot image (GB) - minRam - minimum size of ram to boot image (MB) - size - image data size (MB) - status - image status - properties - metadata containing key=value pairs - - SERVER GROUPS (in use): - tenant - server tenant name (a.k.a., project) - id - server group uuid - name - server group name - policies - server group policies - metadata - metadata containing key=value pairs - ''') - return text - - -class ChoiceOpt(cfg.Opt): - r"""Option with List(String) type - - Option with ``type`` :class:`oslo_config.types.List` - :param name: the option's name - :param choices: Optional sequence of either valid values or tuples of valid - values with descriptions. - :param bounds: if True the value should be inside "[" and "]" pair - :param \*\*kwargs: arbitrary keyword arguments passed to :class:`Opt` - .. versionchanged:: 2.5 - Added *item_type* and *bounds* parameters. - """ - - def __init__(self, name, choices=None, bounds=None, **kwargs): - type = types.List(item_type=types.String(choices=choices), bounds=bounds) - super(ChoiceOpt, self).__init__(name, type=type, **kwargs) - - -def parse_arguments(debug, show): - """Parse command line arguments""" - - # Initialize all debug flags to False - define_debug_flags(debug) - - # Initialized show option lists - (L_opts, L_brief, L_details, L_other) = define_options() - - # Select potentially multiple values from the following options - options = set([]) - options.update(L_brief) - options.update(L_details) - options.update(L_other) - sorted_options = sorted(options) - sorted_options[0:0] = L_opts - - # Enable debug option, but its usage/help is hidden. - debug_options = list(debug.keys()) - debug_options.sort() - debug_options.insert(0, 'all') - - # Parse arguments - cli_opts = [ - ChoiceOpt('show', - default=['brief'], - choices=sorted(list(set(sorted_options))), - metavar='<' + ','.join(str(x) for x in sorted_options) + '>', - help='Show summary of selected tables'), - ChoiceOpt('dbg', - default=[], - choices=sorted(list(set(debug_options))), - metavar='<' + ','.join(str(x) for x in debug_options) + '>', - help='Print debugging information for selected tables'), - ] - - CONF.register_cli_opts(cli_opts) - CONF.formatter_class = argparse.RawTextHelpFormatter - CONF(sys.argv[1:], - default_config_files=[NOVACONF], - prog=os.path.basename(sys.argv[0]), - description=( - 'Tool to summarize server resouce usage and vcpu placement' - 'related attributes for nova and libvirt.'), - # NOTE: oslo_config implementation of _CachedArgumentParser does not - # configure argparse formatter_class. The resulting epilog text is - # automatically text-wrapped which is not desired. Manually adding - # newlines does not work either. The epilog text is disabled for now. - #epilog=help_text_epilog(), - ) - - # Configure logging to appropriate level - level = logging.INFO - if CONF.dbg: - level = logging.DEBUG - configure_logging(logger, level=level) - - if CONF.dbg: - logger.debug('parse_args: debug=%r, show=%r' % (CONF.dbg, CONF.show)) - - # Flatten debug options list - L = list(set(CONF.dbg)) - - # Update debug options based on parsed options - {debug.update({e: True}) for e in L} - - # Enable all debug flags (except libvirt_xml) if 'all' is specified - x = debug['libvirt_xml'] - if debug['all']: - {debug.update({e: True}) for e in debug} - debug['libvirt_xml'] = x - - # Flatten show options list - L = list(set(CONF.show)) - if CONF.dbg: - L = [] - - # Update show options based on parsed options - define_option_flags(show, - options=L, - L_opts=L_opts, - L_brief=L_brief, - L_details=L_details, - L_other=L_other) - - -def configure_logging(logger, level=logging.DEBUG): - """Configure logger streams and format""" - logger.setLevel(level) - ch = logging.StreamHandler(sys.stdout) - ch.setLevel(level) - formatter = logging.Formatter( - '%(asctime)s %(process)s %(levelname)s %(module)s: %(message)s') - ch.setFormatter(formatter) - logger.addHandler(ch) - - -def _translate_keys(collection, convert): - """translate elements _info field names into human-readable names - - :param collection: dictionary containing the elements to be translated - :param convert: list of conversion tuples - """ - for k, item in collection.items(): - keys = list(item.__dict__.keys()) - for from_key, to_key in convert: - if from_key in keys and to_key not in keys: - try: - setattr(item, to_key, item._info[from_key]) - except AttributeError: - logger.error('_translate_keys: from_key:%r to_key:%r, ' - 'item._info[from_key]:%r' - % (from_key, to_key, item._info[from_key])) - - -def _translate_extended_states(collection): - """Return human readable power-state string""" - power_states = [ - 'NOSTATE', # 0x00 - 'Running', # 0x01 - '', # 0x02 - 'Paused', # 0x03 - 'Shutdown', # 0x04 - '', # 0x05 - 'Crashed', # 0x06 - 'Suspended' # 0x07 - ] - for k, item in collection.items(): - try: - setattr(item, 'power_state', - power_states[getattr(item, 'power_state')]) - except AttributeError: - setattr(item, 'power_state', "N/A") - try: - getattr(item, 'task_state') - except AttributeError: - setattr(item, 'task_state', "N/A") - - -def _translate_virDomainState(state): - """Return human readable virtual domain state string""" - states = {} - states[0] = 'NOSTATE' - states[1] = 'Running' - states[2] = 'Blocked' - states[3] = 'Paused' - states[4] = 'Shutdown' - states[5] = 'Shutoff' - states[6] = 'Crashed' - states[7] = 'pmSuspended' - states[8] = 'Last' - return states[state] - - -def _translate_virVcpuState(state): - """Return human readable virtual vpu state string""" - states = {} - states[0] = 'Offline' - states[1] = 'Running' - states[2] = 'Blocked' - states[3] = 'Last' - return states[state] - - -def _mask_to_cpulist(mask=0): - """Create cpulist from mask, list in socket-core-thread enumerated order - - :param extended: extended info - :param mask: cpuset mask - :returns cpulist: list of cpus in socket-core-thread enumerated order - """ - cpulist = [] - if mask is None or mask <= 0: - return cpulist - - # Assume max number of cpus for now... - max_cpus = 128 - for cpu in range(max_cpus): - if ((1 << cpu) & mask): - cpulist.append(cpu) - return cpulist - - -def string_to_cpulist(cpus_str=''): - """Convert a string representation to cpulist - - :param cpus_str: string containing list cpus, eg., 1,2,6-7 - :returns cpulist - """ - - # Create list of excluded cpus by parsing excluded_cpulist_str, - # example: 1,2,6-7 - cpulist = [] - re_digit = re.compile(r'^(\d+)$') - re_range = re.compile(r'^(\d+)-(\d+)$') - s = cpus_str.strip() - for ele in s.split(','): - match = re_digit.search(ele) - if match: - cpu = int(match.group(1)) - cpulist.append(cpu) - - match = re_range.search(ele) - if match: - cpu0 = int(match.group(1)) - cpu1 = int(match.group(2)) - if cpu1 > cpu0: - cpulist.extend(list(range(cpu0, cpu1 + 1))) - return cpulist - - -def list_to_range(L=None): - """Convert a list into a string of comma separate ranges - - E.g., [1,2,3,8,9,15] is converted to '1-3,8-9,15' - """ - if L is None: - L = [] - G = (list(x) for _, x in groupby(enumerate(L), lambda i_x: i_x[0] - i_x[1])) - return ",".join( - "-".join(map(str, (g[0][1], g[-1][1])[:len(g)])) for g in G) - - -def range_to_list(csv_range=None): - """Convert a string of comma separate ranges into an expanded list of ints - - E.g., '1-3,8-9,15' is converted to [1,2,3,8,9,15] - """ - if not csv_range: - return [] - ranges = [(lambda L: range(L[0], L[-1] + 1))([int(x) for x in r.split('-')]) - for r in csv_range.split(',')] - return [y for x in ranges for y in x] - - -class TimeoutError(Exception): - pass - - -def timeout_handler(signum, frame): - raise TimeoutError('timeout') - - -def libvirt_domain_info_worker(tuple_hosts): - (host) = tuple_hosts - pid = os.getpid() - active_pids.update({pid: (host, time.time())}) - error = None - try: - (domain, topology) = do_libvirt_domain_info((host)) - except Exception as e: - domain = {} - topology = {} - error = 'cannot connect to libvirt: %s; %s' % (host, e) - del active_pids[pid] - return (host, domain, topology, time.time(), error) - - -def do_libvirt_domain_info(tuple_hosts): - """Connect to libvirt for specified host, and retrieve per-domain information - - The information includes cpu affinity per vcpu. - """ - (host) = tuple_hosts - domains = {} - topology = {} - if not host: - return (domains, topology) - - # Connect to remote libvirt hypervisor - transport = 'tcp' - duri = "qemu+%s://%s/system" % (transport, host) - try: - signal.signal(signal.SIGALRM, timeout_handler) - signal.setitimer(signal.ITIMER_REAL, LIBVIRT_TIMEOUT_SEC) - with suppress_stdout_stderr(): - conn = libvirt.openReadOnly(duri) - signal.alarm(0) - except TimeoutError: - conn = None - raise - except Exception as e: - conn = None - raise - finally: - signal.alarm(0) - if conn is None: - return (domains, topology) - - # Get host capabilities (contains host topology) - caps_str = conn.getCapabilities() - doc = ElementTree.fromstring(caps_str) - caps = minidom.parseString(caps_str) - caps_host = caps.getElementsByTagName('host')[0] - caps_cells = caps_host.getElementsByTagName('cells')[0] - total_cpus = caps_cells.getElementsByTagName('cpu').length - - # Enumerate logical cpu topology using socket_id, core_id, thread_id - # indices. This generates the following dictionary: - # topology[socket_id][core_id][thread_id] = cpu_id - Thread_cnt = {} - topology = {} - cells = doc.findall('./host/topology/cells/cell') - for cell in cells: - for cpu in cell.findall('./cpus/cpu'): - # obtain core_id, cpu_id, and socket_id; ignore 'siblings' since - # that can be inferred by enumeration of thread_id. - core_id = int(cpu.get('core_id')) - cpu_id = int(cpu.get('id')) - socket_id = int(cpu.get('socket_id')) - - # thread_id's are enumerated assuming cpu_id is already sorted - if socket_id not in Thread_cnt: - Thread_cnt[socket_id] = {} - if core_id not in Thread_cnt[socket_id]: - Thread_cnt[socket_id][core_id] = 0 - else: - Thread_cnt[socket_id][core_id] += 1 - thread_id = Thread_cnt[socket_id][core_id] - - # save topology[socket_id][core_id][thread_id] - if socket_id not in topology: - topology[socket_id] = {} - if core_id not in topology[socket_id]: - topology[socket_id][core_id] = {} - topology[socket_id][core_id][thread_id] = cpu_id - - # Get domains (i.e., one per VM) - for dom in conn.listAllDomains(flags=0): - # Get overall domain info - d_name = dom.name() - d_id = dom.ID() - d_uuid = dom.UUIDString() - d_ostype = dom.OSType() - d_state, d_maxMem_KiB, d_memory_KiB, \ - d_nrVirtCpu, d_cpuTime = dom.info() - try: - with suppress_stdout_stderr(): - d_vcpus = dom.vcpus() - except Exception as e: - d_vcpus = tuple([d_nrVirtCpu * [], - d_nrVirtCpu * [tuple(total_cpus * [False])]]) - - # Obtain cpulist of pcpus in the order of vcpus. This applies to either - # pinned or floating vcpus, Note that the cpuinfo pcpu value can be - # stale if we scale down cpus since it reports cpu-last-run. - # For this reason use cpumap = d_vcpus[1][vcpu], instead of cpuinfo - # (i.e., vcpu, state, cpuTime, pcpu = d_vcpus[0][vcpu]). - cpulist_p = [] - cpulist_d = {} - cpuset_total = 0 - up_total = 0 - for vcpu in range(d_nrVirtCpu): - cpuset_b = d_vcpus[1][vcpu] - cpuset = 0 - for cpu, up in enumerate(cpuset_b): - if up: - cpulist_d[vcpu] = cpu - aff = 1 << cpu - cpuset |= aff - up_total += 1 - cpuset_total |= cpuset - cpulist_f = _mask_to_cpulist(mask=cpuset_total) - for key in sorted(cpulist_d.keys()): - cpulist_p.append(cpulist_d[key]) - - # Determine if floating or pinned, display appropriate cpulist - d_cpuset = cpuset_total - if up_total > d_nrVirtCpu: - d_cpulist = cpulist_f - else: - d_cpulist = cpulist_p - - # Determine list of numa nodes (the hard way) - dom_xml = ElementTree.fromstring(dom.XMLDesc(0)) - nodeset = set([]) - for elem in dom_xml.findall('./numatune/memnode'): - nodes = range_to_list(elem.get('nodeset')) - nodeset.update(nodes) - d_nodelist = list(sorted(nodeset)) - - # Update dictionary with per-domain information - domains[d_uuid] = { - 'name': d_name, - 'id': d_id, - 'uuid': d_uuid, - 'ostype': d_ostype, - 'state': _translate_virDomainState(d_state), - 'maxMem': int(d_maxMem_KiB / 1024.0), - 'memory': int(d_memory_KiB / 1024.0), - 'vcpus': d_nrVirtCpu, - 'cputime': d_cpuTime, - 'cpuset': d_cpuset, - 'nodelist': d_nodelist, - 'cpulist': d_cpulist, - } - - # Dump XML string - if debug['libvirt_xml']: - dom_xml = ElementTree.fromstring(dom.XMLDesc(0)) - xml_str = ElementTree.tostring(dom_xml) - logger.debug('DOM[%s] : XML =\n%s' % (d_name, xml_str)) - - conn.close() - return (domains, topology) - - -def print_debug_info(tenants=None, regions=None, - endpoints=None, services=None, - hypervisors=None, statistics=None, - servers=None, server_groups=None, - migrations=None, flavors=None, extra_specs=None, - images=None, volumes=None, - aggregates=None, domains=None, - topologies=None, topologies_idx=None, topologies_sib=None, - computes_cell=None, - debug=None, show=None): - """Print debug information - pretty formatting of various data structures""" - pp = pprint.PrettyPrinter(indent=2) - - if True in debug.values(): - print() - logger.debug('OPTIONS:') - logger.debug('debug=\n%s' % (pp.pformat(debug))) - logger.debug('show=\n%s' % (pp.pformat(show))) - - if debug['creds']: - print() - logger.debug('CREDENTIALS:') - logger.debug('regions:\n%s' % (pp.pformat(regions))) - logger.debug('tenants:\n%s' % (pp.pformat(tenants))) - logger.debug('services:\n%s' % (pp.pformat(services))) - logger.debug('endpoints:\n%s' % (pp.pformat(endpoints))) - - if debug['hypervisors']: - print() - logger.debug('HYPERVISORS:') - for H in hypervisors.values(): - logger.debug('hypervisor:\n%s' % (pp.pformat(vars(H)))) - - print() - logger.debug('HYPERVISORS: numa cells') - logger.debug('computes_cell:\n%s' % (pp.pformat(computes_cell))) - - if debug['statistics']: - print() - logger.debug('STATISTICS:') - logger.debug('statistic:\n%s' % (pp.pformat(vars(statistics)))) - - if debug['images']: - print() - logger.debug('IMAGES:') - for I in images.values(): - logger.debug('image: id=%r\n%s' % (I.id, pp.pformat(vars(I)))) - - if debug['volumes']: - print() - logger.debug('VOLUMES:') - for V in volumes.values(): - logger.debug('volume: id=%r\n%s' % (V['volume_id'], pp.pformat(V))) - - if debug['servers']: - print() - logger.debug('SERVERS:') - for S in servers.values(): - logger.debug('server: id=%r\n%s' % (S.id, pp.pformat(vars(S)))) - - if debug['server_groups']: - print() - logger.debug('SERVER GROUPS:') - for S in server_groups.values(): - logger.debug( - 'server_group: id=%r\n%s' % (S.id, pp.pformat(vars(S)))) - - if debug['migrations']: - print() - logger.debug('MIGRATIONS:') - for M in migrations.values(): - logger.debug('MIG: id=%r\n%s' % (M.id, pp.pformat(vars(M)))) - - if debug['flavors']: - print() - logger.debug('FLAVORS:') - for F in flavors.values(): - logger.debug( - 'FLAVOR: id=%r\n%s\nextra_specs=%s' - % (F.id, pp.pformat(vars(F)), pp.pformat(extra_specs[F.id]))) - - if debug['aggregates']: - print() - logger.debug('AGGREGATES:') - for A in aggregates.values(): - logger.debug('aggregate: %s' % (pp.pformat(vars(A)))) - - if debug['libvirt']: - print() - logger.debug('LIBVIRT:') - logger.debug('domain:\n%s' % (pp.pformat(domains))) - - if debug['topology']: - print() - logger.debug('TOPOLOGY:') - logger.debug('topologies:\n%s' % (pp.pformat(topologies))) - logger.debug('topologies_idx:\n%s' % (pp.pformat(topologies_idx))) - logger.debug('topologies_sib:\n%s' % (pp.pformat(topologies_sib))) - - if debug: - print() - - -def define_debug_flags(debug): - """Define dictionary of debug flags""" - opts = ['all', - 'creds', - 'hypervisors', - 'statistics', - 'servers', - 'server_groups', - 'migrations', - 'flavors', - 'images', - 'volumes', - 'aggregates', - 'libvirt', - 'libvirt_xml', - 'topology', - 'mismatch', - ] - {debug.update({e: False}) for e in opts} - - -def define_options(): - """Define several groupings with lists of show options""" - L_opts = ['brief', - 'all', - ] - L_brief = ['computes', - 'servers', - 'server_groups', - 'migrations', - 'flavors', - 'images', - ] - L_details = ['computes', - 'servers', - 'server_groups', - 'libvirt', - 'migrations', - 'flavors', - 'images', - 'volumes', - ] - L_other = ['aggregates', 'topology', 'topology-long'] - return (L_opts, L_brief, L_details, L_other) - - -def define_option_flags(show, options=None, - L_opts=None, L_brief=None, L_details=None, L_other=None): - """Define dictionary of option flags""" - if options is None: - options = [] - if L_opts is None: - L_opts = [] - if L_brief is None: - L_brief = [] - if L_details is None: - L_details = [] - if L_other is None: - L_other = [] - # Set all options to False - {show.update({e: False}) for e in L_opts + L_brief + L_details + L_other} - - # Enable specific options - show.update({'show': options}) - if 'brief' in options: - {show.update({e: True}) for e in L_brief} - if 'all' in options: - {show.update({e: True}) for e in L_brief + L_details} - for e in options: - if e in show: - show.update({e: True}) - - -def print_all_tables(tenants=None, - hypervisors=None, statistics=None, - servers=None, server_groups=None, - migrations=migrations, flavors=None, extra_specs=None, - images=None, volumes=None, - aggregates=None, domains=None, - topologies=None, topologies_idx=None, topologies_sib=None, - computes_cell=None, - agg_h=None, - flavors_in_use=None, - images_in_use=None, - server_groups_in_use=None, - debug=None, show=None): - """Print all summary tables using PrettyTable""" - # Print list of aggregates - if show['aggregates']: - print() - print("AGGREGATES:") - pt = PrettyTable( - ['Name', - 'Avail Zone', - 'Hosts', - 'Metadata', - ], caching=False) - pt.align = 'l' - for name, A in sorted(aggregates.items()): - pt.add_row( - [A.name, - str(A.availability_zone), - ", ".join([str(x) for x in A.hosts]), - str(A.metadata) - ]) - print(pt) - - # Print list of compute host hypervisors, showing per numa details - if show['computes']: - print() - print('COMPUTE HOSTS: ' - 'Legend: U = Used, A = Avail') - pt = PrettyTable( - ['Host', - 'status', - 'model', - 'topology', - 'servers', - 'node', - 'pcpus', - 'U:dedicated', - 'U:shared', - 'memory', - 'U:memory', - 'A:mem_4K', - 'A:mem_2M', - 'A:mem_1G', - 'Aggregate', - ]) - pt.align = 'l' - for C in ['servers', 'pcpus', 'U:dedicated', 'U:shared', - 'memory', 'U:memory', 'A:mem_4K', 'A:mem_2M', 'A:mem_1G']: - pt.align[C] = 'r' - for host_name, H in sorted(hypervisors.items(), - key=lambda k_v1: (natural_keys(k_v1[0]))): - A = list(agg_h[host_name].keys()) - - try: - topology_idx = topologies_idx[host_name] - cpu_ids = sorted(topology_idx.keys()) - except Exception: - topology_idx = {} - cpu_ids = [] - if len(cpu_ids) > 0: - # determine number of sockets, cores/socket, threads/core - topology = topologies[host_name] - cpu_id = 0 - socket_id = topology_idx[cpu_id]['s'] - core_id = topology_idx[cpu_id]['c'] - n_sockets = len(list(topology.keys())) - n_cores = len(list(topology[socket_id].keys())) - n_threads = len(list(topology[socket_id][core_id].keys())) - else: - if 'topology' in H.cpu_info: - topology = H.cpu_info['topology'] - n_sockets = topology['sockets'] - n_cores = topology['cores'] - n_threads = topology['threads'] - else: - n_sockets = 0 - n_cores = 0 - n_threads = 0 - if 'model' not in H.cpu_info: - H.cpu_info['model'] = None - - first = True - for cell in computes_cell[host_name]: - if first: - pt.add_row( - [host_name, - H.status, - H.cpu_info['model'], - "%ss,%sc,%st" % (n_sockets, - n_cores, - n_threads), - H.running_vms, - cell['id'], - cell['pcpus'], - cell['pinned_used'], - cell['shared_used'], - cell['memory'], - cell['memory_usage'], - cell['memory_avail_4K'], - cell['memory_avail_2M'], - cell['memory_avail_1G'], - textwrap.fill(", ".join([str(x) for x in A]), - width=75), - ]) - else: - pt.add_row( - ['', # host - '', # H.status, - '', # model - '', # topology - '', # H.running_vms, - cell['id'], - cell['pcpus'], - cell['pinned_used'], - cell['shared_used'], - cell['memory'], - cell['memory_usage'], - cell['memory_avail_4K'], - cell['memory_avail_2M'], - cell['memory_avail_1G'], - '', # agg - ]) - - first = False - if len(computes_cell[host_name]) < 1: - pt.add_row( - [host_name, - H.status, - H.cpu_info['model'], - "%ss,%sc,%st" % (n_sockets, - n_cores, - n_threads), - H.running_vms, - '-', # cell.id - '-', # pcpus - '-', # U:dedicated - '-', # U:shared - '-', # memory - '-', # memory_usage - '-', # memory_avail_4K - '-', # memory_avail_2M - '-', # memory_avail_1G - ", ".join([str(x) for x in A]), - ]) - - # Add row with statistics - Y = statistics - pt.add_row( - ['count: %s' % (Y.count), - '-', # status - '-', # model - '-', # topology - Y.running_vms, - '-', # node - Y.vcpus, # pcpus - '-', # U:dedicated - '-', # U:shared - Y.memory_mb, # memory - Y.memory_mb_used, # memory_usage - '-', # memory_avail_4K - '-', # memory_avail_2M - '-', # memory_avail_1G - '-', # agg - ]) - print(pt) - - # Print list of compute hosts topology - if show['topology']: - print() - print('LOGICAL CPU TOPOLOGY (compute hosts):') - for host_name, topology in sorted(topologies.items(), - key=lambda k_v2: (natural_keys(k_v2[0]))): - H = hypervisors[host_name] - try: - topology_idx = topologies_idx[host_name] - cpu_ids = sorted(topology_idx.keys()) - siblings = topologies_sib[host_name] - except Exception: - topology_idx = {} - siblings = {} - cpu_ids = [] - if len(cpu_ids) < 1: - logger.info('%s libvirt info not available\n' % (host_name)) - continue - - # determine number of sockets, cores/socket, threads/core - cpu_id = 0 - socket_id = topology_idx[cpu_id]['s'] - core_id = topology_idx[cpu_id]['c'] - n_sockets = len(list(topology.keys())) - n_cores = len(list(topology[socket_id].keys())) - n_threads = len(list(topology[socket_id][core_id].keys())) - - print('%s: Model:%s, Arch:%s, Vendor:%s, ' - 'Sockets=%d, Cores/Socket=%d, Threads/Core=%d, Logical=%d' - % (host_name, - H.cpu_info['model'], - H.cpu_info['arch'], - H.cpu_info['vendor'], - n_sockets, n_cores, n_threads, len(cpu_ids))) - - # cpu_id row - L = ['cpu_id'] - {L.append(i) for i in cpu_ids} - pt = PrettyTable(L) - pt.align = 'r' - - # socket_id row - L = ['socket_id'] - {L.append(topology_idx[i]['s']) for i in cpu_ids} - pt.add_row(L) - - # core_id row - L = ['core_id'] - {L.append(topology_idx[i]['c']) for i in cpu_ids} - pt.add_row(L) - - # thread_id row - L = ['thread_id'] - {L.append(topology_idx[i]['t']) for i in cpu_ids} - pt.add_row(L) - - # sibling_id row - L = ['sibling_id'] - {L.append(','.join( - str(s) for s in siblings[i]) or '-') for i in cpu_ids} - pt.add_row(L) - print(pt) - print() - - # Print list of compute hosts topology - if show['topology-long']: - print() - print('LOGICAL CPU TOPOLOGY (compute hosts):') - for host_name, topology in sorted(topologies.items(), - key=lambda k_v3: (natural_keys(k_v3[0]))): - H = hypervisors[host_name] - try: - topology_idx = topologies_idx[host_name] - cpu_ids = sorted(topology_idx.keys()) - siblings = topologies_sib[host_name] - except Exception: - topology_idx = {} - siblings = {} - cpu_ids = [] - if len(cpu_ids) < 1: - logger.info('%s libvirt info not available\n' % (host_name)) - continue - - # determine number of sockets, cores/socket, threads/core - cpu_id = 0 - socket_id = topology_idx[cpu_id]['s'] - core_id = topology_idx[cpu_id]['c'] - n_sockets = len(list(topology.keys())) - n_cores = len(list(topology[socket_id].keys())) - n_threads = len(list(topology[socket_id][core_id].keys())) - - print('%s: Model:%s, Arch:%s, Vendor:%s, ' - 'Sockets=%d, Cores/Socket=%d, Threads/Core=%d, Logical=%d' - % (host_name, - H.cpu_info['model'], - H.cpu_info['arch'], - H.cpu_info['vendor'], - n_sockets, n_cores, n_threads, len(cpu_ids))) - pt = PrettyTable( - ['cpu_id', - 'socket_id', - 'core_id', - 'thread_id', - 'sibling_id', - 'affinity' - ]) - pt.align = 'r' - pt.align['affinity'] = 'l' - for i in cpu_ids: - pt.add_row( - [i, - topology_idx[i]['s'], - topology_idx[i]['c'], - topology_idx[i]['t'], - list_to_range(siblings[i]) or '-', - '0x%x' % (1 << i) - ]) - print(pt) - print() - - # Print list of servers - if show['servers']: - re_server_group = re.compile(r'^(\S+)\s+\((\S+)\)$') - print() - print('SERVERS (nova view):') - pt = PrettyTable( - ['tenant', - 'ID', - 'instance_name', - 'name', - 'host', - 'state (vm, task, power)', - 'server_group', - 'image', - 'flavor', - 'vcpus', - 'memory', - 'instance_topology', - 'in_libvirt', - ]) - pt.align = 'l' - for C in ['vcpus', 'memory']: - pt.align[C] = 'r' - for C in ['in_libvirt']: - pt.align[C] = 'c' - for _, S in sorted(servers.items(), - key=lambda k_v4: (natural_keys(k_v4[1].host), - k_v4[1].server_group, - k_v4[1].instance_name) - if (k_v4[1].host is not None) else 'None' - ): - if S.server_group is not None and S.server_group: - match = re_server_group.search(S.server_group) - if match: - server_group = match.group(1) - else: - server_group = '-' - else: - server_group = '-' - - # Determine image name based on glance image id if it exists, - # or deduce from attached volume metadata. - try: - image_id = S.image['id'] - except Exception: - try: - image_id = volumes[S.id]['image_id'] - except Exception: - image_id = None - try: - image_name = images[image_id].name - except Exception: - image_name = '-' - - # Determine flavor name - flavor_id = S.flavor['id'] - try: - flavor_name = flavors[flavor_id].name - except Exception: - flavor_name = 'DELETED (%s)' % (flavor_id) - try: - flavor_vcpus = flavors[flavor_id].vcpus - flavor_ram = flavors[flavor_id].ram - except Exception: - flavor_vcpus = '-' - flavor_ram = '-' - - try: - vcpus_scale = ','.join(str(x) for x in S.vcpus_scale) - except Exception: - vcpus_scale = flavor_vcpus - - in_libvirt = False - for h, D in domains.items(): - if S.id in D: - in_libvirt = True - break - tenant = tenants[S.tenant_id].name - - pt.add_row( - [tenant, - S.id, - S.instance_name, - S.name, - S.host, - '%7s, %s, %s' % (S.vm_state, S.task_state, S.power_state), - server_group, - image_name, - flavor_name, - vcpus_scale, - flavor_ram, - S.topology, - 'yes' if in_libvirt else 'NO', - ]) - print(pt) - - # Print each libvirt domain info - if show['libvirt']: - print() - print('SERVERS (libvirt view): ' - 'Legend: cpulist = [pcpu[i], ...]') - pt = PrettyTable( - ['uuid', - 'instance_name', - 'host', - 'id', - 'state', - 'vcpus', - 'memory', - 'nodelist', - 'cpulist', - 'in_nova', - ]) - pt.align = 'l' - for C in ['id', 'vcpus', 'memory', 'nodelist']: - pt.align[C] = 'r' - for C in ['in_nova']: - pt.align[C] = 'c' - for host, D in sorted(domains.items(), - key=lambda k_v5: (natural_keys(k_v5[0]))): - for _, S in sorted(D.items(), - key=lambda k_v: (k_v[1]['name'])): - in_nova = True if S['uuid'] in servers else False - pt.add_row( - [S['uuid'], - S['name'], - host, - S['id'], - S['state'], - S['vcpus'], - S['memory'], - list_to_range(S['nodelist']) or '-', - list_to_range(S['cpulist']) or '-', - 'yes' if in_nova else 'NO', - ]) - print(pt) - - # Print list of in-progress migrations - if show['migrations']: - print() - print("MIGRATIONS (in progress): Legend: S=Source, D=Destination") - pt = PrettyTable( - ['ID', - 'status', - 'S:node', - 'D:node', - 'S:compute', - 'D:compute', - 'S:flavor[PKey]', - 'D:flavor[PKey]', - 'created_at', - ]) - pt.align = 'l' - for _, M in sorted(migrations.items(), - key=lambda k_v6: (k_v6[0])): - pt.add_row( - [M.instance_uuid, - M.status, - M.source_node, - M.dest_node, - M.source_compute, - M.dest_compute, - M.new_instance_type_id, - M.old_instance_type_id, - M.created_at, - ]) - print(pt) - - # Print flavors for instances currently in use - pp = pprint.PrettyPrinter(indent=1, width=40) - if show['flavors']: - print() - print("FLAVORS (in use):") - pt = PrettyTable( - ['id', - 'name', - 'vcpus', - 'ram', - 'disk', - 'ephemeral', - 'swap', - 'rxtx_factor', - 'is_public', - 'extra_specs', - ]) - pt.align = 'l' - for C in ['id', 'vcpus', 'ram', 'disk', 'ephemeral', 'swap', - 'rxtx_factor']: - pt.align[C] = 'r' - for _, F in sorted(flavors.items(), - key=lambda k_v7: (k_v7[0])): - if F.id in flavors_in_use: - pt.add_row( - [F.id, - F.name, - F.vcpus, - F.ram, - F.disk, - F.ephemeral or '-', - F.swap or '-', - F.rxtx_factor, - F.is_public, - pp.pformat(extra_specs[F.id]), - ]) - print(pt) - - # Print images for instances currently in use - pp = pprint.PrettyPrinter(indent=1, width=40) - if show['images']: - print() - print("IMAGES (in use):") - pt = PrettyTable( - ['id', - 'name', - 'min_disk', - 'min_ram', - 'size(MB)', - 'status', - 'properties', - ]) - pt.align = 'l' - for C in ['id', 'min_disk', 'min_ram', 'status']: - pt.align[C] = 'r' - for _, I in sorted(images.items(), - key=lambda k_v8: (k_v8[0])): - if I.id in images_in_use: - pt.add_row( - [I.id, - I.name, - I.min_disk, - I.min_ram, - '%.2f' % (I.size / 1024.0 / 1024.0), - I.status, - I.properties, - ]) - print(pt) - - # Print server groups for instances currently in use (exclude members data) - if show['server_groups']: - print() - print("SERVER GROUPS (in use):") - pt = PrettyTable( - ['Tenant', - 'Id', - 'Name', - 'Policies', - 'Metadata', - ]) - pt.align = 'l' - for _, S in sorted(server_groups.items(), - key=lambda k_v9: (k_v9[0])): - if S.id in server_groups_in_use: - tenant = tenants[S.project_id].name - pt.add_row( - [tenant, - S.id, - S.name, - str(S.policies), - str(S.metadata), - ]) - print(pt) - - -def _get_host_id(tenant_id=None, host_name=None): - """Routine defined in nova/api/openstack/compute/views/servers.py""" - sha_hash = hashlib.sha224(tenant_id + host_name) - return sha_hash.hexdigest() - - -def start_process(): - logger.debug('Starting: %s, %d' - % (multiprocessing.current_process().name, os.getpid())) - - -def get_info_and_display(show=None): - """Get information from various sources (keystone, nova, libvirt) - - Display the following information in table format. - - nova view of hypervisors and servers - - libvirt view of servers - - nova view of in-progress migrations - - nova view of flavors in-use - - nova view of volumes and images in-use - - nova view of server-groups in-use - """ - - # Keep track of mismatches found when validating data sources - warnings = [] - - # Define list of server field conversions - convert = [ - ('OS-EXT-SRV-ATTR:host', 'host'), - ('OS-EXT-SRV-ATTR:hypervisor_hostname', 'nodename'), - ('OS-EXT-STS:task_state', 'task_state'), - ('OS-EXT-STS:vm_state', 'vm_state'), - ('OS-EXT-SRV-ATTR:instance_name', 'instance_name'), - ('OS-EXT-STS:power_state', 'power_state'), - ('OS-SRV-USG:launched_at', 'launched_at'), - ('OS-FLV-DISABLED:disabled', 'disabled'), - ('OS-FLV-EXT-DATA:ephemeral', '_ephemeral'), - ('os-flavor-access:is_public', '_is_public'), - ('os-extended-volumes:volumes_attached', 'volumes_attached'), - ('wrs-res:vcpus', 'vcpus_scale'), - ('OS-EXT-IMG-SIZE:size', 'size'), - ('wrs-res:topology', 'topology'), - ('wrs-sg:server_group', 'server_group'), - ('wrs-sg:project_id', 'project_id'), - ] - - # Define list of migration status that imply completed migration - migration_completed_list = [ - # live migration - 'live-post', 'live-rollback', - # cold migration - 'confirmed', 'reverted', 'finished', - # drop_resize_claim - 'drop-claim', - # error - 'error' - ] - - # Get keystone credentials from nova.conf - auth = keystone.load_auth_from_conf_options(CONF, AUTHTOKEN_GROUP) - keystone_session = session.Session(auth=auth) - - # Define primary region_name (should be the same as keystone) - regions = {} - primary = 'primary' - regions[primary] = CONF.keystone_authtoken.region_name - - # Query sysinv database for region_name data. This is done directly from - # sysinv database, as that information is not exported via sysinv APIs. - # We have sufficient postgres credentials since we are on the same - # localhost as the DB and may use a local socket. We also execute as root. - engine = create_engine( - '{driver}://{user}:{passwd}@{host}:{port}/{dbname}'. - format( - driver='postgresql', - user='admin', - passwd='admin', - host='controller', - dbname='sysinv', - port='5432', - ), client_encoding='utf8') - conn = engine.connect() - - # Get sysinv i_system - metadata = MetaData() - metadata.reflect(engine, only=['i_system']) - Base = automap_base(metadata=metadata) - Base.prepare(engine) - S = Base.classes.i_system - q = select([S.name, - S.region_name, - S.deleted_at] - ).where(S.deleted_at is None) - result = conn.execute(q) - for row in result: - field = 'region_name' - if row[field] is None: - continue - regions[primary] = str(row[field]) - - # Get sysinv services - metadata = MetaData() - metadata.reflect(engine, only=['services']) - Base = automap_base(metadata=metadata) - Base.prepare(engine) - S = Base.classes.services - q = select([S.name, - S.region_name, - S.deleted_at] - ).where(S.deleted_at is None) - result = conn.execute(q) - for row in result: - name = str(row['name']) - field = 'region_name' - if row[field] is None: - region = regions[primary] - else: - region = str(row[field]) - regions[name] = region - - # Connect keystone client - region_keystone = CONF.keystone_authtoken.region_name - try: - kc = keystone_client.Client(session=keystone_session, - endpoint_type='internalURL', - region_name=region_keystone) - except Exception as e: - logger.error('cannot connect keystone client, %s', e) - sys.exit(1) - - # Connect nova client as admin - region_nova = regions.get('nova', CONF.keystone_authtoken.region_name) - try: - nc_admin = nova_client.Client(NOVACLIENT_VERSION, - session=keystone_session, - endpoint_type='internalURL', - region_name=region_nova) - except Exception as e: - logger.error('cannot connect nova client, %s', e) - sys.exit(1) - - # Get list of services, then transform into dictionary with 'name' as key - try: - services_ = kc.services.list() - except Exception as e: - logger.error('cannot list services', exc_info=1) - sys.exit(1) - services = dict((e.name, e) for e in services_) - del services_ - - # Get list of endpoints, then transform into dictionary with 'id' as key - try: - endpoints_ = kc.endpoints.list() - except Exception as e: - logger.error('cannot list endpoints', exc_info=1) - sys.exit(1) - endpoints = dict((e.id, e) for e in endpoints_) - del endpoints_ - - # Get list of tenants, then transform into dictionary with 'id' as key - try: - tenants_ = kc.projects.list() - except Exception as e: - logger.error('cannot list tenants', exc_info=1) - sys.exit(1) - tenants = dict((e.id, e) for e in tenants_) - del tenants_ - - # Connect cinder client as admin to access block storage volumes - region_cinder = regions.get('cinder', CONF.keystone_authtoken.region_name) - try: - cv_admin = cinder_client.Client(CINDERCLIENT_VERSION, - session=keystone_session, - endpoint_type='internalURL', - region_name=region_cinder) - except Exception as e: - logger.error('cannot connect cinder client, %s', e) - sys.exit(1) - - # Connect glanceclient as admin to access images - region_glance = regions.get('glance', CONF.keystone_authtoken.region_name) - try: - gc_admin = glance_client.Client(GLANCECLIENT_VERSION, - session=keystone_session, - interface='internalURL', - region_name=region_glance) - except Exception as e: - logger.error('cannot connect glance client, %s', e) - sys.exit(1) - - # Get list of images - try: - images_ = gc_admin.images.list(detailed=True) - except Exception as e: - if True in debug.values(): - logger.error('cannot list images', exc_info=1) - else: - logger.error('cannot list images, %s' % (e)) - images_ = [] - try: - images = dict((e.id, e) for e in images_) - except Exception as e: - if True in debug.values(): - logger.error('cannot list images', exc_info=1) - else: - logger.error('cannot list images, %s' % (e)) - images = {} - - # translate fields into human-readable names - _translate_keys(images, convert) - - for I_id, I in images.items(): - meta = copy.deepcopy(I.properties) - I.properties = {} - for k, v in meta.items(): - I.properties[str(k)] = str(v) - - # Get list of servers for all tenants - try: - servers_ = nc_admin.servers.list(detailed=True, - search_opts={'all_tenants': True}) - except Exception as e: - logger.error('cannot list servers', exc_info=1) - sys.exit(1) - - servers = dict((e.id, e) for e in servers_) - del servers_ - # translate fields into human-readable names - _translate_keys(servers, convert) - _translate_extended_states(servers) - for S in servers.values(): - if S.host != S.nodename: - warnings.append( - 'Server ID=%s, instance_name=%s, name=%s, host=%s ' - 'does not match nodename=%s.' - % (S.id, S.instance_name, S.name, S.host, S.nodename)) - - # Get list of volumes attached to servers for all tenants - if show['volumes']: - try: - volumes_ = cv_admin.volumes.list(detailed=True, - search_opts={'all_tenants': True}) - except Exception as e: - if True in debug.values(): - logger.error('cannot list volumes', exc_info=1) - else: - logger.error('cannot list volumes, %s' % (e)) - volumes_ = [] - else: - volumes_ = [] - volumes = {} - # keep all fields for debug even though we do not display details. - for V in volumes_: - # image metadata (not always available) - try: - image_id = V.volume_image_metadata['image_id'] - image_name = V.volume_image_metadata['image_name'] - except Exception: - image_id = None - image_name = None - for A in V.attachments: - server_id = A['server_id'] - volume_id = A['volume_id'] - volumes[server_id] = {'volume_id': volume_id, - 'image_id': image_id, - 'image_name': image_name, - 'vars': vars(V), - } - del volumes_ - - # Get list of migrations, sort-by id which puts them in time order. - # Transform into dictionary with 'instance_uuid' as key. Keep only the - # most current, and only in-progress migrations. - try: - migrations_ = nc_admin.migrations.list() - except Exception as e: - logger.error('cannot list migrations', exc_info=1) - migrations_ = {} - migrations = {} - if migrations_: - migrations_.sort(key=lambda x: (x.id)) - for M in migrations_: - if M.instance_uuid in servers: - migrations.update({M.instance_uuid: M}) - for _, M in migrations.items(): - S = servers[M.instance_uuid] - if S.task_state is None or M.status in migration_completed_list: - del migrations[M.instance_uuid] - del migrations_ - - # Get list of flavors, then transform into dictionary with 'id' as key - try: - flavors_ = nc_admin.flavors.list(detailed=True) - except Exception as e: - logger.error('cannot list flavors', exc_info=1) - sys.exit(1) - flavors = dict((e.id, e) for e in flavors_) - del flavors_ - - # translate fields into human-readable names - _translate_keys(flavors, convert) - - # Get extra_specs - extra_specs = {} - for f_id, F in flavors.items(): - try: - specs = F.get_keys() - except Exception as e: - specs = {} - logger.error('cannot get extra_specs for flavor:%s, error=%s' - % (f_id, e)) - extra_specs[f_id] = {} - for k, v in specs.items(): - extra_specs[f_id][str(k)] = str(v) - - # Get list of server groups, then transform into dictionary with 'id' - # as key - try: - server_groups_ = nc_admin.server_groups.list() - except Exception as e: - logger.error('cannot list server_groups', exc_info=1) - sys.exit(1) - server_groups = dict((e.id, e) for e in server_groups_) - del server_groups_ - - # translate fields into human-readable names - _translate_keys(server_groups, convert) - - # Generate server_groups_in_use, flavors in-use, images in-use - re_server_group = re.compile(r'^(\S+)\s+\((\S+)\)$') - server_groups_in_use = {} - flavors_in_use = {} - images_in_use = {} - for S in servers.values(): - if S.server_group is not None and S.server_group: - match = re_server_group.search(S.server_group) - if match: - server_group_id = match.group(2) - server_groups_in_use[server_group_id] = True - - # Save flavors in use - flavor_id = S.flavor['id'] - flavors_in_use[flavor_id] = True - - # Save images in use. Look for glance image id. If glance image not - # available, then check for attached volume and store image name from - # volume metadata. - try: - image_id = S.image['id'] - except Exception: - try: - image_id = volumes[S.id]['image_id'] - images_in_use[image_id] = True - except Exception: - image_id = None - if image_id is not None: - images_in_use[image_id] = True - - # Get list of hypervisors, then transform into dictionary with - # 'hypervisor_hostname' as key - try: - hypervisors_ = nc_admin.hypervisors.list(detailed=True) - except Exception as e: - logger.error('cannot list hypervisors', exc_info=1) - sys.exit(1) - hypervisors = dict((e.hypervisor_hostname, e) for e in hypervisors_) - del hypervisors_ - for H in hypervisors.values(): - H.cpu_info = jsonutils.loads(H.cpu_info) if H.cpu_info else {} - del H._info, H._loaded, H.manager - - # Get hypervisor statisics (over all computes) - try: - statistics = nc_admin.hypervisors.statistics() - except Exception as e: - logger.error('cannot get overall hypervisors statistics', exc_info=1) - sys.exit(1) - - # Get list of aggregates, then transform into dictionary with 'id' as key - try: - aggregates_ = nc_admin.aggregates.list() - except Exception as e: - logger.error('cannot list aggregates', exc_info=1) - sys.exit(1) - aggregates = dict((e.id, e) for e in aggregates_) - del aggregates_ - - # Build up aggregate list per compute host - agg_h = {} - for H in hypervisors: - agg_h[H] = {} - for A in aggregates.values(): - for H in A.hosts: - agg_h[H] = {} - for A in aggregates.values(): - for H in A.hosts: - agg_h[H][str(A.name)] = A.metadata - - # Calculate number of workers we can handle - process = psutil.Process(os.getpid()) - avail_MiB = psutil.virtual_memory().available / float(Mi) - try: - process_MiB = process.get_memory_info().rss / float(Mi) - except Exception as e1: - try: - process_MiB = process.memory_info().rss / float(Mi) - except Exception as e2: - logger.error('WORKERS: psutil.memory_info(), error=%s' % (e2)) - process_MiB = 50.0 - pool_size = \ - max(1, - min(len(hypervisors), - max(1, - min(multiprocessing.cpu_count(), - int(0.6 * (avail_MiB - 100.0) / process_MiB) - ) - ) - ) - ) - logger.debug('WORKERS: avail=%.2f MiB, process=%.2f MiB, pool_size=%d' - % (avail_MiB, process_MiB, pool_size)) - - # Create pool of workers that connect to libvirt hypervisor. - try: - pool = multiprocessing.Pool(processes=pool_size, - initializer=start_process, - maxtasksperchild=2) - except Exception as e: - logger.error('Cannot create worker pool, %s' % (e)) - sys.exit(1) - - hosts = [] - for h in hypervisors: - hosts.append(h) - - # Launch tasks - results = [pool.apply_async(libvirt_domain_info_worker, - args=(x,)) for x in hosts] - pool.close() - - # Wait for active workers to complete - time.sleep(0.15) - while len(active_pids) > 0: - # Reap aged workers that exceed hang timeout - now = time.time() - reap = [] - for pid in active_pids: - if pid == 0: - continue - try: - host, age = active_pids[pid] - except: - continue - dt = now - age - if dt > LIBVIRT_REAP_SEC: - reap.append(pid) - logger.error('REAP: pid=%d, host=%s, age=%.2f s' - % (pid, host, dt)) - for pid in reap: - os.kill(pid, signal.SIGKILL) - del active_pids[pid] - time.sleep(0.25) - - # Collect outputs - # Since we have already waited, set timeout small. - outputs = [] - for p in results: - try: - outputs.append(p.get(timeout=0.005)) - except: - pass - - # Cleanup workers - pool.terminate() - pool.join() - - # Summarize per-domain and cpu topology per host. - domains = {} - topologies = {} - topologies_idx = {} - topologies_sib = {} - topologies_lib = {} - for (h, domain_lib, topology_lib, tm1, error) in outputs: - if error is None: - domains[h] = domain_lib - topologies_lib[h] = topology_lib - else: - domains[h] = {} - topologies_lib[h] = {} - logger.error('%s' % error) - topology = copy.deepcopy(topologies_lib[h]) - topologies[h] = copy.deepcopy(topology) - - # Define topology indices for each logical cpu - topology_idx = {} - for socket_id in topology: - for core_id in topology[socket_id]: - for thread_id in topology[socket_id][core_id]: - cpu_id = topology[socket_id][core_id][thread_id] - topology_idx[cpu_id] = {'s': socket_id, - 'c': core_id, - 't': thread_id} - topologies_idx[h] = copy.deepcopy(topology_idx) - - # Define siblings for each logical cpu - siblings = {} - for socket_id in topology: - for core_id in topology[socket_id]: - for thread_id in topology[socket_id][core_id]: - cpu_id = topology[socket_id][core_id][thread_id] - siblings[cpu_id] = [] - for sibling_id in topology[socket_id][core_id]: - if thread_id != sibling_id: - sibling_cpu_id = topology[socket_id][core_id][sibling_id] - siblings[cpu_id].append(sibling_cpu_id) - topologies_sib[h] = copy.deepcopy(siblings) - del outputs - - # Query nova database for compute_nodes table, which contains per NUMA cell - # data (i.e., numa_topology). This is done directly from nova database, - # as that information is not exported via nova APIs. - # We have sufficient postgres credentials since we are on the same - # localhost as the DB and may use a local socket. We also execute as root. - computes_cell = {} - engine = create_engine( - '{driver}://{user}:{passwd}@{host}:{port}/{dbname}'. - format( - driver='postgresql', - user='admin', - passwd='admin', - host='controller', - dbname='nova', - port='5432', - ), client_encoding='utf8') - conn = engine.connect() - metadata = MetaData() - metadata.reflect(engine, only=['compute_nodes']) - Base = automap_base(metadata=metadata) - Base.prepare(engine) - CN = Base.classes.compute_nodes - q = select([CN.hypervisor_hostname, - CN.numa_topology, - CN.deleted] - ).where(CN.deleted == 0) - result = conn.execute(q) - for row in result: - host = row['hypervisor_hostname'] - computes_cell[host] = [] - - # We need libvirt topology information to make sense of cpusets. - have_topology = True - try: - if len(list(topologies_idx[host].keys())) < 1: - have_topology = False - except: - have_topology = False - - field = 'numa_topology' - if field not in row or row[field] is None: - continue - try: - T = jsonutils.loads(row[field]) - except Exception as e: - T = {} - logger.warning('cannot json.loads(%s), error=%s' % (field, e)) - continue - try: - cells = T['nova_object.data']['cells'] - for C in cells: - cell = C['nova_object.data'] - cell_id = cell['id'] - cpu_usage = cell['cpu_usage'] - cpuset = cell['cpuset'] - pinned_cpus = cell['pinned_cpus'] - shared_pcpu = cell['shared_pcpu'] - siblings = cell['siblings'] - memory = cell['memory'] - memory_usage = cell['memory_usage'] - MP = cell['mempages'] - mempages = [] - for M in MP: - MS = M['nova_object.data'] - mempages.append(MS) - - pcpuset = [] - if have_topology: - for cpu in cpuset: - if topologies_idx[host][cpu]['s'] == cell_id: - pcpuset.append(cpu) - - # Store data for compute node numa cell - Cell = {} - Cell['id'] = cell_id - Cell['memory'] = memory - Cell['memory_usage'] = memory_usage - Cell['mempages'] = mempages - Cell['pinned_cpus'] = pinned_cpus - Cell['pcpuset'] = pcpuset - if have_topology: - Cell['pcpus'] = len(pcpuset) - else: - Cell['pcpus'] = '-' - Cell['shared_pcpu'] = shared_pcpu - Cell['siblings'] = siblings - Cell['pinned_used'] = len(pinned_cpus) - if have_topology: - Cell['pinned_avail'] = len(pcpuset) - len(pinned_cpus) - else: - Cell['pinned_avail'] = '-' - Cell['shared_used'] = cpu_usage - len(pinned_cpus) - for suf in ['4K', '2M', '1G']: - Cell['memory_total_' + suf] = 0 - Cell['memory_used_' + suf] = 0 - Cell['memory_avail_' + suf] = 0 - for pages in mempages: - suf = '' - if pages['size_kb'] == 4: - suf = '4K' - if pages['size_kb'] == 2048: - suf = '2M' - if pages['size_kb'] == 1048576: - suf = '1G' - Cell['memory_total_' + suf] = pages['size_kb'] * pages['total'] / Ki - Cell['memory_used_' + suf] = pages['size_kb'] * pages['used'] / Ki - Cell['memory_avail_' + suf] = pages['size_kb'] * (pages['total'] - pages['used']) / Ki - - computes_cell[host].append(Cell) - - except Exception as e: - logger.warning('cannot print numa_topology.cells, error=%s' % (e)) - - conn.close() - - # Detect mismatch where server is in nova but not in libvirt - server_mismatch = False - for S in servers.values(): - in_libvirt = False - for h, D in domains.items(): - if S.id in D and S.host == h: - in_libvirt = True - break - if not in_libvirt: - server_mismatch = True - warnings.append('Server ID=%s, instance_name=%s, name=%s, ' - 'host=%s is in nova but not libvirt.' - % (S.id, S.instance_name, S.name, S.host)) - - # Detect mismatch where server is in libvirt but not in nova - for host, D in domains.items(): - for k, S in D.items(): - in_nova = False - uuid = S['uuid'] - if uuid in servers and servers[uuid].host == host: - in_nova = True - if not in_nova: - server_mismatch = True - warnings.append('Server ID=%s, instance_name=%s, host=%s ' - 'is in libvirt but not nova.' - % (S['uuid'], S['name'], host)) - - # Print out more details if we detect a mismatch, but only if we meant - # to display servers. - if server_mismatch and (show['servers'] or show['libvirt']): - show['servers'] = True - show['libvirt'] = True - - # Print debug information - if True in debug.values(): - print_debug_info(tenants=tenants, regions=regions, - endpoints=endpoints, services=services, - hypervisors=hypervisors, statistics=statistics, - servers=servers, server_groups=server_groups, - migrations=migrations, flavors=flavors, - extra_specs=extra_specs, - images=images, volumes=volumes, - aggregates=aggregates, domains=domains, - topologies=topologies, - topologies_idx=topologies_idx, - topologies_sib=topologies_sib, - computes_cell=computes_cell, - debug=debug, show=show) - - # Print all summary tables - print_all_tables(tenants=tenants, - hypervisors=hypervisors, statistics=statistics, - servers=servers, server_groups=server_groups, - migrations=migrations, flavors=flavors, - extra_specs=extra_specs, - images=images, volumes=volumes, - aggregates=aggregates, domains=domains, - topologies=topologies, - topologies_idx=topologies_idx, - topologies_sib=topologies_sib, - computes_cell=computes_cell, - agg_h=agg_h, - flavors_in_use=flavors_in_use, - images_in_use=images_in_use, - server_groups_in_use=server_groups_in_use, - debug=debug, show=show) - - # Print out warnings if we detect mismatches between nova and libvirt - if warnings: - print() - print("WARNINGS (mismatch):") - pt = PrettyTable(['Message']) - pt.align = 'l' - for W in warnings: - pt.add_row([W]) - print(pt) - - if True in debug.values(): - logger.debug('done.') - - # Cleanup - del nc_admin, kc - - -def main(): - try: - # Enforce 'root' access since we need to read nova.conf . - if os.geteuid() != 0: - print('Require sudo/root.') - os.execvp('sudo', ['sudo'] + sys.argv) - - # Process command line options and arguments, configure logging, - # configure debug and show options - parse_arguments(debug, show) - - # Print selected options, and timestamp - prog = os.path.basename(sys.argv[0]) - ts = datetime.datetime.now() - print("%s: %s options: show:%s" % (prog, ts.isoformat(), show['show'])) - if show['volumes']: - logger.info('volumes selected: displaying will take some time') - - if debug['creds']: - CONF.log_opt_values(logger, logging.INFO) - - # Get all info and display in table format - get_info_and_display(show) - sys.exit(0) - - except KeyboardInterrupt as e: - logger.info('caught: %r, shutting down', e) - sys.exit(0) - - except IOError as e: - sys.exit(0) - - except Exception as e: - logger.error('exception: %r', e, exc_info=1) - sys.exit(-4) diff --git a/tox.ini b/tox.ini index 55352075b..1582be921 100644 --- a/tox.ini +++ b/tox.ini @@ -102,10 +102,9 @@ deps = -r{toxinidir}/test-requirements.txt python-daemon==2.1.2 pylint -# There are currenrly 2 python modules with a setup.py file +# There are currenrly 1 python module with a setup.py file commands = pylint --rcfile=./pylint.rc \ - tools/storage-topology/storage-topology/storage_topology \ - tools/vm-topology/vm-topology/vm_topology + tools/storage-topology/storage-topology/storage_topology [testenv:venv] basepython = python3