openstack-helm/monasca/values_overrides/nvidia.yaml

---
conf:
  check_scripts:
    nvidia_vgpu: |
      # Copyright (c) 2018 StackHPC Ltd.
      #
      # Licensed under the Apache License, Version 2.0 (the "License"); you may
      # not use this file except in compliance with the License. You may obtain
      # a copy of the License at
      #
      #      http://www.apache.org/licenses/LICENSE-2.0
      #
      # Unless required by applicable law or agreed to in writing, software
      # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
      # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
      # License for the specific language governing permissions and limitations
      # under the License.

      import logging

      import monasca_agent.collector.checks as checks
      from py3nvml import py3nvml as pynvml


      log = logging.getLogger(__name__)

      _METRIC_NAME_PREFIX = "nvidia"


      class Nvidia(checks.AgentCheck):
          def __init__(self, name, init_config, agent_config):
              super(Nvidia, self).__init__(name, init_config, agent_config)

          def handle_not_supported(f):
              def wrapper(*args, **kw):
                  try:
                      return f(*args, **kw)
                  except pynvml.NVMLError as err:
                      if err == pynvml.NVMLError(pynvml.NVML_ERROR_NOT_SUPPORTED):
                          log.info('Not supported: {}'.format(f.__name__))
                          return {}
                      else:
                          raise
              return wrapper

          @staticmethod
          @handle_not_supported
          def _get_driver_version():
              return {'driver_version': pynvml.nvmlSystemGetDriverVersion()}

          @staticmethod
          @handle_not_supported
          def _get_fan_speed_percent(gpu):
              return {'fan_speed_percent': pynvml.nvmlDeviceGetFanSpeed(gpu)}

          @staticmethod
          @handle_not_supported
          def _get_device_name(gpu):
              return {'name': pynvml.nvmlDeviceGetName(gpu)}

          @staticmethod
          @handle_not_supported
          def _get_device_serial(gpu):
              return {'serial': pynvml.nvmlDeviceGetSerial(gpu)}

          @staticmethod
          @handle_not_supported
          def _get_device_uuid(gpu):
              return {'uuid': pynvml.nvmlDeviceGetUUID(gpu)}

          @staticmethod
          @handle_not_supported
          def _get_device_vbios_version(gpu):
              return {'vbios_version': pynvml.nvmlDeviceGetVbiosVersion(gpu)}

          @staticmethod
          @handle_not_supported
          def _get_info_rom_image_version(gpu):
              return {'info_rom_image_version':
                      pynvml.nvmlDeviceGetInforomImageVersion(gpu)}

          @staticmethod
          @handle_not_supported
          def _get_device_power_state(gpu):
              power_state = "P{}".format(pynvml.nvmlDeviceGetPowerState(gpu))
              return {'power_state': power_state}

          @staticmethod
          @handle_not_supported
          def _get_framebuffer_memory_stats(gpu):
              mem_info = pynvml.nvmlDeviceGetMemoryInfo(gpu)
              return {
                  'memory_fb_total_bytes': mem_info.total,
                  'memory_fb_used_bytes': mem_info.used,
                  'memory_fb_free_bytes': (mem_info.total - mem_info.used)
              }

          @staticmethod
          @handle_not_supported
          def _get_bar1_memory_stats(gpu):
              mem_info = pynvml.nvmlDeviceGetBAR1MemoryInfo(gpu)
              return {
                  'memory_bar1_total_bytes': mem_info.bar1Total,
                  'memory_bar1_used_bytes': mem_info.bar1Used,
                  'memory_bar1_free_bytes': (mem_info.bar1Total - mem_info.bar1Used)
              }

          @staticmethod
          @handle_not_supported
          def _get_utilisation_stats(gpu):
              util = pynvml.nvmlDeviceGetUtilizationRates(gpu)
              return {
                  'utilisation_gpu_percent': util.gpu,
                  'utilisation_memory_percent': util.memory
              }

          @staticmethod
          @handle_not_supported
          def _get_device_temperature(gpu):
              return {'temperature_deg_c':
                      pynvml.nvmlDeviceGetTemperature(
                          gpu, pynvml.NVML_TEMPERATURE_GPU)}

          @staticmethod
          @handle_not_supported
          def _get_device_shutdown_temp(gpu):
              return {'temperature_shutdown_deg_c':
                      pynvml.nvmlDeviceGetTemperatureThreshold(
                          gpu, pynvml.NVML_TEMPERATURE_THRESHOLD_SHUTDOWN)}

          @staticmethod
          @handle_not_supported
          def _get_device_slowdown_temp(gpu):
              return {'temperature_slowdown_deg_c':
                      pynvml.nvmlDeviceGetTemperatureThreshold(
                          gpu, pynvml.NVML_TEMPERATURE_THRESHOLD_SLOWDOWN)}

          @staticmethod
          @handle_not_supported
          def _get_power_usage_watts(gpu):
              return {'power_watts': (pynvml.nvmlDeviceGetPowerUsage(gpu) / 1000.0)}

          @staticmethod
          @handle_not_supported
          def _get_power_limit_watts(gpu):
              return {'power_limit_watts': (
                  pynvml.nvmlDeviceGetPowerManagementLimit(gpu) / 1000.0)}

          @staticmethod
          @handle_not_supported
          def _get_clock_info(gpu):
              return {
                  'clock_freq_gpu_mhz':
                      pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_GRAPHICS),
                  'clock_freq_sm_mhz':
                      pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_SM),
                  'clock_freq_memory_mhz':
                      pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_MEM),
                  'clock_freq_video_mhz':
                      pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_VIDEO)
              }

          @staticmethod
          @handle_not_supported
          def _get_clock_max_info(gpu):
              return {
                  'clock_max_freq_gpu_mhz':
                      pynvml.nvmlDeviceGetMaxClockInfo(
                          gpu, pynvml.NVML_CLOCK_GRAPHICS),
                  'clock_max_freq_sm_mhz':
                      pynvml.nvmlDeviceGetMaxClockInfo(gpu, pynvml.NVML_CLOCK_SM),
                  'clock_max_freq_memory_mhz':
                      pynvml.nvmlDeviceGetMaxClockInfo(gpu, pynvml.NVML_CLOCK_MEM),
                  'clock_max_freq_video_mhz':
                      pynvml.nvmlDeviceGetMaxClockInfo(gpu, pynvml.NVML_CLOCK_VIDEO)
              }

          @staticmethod
          def _get_gpu_info():
              pynvml.nvmlInit()
              deviceCount = pynvml.nvmlDeviceGetCount()
              all_info = []
              for i in range(0, deviceCount):
                  gpu = pynvml.nvmlDeviceGetHandleByIndex(i)

                  dimensions = {}
                  dimensions.update(Nvidia._get_driver_version())
                  dimensions.update(Nvidia._get_device_uuid(gpu))
                  dimensions.update(Nvidia._get_info_rom_image_version(gpu))
                  dimensions.update(Nvidia._get_device_power_state(gpu))
                  dimensions.update(Nvidia._get_device_vbios_version(gpu))

                  measurements = {}
                  measurements.update(Nvidia._get_fan_speed_percent(gpu))
                  measurements.update(Nvidia._get_framebuffer_memory_stats(gpu))
                  measurements.update(Nvidia._get_bar1_memory_stats(gpu))
                  measurements.update(Nvidia._get_utilisation_stats(gpu))
                  measurements.update(Nvidia._get_device_temperature(gpu))
                  measurements.update(Nvidia._get_device_shutdown_temp(gpu))
                  measurements.update(Nvidia._get_device_slowdown_temp(gpu))
                  measurements.update(Nvidia._get_power_usage_watts(gpu))
                  measurements.update(Nvidia._get_power_limit_watts(gpu))
                  measurements.update(Nvidia._get_clock_info(gpu))
                  measurements.update(Nvidia._get_clock_max_info(gpu))

                  gpu_name = "{}_{}".format(
                      Nvidia._get_device_name(gpu).get('name'),
                      Nvidia._get_device_serial(gpu).get('serial'))
                  gpu_info = {
                      'name': gpu_name,
                      'dimensions': dimensions,
                      'measurements': measurements
                  }
                  all_info.append(gpu_info)
              pynvml.nvmlShutdown()
              return all_info

          def check(self, instance):
              for gpu_metrics in Nvidia._get_gpu_info():
                  for measurement, value in gpu_metrics['measurements'].items():
                      metric_name = '{0}.{1}'.format(
                          _METRIC_NAME_PREFIX, measurement)
                      self.gauge(metric_name,
                                value,
                                device_name=gpu_metrics.get('name'),
                                dimensions=gpu_metrics.get('dimensions'),
                                value_meta=None)
                  log.debug('Collected info for GPU {}'.format(
                      gpu_metrics.get('name')))

  detection_scripts:
    nvidia_vgpu: |
      # Copyright (`c) 2018 StackHPC Ltd.
      #
      # Licensed under the Apache License, Version 2.0 (the "License"); you may
      # not use this file except in compliance with the License. You may obtain
      # a copy of the License at
      #
      #      http://www.apache.org/licenses/LICENSE-2.0
      #
      # Unless required by applicable law or agreed to in writing, software
      # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
      # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
      # License for the specific language governing permissions and limitations
      # under the License.

      import logging
      import subprocess

      import monasca_setup.agent_config
      import monasca_setup.detection

      LOG = logging.getLogger(__name__)


      class NvidiaDetect(monasca_setup.detection.Plugin):
          """Detects and configures nVidia plugin."""

          def _detect(self):
              self.available = False
              if b'nvidia' not in subprocess.check_output(
                      ["lshw", "-C", "display"]).lower():
                  LOG.info('No nVidia hardware detected.')
                  return
              self.available = True

          def build_config(self):
              config = monasca_setup.agent_config.Plugins()
              config['nvidia'] = {
                  'init_config': None,
                  'instances': [{'name': 'nvidia_stats'}]}
              return config

  agent_plugins:
    nvidia_vgpu:
      auto_detect: true
      config:
        cache_dir: /dev/shm
        nova_refresh: "14400"
pod:
  security_context:
    agent:
      container:
        monasca_collector:
          runAsUser: 0
          privileged: true
          allowPrivilegeEscalation: true

  mounts:
    monasca_agent:
      monasca_collector:
        volumeMounts:
        - name: varliblibvirt
          mountPath: /var/lib/libvirt
          readOnly: true
        - mountPath: /lib/modules
          name: libmodules
          readOnly: true
        - name: varlibnova
          mountPath: /var/lib/nova
        - name: hostproc
          mountPath: /proc
        volumes:
        - name: libmodules
          hostPath:
            path: /lib/modules
        - name: varliblibvirt
          hostPath:
            path: /var/lib/libvirt
        - name: varlibnova
          hostPath:
            path: /var/lib/nova
        - name: hostproc
          hostPath:
            path: /proc
...