node: add dcos node metrics command (#909)

2017-03-01 16:21:26 -08:00
parent 6fdffadb86
commit 532710742c
10 changed files with 411 additions and 13 deletions
--- a/cli/dcoscli/data/help/node.txt
+++ b/cli/dcoscli/data/help/node.txt
@@ -13,6 +13,8 @@ Usage:
    dcos node list-components [--leader --mesos-id=<mesos-id> --json]
    dcos node log [--follow --lines=N --leader --master --mesos-id=<mesos-id> --slave=<agent-id>]
                  [--component=<component-name> --filter=<filter>...]
+    dcos node metrics details <mesos-id> [--json]
+    dcos node metrics summary <mesos-id> [--json]
    dcos node ssh (--leader | --master | --mesos-id=<mesos-id> | --private-ip=<private-ip> | --slave=<agent-id>)
                  [--config-file=<path>]
                  [--user=<user>]
@@ -34,6 +36,11 @@ Commands:
        Print a list of available DC/OS components on specified node.
    log
        Print the Mesos logs for the leading master node, agent nodes, or both.
+    metrics details
+        Print a table of all metrics for the agent node specified by <mesos-id>.
+    metrics summary
+        Print CPU, memory and disk metrics for the agent node specified by
+        <mesos-id>.
    ssh
        Establish an SSH connection to the master or agent nodes of your DC/OS
        cluster.
--- a/cli/dcoscli/metrics.py
+++ b/cli/dcoscli/metrics.py
@@ -0,0 +1,173 @@
+import contextlib
+import json
+
+from dcos import emitting, http, util
+from dcos.errors import DCOSException, DCOSHTTPException
+from dcoscli import tables
+
+logger = util.get_logger(__name__)
+emitter = emitting.FlatEmitter()
+
+
+def _gib(n):
+    return n * pow(2, -30)
+
+
+def _fetch_node_metrics(url):
+    """Retrieve the metrics data from `dcos-metrics`' `node` endpoint.
+
+    :param url: `dcos-metrics` `node` endpoint
+    :type url: str
+    :returns: List of metrics datapoints
+    :rtype: [dict]
+    """
+    with contextlib.closing(http.get(url)) as r:
+
+        if r.status_code == 204:
+            raise DCOSException('No metrics found')
+
+        if r.status_code != 200:
+            raise DCOSHTTPException(r)
+
+        return r.json().get('datapoints', [])
+
+
+def _get_datapoint(datapoints, name, tags=None):
+    """Find a specific datapoint by name and tags
+
+    :param datapoints: a list of datapoints
+    :type datapoints: [dict]
+    :param name: the name of the required datapoint
+    :type name: str
+    :param tags: required tags by key and value
+    :type tags: dict
+    :return: a matching datapoint
+    :rtype: dict
+    """
+    for datapoint in datapoints:
+        if datapoint['name'] == name:
+            if tags is None:
+                return datapoint
+
+            dtags = datapoint.get('tags', {})
+            tag_match = True
+            for k, v in tags.items():
+                tag_match = tag_match and dtags.get(k) == v
+            if tag_match:
+                return datapoint
+
+
+def _node_summary_json(datapoints):
+    """Filters datapoints down to CPU, memory and root disk space fields.
+
+    :param datapoints: a list of datapoints
+    :type datapoints: [dict]
+    :return: JSON data
+    :rtype: str
+    """
+    summary_datapoints = [
+        _get_datapoint(datapoints, 'cpu.total'),
+        _get_datapoint(datapoints, 'memory.total'),
+        _get_datapoint(datapoints, 'filesystem.capacity.used', {'path': '/'})
+    ]
+    return json.dumps(summary_datapoints)
+
+
+def _node_summary_data(datapoints):
+    """Extracts CPU, memory and root disk space fields from node datapoints.
+
+    :param datapoints: a list of raw datapoints
+    :type datapoints: [dict]
+    :return: a dictionary of summary fields
+    :rtype: dict
+    """
+
+    def _percentage(dividend, divisor):
+        if divisor > 0:
+            return dividend / divisor * 100
+        return 0
+
+    cpu_used = _get_datapoint(datapoints, 'load.1min')['value']
+    cpu_used_pc = _get_datapoint(datapoints, 'cpu.total')['value']
+
+    mem_total = _get_datapoint(datapoints, 'memory.total')['value']
+    mem_free = _get_datapoint(datapoints, 'memory.free')['value']
+    mem_used = mem_total - mem_free
+    mem_used_pc = _percentage(mem_used, mem_total)
+
+    disk_total = _get_datapoint(
+        datapoints, 'filesystem.capacity.total', {'path': '/'})['value']
+    disk_free = _get_datapoint(
+        datapoints, 'filesystem.capacity.used', {'path': '/'})['value']
+    disk_used = disk_total - disk_free
+    disk_used_pc = _percentage(disk_used, disk_total)
+
+    return {
+        'cpu': '{:0.2f} ({:0.2f}%)'.format(cpu_used, cpu_used_pc),
+        'mem': '{:0.2f}GiB ({:0.2f}%)'.format(_gib(mem_used), mem_used_pc),
+        'disk': '{:0.2f}GiB ({:0.2f}%)'.format(_gib(disk_used), disk_used_pc)
+    }
+
+
+def _format_datapoints(datapoints):
+    """Format raw datapoints for output by making values human-readable
+    according to their unit and formatting tags.
+
+    :param datapoints: a list of datapoints
+    :type datapoints: [dict]
+    :return: a list of formatted datapoints
+    :rtype: [dict]
+    """
+
+    def _format_tags(tags):
+        if tags is None:
+            return ''
+        pairs = []
+        for k, v in tags.items():
+            pairs.append('{}: {}'.format(k, v))
+        return ', '.join(pairs)
+
+    def _format_value(v, u):
+        if u == 'bytes':
+            return '{:0.2f}GiB'.format(_gib(v))
+        if u == 'percent':
+            return '{:0.2f}%'.format(v)
+        return v
+
+    formatted_datapoints = []
+    for d in datapoints:
+        formatted_datapoints.append({
+            'name': d['name'],
+            'value': _format_value(d['value'], d['unit']),
+            'tags': _format_tags(d.get('tags'))
+        })
+
+    return formatted_datapoints
+
+
+def print_node_metrics(url, summary, json_):
+    """Retrieve and pretty-print key fields from the `dcos-metrics`' `node`
+    endpoint.
+
+    :param url: `dcos-metrics` `node` endpoint
+    :type url: str
+    :param summary: print summary if true, or all fields if false
+    :type summary: bool
+    :param json_: print json list if true
+    :type json_: bool
+    :returns: Process status
+    :rtype: int
+    """
+
+    datapoints = _fetch_node_metrics(url)
+
+    if summary:
+        if json_:
+            return emitter.publish(_node_summary_json(datapoints))
+        table = tables.metrics_summary_table(_node_summary_data(datapoints))
+    else:
+        if json_:
+            return emitter.publish(datapoints)
+        table = tables.metrics_details_table(_format_datapoints(datapoints))
+
+    return emitter.publish(table)
--- a/cli/dcoscli/node/main.py
+++ b/cli/dcoscli/node/main.py
@@ -1,5 +1,5 @@
-import functools
 import os
+from functools import partial, wraps

 import docopt
 import six
@@ -10,7 +10,7 @@ from dcos import (cmds, config, emitting, errors,
                  http, mesos, packagemanager, subprocess, util)
 from dcos.cosmos import get_cosmos_url
 from dcos.errors import DCOSException, DefaultError
-from dcoscli import log, tables
+from dcoscli import log, metrics, tables
 from dcoscli.package.main import confirm
 from dcoscli.subcommand import default_command_info, default_doc
 from dcoscli.util import decorate_docopt_usage
@@ -71,6 +71,16 @@ def _cmds():
                      '--component', '--filter'],
            function=_log),

+        cmds.Command(
+            hierarchy=['node', 'metrics', 'details'],
+            arg_keys=['<mesos-id>', '--json'],
+            function=partial(_metrics, False)),
+
+        cmds.Command(
+            hierarchy=['node', 'metrics', 'summary'],
+            arg_keys=['<mesos-id>', '--json'],
+            function=partial(_metrics, True)),
+
        cmds.Command(
            hierarchy=['node', 'list-components'],
            arg_keys=['--leader', '--mesos-id', '--json'],
@@ -111,7 +121,7 @@ def _cmds():


 def diagnostics_error(fn):
-    @functools.wraps(fn)
+    @wraps(fn)
    def check_for_diagnostics_error(*args, **kwargs):
        response = fn(*args, **kwargs)
        if response.status_code != 200:
@@ -520,6 +530,29 @@ def _log(follow, lines, leader, slave, component, filters):
    return 0


+def _metrics(summary, mesos_id, json_):
+    """ Get metrics from the specified agent.
+
+    :param summary: summarise output if true, output all if false
+    :type summary: bool
+    :param mesos_id: mesos node id
+    :type mesos_id: str
+    :param json_: print raw JSON
+    :type json_: bool
+    :returns: Process status
+    :rtype: int
+    """
+
+    endpoint = '/system/v1/agent/{}/metrics/v0/node'.format(mesos_id)
+
+    dcos_url = config.get_config_val('core.dcos_url').rstrip('/')
+    if not dcos_url:
+        raise config.missing_config_exception(['core.dcos_url'])
+
+    url = dcos_url + endpoint
+    return metrics.print_node_metrics(url, summary, json_)
+
+
 def _get_slave_ip(slave):
    """ Get an agent IP address based on mesos id.
        If slave parameter is empty, the function will return
--- a/cli/dcoscli/tables.py
+++ b/cli/dcoscli/tables.py
@@ -886,6 +886,47 @@ def ls_long_table(files):
    return tb


+def metrics_summary_table(data):
+    """Prints a table of CPU, Memory and Disk for the given data.
+
+    :param data: A dictionary of formatted summary values.
+    :type data: dict
+    """
+    fields = OrderedDict([
+        ('CPU', lambda d: d['cpu']),
+        ('MEM', lambda d: d['mem']),
+        ('DISK', lambda d: d['disk'])
+    ])
+
+    # table has a single row
+    metrics_table = table(fields, [data])
+    metrics_table.align['CPU'] = 'l'
+    metrics_table.align['MEM'] = 'l'
+    metrics_table.align['DISK'] = 'l'
+
+    return metrics_table
+
+
+def metrics_details_table(datapoints):
+    """Prints a table of all passed metrics
+
+    :param datapoints: A raw list of datapoints
+    :type datapoints: [dict]
+    """
+
+    fields = OrderedDict([
+        ('NAME', lambda d: d['name']),
+        ('VALUE', lambda d: d['value']),
+        ('TAGS', lambda d: d['tags'])
+    ])
+
+    metrics_table = table(fields, datapoints)
+    metrics_table.align['NAME'] = 'l'
+    metrics_table.align['VALUE'] = 'l'
+    metrics_table.align['TAGS'] = 'l'
+    return metrics_table
+
+
 def truncate_table(fields, objs, limits, **kwargs):
    """Returns a PrettyTable.  `fields` represents the header schema of
    the table.  `objs` represents the objects to be rendered into
--- a/cli/tests/fixtures/metrics.py
+++ b/cli/tests/fixtures/metrics.py
@@ -0,0 +1,48 @@
+def agent_metrics_node_details_fixture():
+    """Agent metrics /node fixture
+
+    :rtype: [dict]
+    """
+
+    return [
+        {"name": "uptime", "value": 1245, "tags": ""},
+
+        {"name": "cpu.cores", "value": 4, "tags": ""},
+        {"name": "cpu.total", "value": "74.94%", "tags": ""},
+        {"name": "cpu.user", "value": "15.67%", "tags": ""},
+        {"name": "cpu.system", "value": "59.27%", "tags": ""},
+        {"name": "cpu.idle", "value": "24.38%", "tags": ""},
+        {"name": "cpu.wait", "value": "0.03%", "tags": ""},
+
+        {"name": "load.1min", "value": 2.85, "tags": ""},
+        {"name": "load.5min", "value": 2.92, "tags": ""},
+        {"name": "load.15min", "value": 2.74, "tags": ""},
+
+        {"name": "filesystem.capacity.total", "value": "5.44GiB",
+         "tags": "path: /"},
+        {"name": "filesystem.capacity.used", "value": "1.65GiB",
+         "tags": "path: /"},
+        {"name": "filesystem.capacity.free", "value": "3.53GiB",
+         "tags": "path: /"},
+
+        {"name": "memory.total", "value": "14.69GiB", "tags": ""},
+        {"name": "memory.free", "value": "12.20GiB", "tags": ""},
+        {"name": "memory.buffers", "value": "0.09GiB", "tags": ""},
+        {"name": "memory.cached", "value": "1.72GiB", "tags": ""},
+
+        {"name": "swap.total", "value": "0.00GiB", "tags": ""},
+        {"name": "swap.free", "value": "0.00GiB", "tags": ""},
+        {"name": "swap.used", "value": "0.00GiB", "tags": ""}
+    ]
+
+
+def agent_metrics_node_summary_fixture():
+    """Fixture for summary information for node
+
+    :rtype: dict
+    """
+    return {
+        'cpu': '2.85 (74.94%)',
+        'mem': '2.49GiB (16.98%)',
+        'disk': '1.65GiB (30.30%)'
+    }
--- a/cli/tests/integrations/common.py
+++ b/cli/tests/integrations/common.py
@@ -465,15 +465,15 @@ def delete_zk_node(znode):
    http.delete(znode_url)


-def assert_lines(cmd, num_lines, great_then=False):
+def assert_lines(cmd, num_lines, greater_than=False):
    """ Assert stdout contains the expected number of lines

    :param cmd: program and arguments
    :type cmd: [str]
    :param num_lines: expected number of lines for stdout
    :type num_lines: int
-    :param great_then: if True assume there may be at least num_lines or more
-    :type great_then: bool
+    :param greater_than: if True assert that there are at least num_lines
+    :type greater_than: bool
    :rtype: None
    """

@@ -482,12 +482,30 @@ def assert_lines(cmd, num_lines, great_then=False):
    assert returncode == 0
    assert stderr == b''
    lines = len(stdout.decode('utf-8').split('\n')) - 1
-    if great_then:
+    if greater_than:
        assert lines >= num_lines
        return
    assert lines == num_lines


+def fetch_valid_json(cmd):
+    """Assert stdout contains valid JSON
+
+    :param cmd: program and arguments
+    :type cmd: [str]
+    :returns: parsed JSON AST
+    """
+    returncode, stdout, stderr = exec_command(cmd)
+
+    assert returncode == 0
+    assert stderr == b''
+    try:
+        return json.loads(stdout.decode('utf-8'))
+    except json.JSONDecodeError:
+        error_text = 'Command "{}" returned invalid JSON'.format(' '.join(cmd))
+        raise Exception(error_text)
+
+
 def file_json_ast(path):
    """Returns the JSON AST parsed from file
    :param path: path to file
--- a/cli/tests/integrations/test_node.py
+++ b/cli/tests/integrations/test_node.py
@@ -9,7 +9,8 @@ import dcos.util as util
 from dcos import mesos
 from dcos.util import create_schema

-from .common import assert_command, assert_lines, exec_command, ssh_output
+from .common import assert_command, assert_lines, exec_command, \
+    fetch_valid_json, ssh_output
 from ..fixtures.node import slave_fixture


@@ -50,7 +51,7 @@ def test_node_log_empty():


 def test_node_log_leader():
-    assert_lines(['dcos', 'node', 'log', '--leader'], 10, great_then=True)
+    assert_lines(['dcos', 'node', 'log', '--leader'], 10, greater_than=True)


 def test_node_log_slave():
@@ -58,7 +59,7 @@ def test_node_log_slave():
    assert_lines(
        ['dcos', 'node', 'log', '--mesos-id={}'.format(slave_id)],
        10,
-        great_then=True)
+        greater_than=True)


 def test_node_log_missing_slave():
@@ -77,7 +78,7 @@ def test_node_log_lines():
    assert_lines(
        ['dcos', 'node', 'log', '--leader', '--lines=4'],
        4,
-        great_then=True)
+        greater_than=True)


 def test_node_log_invalid_lines():
@@ -87,6 +88,46 @@ def test_node_log_invalid_lines():
                   returncode=1)


+def test_node_metrics_agent_summary():
+    first_node_id = _node()[0]['id']
+    assert_lines(
+        ['dcos', 'node', 'metrics', 'summary', first_node_id],
+        2
+    )
+
+
+def test_node_metrics_agent_summary_json():
+    first_node_id = _node()[0]['id']
+
+    node_json = fetch_valid_json(
+        ['dcos', 'node', 'metrics', 'summary', first_node_id, '--json']
+    )
+
+    names = [d['name'] for d in node_json]
+    assert names == ['cpu.total', 'memory.total', 'filesystem.capacity.used']
+
+
+def test_node_metrics_agent_details():
+    first_node_id = _node()[0]['id']
+    assert_lines(
+        ['dcos', 'node', 'metrics', 'details', first_node_id],
+        100,
+        greater_than=True
+    )
+
+
+def test_node_metrics_agent_details_json():
+    first_node_id = _node()[0]['id']
+
+    node_json = fetch_valid_json(
+        ['dcos', 'node', 'metrics', 'details', first_node_id, '--json']
+    )
+
+    names = [d['name'] for d in node_json]
+    assert 'uptime' in names
+    assert 'cpu.cores' in names
+
+
@pytest.mark.skipif(sys.platform == 'win32',
                    reason='No pseudo terminal on windows')
 def test_node_ssh_leader():
@@ -198,8 +239,8 @@ def _node_ssh_output(args):

    cmd = ('ssh-agent /bin/bash -c "ssh-add {} 2> /dev/null && ' +
           'dcos node ssh --option StrictHostKeyChecking=no {}"').format(
-               cli_test_ssh_key_path,
-               ' '.join(args))
+        cli_test_ssh_key_path,
+        ' '.join(args))

    return ssh_output(cmd)

--- a/cli/tests/unit/data/metrics_details.txt
+++ b/cli/tests/unit/data/metrics_details.txt
@@ -0,0 +1,21 @@
+NAME                       VALUE     TAGS     
+uptime                     1245               
+cpu.cores                  4                  
+cpu.total                  74.94%             
+cpu.user                   15.67%             
+cpu.system                 59.27%             
+cpu.idle                   24.38%             
+cpu.wait                   0.03%              
+load.1min                  2.85               
+load.5min                  2.92               
+load.15min                 2.74               
+filesystem.capacity.total  5.44GiB   path: /  
+filesystem.capacity.used   1.65GiB   path: /  
+filesystem.capacity.free   3.53GiB   path: /  
+memory.total               14.69GiB           
+memory.free                12.20GiB           
+memory.buffers             0.09GiB            
+memory.cached              1.72GiB            
+swap.total                 0.00GiB            
+swap.free                  0.00GiB            
+swap.used                  0.00GiB            
--- a/cli/tests/unit/data/metrics_summary.txt
+++ b/cli/tests/unit/data/metrics_summary.txt
@@ -0,0 +1,2 @@
+CPU            MEM               DISK              
+2.85 (74.94%)  2.49GiB (16.98%)  1.65GiB (30.30%)  
--- a/cli/tests/unit/test_tables.py
+++ b/cli/tests/unit/test_tables.py
@@ -14,6 +14,8 @@ from ..fixtures.marathon import (app_fixture, app_task_fixture,
                                 group_fixture, pod_list_fixture,
                                 pod_list_without_instances_fixture,
                                 pod_list_without_spec_version_fixture)
+from ..fixtures.metrics import (agent_metrics_node_details_fixture,
+                                agent_metrics_node_summary_fixture)
 from ..fixtures.node import slave_fixture
 from ..fixtures.package import package_fixture, search_result_fixture
 from ..fixtures.service import framework_fixture
@@ -125,6 +127,18 @@ def test_ls_long_table():
                    'tests/unit/data/ls_long.txt')


+def test_metrics_summary_table():
+    _test_table(tables.metrics_summary_table,
+                agent_metrics_node_summary_fixture(),
+                'tests/unit/data/metrics_summary.txt')
+
+
+def test_metrics_details_table():
+    _test_table(tables.metrics_details_table,
+                agent_metrics_node_details_fixture(),
+                'tests/unit/data/metrics_details.txt')
+
+
 def _test_table(table_fn, fixture_fn, path):
    table = table_fn(fixture_fn)
    with open(path) as f: