node: add dcos node metrics command (#909)

This commit is contained in:
Philip Norman
2017-03-01 16:21:26 -08:00
committed by tamarrow
parent 6fdffadb86
commit 532710742c
10 changed files with 411 additions and 13 deletions

View File

@@ -13,6 +13,8 @@ Usage:
dcos node list-components [--leader --mesos-id=<mesos-id> --json]
dcos node log [--follow --lines=N --leader --master --mesos-id=<mesos-id> --slave=<agent-id>]
[--component=<component-name> --filter=<filter>...]
dcos node metrics details <mesos-id> [--json]
dcos node metrics summary <mesos-id> [--json]
dcos node ssh (--leader | --master | --mesos-id=<mesos-id> | --private-ip=<private-ip> | --slave=<agent-id>)
[--config-file=<path>]
[--user=<user>]
@@ -34,6 +36,11 @@ Commands:
Print a list of available DC/OS components on specified node.
log
Print the Mesos logs for the leading master node, agent nodes, or both.
metrics details
Print a table of all metrics for the agent node specified by <mesos-id>.
metrics summary
Print CPU, memory and disk metrics for the agent node specified by
<mesos-id>.
ssh
Establish an SSH connection to the master or agent nodes of your DC/OS
cluster.

173
cli/dcoscli/metrics.py Normal file
View File

@@ -0,0 +1,173 @@
import contextlib
import json
from dcos import emitting, http, util
from dcos.errors import DCOSException, DCOSHTTPException
from dcoscli import tables
logger = util.get_logger(__name__)
emitter = emitting.FlatEmitter()
def _gib(n):
return n * pow(2, -30)
def _fetch_node_metrics(url):
"""Retrieve the metrics data from `dcos-metrics`' `node` endpoint.
:param url: `dcos-metrics` `node` endpoint
:type url: str
:returns: List of metrics datapoints
:rtype: [dict]
"""
with contextlib.closing(http.get(url)) as r:
if r.status_code == 204:
raise DCOSException('No metrics found')
if r.status_code != 200:
raise DCOSHTTPException(r)
return r.json().get('datapoints', [])
def _get_datapoint(datapoints, name, tags=None):
"""Find a specific datapoint by name and tags
:param datapoints: a list of datapoints
:type datapoints: [dict]
:param name: the name of the required datapoint
:type name: str
:param tags: required tags by key and value
:type tags: dict
:return: a matching datapoint
:rtype: dict
"""
for datapoint in datapoints:
if datapoint['name'] == name:
if tags is None:
return datapoint
dtags = datapoint.get('tags', {})
tag_match = True
for k, v in tags.items():
tag_match = tag_match and dtags.get(k) == v
if tag_match:
return datapoint
def _node_summary_json(datapoints):
"""Filters datapoints down to CPU, memory and root disk space fields.
:param datapoints: a list of datapoints
:type datapoints: [dict]
:return: JSON data
:rtype: str
"""
summary_datapoints = [
_get_datapoint(datapoints, 'cpu.total'),
_get_datapoint(datapoints, 'memory.total'),
_get_datapoint(datapoints, 'filesystem.capacity.used', {'path': '/'})
]
return json.dumps(summary_datapoints)
def _node_summary_data(datapoints):
"""Extracts CPU, memory and root disk space fields from node datapoints.
:param datapoints: a list of raw datapoints
:type datapoints: [dict]
:return: a dictionary of summary fields
:rtype: dict
"""
def _percentage(dividend, divisor):
if divisor > 0:
return dividend / divisor * 100
return 0
cpu_used = _get_datapoint(datapoints, 'load.1min')['value']
cpu_used_pc = _get_datapoint(datapoints, 'cpu.total')['value']
mem_total = _get_datapoint(datapoints, 'memory.total')['value']
mem_free = _get_datapoint(datapoints, 'memory.free')['value']
mem_used = mem_total - mem_free
mem_used_pc = _percentage(mem_used, mem_total)
disk_total = _get_datapoint(
datapoints, 'filesystem.capacity.total', {'path': '/'})['value']
disk_free = _get_datapoint(
datapoints, 'filesystem.capacity.used', {'path': '/'})['value']
disk_used = disk_total - disk_free
disk_used_pc = _percentage(disk_used, disk_total)
return {
'cpu': '{:0.2f} ({:0.2f}%)'.format(cpu_used, cpu_used_pc),
'mem': '{:0.2f}GiB ({:0.2f}%)'.format(_gib(mem_used), mem_used_pc),
'disk': '{:0.2f}GiB ({:0.2f}%)'.format(_gib(disk_used), disk_used_pc)
}
def _format_datapoints(datapoints):
"""Format raw datapoints for output by making values human-readable
according to their unit and formatting tags.
:param datapoints: a list of datapoints
:type datapoints: [dict]
:return: a list of formatted datapoints
:rtype: [dict]
"""
def _format_tags(tags):
if tags is None:
return ''
pairs = []
for k, v in tags.items():
pairs.append('{}: {}'.format(k, v))
return ', '.join(pairs)
def _format_value(v, u):
if u == 'bytes':
return '{:0.2f}GiB'.format(_gib(v))
if u == 'percent':
return '{:0.2f}%'.format(v)
return v
formatted_datapoints = []
for d in datapoints:
formatted_datapoints.append({
'name': d['name'],
'value': _format_value(d['value'], d['unit']),
'tags': _format_tags(d.get('tags'))
})
return formatted_datapoints
def print_node_metrics(url, summary, json_):
"""Retrieve and pretty-print key fields from the `dcos-metrics`' `node`
endpoint.
:param url: `dcos-metrics` `node` endpoint
:type url: str
:param summary: print summary if true, or all fields if false
:type summary: bool
:param json_: print json list if true
:type json_: bool
:returns: Process status
:rtype: int
"""
datapoints = _fetch_node_metrics(url)
if summary:
if json_:
return emitter.publish(_node_summary_json(datapoints))
table = tables.metrics_summary_table(_node_summary_data(datapoints))
else:
if json_:
return emitter.publish(datapoints)
table = tables.metrics_details_table(_format_datapoints(datapoints))
return emitter.publish(table)

View File

@@ -1,5 +1,5 @@
import functools
import os
from functools import partial, wraps
import docopt
import six
@@ -10,7 +10,7 @@ from dcos import (cmds, config, emitting, errors,
http, mesos, packagemanager, subprocess, util)
from dcos.cosmos import get_cosmos_url
from dcos.errors import DCOSException, DefaultError
from dcoscli import log, tables
from dcoscli import log, metrics, tables
from dcoscli.package.main import confirm
from dcoscli.subcommand import default_command_info, default_doc
from dcoscli.util import decorate_docopt_usage
@@ -71,6 +71,16 @@ def _cmds():
'--component', '--filter'],
function=_log),
cmds.Command(
hierarchy=['node', 'metrics', 'details'],
arg_keys=['<mesos-id>', '--json'],
function=partial(_metrics, False)),
cmds.Command(
hierarchy=['node', 'metrics', 'summary'],
arg_keys=['<mesos-id>', '--json'],
function=partial(_metrics, True)),
cmds.Command(
hierarchy=['node', 'list-components'],
arg_keys=['--leader', '--mesos-id', '--json'],
@@ -111,7 +121,7 @@ def _cmds():
def diagnostics_error(fn):
@functools.wraps(fn)
@wraps(fn)
def check_for_diagnostics_error(*args, **kwargs):
response = fn(*args, **kwargs)
if response.status_code != 200:
@@ -520,6 +530,29 @@ def _log(follow, lines, leader, slave, component, filters):
return 0
def _metrics(summary, mesos_id, json_):
""" Get metrics from the specified agent.
:param summary: summarise output if true, output all if false
:type summary: bool
:param mesos_id: mesos node id
:type mesos_id: str
:param json_: print raw JSON
:type json_: bool
:returns: Process status
:rtype: int
"""
endpoint = '/system/v1/agent/{}/metrics/v0/node'.format(mesos_id)
dcos_url = config.get_config_val('core.dcos_url').rstrip('/')
if not dcos_url:
raise config.missing_config_exception(['core.dcos_url'])
url = dcos_url + endpoint
return metrics.print_node_metrics(url, summary, json_)
def _get_slave_ip(slave):
""" Get an agent IP address based on mesos id.
If slave parameter is empty, the function will return

View File

@@ -886,6 +886,47 @@ def ls_long_table(files):
return tb
def metrics_summary_table(data):
"""Prints a table of CPU, Memory and Disk for the given data.
:param data: A dictionary of formatted summary values.
:type data: dict
"""
fields = OrderedDict([
('CPU', lambda d: d['cpu']),
('MEM', lambda d: d['mem']),
('DISK', lambda d: d['disk'])
])
# table has a single row
metrics_table = table(fields, [data])
metrics_table.align['CPU'] = 'l'
metrics_table.align['MEM'] = 'l'
metrics_table.align['DISK'] = 'l'
return metrics_table
def metrics_details_table(datapoints):
"""Prints a table of all passed metrics
:param datapoints: A raw list of datapoints
:type datapoints: [dict]
"""
fields = OrderedDict([
('NAME', lambda d: d['name']),
('VALUE', lambda d: d['value']),
('TAGS', lambda d: d['tags'])
])
metrics_table = table(fields, datapoints)
metrics_table.align['NAME'] = 'l'
metrics_table.align['VALUE'] = 'l'
metrics_table.align['TAGS'] = 'l'
return metrics_table
def truncate_table(fields, objs, limits, **kwargs):
"""Returns a PrettyTable. `fields` represents the header schema of
the table. `objs` represents the objects to be rendered into

48
cli/tests/fixtures/metrics.py vendored Normal file
View File

@@ -0,0 +1,48 @@
def agent_metrics_node_details_fixture():
"""Agent metrics /node fixture
:rtype: [dict]
"""
return [
{"name": "uptime", "value": 1245, "tags": ""},
{"name": "cpu.cores", "value": 4, "tags": ""},
{"name": "cpu.total", "value": "74.94%", "tags": ""},
{"name": "cpu.user", "value": "15.67%", "tags": ""},
{"name": "cpu.system", "value": "59.27%", "tags": ""},
{"name": "cpu.idle", "value": "24.38%", "tags": ""},
{"name": "cpu.wait", "value": "0.03%", "tags": ""},
{"name": "load.1min", "value": 2.85, "tags": ""},
{"name": "load.5min", "value": 2.92, "tags": ""},
{"name": "load.15min", "value": 2.74, "tags": ""},
{"name": "filesystem.capacity.total", "value": "5.44GiB",
"tags": "path: /"},
{"name": "filesystem.capacity.used", "value": "1.65GiB",
"tags": "path: /"},
{"name": "filesystem.capacity.free", "value": "3.53GiB",
"tags": "path: /"},
{"name": "memory.total", "value": "14.69GiB", "tags": ""},
{"name": "memory.free", "value": "12.20GiB", "tags": ""},
{"name": "memory.buffers", "value": "0.09GiB", "tags": ""},
{"name": "memory.cached", "value": "1.72GiB", "tags": ""},
{"name": "swap.total", "value": "0.00GiB", "tags": ""},
{"name": "swap.free", "value": "0.00GiB", "tags": ""},
{"name": "swap.used", "value": "0.00GiB", "tags": ""}
]
def agent_metrics_node_summary_fixture():
"""Fixture for summary information for node
:rtype: dict
"""
return {
'cpu': '2.85 (74.94%)',
'mem': '2.49GiB (16.98%)',
'disk': '1.65GiB (30.30%)'
}

View File

@@ -465,15 +465,15 @@ def delete_zk_node(znode):
http.delete(znode_url)
def assert_lines(cmd, num_lines, great_then=False):
def assert_lines(cmd, num_lines, greater_than=False):
""" Assert stdout contains the expected number of lines
:param cmd: program and arguments
:type cmd: [str]
:param num_lines: expected number of lines for stdout
:type num_lines: int
:param great_then: if True assume there may be at least num_lines or more
:type great_then: bool
:param greater_than: if True assert that there are at least num_lines
:type greater_than: bool
:rtype: None
"""
@@ -482,12 +482,30 @@ def assert_lines(cmd, num_lines, great_then=False):
assert returncode == 0
assert stderr == b''
lines = len(stdout.decode('utf-8').split('\n')) - 1
if great_then:
if greater_than:
assert lines >= num_lines
return
assert lines == num_lines
def fetch_valid_json(cmd):
"""Assert stdout contains valid JSON
:param cmd: program and arguments
:type cmd: [str]
:returns: parsed JSON AST
"""
returncode, stdout, stderr = exec_command(cmd)
assert returncode == 0
assert stderr == b''
try:
return json.loads(stdout.decode('utf-8'))
except json.JSONDecodeError:
error_text = 'Command "{}" returned invalid JSON'.format(' '.join(cmd))
raise Exception(error_text)
def file_json_ast(path):
"""Returns the JSON AST parsed from file
:param path: path to file

View File

@@ -9,7 +9,8 @@ import dcos.util as util
from dcos import mesos
from dcos.util import create_schema
from .common import assert_command, assert_lines, exec_command, ssh_output
from .common import assert_command, assert_lines, exec_command, \
fetch_valid_json, ssh_output
from ..fixtures.node import slave_fixture
@@ -50,7 +51,7 @@ def test_node_log_empty():
def test_node_log_leader():
assert_lines(['dcos', 'node', 'log', '--leader'], 10, great_then=True)
assert_lines(['dcos', 'node', 'log', '--leader'], 10, greater_than=True)
def test_node_log_slave():
@@ -58,7 +59,7 @@ def test_node_log_slave():
assert_lines(
['dcos', 'node', 'log', '--mesos-id={}'.format(slave_id)],
10,
great_then=True)
greater_than=True)
def test_node_log_missing_slave():
@@ -77,7 +78,7 @@ def test_node_log_lines():
assert_lines(
['dcos', 'node', 'log', '--leader', '--lines=4'],
4,
great_then=True)
greater_than=True)
def test_node_log_invalid_lines():
@@ -87,6 +88,46 @@ def test_node_log_invalid_lines():
returncode=1)
def test_node_metrics_agent_summary():
first_node_id = _node()[0]['id']
assert_lines(
['dcos', 'node', 'metrics', 'summary', first_node_id],
2
)
def test_node_metrics_agent_summary_json():
first_node_id = _node()[0]['id']
node_json = fetch_valid_json(
['dcos', 'node', 'metrics', 'summary', first_node_id, '--json']
)
names = [d['name'] for d in node_json]
assert names == ['cpu.total', 'memory.total', 'filesystem.capacity.used']
def test_node_metrics_agent_details():
first_node_id = _node()[0]['id']
assert_lines(
['dcos', 'node', 'metrics', 'details', first_node_id],
100,
greater_than=True
)
def test_node_metrics_agent_details_json():
first_node_id = _node()[0]['id']
node_json = fetch_valid_json(
['dcos', 'node', 'metrics', 'details', first_node_id, '--json']
)
names = [d['name'] for d in node_json]
assert 'uptime' in names
assert 'cpu.cores' in names
@pytest.mark.skipif(sys.platform == 'win32',
reason='No pseudo terminal on windows')
def test_node_ssh_leader():
@@ -198,8 +239,8 @@ def _node_ssh_output(args):
cmd = ('ssh-agent /bin/bash -c "ssh-add {} 2> /dev/null && ' +
'dcos node ssh --option StrictHostKeyChecking=no {}"').format(
cli_test_ssh_key_path,
' '.join(args))
cli_test_ssh_key_path,
' '.join(args))
return ssh_output(cmd)

View File

@@ -0,0 +1,21 @@
NAME VALUE TAGS
uptime 1245
cpu.cores 4
cpu.total 74.94%
cpu.user 15.67%
cpu.system 59.27%
cpu.idle 24.38%
cpu.wait 0.03%
load.1min 2.85
load.5min 2.92
load.15min 2.74
filesystem.capacity.total 5.44GiB path: /
filesystem.capacity.used 1.65GiB path: /
filesystem.capacity.free 3.53GiB path: /
memory.total 14.69GiB
memory.free 12.20GiB
memory.buffers 0.09GiB
memory.cached 1.72GiB
swap.total 0.00GiB
swap.free 0.00GiB
swap.used 0.00GiB

View File

@@ -0,0 +1,2 @@
CPU MEM DISK
2.85 (74.94%) 2.49GiB (16.98%) 1.65GiB (30.30%)

View File

@@ -14,6 +14,8 @@ from ..fixtures.marathon import (app_fixture, app_task_fixture,
group_fixture, pod_list_fixture,
pod_list_without_instances_fixture,
pod_list_without_spec_version_fixture)
from ..fixtures.metrics import (agent_metrics_node_details_fixture,
agent_metrics_node_summary_fixture)
from ..fixtures.node import slave_fixture
from ..fixtures.package import package_fixture, search_result_fixture
from ..fixtures.service import framework_fixture
@@ -125,6 +127,18 @@ def test_ls_long_table():
'tests/unit/data/ls_long.txt')
def test_metrics_summary_table():
_test_table(tables.metrics_summary_table,
agent_metrics_node_summary_fixture(),
'tests/unit/data/metrics_summary.txt')
def test_metrics_details_table():
_test_table(tables.metrics_details_table,
agent_metrics_node_details_fixture(),
'tests/unit/data/metrics_details.txt')
def _test_table(table_fn, fixture_fn, path):
table = table_fn(fixture_fn)
with open(path) as f: