Instrument metrics in BrokerConnection

This commit is contained in:
Dana Powers
2016-08-04 12:21:40 -07:00
parent af08b54875
commit 460f0784a3
4 changed files with 127 additions and 17 deletions

View File

@@ -222,6 +222,7 @@ class KafkaClient(object):
cb = functools.partial(self._conn_state_change, 'bootstrap') cb = functools.partial(self._conn_state_change, 'bootstrap')
bootstrap = BrokerConnection(host, port, afi, bootstrap = BrokerConnection(host, port, afi,
state_change_callback=cb, state_change_callback=cb,
node_id='bootstrap',
**self.config) **self.config)
bootstrap.connect() bootstrap.connect()
while bootstrap.connecting(): while bootstrap.connecting():
@@ -313,6 +314,7 @@ class KafkaClient(object):
cb = functools.partial(self._conn_state_change, node_id) cb = functools.partial(self._conn_state_change, node_id)
self._conns[node_id] = BrokerConnection(host, broker.port, afi, self._conns[node_id] = BrokerConnection(host, broker.port, afi,
state_change_callback=cb, state_change_callback=cb,
node_id=node_id,
**self.config) **self.config)
conn = self._conns[node_id] conn = self._conns[node_id]
if conn.connected(): if conn.connected():

View File

@@ -14,6 +14,7 @@ from kafka.vendor import six
import kafka.errors as Errors import kafka.errors as Errors
from kafka.future import Future from kafka.future import Future
from kafka.metrics.stats import Avg, Count, Max, Rate
from kafka.protocol.api import RequestHeader from kafka.protocol.api import RequestHeader
from kafka.protocol.admin import SaslHandShakeRequest from kafka.protocol.admin import SaslHandShakeRequest
from kafka.protocol.commit import GroupCoordinatorResponse from kafka.protocol.commit import GroupCoordinatorResponse
@@ -58,6 +59,7 @@ InFlightRequest = collections.namedtuple('InFlightRequest',
class BrokerConnection(object): class BrokerConnection(object):
DEFAULT_CONFIG = { DEFAULT_CONFIG = {
'client_id': 'kafka-python-' + __version__, 'client_id': 'kafka-python-' + __version__,
'node_id': 0,
'request_timeout_ms': 40000, 'request_timeout_ms': 40000,
'reconnect_backoff_ms': 50, 'reconnect_backoff_ms': 50,
'max_in_flight_requests_per_connection': 5, 'max_in_flight_requests_per_connection': 5,
@@ -74,6 +76,8 @@ class BrokerConnection(object):
'ssl_password': None, 'ssl_password': None,
'api_version': (0, 8, 2), # default to most restrictive 'api_version': (0, 8, 2), # default to most restrictive
'state_change_callback': lambda conn: True, 'state_change_callback': lambda conn: True,
'metrics': None,
'metric_group_prefix': '',
'sasl_mechanism': 'PLAIN', 'sasl_mechanism': 'PLAIN',
'sasl_plain_username': None, 'sasl_plain_username': None,
'sasl_plain_password': None 'sasl_plain_password': None
@@ -138,6 +142,9 @@ class BrokerConnection(object):
api version. Only applies if api_version is None api version. Only applies if api_version is None
state_chance_callback (callable): function to be called when the state_chance_callback (callable): function to be called when the
connection state changes from CONNECTING to CONNECTED etc. connection state changes from CONNECTING to CONNECTED etc.
metrics (kafka.metrics.Metrics): Optionally provide a metrics
instance for capturing network IO stats. Default: None.
metric_group_prefix (str): Prefix for metric names. Default: ''
sasl_mechanism (str): string picking sasl mechanism when security_protocol sasl_mechanism (str): string picking sasl mechanism when security_protocol
is SASL_PLAINTEXT or SASL_SSL. Currently only PLAIN is supported. is SASL_PLAINTEXT or SASL_SSL. Currently only PLAIN is supported.
Default: None Default: None
@@ -188,6 +195,11 @@ class BrokerConnection(object):
self._correlation_id = 0 self._correlation_id = 0
self._gai = None self._gai = None
self._gai_index = 0 self._gai_index = 0
self._sensors = None
if self.config['metrics']:
self._sensors = BrokerConnectionMetrics(self.config['metrics'],
self.config['metric_group_prefix'],
self.config['node_id'])
def connect(self): def connect(self):
"""Attempt to connect and return ConnectionState""" """Attempt to connect and return ConnectionState"""
@@ -518,6 +530,8 @@ class BrokerConnection(object):
sent_bytes = self._sock.send(data[total_sent:]) sent_bytes = self._sock.send(data[total_sent:])
total_sent += sent_bytes total_sent += sent_bytes
assert total_sent == len(data) assert total_sent == len(data)
if self._sensors:
self._sensors.bytes_sent.record(total_sent)
self._sock.setblocking(False) self._sock.setblocking(False)
except (AssertionError, ConnectionError) as e: except (AssertionError, ConnectionError) as e:
log.exception("Error sending %s to %s", request, self) log.exception("Error sending %s to %s", request, self)
@@ -648,6 +662,8 @@ class BrokerConnection(object):
self._receiving = False self._receiving = False
self._next_payload_bytes = 0 self._next_payload_bytes = 0
if self._sensors:
self._sensors.bytes_received.record(4 + self._rbuffer.tell())
self._rbuffer.seek(0) self._rbuffer.seek(0)
response = self._process_response(self._rbuffer) response = self._process_response(self._rbuffer)
self._rbuffer.seek(0) self._rbuffer.seek(0)
@@ -658,6 +674,8 @@ class BrokerConnection(object):
assert not self._processing, 'Recursion not supported' assert not self._processing, 'Recursion not supported'
self._processing = True self._processing = True
ifr = self.in_flight_requests.popleft() ifr = self.in_flight_requests.popleft()
if self._sensors:
self._sensors.request_time.record((time.time() - ifr.timestamp) * 1000)
# verify send/recv correlation ids match # verify send/recv correlation ids match
recv_correlation_id = Int32.decode(read_buffer) recv_correlation_id = Int32.decode(read_buffer)
@@ -827,6 +845,111 @@ class BrokerConnection(object):
self.port) self.port)
class BrokerConnectionMetrics(object):
def __init__(self, metrics, metric_group_prefix, node_id):
self.metrics = metrics
# Any broker may have registered summary metrics already
# but if not, we need to create them so we can set as parents below
all_conns_transferred = metrics.get_sensor('bytes-sent-received')
if not all_conns_transferred:
metric_group_name = metric_group_prefix + '-metrics'
bytes_transferred = metrics.sensor('bytes-sent-received')
bytes_transferred.add(metrics.metric_name(
'network-io-rate', metric_group_name,
'The average number of network operations (reads or writes) on all'
' connections per second.'), Rate(sampled_stat=Count()))
bytes_sent = metrics.sensor('bytes-sent',
parents=[bytes_transferred])
bytes_sent.add(metrics.metric_name(
'outgoing-byte-rate', metric_group_name,
'The average number of outgoing bytes sent per second to all'
' servers.'), Rate())
bytes_sent.add(metrics.metric_name(
'request-rate', metric_group_name,
'The average number of requests sent per second.'),
Rate(sampled_stat=Count()))
bytes_sent.add(metrics.metric_name(
'request-size-avg', metric_group_name,
'The average size of all requests in the window.'), Avg())
bytes_sent.add(metrics.metric_name(
'request-size-max', metric_group_name,
'The maximum size of any request sent in the window.'), Max())
bytes_received = metrics.sensor('bytes-received',
parents=[bytes_transferred])
bytes_received.add(metrics.metric_name(
'incoming-byte-rate', metric_group_name,
'Bytes/second read off all sockets'), Rate())
bytes_received.add(metrics.metric_name(
'response-rate', metric_group_name,
'Responses received sent per second.'),
Rate(sampled_stat=Count()))
request_latency = metrics.sensor('request-latency')
request_latency.add(metrics.metric_name(
'request-latency-avg', metric_group_name,
'The average request latency in ms.'),
Avg())
request_latency.add(metrics.metric_name(
'request-latency-max', metric_group_name,
'The maximum request latency in ms.'),
Max())
# if one sensor of the metrics has been registered for the connection,
# then all other sensors should have been registered; and vice versa
node_str = 'node-{0}'.format(node_id)
node_sensor = metrics.get_sensor(node_str + '.bytes-sent')
if not node_sensor:
metric_group_name = metric_group_prefix + '-node-metrics.' + node_str
self.bytes_sent = metrics.sensor(
node_str + '.bytes-sent',
parents=[metrics.get_sensor('bytes-sent')])
self.bytes_sent.add(metrics.metric_name(
'outgoing-byte-rate', metric_group_name,
'The average number of outgoing bytes sent per second.'),
Rate())
self.bytes_sent.add(metrics.metric_name(
'request-rate', metric_group_name,
'The average number of requests sent per second.'),
Rate(sampled_stat=Count()))
self.bytes_sent.add(metrics.metric_name(
'request-size-avg', metric_group_name,
'The average size of all requests in the window.'),
Avg())
self.bytes_sent.add(metrics.metric_name(
'request-size-max', metric_group_name,
'The maximum size of any request sent in the window.'),
Max())
self.bytes_received = metrics.sensor(
node_str + '.bytes-received',
parents=[metrics.get_sensor('bytes-received')])
self.bytes_received.add(metrics.metric_name(
'incoming-byte-rate', metric_group_name,
'Bytes/second read off node-connection socket'),
Rate())
self.bytes_received.add(metrics.metric_name(
'response-rate', metric_group_name,
'The average number of responses received per second.'),
Rate(sampled_stat=Count()))
self.request_time = self.metrics.sensor(
node_str + '.latency',
parents=[metrics.get_sensor('request-latency')])
self.request_time.add(metrics.metric_name(
'request-latency-avg', metric_group_name,
'The average request latency in ms.'),
Avg())
self.request_time.add(metrics.metric_name(
'request-latency-max', metric_group_name,
'The maximum request latency in ms.'),
Max())
def _address_family(address): def _address_family(address):
""" """
Attempt to determine the family of an address (or hostname) Attempt to determine the family of an address (or hostname)

View File

@@ -204,7 +204,6 @@ class Sender(threading.Thread):
batch = batches_by_partition[tp] batch = batches_by_partition[tp]
self._complete_batch(batch, error, offset, ts) self._complete_batch(batch, error, offset, ts)
self._sensors.record_latency((time.time() - send_time) * 1000, node=node_id)
if response.API_VERSION > 0: if response.API_VERSION > 0:
self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id) self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id)
@@ -343,15 +342,6 @@ class SenderMetrics(object):
sensor_name=sensor_name, sensor_name=sensor_name,
description='The maximum time in ms record batches spent in the record accumulator.') description='The maximum time in ms record batches spent in the record accumulator.')
sensor_name = 'request-time'
self.request_time_sensor = self.metrics.sensor(sensor_name)
self.add_metric('request-latency-avg', Avg(),
sensor_name=sensor_name,
description='The average request latency in ms')
self.add_metric('request-latency-max', Max(),
sensor_name=sensor_name,
description='The maximum request latency in ms')
sensor_name = 'produce-throttle-time' sensor_name = 'produce-throttle-time'
self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name) self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name)
self.add_metric('produce-throttle-time-avg', Avg(), self.add_metric('produce-throttle-time-avg', Avg(),
@@ -498,12 +488,5 @@ class SenderMetrics(object):
if sensor: if sensor:
sensor.record(count) sensor.record(count)
def record_latency(self, latency, node=None):
self.request_time_sensor.record(latency)
if node is not None:
sensor = self.metrics.get_sensor('node-' + str(node) + '.latency')
if sensor:
sensor.record(latency)
def record_throttle_time(self, throttle_time_ms, node=None): def record_throttle_time(self, throttle_time_ms, node=None):
self.produce_throttle_time_sensor.record(throttle_time_ms) self.produce_throttle_time_sensor.record(throttle_time_ms)

View File

@@ -49,6 +49,7 @@ def test_bootstrap_success(conn):
args, kwargs = conn.call_args args, kwargs = conn.call_args
assert args == ('localhost', 9092, socket.AF_UNSPEC) assert args == ('localhost', 9092, socket.AF_UNSPEC)
kwargs.pop('state_change_callback') kwargs.pop('state_change_callback')
kwargs.pop('node_id')
assert kwargs == cli.config assert kwargs == cli.config
conn.connect.assert_called_with() conn.connect.assert_called_with()
conn.send.assert_called_once_with(MetadataRequest[0]([])) conn.send.assert_called_once_with(MetadataRequest[0]([]))
@@ -62,6 +63,7 @@ def test_bootstrap_failure(conn):
args, kwargs = conn.call_args args, kwargs = conn.call_args
assert args == ('localhost', 9092, socket.AF_UNSPEC) assert args == ('localhost', 9092, socket.AF_UNSPEC)
kwargs.pop('state_change_callback') kwargs.pop('state_change_callback')
kwargs.pop('node_id')
assert kwargs == cli.config assert kwargs == cli.config
conn.connect.assert_called_with() conn.connect.assert_called_with()
conn.close.assert_called_with() conn.close.assert_called_with()