Instrument metrics in BrokerConnection
This commit is contained in:
@@ -222,6 +222,7 @@ class KafkaClient(object):
|
|||||||
cb = functools.partial(self._conn_state_change, 'bootstrap')
|
cb = functools.partial(self._conn_state_change, 'bootstrap')
|
||||||
bootstrap = BrokerConnection(host, port, afi,
|
bootstrap = BrokerConnection(host, port, afi,
|
||||||
state_change_callback=cb,
|
state_change_callback=cb,
|
||||||
|
node_id='bootstrap',
|
||||||
**self.config)
|
**self.config)
|
||||||
bootstrap.connect()
|
bootstrap.connect()
|
||||||
while bootstrap.connecting():
|
while bootstrap.connecting():
|
||||||
@@ -313,6 +314,7 @@ class KafkaClient(object):
|
|||||||
cb = functools.partial(self._conn_state_change, node_id)
|
cb = functools.partial(self._conn_state_change, node_id)
|
||||||
self._conns[node_id] = BrokerConnection(host, broker.port, afi,
|
self._conns[node_id] = BrokerConnection(host, broker.port, afi,
|
||||||
state_change_callback=cb,
|
state_change_callback=cb,
|
||||||
|
node_id=node_id,
|
||||||
**self.config)
|
**self.config)
|
||||||
conn = self._conns[node_id]
|
conn = self._conns[node_id]
|
||||||
if conn.connected():
|
if conn.connected():
|
||||||
|
123
kafka/conn.py
123
kafka/conn.py
@@ -14,6 +14,7 @@ from kafka.vendor import six
|
|||||||
|
|
||||||
import kafka.errors as Errors
|
import kafka.errors as Errors
|
||||||
from kafka.future import Future
|
from kafka.future import Future
|
||||||
|
from kafka.metrics.stats import Avg, Count, Max, Rate
|
||||||
from kafka.protocol.api import RequestHeader
|
from kafka.protocol.api import RequestHeader
|
||||||
from kafka.protocol.admin import SaslHandShakeRequest
|
from kafka.protocol.admin import SaslHandShakeRequest
|
||||||
from kafka.protocol.commit import GroupCoordinatorResponse
|
from kafka.protocol.commit import GroupCoordinatorResponse
|
||||||
@@ -58,6 +59,7 @@ InFlightRequest = collections.namedtuple('InFlightRequest',
|
|||||||
class BrokerConnection(object):
|
class BrokerConnection(object):
|
||||||
DEFAULT_CONFIG = {
|
DEFAULT_CONFIG = {
|
||||||
'client_id': 'kafka-python-' + __version__,
|
'client_id': 'kafka-python-' + __version__,
|
||||||
|
'node_id': 0,
|
||||||
'request_timeout_ms': 40000,
|
'request_timeout_ms': 40000,
|
||||||
'reconnect_backoff_ms': 50,
|
'reconnect_backoff_ms': 50,
|
||||||
'max_in_flight_requests_per_connection': 5,
|
'max_in_flight_requests_per_connection': 5,
|
||||||
@@ -74,6 +76,8 @@ class BrokerConnection(object):
|
|||||||
'ssl_password': None,
|
'ssl_password': None,
|
||||||
'api_version': (0, 8, 2), # default to most restrictive
|
'api_version': (0, 8, 2), # default to most restrictive
|
||||||
'state_change_callback': lambda conn: True,
|
'state_change_callback': lambda conn: True,
|
||||||
|
'metrics': None,
|
||||||
|
'metric_group_prefix': '',
|
||||||
'sasl_mechanism': 'PLAIN',
|
'sasl_mechanism': 'PLAIN',
|
||||||
'sasl_plain_username': None,
|
'sasl_plain_username': None,
|
||||||
'sasl_plain_password': None
|
'sasl_plain_password': None
|
||||||
@@ -138,6 +142,9 @@ class BrokerConnection(object):
|
|||||||
api version. Only applies if api_version is None
|
api version. Only applies if api_version is None
|
||||||
state_chance_callback (callable): function to be called when the
|
state_chance_callback (callable): function to be called when the
|
||||||
connection state changes from CONNECTING to CONNECTED etc.
|
connection state changes from CONNECTING to CONNECTED etc.
|
||||||
|
metrics (kafka.metrics.Metrics): Optionally provide a metrics
|
||||||
|
instance for capturing network IO stats. Default: None.
|
||||||
|
metric_group_prefix (str): Prefix for metric names. Default: ''
|
||||||
sasl_mechanism (str): string picking sasl mechanism when security_protocol
|
sasl_mechanism (str): string picking sasl mechanism when security_protocol
|
||||||
is SASL_PLAINTEXT or SASL_SSL. Currently only PLAIN is supported.
|
is SASL_PLAINTEXT or SASL_SSL. Currently only PLAIN is supported.
|
||||||
Default: None
|
Default: None
|
||||||
@@ -188,6 +195,11 @@ class BrokerConnection(object):
|
|||||||
self._correlation_id = 0
|
self._correlation_id = 0
|
||||||
self._gai = None
|
self._gai = None
|
||||||
self._gai_index = 0
|
self._gai_index = 0
|
||||||
|
self._sensors = None
|
||||||
|
if self.config['metrics']:
|
||||||
|
self._sensors = BrokerConnectionMetrics(self.config['metrics'],
|
||||||
|
self.config['metric_group_prefix'],
|
||||||
|
self.config['node_id'])
|
||||||
|
|
||||||
def connect(self):
|
def connect(self):
|
||||||
"""Attempt to connect and return ConnectionState"""
|
"""Attempt to connect and return ConnectionState"""
|
||||||
@@ -518,6 +530,8 @@ class BrokerConnection(object):
|
|||||||
sent_bytes = self._sock.send(data[total_sent:])
|
sent_bytes = self._sock.send(data[total_sent:])
|
||||||
total_sent += sent_bytes
|
total_sent += sent_bytes
|
||||||
assert total_sent == len(data)
|
assert total_sent == len(data)
|
||||||
|
if self._sensors:
|
||||||
|
self._sensors.bytes_sent.record(total_sent)
|
||||||
self._sock.setblocking(False)
|
self._sock.setblocking(False)
|
||||||
except (AssertionError, ConnectionError) as e:
|
except (AssertionError, ConnectionError) as e:
|
||||||
log.exception("Error sending %s to %s", request, self)
|
log.exception("Error sending %s to %s", request, self)
|
||||||
@@ -648,6 +662,8 @@ class BrokerConnection(object):
|
|||||||
|
|
||||||
self._receiving = False
|
self._receiving = False
|
||||||
self._next_payload_bytes = 0
|
self._next_payload_bytes = 0
|
||||||
|
if self._sensors:
|
||||||
|
self._sensors.bytes_received.record(4 + self._rbuffer.tell())
|
||||||
self._rbuffer.seek(0)
|
self._rbuffer.seek(0)
|
||||||
response = self._process_response(self._rbuffer)
|
response = self._process_response(self._rbuffer)
|
||||||
self._rbuffer.seek(0)
|
self._rbuffer.seek(0)
|
||||||
@@ -658,6 +674,8 @@ class BrokerConnection(object):
|
|||||||
assert not self._processing, 'Recursion not supported'
|
assert not self._processing, 'Recursion not supported'
|
||||||
self._processing = True
|
self._processing = True
|
||||||
ifr = self.in_flight_requests.popleft()
|
ifr = self.in_flight_requests.popleft()
|
||||||
|
if self._sensors:
|
||||||
|
self._sensors.request_time.record((time.time() - ifr.timestamp) * 1000)
|
||||||
|
|
||||||
# verify send/recv correlation ids match
|
# verify send/recv correlation ids match
|
||||||
recv_correlation_id = Int32.decode(read_buffer)
|
recv_correlation_id = Int32.decode(read_buffer)
|
||||||
@@ -827,6 +845,111 @@ class BrokerConnection(object):
|
|||||||
self.port)
|
self.port)
|
||||||
|
|
||||||
|
|
||||||
|
class BrokerConnectionMetrics(object):
|
||||||
|
def __init__(self, metrics, metric_group_prefix, node_id):
|
||||||
|
self.metrics = metrics
|
||||||
|
|
||||||
|
# Any broker may have registered summary metrics already
|
||||||
|
# but if not, we need to create them so we can set as parents below
|
||||||
|
all_conns_transferred = metrics.get_sensor('bytes-sent-received')
|
||||||
|
if not all_conns_transferred:
|
||||||
|
metric_group_name = metric_group_prefix + '-metrics'
|
||||||
|
|
||||||
|
bytes_transferred = metrics.sensor('bytes-sent-received')
|
||||||
|
bytes_transferred.add(metrics.metric_name(
|
||||||
|
'network-io-rate', metric_group_name,
|
||||||
|
'The average number of network operations (reads or writes) on all'
|
||||||
|
' connections per second.'), Rate(sampled_stat=Count()))
|
||||||
|
|
||||||
|
bytes_sent = metrics.sensor('bytes-sent',
|
||||||
|
parents=[bytes_transferred])
|
||||||
|
bytes_sent.add(metrics.metric_name(
|
||||||
|
'outgoing-byte-rate', metric_group_name,
|
||||||
|
'The average number of outgoing bytes sent per second to all'
|
||||||
|
' servers.'), Rate())
|
||||||
|
bytes_sent.add(metrics.metric_name(
|
||||||
|
'request-rate', metric_group_name,
|
||||||
|
'The average number of requests sent per second.'),
|
||||||
|
Rate(sampled_stat=Count()))
|
||||||
|
bytes_sent.add(metrics.metric_name(
|
||||||
|
'request-size-avg', metric_group_name,
|
||||||
|
'The average size of all requests in the window.'), Avg())
|
||||||
|
bytes_sent.add(metrics.metric_name(
|
||||||
|
'request-size-max', metric_group_name,
|
||||||
|
'The maximum size of any request sent in the window.'), Max())
|
||||||
|
|
||||||
|
bytes_received = metrics.sensor('bytes-received',
|
||||||
|
parents=[bytes_transferred])
|
||||||
|
bytes_received.add(metrics.metric_name(
|
||||||
|
'incoming-byte-rate', metric_group_name,
|
||||||
|
'Bytes/second read off all sockets'), Rate())
|
||||||
|
bytes_received.add(metrics.metric_name(
|
||||||
|
'response-rate', metric_group_name,
|
||||||
|
'Responses received sent per second.'),
|
||||||
|
Rate(sampled_stat=Count()))
|
||||||
|
|
||||||
|
request_latency = metrics.sensor('request-latency')
|
||||||
|
request_latency.add(metrics.metric_name(
|
||||||
|
'request-latency-avg', metric_group_name,
|
||||||
|
'The average request latency in ms.'),
|
||||||
|
Avg())
|
||||||
|
request_latency.add(metrics.metric_name(
|
||||||
|
'request-latency-max', metric_group_name,
|
||||||
|
'The maximum request latency in ms.'),
|
||||||
|
Max())
|
||||||
|
|
||||||
|
# if one sensor of the metrics has been registered for the connection,
|
||||||
|
# then all other sensors should have been registered; and vice versa
|
||||||
|
node_str = 'node-{0}'.format(node_id)
|
||||||
|
node_sensor = metrics.get_sensor(node_str + '.bytes-sent')
|
||||||
|
if not node_sensor:
|
||||||
|
metric_group_name = metric_group_prefix + '-node-metrics.' + node_str
|
||||||
|
|
||||||
|
self.bytes_sent = metrics.sensor(
|
||||||
|
node_str + '.bytes-sent',
|
||||||
|
parents=[metrics.get_sensor('bytes-sent')])
|
||||||
|
self.bytes_sent.add(metrics.metric_name(
|
||||||
|
'outgoing-byte-rate', metric_group_name,
|
||||||
|
'The average number of outgoing bytes sent per second.'),
|
||||||
|
Rate())
|
||||||
|
self.bytes_sent.add(metrics.metric_name(
|
||||||
|
'request-rate', metric_group_name,
|
||||||
|
'The average number of requests sent per second.'),
|
||||||
|
Rate(sampled_stat=Count()))
|
||||||
|
self.bytes_sent.add(metrics.metric_name(
|
||||||
|
'request-size-avg', metric_group_name,
|
||||||
|
'The average size of all requests in the window.'),
|
||||||
|
Avg())
|
||||||
|
self.bytes_sent.add(metrics.metric_name(
|
||||||
|
'request-size-max', metric_group_name,
|
||||||
|
'The maximum size of any request sent in the window.'),
|
||||||
|
Max())
|
||||||
|
|
||||||
|
self.bytes_received = metrics.sensor(
|
||||||
|
node_str + '.bytes-received',
|
||||||
|
parents=[metrics.get_sensor('bytes-received')])
|
||||||
|
self.bytes_received.add(metrics.metric_name(
|
||||||
|
'incoming-byte-rate', metric_group_name,
|
||||||
|
'Bytes/second read off node-connection socket'),
|
||||||
|
Rate())
|
||||||
|
self.bytes_received.add(metrics.metric_name(
|
||||||
|
'response-rate', metric_group_name,
|
||||||
|
'The average number of responses received per second.'),
|
||||||
|
Rate(sampled_stat=Count()))
|
||||||
|
|
||||||
|
self.request_time = self.metrics.sensor(
|
||||||
|
node_str + '.latency',
|
||||||
|
parents=[metrics.get_sensor('request-latency')])
|
||||||
|
self.request_time.add(metrics.metric_name(
|
||||||
|
'request-latency-avg', metric_group_name,
|
||||||
|
'The average request latency in ms.'),
|
||||||
|
Avg())
|
||||||
|
self.request_time.add(metrics.metric_name(
|
||||||
|
'request-latency-max', metric_group_name,
|
||||||
|
'The maximum request latency in ms.'),
|
||||||
|
Max())
|
||||||
|
|
||||||
|
|
||||||
def _address_family(address):
|
def _address_family(address):
|
||||||
"""
|
"""
|
||||||
Attempt to determine the family of an address (or hostname)
|
Attempt to determine the family of an address (or hostname)
|
||||||
|
@@ -204,7 +204,6 @@ class Sender(threading.Thread):
|
|||||||
batch = batches_by_partition[tp]
|
batch = batches_by_partition[tp]
|
||||||
self._complete_batch(batch, error, offset, ts)
|
self._complete_batch(batch, error, offset, ts)
|
||||||
|
|
||||||
self._sensors.record_latency((time.time() - send_time) * 1000, node=node_id)
|
|
||||||
if response.API_VERSION > 0:
|
if response.API_VERSION > 0:
|
||||||
self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id)
|
self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id)
|
||||||
|
|
||||||
@@ -343,15 +342,6 @@ class SenderMetrics(object):
|
|||||||
sensor_name=sensor_name,
|
sensor_name=sensor_name,
|
||||||
description='The maximum time in ms record batches spent in the record accumulator.')
|
description='The maximum time in ms record batches spent in the record accumulator.')
|
||||||
|
|
||||||
sensor_name = 'request-time'
|
|
||||||
self.request_time_sensor = self.metrics.sensor(sensor_name)
|
|
||||||
self.add_metric('request-latency-avg', Avg(),
|
|
||||||
sensor_name=sensor_name,
|
|
||||||
description='The average request latency in ms')
|
|
||||||
self.add_metric('request-latency-max', Max(),
|
|
||||||
sensor_name=sensor_name,
|
|
||||||
description='The maximum request latency in ms')
|
|
||||||
|
|
||||||
sensor_name = 'produce-throttle-time'
|
sensor_name = 'produce-throttle-time'
|
||||||
self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name)
|
self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name)
|
||||||
self.add_metric('produce-throttle-time-avg', Avg(),
|
self.add_metric('produce-throttle-time-avg', Avg(),
|
||||||
@@ -498,12 +488,5 @@ class SenderMetrics(object):
|
|||||||
if sensor:
|
if sensor:
|
||||||
sensor.record(count)
|
sensor.record(count)
|
||||||
|
|
||||||
def record_latency(self, latency, node=None):
|
|
||||||
self.request_time_sensor.record(latency)
|
|
||||||
if node is not None:
|
|
||||||
sensor = self.metrics.get_sensor('node-' + str(node) + '.latency')
|
|
||||||
if sensor:
|
|
||||||
sensor.record(latency)
|
|
||||||
|
|
||||||
def record_throttle_time(self, throttle_time_ms, node=None):
|
def record_throttle_time(self, throttle_time_ms, node=None):
|
||||||
self.produce_throttle_time_sensor.record(throttle_time_ms)
|
self.produce_throttle_time_sensor.record(throttle_time_ms)
|
||||||
|
@@ -49,6 +49,7 @@ def test_bootstrap_success(conn):
|
|||||||
args, kwargs = conn.call_args
|
args, kwargs = conn.call_args
|
||||||
assert args == ('localhost', 9092, socket.AF_UNSPEC)
|
assert args == ('localhost', 9092, socket.AF_UNSPEC)
|
||||||
kwargs.pop('state_change_callback')
|
kwargs.pop('state_change_callback')
|
||||||
|
kwargs.pop('node_id')
|
||||||
assert kwargs == cli.config
|
assert kwargs == cli.config
|
||||||
conn.connect.assert_called_with()
|
conn.connect.assert_called_with()
|
||||||
conn.send.assert_called_once_with(MetadataRequest[0]([]))
|
conn.send.assert_called_once_with(MetadataRequest[0]([]))
|
||||||
@@ -62,6 +63,7 @@ def test_bootstrap_failure(conn):
|
|||||||
args, kwargs = conn.call_args
|
args, kwargs = conn.call_args
|
||||||
assert args == ('localhost', 9092, socket.AF_UNSPEC)
|
assert args == ('localhost', 9092, socket.AF_UNSPEC)
|
||||||
kwargs.pop('state_change_callback')
|
kwargs.pop('state_change_callback')
|
||||||
|
kwargs.pop('node_id')
|
||||||
assert kwargs == cli.config
|
assert kwargs == cli.config
|
||||||
conn.connect.assert_called_with()
|
conn.connect.assert_called_with()
|
||||||
conn.close.assert_called_with()
|
conn.close.assert_called_with()
|
||||||
|
Reference in New Issue
Block a user