Merge "[HBase] get_resource optimization"

This commit is contained in:
Jenkins 2014-06-15 21:40:02 +00:00 committed by Gerrit Code Review
commit 3efe03b60a

View File

@ -20,12 +20,13 @@
import copy
import datetime
import hashlib
import itertools
import json
import operator
import os
import re
import six
import six.moves.urllib.parse as urlparse
import time
import bson.json_util
import happybase
@ -34,7 +35,6 @@ from ceilometer.openstack.common.gettextutils import _
from ceilometer.openstack.common import log
from ceilometer.openstack.common import network_utils
from ceilometer.openstack.common import timeutils
from ceilometer import storage
from ceilometer.storage import base
from ceilometer.storage import models
from ceilometer import utils
@ -107,8 +107,8 @@ class Connection(base.Connection):
f:r_metadata.display_name or f:r_metadata.tag
-sources for all corresponding meters with prefix 's'
-all meters for this resource in format
"%s!%s!%s+%s" % (counter_name, counter_type, counter_unit,
source)
"%s+%s+%s!%s!%s" % (rts, source, counter_name, counter_type,
counter_unit)
- alarm
- row_key: uuid of alarm
@ -308,16 +308,25 @@ class Connection(base.Connection):
resource_metadata = data.get('resource_metadata', {})
# Determine the name of new meter
rts = timestamp(data['timestamp'])
new_meter = _format_meter_reference(
data['counter_name'], data['counter_type'],
data['counter_unit'], data['source'])
data['counter_unit'], rts, data['source'])
#TODO(nprivalova): try not to store resource_id
resource = serialize_entry(**{
'source': data['source'], 'meter': new_meter,
'source': data['source'],
'meter': {new_meter: data['timestamp']},
'resource_metadata': resource_metadata,
'resource_id': data['resource_id'],
'project_id': data['project_id'], 'user_id': data['user_id']})
resource_table.put(data['resource_id'], resource)
# Here we put entry in HBase with our own timestamp. This is needed
# when samples arrive out-of-order
# If we use timestamp=data['timestamp'] the newest data will be
# automatically 'on the top'. It is needed to keep metadata
# up-to-date: metadata from newest samples is considered as actual.
ts = int(time.mktime(data['timestamp'].timetuple()) * 1000)
resource_table.put(data['resource_id'], resource, ts)
#TODO(nprivalova): improve uniqueness
# Rowkey consists of reversed timestamp, meter and an md5 of
@ -325,10 +334,6 @@ class Connection(base.Connection):
m = hashlib.md5()
m.update("%s%s%s" % (data['user_id'], data['resource_id'],
data['project_id']))
# We use reverse timestamps in rowkeys as they are sorted
# alphabetically.
rts = timestamp(data['timestamp'])
row = "%s_%d_%s" % (data['counter_name'], rts, m.hexdigest())
record = serialize_entry(data, **{'source': data['source'],
'rts': rts,
@ -357,47 +362,38 @@ class Connection(base.Connection):
if pagination:
raise NotImplementedError('Pagination not implemented')
metaquery = metaquery or {}
sample_filter = storage.SampleFilter(
user=user, project=project,
start=start_timestamp, start_timestamp_op=start_timestamp_op,
end=end_timestamp, end_timestamp_op=end_timestamp_op,
resource=resource, source=source, metaquery=metaquery)
q, start_row, stop_row, __ = make_sample_query_from_filter(
sample_filter, require_meter=False)
q = make_query(metaquery=metaquery, user_id=user, project_id=project,
resource_id=resource, source=source)
q = make_meter_query_for_resource(start_timestamp, start_timestamp_op,
end_timestamp, end_timestamp_op,
source, q)
with self.conn_pool.connection() as conn:
meter_table = conn.table(self.METER_TABLE)
LOG.debug(_("Query Meter table: %s") % q)
meters = meter_table.scan(filter=q, row_start=start_row,
row_stop=stop_row)
d_meters = []
for i, m in meters:
d_meters.append(deserialize_entry(m))
# We have to sort on resource_id before we can group by it.
# According to the itertools documentation a new group is
# generated when the value of the key function changes
# (it breaks there).
meters = sorted(d_meters, key=_resource_id_from_record_tuple)
for resource_id, r_meters in itertools.groupby(
meters, key=_resource_id_from_record_tuple):
# We need deserialized entry(data[0]), sources (data[1]) and
# metadata(data[3])
meter_rows = [(data[0], data[1], data[3]) for data in sorted(
r_meters, key=_timestamp_from_record_tuple)]
latest_data = meter_rows[-1]
min_ts = meter_rows[0][0]['timestamp']
max_ts = latest_data[0]['timestamp']
resource_table = conn.table(self.RESOURCE_TABLE)
LOG.debug(_("Query Resource table: %s") % q)
for resource_id, data in resource_table.scan(filter=q):
f_res, sources, meters, md = deserialize_entry(data)
# Unfortunately happybase doesn't keep ordered result from
# HBase. So that's why it's needed to find min and max
# manually
first_ts = min(meters, key=operator.itemgetter(1))[1]
last_ts = max(meters, key=operator.itemgetter(1))[1]
source = meters[0][0].split('+')[1]
# If we use QualifierFilter then HBase returnes only
# qualifiers filtered by. It will not return the whole entry.
# That's why if we need to ask additional qualifiers manually.
if 'project_id' not in f_res and 'user_id' not in f_res:
row = resource_table.row(
resource_id, columns=['f:project_id', 'f:user_id',
'f:resource_metadata'])
f_res, _s, _m, md = deserialize_entry(row)
yield models.Resource(
resource_id=resource_id,
first_sample_timestamp=min_ts,
last_sample_timestamp=max_ts,
project_id=latest_data[0]['project_id'],
source=latest_data[1][0],
user_id=latest_data[0]['user_id'],
metadata=latest_data[2],
)
first_sample_timestamp=first_ts,
last_sample_timestamp=last_ts,
project_id=f_res['project_id'],
source=source,
user_id=f_res['user_id'],
metadata=md)
def get_meters(self, user=None, project=None, resource=None, source=None,
metaquery=None, pagination=None):
@ -430,8 +426,8 @@ class Connection(base.Connection):
for ignored, data in gen:
flatten_result, s, meters, md = deserialize_entry(data)
for m in meters:
meter_raw, m_source = m.split("+")
name, type, unit = meter_raw.split('!')
_m_rts, m_source, m_raw = m[0].split("+")
name, type, unit = m_raw.split('!')
meter_dict = {'name': name,
'type': type,
'unit': unit,
@ -453,13 +449,15 @@ class Connection(base.Connection):
:param sample_filter: Filter.
:param limit: Maximum number of results to return.
"""
if limit == 0:
return
with self.conn_pool.connection() as conn:
meter_table = conn.table(self.METER_TABLE)
q, start, stop, columns = make_sample_query_from_filter(
sample_filter, require_meter=False)
LOG.debug(_("Query Meter Table: %s") % q)
gen = meter_table.scan(filter=q, row_start=start, row_stop=stop,
columns=columns, limit=limit)
limit=limit)
for ignored, meter in gen:
d_meter = deserialize_entry(meter)[0]
d_meter['message']['recorded_at'] = d_meter['recorded_at']
@ -707,6 +705,10 @@ class Connection(base.Connection):
yield trait
def _QualifierFilter(op, qualifier):
return "QualifierFilter (%s, 'binaryprefix:m_%s')" % (op, qualifier)
###############
# This is a very crude version of "in-memory HBase", which implements just
# enough functionality of HappyBase API to support testing of our driver.
@ -717,27 +719,56 @@ class MTable(object):
def __init__(self, name, families):
self.name = name
self.families = families
self._rows = {}
self._rows_with_ts = {}
def row(self, key):
return self._rows.get(key, {})
def row(self, key, columns=None):
if key not in self._rows_with_ts:
return {}
res = copy.copy(sorted(six.iteritems(
self._rows_with_ts.get(key)))[-1][1])
if columns:
keys = res.keys()
for key in keys:
if key not in columns:
res.pop(key)
return res
def rows(self, keys):
return ((k, self.row(k)) for k in keys)
def put(self, key, data):
if key not in self._rows:
self._rows[key] = data
def put(self, key, data, ts=None):
# Note: Now we use 'timestamped' but only for one Resource table.
# That's why we may put ts='0' in case when ts is None. If it is
# needed to use 2 types of put in one table ts=0 cannot be used.
if ts is None:
ts = "0"
if key not in self._rows_with_ts:
self._rows_with_ts[key] = {ts: data}
else:
self._rows[key].update(data)
if ts in self._rows_with_ts[key]:
self._rows_with_ts[key][ts].update(data)
else:
self._rows_with_ts[key].update({ts: data})
def delete(self, key):
del self._rows[key]
del self._rows_with_ts[key]
def _get_latest_dict(self, row):
# The idea here is to return latest versions of columns.
# In _rows_with_ts we store {row: {ts_1: {data}, ts_2: {data}}}.
# res will contain a list of tuples [(ts_1, {data}), (ts_2, {data})]
# sorted by ts, i.e. in this list ts_2 is the most latest.
# To get result as HBase provides we should iterate in reverse order
# and get from "latest" data only key-values that are not in newer data
data = {}
for i in sorted(six.iteritems(self._rows_with_ts[row])):
data.update(i[1])
return data
def scan(self, filter=None, columns=None, row_start=None, row_stop=None,
limit=None):
columns = columns or []
sorted_keys = sorted(self._rows)
sorted_keys = sorted(self._rows_with_ts)
# copy data between row_start and row_stop into a dict
rows = {}
for row in sorted_keys:
@ -745,11 +776,11 @@ class MTable(object):
continue
if row_stop and row > row_stop:
break
rows[row] = copy.copy(self._rows[row])
rows[row] = self._get_latest_dict(row)
if columns:
ret = {}
for row in rows.keys():
data = rows[row]
for row, data in six.iteritems(rows):
for key in data:
if key in columns:
ret[row] = data
@ -854,6 +885,33 @@ class MTable(object):
pass
return r
@staticmethod
def QualifierFilter(args, rows):
"""This method is called from scan() when 'QualifierFilter'
is found in the 'filter' argument
"""
op = args[0]
value = args[1]
if value.startswith('binaryprefix:'):
value = value[len('binaryprefix:'):]
column = 'f:' + value
r = {}
for row in rows:
data = rows[row]
r_data = {}
for key in data:
if (op == '=' and key.startswith(column)) or \
(op == '>=' and key >= column) or \
(op == '<=' and key <= column):
r_data[key] = data[key]
else:
raise NotImplementedError("In-memory QualifierFilter "
"doesn't support the %s "
"operation yet" % op)
if r_data:
r[row] = r_data
return r
class MConnectionPool(object):
def __init__(self):
@ -898,14 +956,14 @@ class MConnection(object):
def timestamp(dt, reverse=True):
"""Timestamp is count of milliseconds since start of epoch.
Timestamps is a technique used in HBase rowkey design. When period
queries are required the HBase rowkeys must include timestamps, but as
rowkeys in HBase are ordered lexicographically.
If reverse=True then timestamp will be reversed. Such a technique is used
in HBase rowkey design when period queries are required. Because of the
fact that rows are sorted lexicographically it's possible to vary whether
the 'oldest' entries will be on top of the table or it should be the newest
ones (reversed timestamp case).
Same for the reversed timestamps, but the order will be opposite.
:param: dt: datetime which is translated to the (reversed or not) timestamp
:param: reverse: is a boolean parameter for reverse or straight count of
:param: dt: datetime which is translated to timestamp
:param: reverse: a boolean parameter for reverse or straight count of
timestamp in milliseconds
:return count or reversed count of milliseconds since start of epoch
"""
@ -986,7 +1044,7 @@ def get_start_end_rts(start, start_op, end, end_op):
rts_start = str(timestamp(start) + 1) if start else ""
rts_end = str(timestamp(end) + 1) if end else ""
#By default, we are using ge for lower bound and lt for upper bound
# By default, we are using ge for lower bound and lt for upper bound
if start_op == 'gt':
rts_start = str(long(rts_start) - 2)
if end_op == 'le':
@ -1006,9 +1064,9 @@ def make_query(metaquery=None, trait_query=None, **kwargs):
q = []
res_q = None
# Query for traits if a little differ from others it is constructed with
# SingleColumnValueFilter with the possibility to choose an operator for
# value
# Query for traits differs from others. It is constructed with
# SingleColumnValueFilter with the possibility to choose comparision
# operator
if trait_query:
trait_name = kwargs.pop('key')
op = kwargs.pop('op', 'eq')
@ -1105,6 +1163,43 @@ def make_sample_query_from_filter(sample_filter, require_meter=True):
return res_q, start_row, end_row, columns
def make_meter_query_for_resource(start_timestamp, start_timestamp_op,
end_timestamp, end_timestamp_op, source,
query=None):
"""This method is used when Resource table should be filtered by meters.
In this method we are looking into all qualifiers with m_ prefix.
:param start_timestamp: meter's timestamp start range.
:param start_timestamp_op: meter's start time operator, like ge, gt.
:param end_timestamp: meter's timestamp end range.
:param end_timestamp_op: meter's end time operator, like lt, le.
:param source: source filter.
:param query: a query string to concatenate with.
"""
start_rts, end_rts = get_start_end_rts(start_timestamp,
start_timestamp_op,
end_timestamp, end_timestamp_op)
mq = []
if start_rts:
filter_value = start_rts + '+' + source if source else start_rts
mq.append(_QualifierFilter("<=", filter_value))
if end_rts:
filter_value = end_rts + '+' + source if source else end_rts
mq.append(_QualifierFilter(">=", filter_value))
if mq:
meter_q = " AND ".join(mq)
# If there is a filtering on time_range we need to point that
# qualifiers should start with m_. Overwise in case e.g.
# QualifierFilter (>=, 'binaryprefix:m_9222030811134775808')
# qualifier 's_test' satisfies the filter and will be returned.
meter_q = _QualifierFilter("=", '') + " AND " + meter_q
query = meter_q if not query else query + " AND " + meter_q
return query
def _make_general_rowkey_scan(rts_start=None, rts_end=None, some_id=None):
"""If it's filter on some_id without start and end,
start_row = some_id while end_row = some_id + MAX_BYTE
@ -1119,10 +1214,10 @@ def _make_general_rowkey_scan(rts_start=None, rts_end=None, some_id=None):
return start_row, end_row
def _format_meter_reference(counter_name, counter_type, counter_unit, source):
def _format_meter_reference(c_name, c_type, c_unit, rts, source):
"""Format reference to meter data.
"""
return "%s!%s!%s+%s" % (counter_name, counter_type, counter_unit, source)
return "%s+%s+%s!%s!%s" % (rts, source, c_name, c_type, c_unit)
def _timestamp_from_record_tuple(record):
@ -1158,8 +1253,9 @@ def deserialize_entry(entry, get_raw_meta=True):
sources.append(k[4:])
elif k.startswith('f:r_metadata.'):
metadata_flattened[k[len('f:r_metadata.'):]] = load(v)
elif k.startswith('f:m_'):
meters.append(k[4:])
elif k.startswith("f:m_"):
meter = (k[4:], load(v))
meters.append(meter)
else:
flatten_result[k[2:]] = load(v)
if get_raw_meta:
@ -1189,9 +1285,9 @@ def serialize_entry(data=None, **kwargs):
# a separate cell. For this purpose s_ and m_ prefixes are
# introduced.
result['f:s_%s' % v] = dump('1')
elif k == 'meter':
result['f:m_%s' % v] = dump('1')
for meter, ts in v.items():
result['f:m_%s' % meter] = dump(ts)
elif k == 'resource_metadata':
# keep raw metadata as well as flattened to provide
# capability with API v2. It will be flattened in another