Merge "Collector rpc datasource works at 200k entities."
This commit is contained in:
commit
4b7dac42a6
@ -8,6 +8,7 @@ General
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
high-scale
|
||||
resource-state-config
|
||||
alarm-severity-config
|
||||
profiler-config
|
||||
|
57
doc/source/contributor/high-scale.rst
Normal file
57
doc/source/contributor/high-scale.rst
Normal file
@ -0,0 +1,57 @@
|
||||
================================
|
||||
Configure Vitrage for high-scale
|
||||
================================
|
||||
In a production environment with > 50,000 entities, the following configuration changes are suggested:
|
||||
|
||||
|
||||
Tune RPC
|
||||
--------
|
||||
|
||||
Vitrage-graph uses RPC to request data from vitrage-collector, these requests take longer, and there is a need to increase the timeout.
|
||||
The following should be set in ``/etc/vitrage/vitrage.conf``, under ``[DEFAULT]`` section:
|
||||
|
||||
+----------------------+---------------------------------------------------------+-----------------+-----------------+
|
||||
| Name | Description | Default Value | Suggested Value |
|
||||
+======================+=========================================================+=================+=================+
|
||||
| rpc_response_timeout | Seconds to wait for a response from a call | 60 | 300 |
|
||||
+----------------------+---------------------------------------------------------+-----------------+-----------------+
|
||||
|
||||
To apply, restart these:
|
||||
|
||||
``sudo service vitrage-graph restart``
|
||||
|
||||
``sudo service vitrage-collector restart``
|
||||
|
||||
Restart the Vitrage api (either vitrage-api or apache)
|
||||
|
||||
|
||||
Tune Memory
|
||||
-----------
|
||||
|
||||
Most of the data is held in-memory. To conserve memory usage, the number of evaluator workers should be decreased.
|
||||
If using many Vitrage templates the number of evaluator workers can be increased, but kept to a minimum needed.
|
||||
|
||||
The following should be set in ``/etc/vitrage/vitrage.conf``, under ``[evaluator]`` section:
|
||||
|
||||
+----------------------+---------------------------------------------------------+-----------------+-----------------+
|
||||
| Name | Description | Default Value | Suggested Value |
|
||||
+======================+=========================================================+=================+=================+
|
||||
| workers | Number of workers for template evaluator | number of cores | 1 |
|
||||
+----------------------+---------------------------------------------------------+-----------------+-----------------+
|
||||
|
||||
To apply, run ``sudo service vitrage-graph restart``
|
||||
|
||||
|
||||
Tune Mysql
|
||||
----------
|
||||
Vitrage periodically persists the graph to mysql, as a mysql blob. As the graph size increases, it is recommended to increase the mysql max_allowed_packet.
|
||||
|
||||
The following should be set in ``/etc/mysql/my.cnf``, under ``[mysqld]`` section:
|
||||
|
||||
+----------------------+---------------------------------------------------------+-----------------+-----------------+
|
||||
| Name | Description | Default Value | Suggested Value |
|
||||
+======================+=========================================================+=================+=================+
|
||||
| max_allowed_packet | The maximum size of one packet or any string | 4M-64M | 100M |
|
||||
+----------------------+---------------------------------------------------------+-----------------+-----------------+
|
||||
|
||||
To apply, run ``sudo service mysql restart``
|
@ -0,0 +1,3 @@
|
||||
features:
|
||||
- Support for graphs with more than 100,000 vertices has been added and
|
||||
tested. See high-scale configuration document.
|
@ -52,8 +52,8 @@ class DriverBase(object):
|
||||
|
||||
@classmethod
|
||||
def make_pickleable(cls, entities, entity_type, datasource_action, *args):
|
||||
pickleable_entities = cls.prepare_entities(entities, entity_type,
|
||||
datasource_action, args)
|
||||
pickleable_entities = cls.make_pickleable_without_end_msg(
|
||||
entities, entity_type, datasource_action, *args)
|
||||
|
||||
if datasource_action == DatasourceAction.INIT_SNAPSHOT:
|
||||
pickleable_entities.append(cls._get_end_message(entity_type))
|
||||
@ -63,12 +63,6 @@ class DriverBase(object):
|
||||
@classmethod
|
||||
def make_pickleable_without_end_msg(cls, entities, entity_type,
|
||||
datasource_action, *args):
|
||||
pickleable_entities = cls.prepare_entities(entities, entity_type,
|
||||
datasource_action, args)
|
||||
return pickleable_entities
|
||||
|
||||
@classmethod
|
||||
def prepare_entities(cls, entities, entity_type, datasource_action, args):
|
||||
pickleable_entities = []
|
||||
for entity in entities:
|
||||
for field in args:
|
||||
@ -80,6 +74,18 @@ class DriverBase(object):
|
||||
pickleable_entities.append(entity)
|
||||
return pickleable_entities
|
||||
|
||||
@classmethod
|
||||
def make_pickleable_iter(cls, entities, entity_type,
|
||||
datasource_action, *args):
|
||||
for entity in entities:
|
||||
for field in args:
|
||||
entity.pop(field, None)
|
||||
|
||||
cls._add_entity_type(entity, entity_type)
|
||||
cls._add_datasource_action(entity, datasource_action)
|
||||
cls._add_sampling_time(entity)
|
||||
yield entity
|
||||
|
||||
@staticmethod
|
||||
def _add_entity_type(entity, entity_type):
|
||||
if DSProps.ENTITY_TYPE not in entity:
|
||||
|
@ -43,4 +43,5 @@ class NetworkDriver(NeutronBase):
|
||||
return self.make_pickleable(
|
||||
self.client.list_networks()['networks'],
|
||||
NEUTRON_NETWORK_DATASOURCE,
|
||||
datasource_action)
|
||||
datasource_action,
|
||||
*self.properties_to_filter_out())
|
||||
|
@ -44,4 +44,5 @@ class PortDriver(NeutronBase):
|
||||
return self.make_pickleable(
|
||||
ports,
|
||||
NEUTRON_PORT_DATASOURCE,
|
||||
datasource_action)
|
||||
datasource_action,
|
||||
*self.properties_to_filter_out())
|
||||
|
@ -11,9 +11,11 @@
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import base64
|
||||
from concurrent import futures
|
||||
from six.moves import cPickle
|
||||
import time
|
||||
import zlib
|
||||
|
||||
from oslo_log import log
|
||||
|
||||
@ -44,6 +46,11 @@ class CollectorRpcHandlerService(object):
|
||||
LOG.info("Collector Rpc Handler Service - Stopped!")
|
||||
|
||||
|
||||
def compress_events(events):
|
||||
str_data = cPickle.dumps(events, cPickle.HIGHEST_PROTOCOL)
|
||||
return base64.b64encode(zlib.compress(str_data))
|
||||
|
||||
|
||||
class DriversEndpoint(object):
|
||||
|
||||
def __init__(self, conf):
|
||||
@ -71,7 +78,8 @@ class DriversEndpoint(object):
|
||||
time.sleep(fault_interval)
|
||||
result.extend(list(self.pool.map(run_driver, failed_drivers)))
|
||||
|
||||
events = [e for success, events in result if success for e in events]
|
||||
events = compress_events([e for success, events in result if success
|
||||
for e in events])
|
||||
LOG.debug("run drivers get_all done.")
|
||||
return events
|
||||
|
||||
|
@ -11,7 +11,10 @@
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
from base64 import standard_b64decode
|
||||
from six.moves import cPickle
|
||||
import time
|
||||
import zlib
|
||||
|
||||
from oslo_log import log
|
||||
import oslo_messaging
|
||||
@ -35,12 +38,15 @@ def get_all(rpc_client, events_coordination, driver_names, action,
|
||||
t1 = time.time()
|
||||
|
||||
def _call():
|
||||
return rpc_client.call(
|
||||
result = rpc_client.call(
|
||||
{},
|
||||
'driver_get_all',
|
||||
driver_names=driver_names,
|
||||
action=action,
|
||||
retry_on_fault=retry_on_fault)
|
||||
events = cPickle.loads(zlib.decompress(standard_b64decode(result)))
|
||||
for e in events:
|
||||
yield e
|
||||
|
||||
try:
|
||||
events = _call()
|
||||
@ -48,10 +54,10 @@ def get_all(rpc_client, events_coordination, driver_names, action,
|
||||
LOG.exception('Got MessagingTimeout')
|
||||
events = _call() if retry_on_fault else []
|
||||
t2 = time.time()
|
||||
events_coordination.handle_multiple_low_priority(events)
|
||||
count = events_coordination.handle_multiple_low_priority(events)
|
||||
t3 = time.time()
|
||||
LOG.info('get_all took %s, processing took %s for %s events',
|
||||
t2 - t1, t3 - t2, len(events))
|
||||
t2 - t1, t3 - t2, count)
|
||||
|
||||
|
||||
def get_changes(rpc_client, events_coordination, driver_name):
|
||||
|
@ -173,8 +173,10 @@ class EventsCoordination(object):
|
||||
self._lock.release()
|
||||
|
||||
def handle_multiple_low_priority(self, events):
|
||||
for e in events:
|
||||
index = 0
|
||||
for index, e in enumerate(events):
|
||||
self._do_low_priority_work(e)
|
||||
return index
|
||||
|
||||
def _init_listener(self, topic, callback):
|
||||
if not topic:
|
||||
|
@ -322,7 +322,10 @@ class NXGraph(Graph):
|
||||
node_link_data['links'][i]['target'] = vitrage_id_to_index[
|
||||
node_link_data['links'][i]['target']]
|
||||
|
||||
return json.dumps(node_link_data)
|
||||
if kwargs.get('raw', False):
|
||||
return node_link_data
|
||||
else:
|
||||
return json.dumps(node_link_data)
|
||||
|
||||
def write_gpickle(self):
|
||||
return cPickle.dumps(self._g, cPickle.HIGHEST_PROTOCOL)
|
||||
|
@ -11,8 +11,6 @@
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import json
|
||||
|
||||
from oslo_log import log
|
||||
|
||||
|
||||
@ -30,7 +28,7 @@ class MockDriver(StaticDriver):
|
||||
def __init__(self, conf):
|
||||
super(StaticDriver, self).__init__()
|
||||
mock_cfg = conf.mock_graph_datasource
|
||||
self.e_graph = GraphGenerator(
|
||||
e_graph = GraphGenerator(
|
||||
networks=mock_cfg.networks,
|
||||
zones_per_cluster=mock_cfg.zones_per_cluster,
|
||||
hosts_per_zone=mock_cfg.hosts_per_zone,
|
||||
@ -42,19 +40,20 @@ class MockDriver(StaticDriver):
|
||||
tripleo_controllers=mock_cfg.tripleo_controllers,
|
||||
zabbix_alarms_per_controller=mock_cfg.zabbix_alarms_per_controller
|
||||
).create_graph()
|
||||
definitions = e_graph.json_output_graph(raw=True)
|
||||
self.mock_entities = self._get_mock_entities(definitions)
|
||||
|
||||
def get_all(self, datasource_action):
|
||||
return self.make_pickleable(self._get_mock_entities(),
|
||||
MOCK_DATASOURCE,
|
||||
datasource_action)
|
||||
return self.make_pickleable_iter(self.mock_entities,
|
||||
MOCK_DATASOURCE,
|
||||
datasource_action)
|
||||
|
||||
def get_changes(self, datasource_action):
|
||||
return self.make_pickleable([],
|
||||
MOCK_DATASOURCE,
|
||||
datasource_action)
|
||||
return self.make_pickleable_iter([],
|
||||
MOCK_DATASOURCE,
|
||||
datasource_action)
|
||||
|
||||
def _get_mock_entities(self):
|
||||
definitions = json.loads(self.e_graph.json_output_graph())
|
||||
def _get_mock_entities(self, definitions):
|
||||
for node in definitions['nodes']:
|
||||
node[StaticFields.STATIC_ID] = str(node[VProps.GRAPH_INDEX])
|
||||
node[StaticFields.TYPE] = node[VProps.VITRAGE_TYPE]
|
||||
|
Loading…
Reference in New Issue
Block a user