Merge "Collector rpc datasource works at 200k entities."

2018-08-07 06:37:51 +00:00 · 2018-08-07 06:37:51 +00:00 · 4b7dac42a6
commit 4b7dac42a6
parent 05fdd3c812 9115c83c77
11 changed files with 115 additions and 28 deletions
--- a/doc/source/contributor/configuration.rst
+++ b/doc/source/contributor/configuration.rst
@ -8,6 +8,7 @@ General
 .. toctree::
   :maxdepth: 1

+   high-scale
   resource-state-config
   alarm-severity-config
   profiler-config
--- a/doc/source/contributor/high-scale.rst
+++ b/doc/source/contributor/high-scale.rst
@ -0,0 +1,57 @@
+================================
+Configure Vitrage for high-scale
+================================
+In a production environment with > 50,000 entities, the following configuration changes are suggested:
+
+
+Tune RPC
+--------
+
+Vitrage-graph uses RPC to request data from vitrage-collector, these requests take longer, and there is a need to increase the timeout.
+The following should be set in ``/etc/vitrage/vitrage.conf``, under ``[DEFAULT]`` section:
+
+----------------------+---------------------------------------------------------+-----------------+-----------------+
+| Name                 | Description                                             | Default Value   | Suggested Value |
+======================+=========================================================+=================+=================+
+| rpc_response_timeout | Seconds to wait for a response from a call              |  60             |  300            |
+----------------------+---------------------------------------------------------+-----------------+-----------------+
+
+To apply, restart these:
+
+``sudo service vitrage-graph restart``
+
+``sudo service vitrage-collector restart``
+
+Restart the Vitrage api (either vitrage-api or apache)
+
+
+Tune Memory
+-----------
+
+Most of the data is held in-memory. To conserve memory usage, the number of evaluator workers should be decreased.
+If using many Vitrage templates the number of evaluator workers can be increased, but kept to a minimum needed.
+
+The following should be set in ``/etc/vitrage/vitrage.conf``, under ``[evaluator]`` section:
+
+----------------------+---------------------------------------------------------+-----------------+-----------------+
+| Name                 | Description                                             | Default Value   | Suggested Value |
+======================+=========================================================+=================+=================+
+| workers              | Number of workers for template evaluator                | number of cores |  1              |
+----------------------+---------------------------------------------------------+-----------------+-----------------+
+
+To apply, run ``sudo service vitrage-graph restart``
+
+
+Tune Mysql
+----------
+Vitrage periodically persists the graph to mysql, as a mysql blob. As the graph size increases, it is recommended  to increase the mysql max_allowed_packet.
+
+The following should be set in ``/etc/mysql/my.cnf``, under ``[mysqld]`` section:
+
+----------------------+---------------------------------------------------------+-----------------+-----------------+
+| Name                 | Description                                             | Default Value   | Suggested Value |
+======================+=========================================================+=================+=================+
+| max_allowed_packet   |  The maximum size of one packet or any string           | 4M-64M          |  100M           |
+----------------------+---------------------------------------------------------+-----------------+-----------------+
+
+To apply, run ``sudo service mysql restart``
--- a/releasenotes/notes/support_high_scale-fa1053f06954aed7.yaml
+++ b/releasenotes/notes/support_high_scale-fa1053f06954aed7.yaml
@ -0,0 +1,3 @@
+features:
+  - Support for graphs with more than 100,000 vertices has been added and
+    tested. See high-scale configuration document.
--- a/vitrage/datasources/driver_base.py
+++ b/vitrage/datasources/driver_base.py
@ -52,8 +52,8 @@ class DriverBase(object):

    @classmethod
    def make_pickleable(cls, entities, entity_type, datasource_action, *args):
-        pickleable_entities = cls.prepare_entities(entities, entity_type,
-                                                   datasource_action, args)
+        pickleable_entities = cls.make_pickleable_without_end_msg(
+            entities, entity_type, datasource_action, *args)

        if datasource_action == DatasourceAction.INIT_SNAPSHOT:
            pickleable_entities.append(cls._get_end_message(entity_type))
@ -63,12 +63,6 @@ class DriverBase(object):
    @classmethod
    def make_pickleable_without_end_msg(cls, entities, entity_type,
                                        datasource_action, *args):
-        pickleable_entities = cls.prepare_entities(entities, entity_type,
-                                                   datasource_action, args)
-        return pickleable_entities
-
-    @classmethod
-    def prepare_entities(cls, entities, entity_type, datasource_action, args):
        pickleable_entities = []
        for entity in entities:
            for field in args:
@ -80,6 +74,18 @@ class DriverBase(object):
            pickleable_entities.append(entity)
        return pickleable_entities

+    @classmethod
+    def make_pickleable_iter(cls, entities, entity_type,
+                             datasource_action, *args):
+        for entity in entities:
+            for field in args:
+                entity.pop(field, None)
+
+            cls._add_entity_type(entity, entity_type)
+            cls._add_datasource_action(entity, datasource_action)
+            cls._add_sampling_time(entity)
+            yield entity
+
    @staticmethod
    def _add_entity_type(entity, entity_type):
        if DSProps.ENTITY_TYPE not in entity:
--- a/vitrage/datasources/neutron/network/driver.py
+++ b/vitrage/datasources/neutron/network/driver.py
@ -43,4 +43,5 @@ class NetworkDriver(NeutronBase):
        return self.make_pickleable(
            self.client.list_networks()['networks'],
            NEUTRON_NETWORK_DATASOURCE,
-            datasource_action)
+            datasource_action,
+            *self.properties_to_filter_out())
--- a/vitrage/datasources/neutron/port/driver.py
+++ b/vitrage/datasources/neutron/port/driver.py
@ -44,4 +44,5 @@ class PortDriver(NeutronBase):
        return self.make_pickleable(
            ports,
            NEUTRON_PORT_DATASOURCE,
-            datasource_action)
+            datasource_action,
+            *self.properties_to_filter_out())
--- a/vitrage/datasources/rpc_service.py
+++ b/vitrage/datasources/rpc_service.py
@ -11,9 +11,11 @@
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
-
+import base64
 from concurrent import futures
+from six.moves import cPickle
 import time
+import zlib

 from oslo_log import log

@ -44,6 +46,11 @@ class CollectorRpcHandlerService(object):
        LOG.info("Collector Rpc Handler Service - Stopped!")


+def compress_events(events):
+    str_data = cPickle.dumps(events, cPickle.HIGHEST_PROTOCOL)
+    return base64.b64encode(zlib.compress(str_data))
+
+
 class DriversEndpoint(object):

    def __init__(self, conf):
@ -71,7 +78,8 @@ class DriversEndpoint(object):
            time.sleep(fault_interval)
            result.extend(list(self.pool.map(run_driver, failed_drivers)))

-        events = [e for success, events in result if success for e in events]
+        events = compress_events([e for success, events in result if success
+                                  for e in events])
        LOG.debug("run drivers get_all done.")
        return events

--- a/vitrage/entity_graph/datasource_rpc.py
+++ b/vitrage/entity_graph/datasource_rpc.py
@ -11,7 +11,10 @@
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
+from base64 import standard_b64decode
+from six.moves import cPickle
 import time
+import zlib

 from oslo_log import log
 import oslo_messaging
@ -35,12 +38,15 @@ def get_all(rpc_client, events_coordination, driver_names, action,
    t1 = time.time()

    def _call():
-        return rpc_client.call(
+        result = rpc_client.call(
            {},
            'driver_get_all',
            driver_names=driver_names,
            action=action,
            retry_on_fault=retry_on_fault)
+        events = cPickle.loads(zlib.decompress(standard_b64decode(result)))
+        for e in events:
+            yield e

    try:
        events = _call()
@ -48,10 +54,10 @@ def get_all(rpc_client, events_coordination, driver_names, action,
        LOG.exception('Got MessagingTimeout')
        events = _call() if retry_on_fault else []
    t2 = time.time()
-    events_coordination.handle_multiple_low_priority(events)
+    count = events_coordination.handle_multiple_low_priority(events)
    t3 = time.time()
    LOG.info('get_all took %s, processing took %s for %s events',
-             t2 - t1, t3 - t2, len(events))
+             t2 - t1, t3 - t2, count)


 def get_changes(rpc_client, events_coordination, driver_name):
--- a/vitrage/entity_graph/graph_init.py
+++ b/vitrage/entity_graph/graph_init.py
@ -173,8 +173,10 @@ class EventsCoordination(object):
        self._lock.release()

    def handle_multiple_low_priority(self, events):
-        for e in events:
+        index = 0
+        for index, e in enumerate(events):
            self._do_low_priority_work(e)
+        return index

    def _init_listener(self, topic, callback):
        if not topic:
--- a/vitrage/graph/driver/networkx_graph.py
+++ b/vitrage/graph/driver/networkx_graph.py
@ -322,7 +322,10 @@ class NXGraph(Graph):
                node_link_data['links'][i]['target'] = vitrage_id_to_index[
                    node_link_data['links'][i]['target']]

-        return json.dumps(node_link_data)
+        if kwargs.get('raw', False):
+            return node_link_data
+        else:
+            return json.dumps(node_link_data)

    def write_gpickle(self):
        return cPickle.dumps(self._g, cPickle.HIGHEST_PROTOCOL)
--- a/vitrage/tests/mocks/mock_graph_datasource/driver.py
+++ b/vitrage/tests/mocks/mock_graph_datasource/driver.py
@ -11,8 +11,6 @@
 # WARRANTIES OR  CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
-import json
-
 from oslo_log import log


@ -30,7 +28,7 @@ class MockDriver(StaticDriver):
    def __init__(self, conf):
        super(StaticDriver, self).__init__()
        mock_cfg = conf.mock_graph_datasource
-        self.e_graph = GraphGenerator(
+        e_graph = GraphGenerator(
            networks=mock_cfg.networks,
            zones_per_cluster=mock_cfg.zones_per_cluster,
            hosts_per_zone=mock_cfg.hosts_per_zone,
@ -42,19 +40,20 @@ class MockDriver(StaticDriver):
            tripleo_controllers=mock_cfg.tripleo_controllers,
            zabbix_alarms_per_controller=mock_cfg.zabbix_alarms_per_controller
        ).create_graph()
+        definitions = e_graph.json_output_graph(raw=True)
+        self.mock_entities = self._get_mock_entities(definitions)

    def get_all(self, datasource_action):
-        return self.make_pickleable(self._get_mock_entities(),
-                                    MOCK_DATASOURCE,
-                                    datasource_action)
+        return self.make_pickleable_iter(self.mock_entities,
+                                         MOCK_DATASOURCE,
+                                         datasource_action)

    def get_changes(self, datasource_action):
-        return self.make_pickleable([],
-                                    MOCK_DATASOURCE,
-                                    datasource_action)
+        return self.make_pickleable_iter([],
+                                         MOCK_DATASOURCE,
+                                         datasource_action)

-    def _get_mock_entities(self):
-        definitions = json.loads(self.e_graph.json_output_graph())
+    def _get_mock_entities(self, definitions):
        for node in definitions['nodes']:
            node[StaticFields.STATIC_ID] = str(node[VProps.GRAPH_INDEX])
            node[StaticFields.TYPE] = node[VProps.VITRAGE_TYPE]