diff --git a/doc/source/contributor/configuration.rst b/doc/source/contributor/configuration.rst index 77f4a0bfd..d26883200 100644 --- a/doc/source/contributor/configuration.rst +++ b/doc/source/contributor/configuration.rst @@ -8,6 +8,7 @@ General .. toctree:: :maxdepth: 1 + high-scale resource-state-config alarm-severity-config profiler-config diff --git a/doc/source/contributor/high-scale.rst b/doc/source/contributor/high-scale.rst new file mode 100644 index 000000000..287de67d5 --- /dev/null +++ b/doc/source/contributor/high-scale.rst @@ -0,0 +1,57 @@ +================================ +Configure Vitrage for high-scale +================================ +In a production environment with > 50,000 entities, the following configuration changes are suggested: + + +Tune RPC +-------- + +Vitrage-graph uses RPC to request data from vitrage-collector, these requests take longer, and there is a need to increase the timeout. +The following should be set in ``/etc/vitrage/vitrage.conf``, under ``[DEFAULT]`` section: + ++----------------------+---------------------------------------------------------+-----------------+-----------------+ +| Name | Description | Default Value | Suggested Value | ++======================+=========================================================+=================+=================+ +| rpc_response_timeout | Seconds to wait for a response from a call | 60 | 300 | ++----------------------+---------------------------------------------------------+-----------------+-----------------+ + +To apply, restart these: + +``sudo service vitrage-graph restart`` + +``sudo service vitrage-collector restart`` + +Restart the Vitrage api (either vitrage-api or apache) + + +Tune Memory +----------- + +Most of the data is held in-memory. To conserve memory usage, the number of evaluator workers should be decreased. +If using many Vitrage templates the number of evaluator workers can be increased, but kept to a minimum needed. + +The following should be set in ``/etc/vitrage/vitrage.conf``, under ``[evaluator]`` section: + ++----------------------+---------------------------------------------------------+-----------------+-----------------+ +| Name | Description | Default Value | Suggested Value | ++======================+=========================================================+=================+=================+ +| workers | Number of workers for template evaluator | number of cores | 1 | ++----------------------+---------------------------------------------------------+-----------------+-----------------+ + +To apply, run ``sudo service vitrage-graph restart`` + + +Tune Mysql +---------- +Vitrage periodically persists the graph to mysql, as a mysql blob. As the graph size increases, it is recommended to increase the mysql max_allowed_packet. + +The following should be set in ``/etc/mysql/my.cnf``, under ``[mysqld]`` section: + ++----------------------+---------------------------------------------------------+-----------------+-----------------+ +| Name | Description | Default Value | Suggested Value | ++======================+=========================================================+=================+=================+ +| max_allowed_packet | The maximum size of one packet or any string | 4M-64M | 100M | ++----------------------+---------------------------------------------------------+-----------------+-----------------+ + +To apply, run ``sudo service mysql restart`` diff --git a/releasenotes/notes/support_high_scale-fa1053f06954aed7.yaml b/releasenotes/notes/support_high_scale-fa1053f06954aed7.yaml new file mode 100644 index 000000000..ade368bc2 --- /dev/null +++ b/releasenotes/notes/support_high_scale-fa1053f06954aed7.yaml @@ -0,0 +1,3 @@ +features: + - Support for graphs with more than 100,000 vertices has been added and + tested. See high-scale configuration document. diff --git a/vitrage/datasources/driver_base.py b/vitrage/datasources/driver_base.py index 5c004a18d..96afe401f 100644 --- a/vitrage/datasources/driver_base.py +++ b/vitrage/datasources/driver_base.py @@ -52,8 +52,8 @@ class DriverBase(object): @classmethod def make_pickleable(cls, entities, entity_type, datasource_action, *args): - pickleable_entities = cls.prepare_entities(entities, entity_type, - datasource_action, args) + pickleable_entities = cls.make_pickleable_without_end_msg( + entities, entity_type, datasource_action, *args) if datasource_action == DatasourceAction.INIT_SNAPSHOT: pickleable_entities.append(cls._get_end_message(entity_type)) @@ -63,12 +63,6 @@ class DriverBase(object): @classmethod def make_pickleable_without_end_msg(cls, entities, entity_type, datasource_action, *args): - pickleable_entities = cls.prepare_entities(entities, entity_type, - datasource_action, args) - return pickleable_entities - - @classmethod - def prepare_entities(cls, entities, entity_type, datasource_action, args): pickleable_entities = [] for entity in entities: for field in args: @@ -80,6 +74,18 @@ class DriverBase(object): pickleable_entities.append(entity) return pickleable_entities + @classmethod + def make_pickleable_iter(cls, entities, entity_type, + datasource_action, *args): + for entity in entities: + for field in args: + entity.pop(field, None) + + cls._add_entity_type(entity, entity_type) + cls._add_datasource_action(entity, datasource_action) + cls._add_sampling_time(entity) + yield entity + @staticmethod def _add_entity_type(entity, entity_type): if DSProps.ENTITY_TYPE not in entity: diff --git a/vitrage/datasources/neutron/network/driver.py b/vitrage/datasources/neutron/network/driver.py index d23019fd6..73811779b 100644 --- a/vitrage/datasources/neutron/network/driver.py +++ b/vitrage/datasources/neutron/network/driver.py @@ -43,4 +43,5 @@ class NetworkDriver(NeutronBase): return self.make_pickleable( self.client.list_networks()['networks'], NEUTRON_NETWORK_DATASOURCE, - datasource_action) + datasource_action, + *self.properties_to_filter_out()) diff --git a/vitrage/datasources/neutron/port/driver.py b/vitrage/datasources/neutron/port/driver.py index 018743be3..fa53aef2a 100644 --- a/vitrage/datasources/neutron/port/driver.py +++ b/vitrage/datasources/neutron/port/driver.py @@ -44,4 +44,5 @@ class PortDriver(NeutronBase): return self.make_pickleable( ports, NEUTRON_PORT_DATASOURCE, - datasource_action) + datasource_action, + *self.properties_to_filter_out()) diff --git a/vitrage/datasources/rpc_service.py b/vitrage/datasources/rpc_service.py index 4a3f8f0d9..cf25230bc 100644 --- a/vitrage/datasources/rpc_service.py +++ b/vitrage/datasources/rpc_service.py @@ -11,9 +11,11 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - +import base64 from concurrent import futures +from six.moves import cPickle import time +import zlib from oslo_log import log @@ -44,6 +46,11 @@ class CollectorRpcHandlerService(object): LOG.info("Collector Rpc Handler Service - Stopped!") +def compress_events(events): + str_data = cPickle.dumps(events, cPickle.HIGHEST_PROTOCOL) + return base64.b64encode(zlib.compress(str_data)) + + class DriversEndpoint(object): def __init__(self, conf): @@ -71,7 +78,8 @@ class DriversEndpoint(object): time.sleep(fault_interval) result.extend(list(self.pool.map(run_driver, failed_drivers))) - events = [e for success, events in result if success for e in events] + events = compress_events([e for success, events in result if success + for e in events]) LOG.debug("run drivers get_all done.") return events diff --git a/vitrage/entity_graph/datasource_rpc.py b/vitrage/entity_graph/datasource_rpc.py index 2a80d11ce..cd3d5a851 100644 --- a/vitrage/entity_graph/datasource_rpc.py +++ b/vitrage/entity_graph/datasource_rpc.py @@ -11,7 +11,10 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +from base64 import standard_b64decode +from six.moves import cPickle import time +import zlib from oslo_log import log import oslo_messaging @@ -35,12 +38,15 @@ def get_all(rpc_client, events_coordination, driver_names, action, t1 = time.time() def _call(): - return rpc_client.call( + result = rpc_client.call( {}, 'driver_get_all', driver_names=driver_names, action=action, retry_on_fault=retry_on_fault) + events = cPickle.loads(zlib.decompress(standard_b64decode(result))) + for e in events: + yield e try: events = _call() @@ -48,10 +54,10 @@ def get_all(rpc_client, events_coordination, driver_names, action, LOG.exception('Got MessagingTimeout') events = _call() if retry_on_fault else [] t2 = time.time() - events_coordination.handle_multiple_low_priority(events) + count = events_coordination.handle_multiple_low_priority(events) t3 = time.time() LOG.info('get_all took %s, processing took %s for %s events', - t2 - t1, t3 - t2, len(events)) + t2 - t1, t3 - t2, count) def get_changes(rpc_client, events_coordination, driver_name): diff --git a/vitrage/entity_graph/graph_init.py b/vitrage/entity_graph/graph_init.py index 5694e8e2d..5a463b657 100644 --- a/vitrage/entity_graph/graph_init.py +++ b/vitrage/entity_graph/graph_init.py @@ -173,8 +173,10 @@ class EventsCoordination(object): self._lock.release() def handle_multiple_low_priority(self, events): - for e in events: + index = 0 + for index, e in enumerate(events): self._do_low_priority_work(e) + return index def _init_listener(self, topic, callback): if not topic: diff --git a/vitrage/graph/driver/networkx_graph.py b/vitrage/graph/driver/networkx_graph.py index be6749139..e2d8d9d6e 100644 --- a/vitrage/graph/driver/networkx_graph.py +++ b/vitrage/graph/driver/networkx_graph.py @@ -322,7 +322,10 @@ class NXGraph(Graph): node_link_data['links'][i]['target'] = vitrage_id_to_index[ node_link_data['links'][i]['target']] - return json.dumps(node_link_data) + if kwargs.get('raw', False): + return node_link_data + else: + return json.dumps(node_link_data) def write_gpickle(self): return cPickle.dumps(self._g, cPickle.HIGHEST_PROTOCOL) diff --git a/vitrage/tests/mocks/mock_graph_datasource/driver.py b/vitrage/tests/mocks/mock_graph_datasource/driver.py index ff9e3cc2b..9403253e8 100644 --- a/vitrage/tests/mocks/mock_graph_datasource/driver.py +++ b/vitrage/tests/mocks/mock_graph_datasource/driver.py @@ -11,8 +11,6 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. -import json - from oslo_log import log @@ -30,7 +28,7 @@ class MockDriver(StaticDriver): def __init__(self, conf): super(StaticDriver, self).__init__() mock_cfg = conf.mock_graph_datasource - self.e_graph = GraphGenerator( + e_graph = GraphGenerator( networks=mock_cfg.networks, zones_per_cluster=mock_cfg.zones_per_cluster, hosts_per_zone=mock_cfg.hosts_per_zone, @@ -42,19 +40,20 @@ class MockDriver(StaticDriver): tripleo_controllers=mock_cfg.tripleo_controllers, zabbix_alarms_per_controller=mock_cfg.zabbix_alarms_per_controller ).create_graph() + definitions = e_graph.json_output_graph(raw=True) + self.mock_entities = self._get_mock_entities(definitions) def get_all(self, datasource_action): - return self.make_pickleable(self._get_mock_entities(), - MOCK_DATASOURCE, - datasource_action) + return self.make_pickleable_iter(self.mock_entities, + MOCK_DATASOURCE, + datasource_action) def get_changes(self, datasource_action): - return self.make_pickleable([], - MOCK_DATASOURCE, - datasource_action) + return self.make_pickleable_iter([], + MOCK_DATASOURCE, + datasource_action) - def _get_mock_entities(self): - definitions = json.loads(self.e_graph.json_output_graph()) + def _get_mock_entities(self, definitions): for node in definitions['nodes']: node[StaticFields.STATIC_ID] = str(node[VProps.GRAPH_INDEX]) node[StaticFields.TYPE] = node[VProps.VITRAGE_TYPE]