From 0e97278c8acbf573b2ca3f03d77f0316fefdd8d0 Mon Sep 17 00:00:00 2001 From: Kobi Samoray Date: Tue, 12 Mar 2019 11:53:12 +0200 Subject: [PATCH] NSXv: admin util metadata breakage recovery Due to neutron bug, some metadata components in the various backend Edge appliances are missing. The patch is supposed to address these issues. Admin util command can run per Edge, per AZ or for the whole cloud. Cases handled by the utility: - Existing metadata proxies' internal IP is different than the IPs which are defined in the Edge's loadbalancer object. This case can happen when the metadata proxies are recreated for some reason. - Edge appliance is lacking the metadata network connectivity, and the loadbalancer objects. This case can happen while a router or a DHCP was created by the Neutron parent process, which failed to initialize with metadata due to a bug. - The Edge is missing the metadata firewall rules. This case can happen while the first interface attachment to the router was done in the Neutron parent process context due to the bug described above. Command syntax: Update AZ: nsxadmin -r metadata -o nsx-update --property az-name=az123 Update single Edge appliance: nsxadmin -r metadata -o nsx-update --property edge-id=edge-15 Update entire cloud: nsxadmin -r metadata -o nsx-update Change-Id: I77de9e0a0c627e43d3b1c95573d151e0414a34a9 --- doc/source/admin_util.rst | 10 +- .../admin/plugins/nsxv/resources/metadata.py | 294 ++++++++++++++---- 2 files changed, 243 insertions(+), 61 deletions(-) diff --git a/doc/source/admin_util.rst b/doc/source/admin_util.rst index d81c6d6c7b..b75addb6ff 100644 --- a/doc/source/admin_util.rst +++ b/doc/source/admin_util.rst @@ -287,10 +287,18 @@ Security Groups, Firewall and Spoofguard Metadata ~~~~~~~~ -- Update loadbalancer members on router and DHCP edges:: +- Update metadata infrastructure on all router and DHCP edges:: nsxadmin -r metadata -o nsx-update +- Update metadata infrastructure on availability zone's router and DHCP edges:: + + nsxadmin -r metadata -o nsx-update --property az-name=az123 + +- Update metadata infrastructure on specific router or DHCP edge:: + + nsxadmin -r metadata -o nsx-update --property edge-id=edge-15 + - Update shared secret on router and DHCP edges:: nsxadmin -r metadata -o nsx-update-secret diff --git a/vmware_nsx/shell/admin/plugins/nsxv/resources/metadata.py b/vmware_nsx/shell/admin/plugins/nsxv/resources/metadata.py index 2db78fac39..11b2287183 100644 --- a/vmware_nsx/shell/admin/plugins/nsxv/resources/metadata.py +++ b/vmware_nsx/shell/admin/plugins/nsxv/resources/metadata.py @@ -29,6 +29,7 @@ from vmware_nsx.plugins.nsx_v import availability_zones as nsx_az from vmware_nsx.plugins.nsx_v import md_proxy from vmware_nsx.plugins.nsx_v.vshield.common import constants as vcns_constants from vmware_nsx.plugins.nsx_v.vshield import nsxv_loadbalancer as nsxv_lb +from vmware_nsx.services.lbaas.nsx_v import lbaas_common as lb_common from vmware_nsx.shell.admin.plugins.common import constants from vmware_nsx.shell.admin.plugins.common import formatters from vmware_nsx.shell.admin.plugins.common import utils as admin_utils @@ -36,100 +37,273 @@ from vmware_nsx.shell.admin.plugins.nsxv.resources import utils as utils from vmware_nsx.shell import resources as shell +NSXV_MD_RULES = [ + {'name': 'MDServiceIP', + 'destination': {'ipAddress': ['169.254.169.254']}, + 'enabled': True, + 'application': {'service': [{'protocol': 'tcp', + 'port': [80, 443, 8775]}]}, + 'action': 'accept', + 'ruleTag': None}, + {'name': 'MDInterEdgeNet', + 'destination': {'ipAddress': ['169.254.128.0/17']}, + 'enabled': True, + 'action': 'deny', + 'ruleTag': None}] + LOG = logging.getLogger(__name__) nsxv = utils.get_nsxv_client() +def _append_md_fw_rules(fw_rules): + # Set FW rules tags + NSXV_MD_RULES[0]['ruleTag'] = len(fw_rules) + 1 + NSXV_MD_RULES[1]['ruleTag'] = len(fw_rules) + 2 + fw_rules += NSXV_MD_RULES + return fw_rules + + +def _handle_edge_firewall_rules(edge_id): + try: + h, fw_cfg = nsxv.get_firewall(edge_id) + except Exception as e: + fw_cfg = {} + LOG.error("Failed to retrieve firewall config for edge %(edge)s " + "with exception %(e)s", {'edge': edge_id, 'e': e}) + do_update = True + fw_rules = fw_cfg.get('firewallRules', {}).get('firewallRules', []) + for rule in fw_rules: + if rule['name'] in ['MDInterEdgeNet', 'MDServiceIP']: + do_update = False + break + if do_update: + fw_rules = _append_md_fw_rules(fw_rules) + fw_cfg['firewallRules']['firewallRules'] = fw_rules + try: + nsxv.update_firewall(edge_id, fw_cfg) + LOG.info('Added missing firewall rules for edge %s', edge_id) + except Exception as e: + LOG.warning("Failed to update firewall config for edge " + "%(edge)s with exception %(e)s", + {'edge': edge_id, 'e': e}) + + +def _recreate_rtr_metadata_cfg(context, plugin, az_name, edge_id): + rtr_binding = nsxv_db.get_nsxv_router_binding_by_edge( + context.session, edge_id) + md_handler = plugin.metadata_proxy_handler[az_name] + if md_handler: + try: + md_handler.configure_router_edge( + context, rtr_binding['router_id']) + LOG.info('Added metadata components for edge %s', + edge_id) + except Exception as e: + LOG.error('Recreation of metadata components for edge ' + '%(edge)s failed with error %(e)s', + {'edge': edge_id, 'e': e}) + + +def _update_md_lb_members(edge_id, edge_internal_ips, lb, pool): + LOG.info('Updating metadata members for edge %s', edge_id) + pool.members = {} + + i = 0 + s_port = cfg.CONF.nsxv.nova_metadata_port + for member_ip in edge_internal_ips: + i += 1 + member = nsxv_lb.NsxvLBPoolMember( + name='Member-%d' % i, + ip_address=member_ip, + port=s_port, + monitor_port=s_port) + pool.add_member(member) + + try: + lb.submit_to_backend(nsxv, edge_id) + LOG.info('Updated members for %s', edge_id) + except Exception as e: + LOG.error('Updating members for %(edge)s failed with ' + 'error %(e)s', {'edge': edge_id, 'e': e}) + + +def _get_internal_edge_ips(context, az_name): + # Get the list of internal networks for this AZ + db_net = nsxv_db.get_nsxv_internal_network_for_az( + context.session, + vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE, + az_name) + + internal_net = None + internal_subnet = None + if db_net: + internal_net = db_net['network_id'] + internal_subnet = context.session.query( + models_v2.Subnet).filter_by( + network_id=internal_net).first().get('id') + + # Get the list of internal edges for this AZ + edge_list = nsxv_db.get_nsxv_internal_edges_by_purpose( + context.session, + vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE) + edge_az_list = [edge for edge in edge_list if + nsxv_db.get_router_availability_zone( + context.session, edge['router_id']) == az_name] + + md_rtr_ids = [edge['router_id'] for edge in edge_az_list] + + edge_internal_ips = [] + for edge in edge_az_list: + edge_internal_port = context.session.query( + models_v2.Port).filter_by(network_id=internal_net, + device_id=edge['router_id']).first() + if edge_internal_port: + edge_internal_ip = context.session.query( + models_v2.IPAllocation).filter_by( + port_id=edge_internal_port['id']).first() + edge_internal_ips.append(edge_internal_ip['ip_address']) + + if not internal_net or not internal_subnet or not edge_internal_ips: + return None, None + + LOG.info('Metadata proxy internal IPs are %s', edge_internal_ips) + return edge_internal_ips, md_rtr_ids + + +def _handle_edge(context, plugin, az_name, edge_id, edge_internal_ips): + with locking.LockManager.get_lock(edge_id): + lb = nsxv_lb.NsxvLoadbalancer.get_loadbalancer(nsxv, edge_id) + virt = lb.virtual_servers.get(md_proxy.METADATA_VSE_NAME) + if virt: + pool = virt.default_pool + curr_member_ips = [member.payload['ipAddress'] for member in + pool.members.values()] + if set(curr_member_ips) != set(edge_internal_ips): + _update_md_lb_members(edge_id, edge_internal_ips, lb, pool) + + else: + # Interface connectivity and LB definition are done at the same + # operation. if LB is missing then interface should be missing + # as well + LOG.info('Metadata LB components for edge %s are missing', + edge_id) + _recreate_rtr_metadata_cfg(context, plugin, az_name, edge_id) + _handle_edge_firewall_rules(edge_id) + + @admin_utils.output_header def nsx_redo_metadata_cfg(resource, event, trigger, **kwargs): + properties = admin_utils.parse_multi_keyval_opt(kwargs.get('property')) edgeapi = utils.NeutronDbClient() + plugin = utils.NsxVPluginWrapper() + + edge_id = properties.get('edge-id') + if properties: + if edge_id: + nsx_redo_metadata_cfg_for_edge(edgeapi.context, plugin, edge_id) + return + else: + # if the net-id property exist - recreate the edge for this network + az_name = properties.get('az-name') + if az_name: + nsx_redo_metadata_cfg_for_az(edgeapi.context, plugin, az_name) + return + LOG.error('Cannot parse properties %s', properties) + return + + nsx_redo_metadata_cfg_all(edgeapi.context, plugin) + + +def nsx_redo_metadata_cfg_for_edge(context, plugin, edge_id): + binding = nsxv_db.get_nsxv_router_binding_by_edge(context.session, edge_id) + if binding: + az_name = binding['availability_zone'] + + conf_az = nsx_az.NsxVAvailabilityZones() + az = conf_az.availability_zones[az_name] + if not az.supports_metadata(): + LOG.error('Edge %(edge)s belongs to az %(az)s which does not ' + 'support metadata', + {'az': az_name, 'edge': edge_id}) + + edge_internal_ips, md_rtr_ids = _get_internal_edge_ips(context, + az_name) + + if binding['router_id'] in md_rtr_ids: + LOG.error('Edge %s is a metadata proxy', edge_id) + return + + if (binding['router_id'].startswith( + vcns_constants.BACKUP_ROUTER_PREFIX) or + binding['router_id'].startswith( + vcns_constants.PLR_EDGE_PREFIX)or + binding['router_id'].startswith( + lb_common.RESOURCE_ID_PFX)): + LOG.error('Edge %s is not a metadata delivery appliance', edge_id) + return + + _handle_edge(context, plugin, az_name, edge_id, edge_internal_ips) + else: + LOG.error('No edge binding found for edge %s', edge_id) + + +@admin_utils.output_header +def nsx_redo_metadata_cfg_all(context, plugin): + user_confirm = admin_utils.query_yes_no("Do you want to setup metadata " + "infrastructure for all the edges", + default="no") + if not user_confirm: + LOG.info("NSXv vnics deletion aborted by user") + return config.register_nsxv_azs(cfg.CONF, cfg.CONF.nsxv.availability_zones) conf_az = nsx_az.NsxVAvailabilityZones() az_list = conf_az.list_availability_zones_objects() for az in az_list: if az.supports_metadata(): - nsx_redo_metadata_cfg_for_az(az, edgeapi) + nsx_redo_metadata_cfg_for_az(context, plugin, az.name, False) else: LOG.info("Skipping availability zone: %s - no metadata " "configuration", az.name) -def nsx_redo_metadata_cfg_for_az(az, edgeapi): - LOG.info("Updating MetaData for availability zone: %s", az.name) +def nsx_redo_metadata_cfg_for_az(context, plugin, az_name, check_az=True): + LOG.info("Updating MetaData for availability zone: %s", az_name) - # Get the list of internal networks for this AZ - db_net = nsxv_db.get_nsxv_internal_network_for_az( - edgeapi.context.session, - vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE, - az.name) + if check_az: + conf_az = nsx_az.NsxVAvailabilityZones() + az = conf_az.availability_zones.get(az_name) + if not az: + LOG.error('Availability zone %s not found', az_name) + return + if not az.supports_metadata(): + LOG.error('Availability zone %s is not configured with metadata', + az_name) + return - internal_net = None - internal_subnet = None - if db_net: - internal_net = db_net['network_id'] - internal_subnet = edgeapi.context.session.query( - models_v2.Subnet).filter_by( - network_id=internal_net).first().get('id') - - # Get the list of internal edges for this AZ - edge_list = nsxv_db.get_nsxv_internal_edges_by_purpose( - edgeapi.context.session, - vcns_constants.InternalEdgePurposes.INTER_EDGE_PURPOSE) - edge_az_list = [edge for edge in edge_list if - nsxv_db.get_router_availability_zone( - edgeapi.context.session, edge['router_id']) == az.name] - - md_rtr_ids = [edge['router_id'] for edge in edge_az_list] - - edge_internal_ips = [] - for edge in edge_az_list: - edge_internal_port = edgeapi.context.session.query( - models_v2.Port).filter_by(network_id=internal_net, - device_id=edge['router_id']).first() - if edge_internal_port: - edge_internal_ip = edgeapi.context.session.query( - models_v2.IPAllocation).filter_by( - port_id=edge_internal_port['id']).first() - edge_internal_ips.append(edge_internal_ip['ip_address']) - - if not internal_net or not internal_subnet or not edge_internal_ips: + edge_internal_ips, md_rtr_ids = _get_internal_edge_ips(context, + az_name) + if not edge_internal_ips and not md_rtr_ids: LOG.error("Metadata infrastructure is missing or broken. " "It is recommended to restart neutron service before " "proceeding with configuration restoration") return router_bindings = nsxv_db.get_nsxv_router_bindings( - edgeapi.context.session, + context.session, filters={'edge_type': [nsxv_constants.SERVICE_EDGE], - 'availability_zone': az.name}) + 'availability_zone': [az_name]}) edge_ids = list(set([binding['edge_id'] for binding in router_bindings if (binding['router_id'] not in set(md_rtr_ids) and not binding['router_id'].startswith( vcns_constants.BACKUP_ROUTER_PREFIX) and not binding['router_id'].startswith( - vcns_constants.PLR_EDGE_PREFIX))])) + vcns_constants.PLR_EDGE_PREFIX)and + not binding['router_id'].startswith( + lb_common.RESOURCE_ID_PFX))])) for edge_id in edge_ids: - with locking.LockManager.get_lock(edge_id): - lb = nsxv_lb.NsxvLoadbalancer.get_loadbalancer(nsxv, edge_id) - virt = lb.virtual_servers.get(md_proxy.METADATA_VSE_NAME) - if virt: - pool = virt.default_pool - pool.members = {} - - i = 0 - s_port = cfg.CONF.nsxv.nova_metadata_port - for member_ip in edge_internal_ips: - i += 1 - member = nsxv_lb.NsxvLBPoolMember( - name='Member-%d' % i, - ip_address=member_ip, - port=s_port, - monitor_port=s_port) - pool.add_member(member) - - lb.submit_to_backend(nsxv, edge_id) + _handle_edge(context, plugin, az_name, edge_id, edge_internal_ips) @admin_utils.output_header