diff --git a/doc/source/devspecs/async_job_management.rst b/doc/source/devspecs/async_job_management.rst new file mode 100644 index 00000000..896211ed --- /dev/null +++ b/doc/source/devspecs/async_job_management.rst @@ -0,0 +1,276 @@ +========================================= +Tricircle Asynchronous Job Management API +========================================= + +Background +========== +In the Tricircle, XJob provides OpenStack multi-region functionality. It +receives and processes jobs from the Admin API or Tricircle Central +Neutron Plugin and handles them in an asynchronous way. For example, when +booting an instance in the first time for the project, router, security +group rule, FIP and other resources may have not already been created in +the local Neutron(s), these resources could be created asynchronously to +accelerate response for the initial instance booting request, different +from network, subnet and security group resources that must be created +before an instance booting. Central Neutron could send such creation jobs +to local Neutron(s) through XJob and then local Neutron(s) handle them +with their own speed. + +Implementation +============== +XJob server may strike occasionally so tenants and cloud administrators +need to know the job status and delete or redo the failed job if necessary. +Asynchronous job management APIs provide such functionality and they are +listed as following: + +* Create a job + + Create a job to synchronize resource if necessary. + + Create Job Request:: + + POST /v1.0/jobs + { + "job": { + "type": "port_delete", + "project_id": "d01246bc5792477d9062a76332b7514a", + "resource": { + "pod_id": "0eb59465-5132-4f57-af01-a9e306158b86", + "port_id": "8498b903-9e18-4265-8d62-3c12e0ce4314" + } + } + } + + Response: + { + "job": { + "id": "3f4ecf30-0213-4f1f-9cb0-0233bcedb767", + "project_id": "d01246bc5792477d9062a76332b7514a", + "type": "port_delete", + "timestamp": "2017-03-03 11:05:36", + "status": "NEW", + "resource": { + "pod_id": "0eb59465-5132-4f57-af01-a9e306158b86", + "port_id": "8498b903-9e18-4265-8d62-3c12e0ce4314" + } + } + } + + Normal Response Code: 202 + + +* Get a job + + Retrieve a job from the Tricircle database. + + The detailed information of the job will be shown. Otherwise + it will return "Resource not found" exception. + + List Request:: + + GET /v1.0/jobs/3f4ecf30-0213-4f1f-9cb0-0233bcedb767 + + Response: + { + "job": { + "id": "3f4ecf30-0213-4f1f-9cb0-0233bcedb767", + "project_id": "d01246bc5792477d9062a76332b7514a", + "type": "port_delete", + "timestamp": "2017-03-03 11:05:36", + "status": "NEW", + "resource": { + "pod_id": "0eb59465-5132-4f57-af01-a9e306158b86", + "port_id": "8498b903-9e18-4265-8d62-3c12e0ce4314" + } + } + } + + Normal Response Code: 200 + +* Get all jobs + + Retrieve all of the jobs from the Tricircle database. + + List Request:: + + GET /v1.0/jobs/detail + + Response: + { + "jobs": + [ + { + "id": "3f4ecf30-0213-4f1f-9cb0-0233bcedb767", + "project_id": "d01246bc5792477d9062a76332b7514a", + "type": "port_delete", + "timestamp": "2017-03-03 11:05:36", + "status": "NEW", + "resource": { + "pod_id": "0eb59465-5132-4f57-af01-a9e306158b86", + "port_id": "8498b903-9e18-4265-8d62-3c12e0ce4314" + } + }, + { + "id": "b01fe514-5211-4758-bbd1-9f32141a7ac2", + "project_id": "d01246bc5792477d9062a76332b7514a", + "type": "seg_rule_setup", + "timestamp": "2017-03-01 17:14:44", + "status": "FAIL", + "resource": { + "project_id": "d01246bc5792477d9062a76332b7514a" + } + } + ] + } + + Normal Response Code: 200 + +* Get all jobs with filter(s) + + Retrieve job(s) from the Tricircle database. We can filter them by + project ID, job type and job status. If no filter is provided, + GET /v1.0/jobs will return all jobs. + + The response contains a list of jobs. Using filters, a subset of jobs + will be returned. + + List Request:: + + GET /v1.0/jobs?project_id=d01246bc5792477d9062a76332b7514a + + Response: + { + "jobs": + [ + { + "id": "3f4ecf30-0213-4f1f-9cb0-0233bcedb767", + "project_id": "d01246bc5792477d9062a76332b7514a", + "type": "port_delete", + "timestamp": "2017-03-03 11:05:36", + "status": "NEW", + "resource": { + "pod_id": "0eb59465-5132-4f57-af01-a9e306158b86", + "port_id": "8498b903-9e18-4265-8d62-3c12e0ce4314" + } + }, + { + "id": "b01fe514-5211-4758-bbd1-9f32141a7ac2", + "project_id": "d01246bc5792477d9062a76332b7514a", + "type": "seg_rule_setup", + "timestamp": "2017-03-01 17:14:44", + "status": "FAIL", + "resource": { + "project_id": "d01246bc5792477d9062a76332b7514a" + } + } + ] + } + + Normal Response Code: 200 + + +* Get all jobs' schemas + + Retrieve all jobs' schemas. User may want to know what the resources + are needed for a specific job. + + List Request:: + + GET /v1.0/jobs/schemas + + return all jobs' schemas. + Response: + { + "schemas": + [ + { + "type": "configure_route", + "resource": ["router_id"] + }, + { + "type": "router_setup", + "resource": ["pod_id", "router_id", "network_id"] + }, + { + "type": "port_delete", + "resource": ["pod_id", "port_id"] + }, + { + "type": "seg_rule_setup", + "resource": ["project_id"] + }, + { + "type": "update_network", + "resource": ["pod_id", "network_id"] + }, + { + "type": "subnet_update", + "resource": ["pod_id", "subnet_id"] + }, + { + "type": "shadow_port_setup", + "resource": [pod_id", "network_id"] + } + ] + } + + Normal Response Code: 200 + + +* Delete a job + + Delete a failed or duplicated job from the Tricircle database. + A pair of curly braces will be returned if succeeds, otherwise an + exception will be thrown. What's more, we can list all jobs to verify + whether it is deleted successfully or not. + + Delete Job Request:: + + DELETE /v1.0/jobs/{id} + + Response: + This operation does not return a response body. + + Normal Response Code: 200 + + +* Redo a job + + Redo a halted job brought by the XJob server corruption or network failures. + The job handler will redo a failed job with time interval, but this Admin + API will redo a job immediately. Nothing will be returned for this request, + but we can monitor its status through the execution state. + + Redo Job Request:: + + PUT /v1.0/jobs/{id} + + Response: + This operation does not return a response body. + + Normal Response Code: 200 + + +Data Model Impact +================= + +In order to manage the jobs for each tenant, we need to filter them by +project ID. So project ID is going to be added to the AsyncJob model and +AsyncJobLog model. + +Dependencies +============ + +None + +Documentation Impact +==================== + +- Add documentation for asynchronous job management API +- Add release note for asynchronous job management API + +References +========== + +None + diff --git a/doc/source/devspecs/cross-neutron-l2-networking.rst b/doc/source/devspecs/cross-neutron-l2-networking.rst new file mode 100644 index 00000000..1246e399 --- /dev/null +++ b/doc/source/devspecs/cross-neutron-l2-networking.rst @@ -0,0 +1,558 @@ +======================================== +Cross Neutron L2 networking in Tricircle +======================================== + +Background +========== +The Tricircle provides unified OpenStack API gateway and networking automation +functionality. Those main functionalities allow cloud operators to manage +multiple OpenStack instances which are running in one site or multiple sites +as a single OpenStack cloud. + +Each bottom OpenStack instance which is managed by the Tricircle is also called +a pod. + +The Tricircle has the following components: + +* Nova API-GW +* Cinder API-GW +* Neutron API Server with Neutron Tricircle plugin +* Admin API +* XJob +* DB + +Nova API-GW provides the functionality to trigger automatic networking creation +when new VMs are being provisioned. Neutron Tricircle plug-in is the +functionality to create cross Neutron L2/L3 networking for new VMs. After the +binding of tenant-id and pod finished in the Tricircle, Cinder API-GW and Nova +API-GW will pass the cinder api or nova api request to appropriate bottom +OpenStack instance. + +Please refer to the Tricircle design blueprint[1], especially from +'7. Stateless Architecture Proposal' for the detail description of each +components. + + +Problem Description +=================== +When a user wants to create a network in Neutron API Server, the user can +specify the 'availability_zone_hints'(AZ or az will be used for short for +availability zone) during network creation[5], in the Tricircle, the +'az_hints' means which AZ the network should be spread into. The 'az_hints' +meaning in Tricircle is a little different from the 'az_hints' meaning in +Neutron[5]. If no 'az_hints' was specified during network creation, this created +network will be spread into any AZ. If there is a list of 'az_hints' during the +network creation, that means the network should be able to be spread into these +AZs which are suggested by a list of 'az_hints'. + +When a user creates VM or Volume, there is also one parameter called +availability zone. The AZ parameter is used for Volume and VM co-location, so +that the Volume and VM will be created into same bottom OpenStack instance. + +When a VM is being attached to a network, the Tricircle will check whether a +VM's AZ is inside in the network's AZs scope. If a VM is not in the network's +AZs scope, the VM creation will be rejected. + +Currently, the Tricircle only supports one pod in one AZ. And only supports a +network associated with one AZ. That means currently a tenant's network will +be presented only in one bottom OpenStack instance, that also means all VMs +connected to the network will be located at one bottom OpenStack instance. +If there are more than one pod in one AZ, refer to the dynamic pod binding[6]. + +There are lots of use cases where a tenant needs a network being able to be +spread out into multiple bottom OpenStack instances in one AZ or multiple AZs. + +* Capacity expansion: tenants add VMs more and more, the capacity of one + OpenStack may not be enough, then a new OpenStack instance has to be added + to the cloud. But the tenant still wants to add new VMs into same network. + +* Cross Neutron network service chaining. Service chaining is based on + the port-pairs. Leveraging the cross Neutron L2 networking capability which + is provided by the Tricircle, the chaining could also be done by across sites. + For example, vRouter1 in pod1, but vRouter2 in pod2, these two VMs could be + chained. + +* Applications are often required to run in different availability zones to + achieve high availability. Application needs to be designed as + Active-Standby/Active-Active/N-Way to achieve high availability, and some + components inside one application are designed to work as distributed + cluster, this design typically leads to state replication or heart + beat among application components (directly or via replicated database + services, or via private designed message format). When this kind of + applications are distributedly deployed into multiple OpenStack instances, + cross Neutron L2 networking is needed to support heart beat + or state replication. + +* When a tenant's VMs are provisioned in different OpenStack instances, there + is E-W (East-West) traffic for these VMs, the E-W traffic should be only + visible to the tenant, and isolation is needed. If the traffic goes through + N-S (North-South) via tenant level VPN, overhead is too much, and the + orchestration for multiple site to site VPN connection is also complicated. + Therefore cross Neutron L2 networking to bridge the tenant's routers in + different Neutron servers can provide more light weight isolation. + +* In hybrid cloud, there is cross L2 networking requirement between the + private OpenStack and the public OpenStack. Cross Neutron L2 networking will + help the VMs migration in this case and it's not necessary to change the + IP/MAC/Security Group configuration during VM migration. + +The spec[5] is to explain how one AZ can support more than one pod, and how +to schedule a proper pod during VM or Volume creation. + +And this spec is to deal with the cross Neutron L2 networking automation in +the Tricircle. + +The simplest way to spread out L2 networking to multiple OpenStack instances +is to use same VLAN. But there is a lot of limitations: (1) A number of VLAN +segment is limited, (2) the VLAN network itself is not good to spread out +multiple sites, although you can use some gateways to do the same thing. + +So flexible tenant level L2 networking across multiple Neutron servers in +one site or in multiple sites is needed. + +Proposed Change +=============== + +Cross Neutron L2 networking can be divided into three categories, +``VLAN``, ``Shared VxLAN`` and ``Mixed VLAN/VxLAN``. + +* VLAN + + Network in each bottom OpenStack is VLAN type and has the same VLAN ID. + If we want VLAN L2 networking to work in multi-site scenario, i.e., + Multiple OpenStack instances in multiple sites, physical gateway needs to + be manually configured to make one VLAN networking be extended to other + sites. + + *Manual setup physical gateway is out of the scope of this spec* + +* Shared VxLAN + + Network in each bottom OpenStack instance is VxLAN type and has the same + VxLAN ID. + + Leverage L2GW[2][3] to implement this type of L2 networking. + +* Mixed VLAN/VxLAN + + Network in each bottom OpenStack instance may have different types and/or + have different segment IDs. + + Leverage L2GW[2][3] to implement this type of L2 networking. + +There is another network type called “Local Network”. For “Local Network”, +the network will be only presented in one bottom OpenStack instance. And the +network won't be presented in different bottom OpenStack instances. If a VM +in another pod tries to attach to the “Local Network”, it should be failed. +This use case is quite useful for the scenario in which cross Neutron L2 +networking is not required, and one AZ will not include more than bottom +OpenStack instance. + +Cross Neutron L2 networking will be able to be established dynamically during +tenant's VM is being provisioned. + +There is assumption here that only one type of L2 networking will work in one +cloud deployment. + + +A Cross Neutron L2 Networking Creation +-------------------------------------- + +A cross Neutron L2 networking creation will be able to be done with the az_hint +attribute of the network. If az_hint includes one AZ or more AZs, the network +will be presented only in this AZ or these AZs, if no AZ in az_hint, it means +that the network can be extended to any bottom OpenStack. + +There is a special use case for external network creation. For external +network creation, you need to specify the pod_id but not AZ in the az_hint +so that the external network will be only created in one specified pod per AZ. + + *Support of External network in multiple OpenStack instances in one AZ + is out of scope of this spec.* + +Pluggable L2 networking framework is proposed to deal with three types of +L2 cross Neutron networking, and it should be compatible with the +``Local Network``. + +1. Type Driver under Tricircle Plugin in Neutron API server + +* Type driver to distinguish different type of cross Neutron L2 networking. So + the Tricircle plugin need to load type driver according to the configuration. + The Tricircle can reuse the type driver of ML2 with update. + +* Type driver to allocate VLAN segment id for VLAN L2 networking. + +* Type driver to allocate VxLAN segment id for shared VxLAN L2 networking. + +* Type driver for mixed VLAN/VxLAN to allocate VxLAN segment id for the + network connecting L2GWs[2][3]. + +* Type driver for Local Network only updating ``network_type`` for the + network to the Tricircle Neutron DB. + +When a network creation request is received in Neutron API Server in the +Tricircle, the type driver will be called based on the configured network +type. + +2. Nova API-GW to trigger the bottom networking automation + +Nova API-GW can be aware of when a new VM is provisioned if boot VM api request +is received, therefore Nova API-GW is responsible for the network creation in +the bottom OpenStack instances. + +Nova API-GW needs to get the network type from Neutron API server in the +Tricircle, and deal with the networking automation based on the network type: + +* VLAN + Nova API-GW creates network in bottom OpenStack instance in which the VM will + run with the VLAN segment id, network name and type that are retrieved from + the Neutron API server in the Tricircle. + +* Shared VxLAN + Nova API-GW creates network in bottom OpenStack instance in which the VM will + run with the VxLAN segment id, network name and type which are retrieved from + Tricricle Neutron API server. After the network in the bottom OpenStack + instance is created successfully, Nova API-GW needs to make this network in the + bottom OpenStack instance as one of the segments in the network in the Tricircle. + +* Mixed VLAN/VxLAN + Nova API-GW creates network in different bottom OpenStack instance in which the + VM will run with the VLAN or VxLAN segment id respectively, network name and type + which are retrieved from Tricricle Neutron API server. After the network in the + bottom OpenStack instances is created successfully, Nova API-GW needs to update + network in the Tricircle with the segmentation information of bottom netwoks. + +3. L2GW driver under Tricircle Plugin in Neutron API server + +Tricircle plugin needs to support multi-segment network extension[4]. + +For Shared VxLAN or Mixed VLAN/VxLAN L2 network type, L2GW driver will utilize the +multi-segment network extension in Neutron API server to build the L2 network in the +Tricircle. Each network in the bottom OpenStack instance will be a segment for the +whole cross Neutron L2 networking in the Tricircle. + +After the network in the bottom OpenStack instance was created successfully, Nova +API-GW will call Neutron server API to update the network in the Tricircle with a +new segment from the network in the bottom OpenStack instance. + +If the network in the bottom OpenStack instance was removed successfully, Nova +API-GW will call Neutron server api to remove the segment in the bottom OpenStack +instance from network in the Tricircle. + +When L2GW driver under Tricircle plugin in Neutron API server receives the +segment update request, L2GW driver will start async job to orchestrate L2GW API +for L2 networking automation[2][3]. + + +Data model impact +----------------- + +In database, we are considering setting physical_network in top OpenStack instance +as ``bottom_physical_network#bottom_pod_id`` to distinguish segmentation information +in different bottom OpenStack instance. + +REST API impact +--------------- + +None + +Security impact +--------------- + +None + +Notifications impact +-------------------- + +None + +Other end user impact +--------------------- + +None + +Performance Impact +------------------ + +None + +Other deployer impact +--------------------- + +None + +Developer impact +---------------- + +None + + +Implementation +---------------- + +**Local Network Implementation** + +For Local Network, L2GW is not required. In this scenario, no cross Neutron L2/L3 +networking is required. + +A user creates network ``Net1`` with single AZ1 in az_hint, the Tricircle plugin +checks the configuration, if ``tenant_network_type`` equals ``local_network``, +it will invoke Local Network type driver. Local Network driver under the +Tricircle plugin will update ``network_type`` in database. + +For example, a user creates VM1 in AZ1 which has only one pod ``POD1``, and +connects it to network ``Net1``. ``Nova API-GW`` will send network creation +request to ``POD1`` and the VM will be booted in AZ1 (There should be only one +pod in AZ1). + +If a user wants to create VM2 in AZ2 or ``POD2`` in AZ1, and connect it to +network ``Net1`` in the Tricircle, it would be failed. Because the ``Net1`` is +local_network type network and it is limited to present in ``POD1`` in AZ1 only. + +**VLAN Implementation** + +For VLAN, L2GW is not required. This is the most simplest cross Neutron +L2 networking for limited scenario. For example, with a small number of +networks, all VLANs are extended through physical gateway to support cross +Neutron VLAN networking, or all Neutron servers under same core switch with same visible +VLAN ranges that supported by the core switch are connected by the core +switch. + +when a user creates network called ``Net1``, the Tricircle plugin checks the +configuration. If ``tenant_network_type`` equals ``vlan``, the +Tricircle will invoke VLAN type driver. VLAN driver will +create ``segment``, and assign ``network_type`` with VLAN, update +``segment`` and ``network_type`` and ``physical_network`` with DB + +A user creates VM1 in AZ1, and connects it to network Net1. If VM1 will be +booted in ``POD1``, ``Nova API-GW`` needs to get the network information and +send network creation message to ``POD1``. Network creation message includes +``network_type`` and ``segment`` and ``physical_network``. + +Then the user creates VM2 in AZ2, and connects it to network Net1. If VM will +be booted in ``POD2``, ``Nova API-GW`` needs to get the network information and +send create network message to ``POD2``. Create network message includes +``network_type`` and ``segment`` and ``physical_network``. + +**Shared VxLAN Implementation** + +A user creates network ``Net1``, the Tricircle plugin checks the configuration, if +``tenant_network_type`` equals ``shared_vxlan``, it will invoke shared VxLAN +driver. Shared VxLAN driver will allocate ``segment``, and assign +``network_type`` with VxLAN, and update network with ``segment`` and +``network_type`` with DB + +A user creates VM1 in AZ1, and connects it to network ``Net1``. If VM1 will be +booted in ``POD1``, ``Nova API-GW`` needs to get the network information and send +create network message to ``POD1``, create network message includes +``network_type`` and ``segment``. + +``Nova API-GW`` should update ``Net1`` in Tricircle with the segment information +got by ``POD1``. + +Then the user creates VM2 in AZ2, and connects it to network ``Net1``. If VM2 will +be booted in ``POD2``, ``Nova API-GW`` needs to get the network information and +send network creation massage to ``POD2``, network creation message includes +``network_type`` and ``segment``. + +``Nova API-GW`` should update ``Net1`` in the Tricircle with the segment information +get by ``POD2``. + +The Tricircle plugin detects that the network includes more than one segment +network, calls L2GW driver to start async job for cross Neutron networking for +``Net1``. The L2GW driver will create L2GW1 in ``POD1`` and L2GW2 in ``POD2``. In +``POD1``, L2GW1 will connect the local ``Net1`` and create L2GW remote connection +to L2GW2, then populate the information of MAC/IP which resides in L2GW1. In +``POD2``, L2GW2 will connect the local ``Net1`` and create L2GW remote connection +to L2GW1, then populate remote MAC/IP information which resides in ``POD1`` in L2GW2. + +L2GW driver in the Tricircle will also detect the new port creation/deletion API +request. If port (MAC/IP) created or deleted in ``POD1`` or ``POD2``, it needs to +refresh the L2GW2 MAC/IP information. + +Whether to populate the information of port (MAC/IP) should be configurable according +to L2GW capability. And only populate MAC/IP information for the ports that are not +resides in the same pod. + +**Mixed VLAN/VxLAN** + +To achieve cross Neutron L2 networking, L2GW will be used to connect L2 network +in different Neutron servers, using L2GW should work for Shared VxLAN and Mixed VLAN/VxLAN +scenario. + +When L2GW connected with local network in the same OpenStack instance, no +matter it's VLAN or VxLAN or GRE, the L2GW should be able to connect the +local network, and because L2GW is extension of Neutron, only network +UUID should be enough for L2GW to connect the local network. + +When admin user creates network in Tricircle, he/she specifies the network +type as one of the network type as discussed above. In the phase of creating +network in Tricircle, only one record is saved in the database, no network +will be created in bottom OpenStack. + +After the network in the bottom created successfully, need to retrieve the +network information like segment id, network name and network type, and make +this network in the bottom pod as one of the segments in the network in +Tricircle. + +In the Tricircle, network could be created by tenant or admin. For tenant, no way +to specify the network type and segment id, then default network type will +be used instead. When user uses the network to boot a VM, ``Nova API-GW`` +checks the network type. For Mixed VLAN/VxLAN network, ``Nova API-GW`` first +creates network in bottom OpenStack without specifying network type and segment +ID, then updates the top network with bottom network segmentation information +returned by bottom OpenStack. + +A user creates network ``Net1``, plugin checks the configuration, if +``tenant_network_type`` equals ``mixed_vlan_vxlan``, it will invoke mixed VLAN +and VxLAN driver. The driver needs to do nothing since segment is allocated +in bottom. + +A user creates VM1 in AZ1, and connects it to the network ``Net1``, the VM is +booted in bottom ``POD1``, and ``Nova API-GW`` creates network in ``POD1`` and +queries the network detail segmentation information (using admin role), and +gets network type, segment id, then updates this new segment to the ``Net1`` +in Tricircle ``Neutron API Server``. + +Then the user creates another VM2, and with AZ info AZ2, then the VM should be +able to be booted in bottom ``POD2`` which is located in AZ2. And when VM2 should +be able to be booted in AZ2, ``Nova API-GW`` also creates a network in ``POD2``, +and queries the network information including segment and network type, +updates this new segment to the ``Net1`` in Tricircle ``Neutron API Server``. + +The Tricircle plugin detects that the ``Net1`` includes more than one network +segments, calls L2GW driver to start async job for cross Neutron networking for +``Net1``. The L2GW driver will create L2GW1 in ``POD1`` and L2GW2 in ``POD2``. In +``POD1``, L2GW1 will connect the local ``Net1`` and create L2GW remote connection +to L2GW2, then populate information of MAC/IP which resides in ``POD2`` in L2GW1. +In ``POD2``, L2GW2 will connect the local ``Net1`` and create L2GW remote connection +to L2GW1, then populate remote MAC/IP information which resides in ``POD1`` in L2GW2. + +L2GW driver in Tricircle will also detect the new port creation/deletion api +calling, if port (MAC/IP) created or deleted in ``POD1``, then needs to refresh +the L2GW2 MAC/IP information. If port (MAC/IP) created or deleted in ``POD2``, +then needs to refresh the L2GW1 MAC/IP information, + +Whether to populate MAC/IP information should be configurable according to +L2GW capability. And only populate MAC/IP information for the ports that are +not resides in the same pod. + +**L3 bridge network** + +Current implementation without cross Neutron L2 networking. + +* A special bridge network is created and connected to the routers in + different bottom OpenStack instances. We configure the extra routes of the routers + to route the packets from one OpenStack to another. In current + implementation, we create this special bridge network in each bottom + OpenStack with the same ``VLAN ID``, so we have an L2 network to connect + the routers. + +Difference between L2 networking for tenant's VM and for L3 bridging network. + +* The creation of bridge network is triggered during attaching router + interface and adding router external gateway. + +* The L2 network for VM is triggered by ``Nova API-GW`` when a VM is to be + created in one pod, and finds that there is no network, then the network + will be created before the VM is booted, network or port parameter is + required to boot VM. The IP/Mac for VM is allocated in the ``Tricircle``, + top layer to avoid IP/mac collision if they are allocated separately in + bottom pods. + +After cross Neutron L2 networking is introduced, the L3 bridge network should +be updated too. + +L3 bridge network N-S (North-South): + +* For each tenant, one cross Neutron N-S bridge network should be created for + router N-S inter-connection. Just replace the current VLAN N-S bridge network + to corresponding Shared VxLAN or Mixed VLAN/VxLAN. + +L3 bridge network E-W (East-West): + +* When attaching router interface happened, for VLAN, it will keep + current process to establish E-W bridge network. For Shared VxLAN and Mixed + VLAN/VxLAN, if a L2 network is able to expand to the current pod, then just + expand the L2 network to the pod, all E-W traffic will go out from local L2 + network, then no bridge network is needed. + +* For example, (Net1, Router1) in ``Pod1``, (Net2, Router1) in ``Pod2``, if + ``Net1`` is a cross Neutron L2 network, and can be expanded to Pod2, then + will just expand ``Net1`` to Pod2. After the ``Net1`` expansion ( just like + cross Neutron L2 networking to spread one network in multiple Neutron servers ), it'll + look like (Net1, Router1) in ``Pod1``, (Net1, Net2, Router1) in ``Pod2``, In + ``Pod2``, no VM in ``Net1``, only for E-W traffic. Now the E-W traffic will + look like this: + +from Net2 to Net1: + +Net2 in Pod2 -> Router1 in Pod2 -> Net1 in Pod2 -> L2GW in Pod2 ---> L2GW in +Pod1 -> Net1 in Pod1. + +Note: The traffic for ``Net1`` in ``Pod2`` to ``Net1`` in ``Pod1`` can bypass the L2GW in +``Pod2``, that means outbound traffic can bypass the local L2GW if the remote VTEP of +L2GW is known to the local compute node and the packet from the local compute +node with VxLAN encapsulation cloud be routed to remote L2GW directly. It's up +to the L2GW implementation. With the inbound traffic through L2GW, the inbound +traffic to the VM will not be impacted by the VM migration from one host to +another. + +If ``Net2`` is a cross Neutron L2 network, and can be expanded to ``Pod1`` too, +then will just expand ``Net2`` to ``Pod1``. After the ``Net2`` expansion(just +like cross Neutron L2 networking to spread one network in multiple Neutron servers ), it'll +look like (Net2, Net1, Router1) in ``Pod1``, (Net1, Net2, Router1) in ``Pod2``, +In ``Pod1``, no VM in Net2, only for E-W traffic. Now the E-W traffic will look +like this: from ``Net1`` to ``Net2``: + +Net1 in Pod1 -> Router1 in Pod1 -> Net2 in Pod1 -> L2GW in Pod1 ---> L2GW in +Pod2 -> Net2 in Pod2. + +To limit the complexity, one network's az_hint can only be specified when +creating, and no update is allowed, if az_hint need to be updated, you have +to delete the network and create again. + +If the network can't be expanded, then E-W bridge network is needed. For +example, Net1(AZ1, AZ2,AZ3), Router1; Net2(AZ4, AZ5, AZ6), Router1. +Then a cross Neutron L2 bridge network has to be established: + +Net1(AZ1, AZ2, AZ3), Router1 --> E-W bridge network ---> Router1, +Net2(AZ4, AZ5, AZ6). + +Assignee(s) +------------ + +Primary assignee: + + +Other contributors: + + +Work Items +------------ + +Dependencies +---------------- + +None + + +Testing +---------------- + +None + + +References +---------------- +[1] https://docs.google.com/document/d/18kZZ1snMOCD9IQvUKI5NVDzSASpw-QKj7l2zNqMEd3g/ + +[2] https://review.openstack.org/#/c/270786/ + +[3] https://github.com/openstack/networking-l2gw/blob/master/specs/kilo/l2-gateway-api.rst + +[4] http://developer.openstack.org/api-ref-networking-v2-ext.html#networks-multi-provider-ext + +[5] http://docs.openstack.org/mitaka/networking-guide/adv-config-availability-zone.html + +[6] https://review.openstack.org/#/c/306224/ diff --git a/doc/source/devspecs/cross-neutron-vxlan-networking.rst b/doc/source/devspecs/cross-neutron-vxlan-networking.rst new file mode 100644 index 00000000..ae258f98 --- /dev/null +++ b/doc/source/devspecs/cross-neutron-vxlan-networking.rst @@ -0,0 +1,233 @@ +=========================================== +Cross Neutron VxLAN Networking in Tricircle +=========================================== + +Background +========== + +Currently we only support VLAN as the cross-Neutron network type. For VLAN network +type, central plugin in Tricircle picks a physical network and allocates a VLAN +tag(or uses what users specify), then before the creation of local network, +local plugin queries this provider network information and creates the network +based on this information. Tricircle only guarantees that instance packets sent +out of hosts in different pods belonging to the same VLAN network will be tagged +with the same VLAN ID. Deployers need to carefully configure physical networks +and switch ports to make sure that packets can be transported correctly between +physical devices. + +For more flexible deployment, VxLAN network type is a better choice. Compared +to 12-bit VLAN ID, 24-bit VxLAN ID can support more numbers of bridge networks +and cross-Neutron L2 networks. With MAC-in-UDP encapsulation of VxLAN network, +hosts in different pods only need to be IP routable to transport instance +packets. + +Proposal +======== + +There are some challenges to support cross-Neutron VxLAN network. + +1. How to keep VxLAN ID identical for the same VxLAN network across Neutron servers + +2. How to synchronize tunnel endpoint information between pods + +3. How to trigger L2 agents to build tunnels based on this information + +4. How to support different back-ends, like ODL, L2 gateway + +The first challenge can be solved as VLAN network does, we allocate VxLAN ID in +central plugin and local plugin will use the same VxLAN ID to create local +network. For the second challenge, we introduce a new table called +"shadow_agents" in Tricircle database, so central plugin can save the tunnel +endpoint information collected from one local Neutron server in this table +and use it to populate the information to other local Neutron servers when +needed. Here is the schema of the table: + +.. csv-table:: Shadow Agent Table + :header: Field, Type, Nullable, Key, Default + + id, string, no, primary, null + pod_id, string, no, , null + host, string, no, unique, null + type, string, no, unique, null + tunnel_ip, string, no, , null + +**How to collect tunnel endpoint information** + +When the host where a port will be located is determined, local Neutron server +will receive a port-update request containing host ID in the body. During the +process of this request, local plugin can query agent information that contains +tunnel endpoint information from local Neutron database with host ID and port +VIF type; then send tunnel endpoint information to central Neutron server by +issuing a port-update request with this information in the binding profile. + +**How to populate tunnel endpoint information** + +When the tunnel endpoint information in one pod is needed to be populated to +other pods, XJob will issue port-create requests to corresponding local Neutron +servers with tunnel endpoint information queried from Tricircle database in the +bodies. After receiving such request, local Neutron server will save tunnel +endpoint information by calling real core plugin's "create_or_update_agent" +method. This method comes from neutron.db.agent_db.AgentDbMixin class. Plugins +that support "agent" extension will have this method. Actually there's no such +agent daemon running in the target local Neutron server, but we insert a record +for it in the database so the local Neutron server will assume there exists an +agent. That's why we call it shadow agent. + +The proposed solution for the third challenge is based on the shadow agent and +L2 population mechanism. In the original Neutron process, if the port status +is updated to active, L2 population mechanism driver does two things. First, +driver checks if the updated port is the first port in the target agent. If so, +driver collects tunnel endpoint information of other ports in the same network, +then sends the information to the target agent via RPC. Second, driver sends +the tunnel endpoint information of the updated port to other agents where ports +in the same network are located, also via RPC. L2 agents will build the tunnels +based on the information they received. To trigger the above processes to build +tunnels across Neutron servers, we further introduce shadow port. + +Let's say we have two instance ports, port1 is located in host1 in pod1 and +port2 is located in host2 in pod2. To make L2 agent running in host1 build a +tunnel to host2, we create a port with the same properties of port2 in pod1. +As discussed above, local Neutron server will create shadow agent during the +process of port-create request, so local Neutron server in pod1 won't complain +that host2 doesn't exist. To trigger L2 population process, we then update the +port status to active, so L2 agent in host1 will receive tunnel endpoint +information of port2 and build the tunnel. Port status is a read-only property +so we can't directly update it via ReSTful API. Instead, we issue a port-update +request with a special key in the binding profile. After local Neutron server +receives such request, it pops the special key from the binding profile and +updates the port status to active. XJob daemon will take the job to create and +update shadow ports. + +Here is the flow of shadow agent and shadow port process:: + + +-------+ +---------+ +---------+ + | | | | +---------+ | | + | Local | | Local | | | +----------+ +------+ | Local | + | Nova | | Neutron | | Central | | | | | | Neutron | + | Pod1 | | Pod1 | | Neutron | | Database | | XJob | | Pod2 | + | | | | | | | | | | | | + +---+---+ +---- ----+ +----+----+ +----+-----+ +--+---+ +----+----+ + | | | | | | + | update port1 | | | | | + | [host id] | | | | | + +---------------> | | | | + | | update port1 | | | | + | | [agent info] | | | | + | +----------------> | | | + | | | save shadow | | | + | | | agent info | | | + | | +----------------> | | + | | | | | | + | | | trigger shadow | | | + | | | port setup job | | | + | | | for pod1 | | | + | | +---------------------------------> | + | | | | | query ports in | + | | | | | the same network | + | | | | +------------------> + | | | | | | + | | | | | return port2 | + | | | | <------------------+ + | | | | query shadow | | + | | | | agent info | | + | | | | for port2 | | + | | | <----------------+ | + | | | | | | + | | | | create shadow | | + | | | | port for port2 | | + | <--------------------------------------------------+ | + | | | | | | + | | create shadow | | | | + | | agent and port | | | | + | +-----+ | | | | + | | | | | | | + | | | | | | | + | <-----+ | | | | + | | | | update shadow | | + | | | | port to active | | + | <--------------------------------------------------+ | + | | | | | | + | | L2 population | | | trigger shadow | + | +-----+ | | | port setup job | + | | | | | | for pod2 | + | | | | | +-----+ | + | <-----+ | | | | | + | | | | | | | + | | | | <-----+ | + | | | | | | + | | | | | | + + + + + + + + +Bridge network can support VxLAN network in the same way, we just create shadow +ports for router interface and router gateway. In the above graph, local Nova +server updates port with host ID to trigger the whole process. L3 agent will +update interface port and gateway port with host ID, so similar process will +be triggered to create shadow ports for router interface and router gateway. + +Currently Neutron team is working on push notification [1]_, Neutron server +will send resource data to agents; agents cache this data and use it to do the +real job like configuring openvswitch, updating iptables, configuring dnsmasq, +etc. Agents don't need to retrieve resource data from Neutron server via RPC +any more. Based on push notification, if tunnel endpoint information is stored +in port object later, and this information supports updating via ReSTful API, +we can simplify the solution for challenge 3 and 4. We just need to create +shadow port containing tunnel endpoint information. This information will be +pushed to agents and agents use it to create necessary tunnels and flows. + +**How to support different back-ends besides ML2+OVS implementation** + +We consider two typical back-ends that can support cross-Neutron VxLAN networking, +L2 gateway and SDN controller like ODL. For L2 gateway, we consider only +supporting static tunnel endpoint information for L2 gateway at the first step. +Shadow agent and shadow port process is almost the same with the ML2+OVS +implementation. The difference is that, for L2 gateway, the tunnel IP of the +shadow agent is set to the tunnel endpoint of the L2 gateway. So after L2 +population, L2 agents will create tunnels to the tunnel endpoint of the L2 +gateway. For SDN controller, we assume that SDN controller has the ability to +manage tunnel endpoint information across Neutron servers, so Tricircle only helps to +allocate VxLAN ID and keep the VxLAN ID identical across Neutron servers for one network. +Shadow agent and shadow port process will not be used in this case. However, if +different SDN controllers are used in different pods, it will be hard for each +SDN controller to connect hosts managed by other SDN controllers since each SDN +controller has its own mechanism. This problem is discussed in this page [2]_. +One possible solution under Tricircle is as what L2 gateway does. We create +shadow ports that contain L2 gateway tunnel endpoint information so SDN +controller can build tunnels in its own way. We then configure L2 gateway in +each pod to forward the packets between L2 gateways. L2 gateways discussed here +are mostly hardware based, and can be controlled by SDN controller. SDN +controller will use ML2 mechanism driver to receive the L2 network context and +further control L2 gateways for the network. + +To distinguish different back-ends, we will add a new configuration option +cross_pod_vxlan_mode whose valid values are "p2p", "l2gw" and "noop". Mode +"p2p" works for the ML2+OVS scenario, in this mode, shadow ports and shadow +agents containing host tunnel endpoint information are created; mode "l2gw" +works for the L2 gateway scenario, in this mode, shadow ports and shadow agents +containing L2 gateway tunnel endpoint information are created. For the SDN +controller scenario, as discussed above, if SDN controller can manage tunnel +endpoint information by itself, we only need to use "noop" mode, meaning that +neither shadow ports nor shadow agents will be created; or if SDN controller +can manage hardware L2 gateway, we can use "l2gw" mode. + +Data Model Impact +================= + +New table "shadow_agents" is added. + +Dependencies +============ + +None + +Documentation Impact +==================== + +- Update configuration guide to introduce options for VxLAN network +- Update networking guide to discuss new scenarios with VxLAN network +- Add release note about cross-Neutron VxLAN networking support + +References +========== + +.. [1] https://blueprints.launchpad.net/neutron/+spec/push-notifications +.. [2] http://etherealmind.com/help-wanted-stitching-a-federated-sdn-on-openstack-with-evpn/ diff --git a/doc/source/devspecs/devspecs-guide.rst b/doc/source/devspecs/devspecs-guide.rst new file mode 100644 index 00000000..7b9409ed --- /dev/null +++ b/doc/source/devspecs/devspecs-guide.rst @@ -0,0 +1,18 @@ +Devspecs Guide +------------------ +Some specs for developers. Who are interest in tricircle. + +.. include:: ./async_job_management.rst +.. include:: ./cross-neutron-l2-networking.rst +.. include:: ./cross-neutron-vxlan-networking.rst +.. include:: ./dynamic-pod-binding.rst +.. include:: ./enhance-xjob-reliability.rst +.. include:: ./l3-networking-combined-bridge-net.rst +.. include:: ./l3-networking-multi-NS-with-EW-enabled.rst +.. include:: ./lbaas.rst +.. include:: ./legacy_tables_clean.rst +.. include:: ./local-neutron-plugin.rst +.. include:: ./new-l3-networking-mulit-NS-with-EW.rst +.. include:: ./quality-of-service.rst +.. include:: ./resource_deleting.rst +.. include:: ./smoke-test-engine.rst diff --git a/doc/source/devspecs/dynamic-pod-binding.rst b/doc/source/devspecs/dynamic-pod-binding.rst new file mode 100644 index 00000000..d85c6832 --- /dev/null +++ b/doc/source/devspecs/dynamic-pod-binding.rst @@ -0,0 +1,236 @@ +================================= +Dynamic Pod Binding in Tricircle +================================= + +Background +=========== + +Most public cloud infrastructure is built with Availability Zones (AZs). +Each AZ is consisted of one or more discrete data centers, each with high +bandwidth and low latency network connection, separate power and facilities. +These AZs offer cloud tenants the ability to operate production +applications and databases deployed into multiple AZs are more highly +available, fault tolerant and scalable than a single data center. + +In production clouds, each AZ is built by modularized OpenStack, and each +OpenStack is one pod. Moreover, one AZ can include multiple pods. Among the +pods, they are classified into different categories. For example, servers +in one pod are only for general purposes, and the other pods may be built +for heavy load CAD modeling with GPU. So pods in one AZ could be divided +into different groups. Different pod groups for different purposes, and +the VM's cost and performance are also different. + +The concept "pod" is created for the Tricircle to facilitate managing +OpenStack instances among AZs, which therefore is transparent to cloud +tenants. The Tricircle maintains and manages a pod binding table which +records the mapping relationship between a cloud tenant and pods. When the +cloud tenant creates a VM or a volume, the Tricircle tries to assign a pod +based on the pod binding table. + +Motivation +=========== + +In resource allocation scenario, when a tenant creates a VM in one pod and a +new volume in a another pod respectively. If the tenant attempt to attach the +volume to the VM, the operation will fail. In other words, the volume should +be in the same pod where the VM is, otherwise the volume and VM would not be +able to finish the attachment. Hence, the Tricircle needs to ensure the pod +binding so as to guarantee that VM and volume are created in one pod. + +In capacity expansion scenario, when resources in one pod are exhausted, +then a new pod with the same type should be added into the AZ. Therefore, +new resources of this type should be provisioned in the new added pod, which +requires dynamical change of pod binding. The pod binding could be done +dynamically by the Tricircle, or by admin through admin api for maintenance +purpose. For example, for maintenance(upgrade, repairement) window, all +new provision requests should be forwarded to the running one, but not +the one under maintenance. + +Solution: dynamic pod binding +============================== + +It's quite headache for capacity expansion inside one pod, you have to +estimate, calculate, monitor, simulate, test, and do online grey expansion +for controller nodes and network nodes whenever you add new machines to the +pod. It's quite big challenge as more and more resources added to one pod, +and at last you will reach limitation of one OpenStack. If this pod's +resources exhausted or reach the limit for new resources provisioning, the +Tricircle needs to bind tenant to a new pod instead of expanding the current +pod unlimitedly. The Tricircle needs to select a proper pod and stay binding +for a duration, in this duration VM and volume will be created for one tenant +in the same pod. + +For example, suppose we have two groups of pods, and each group has 3 pods, +i.e., + +GroupA(Pod1, Pod2, Pod3) for general purpose VM, + +GroupB(Pod4, Pod5, Pod6) for CAD modeling. + +Tenant1 is bound to Pod1, Pod4 during the first phase for several months. +In the first phase, we can just add weight in Pod, for example, Pod1, weight 1, +Pod2, weight2, this could be done by adding one new field in pod table, or no +field at all, just link them by the order created in the Tricircle. In this +case, we use the pod creation time as the weight. + +If the tenant wants to allocate VM/volume for general VM, Pod1 should be +selected. It can be implemented with flavor or volume type metadata. For +general VM/Volume, there is no special tag in flavor or volume type metadata. + +If the tenant wants to allocate VM/volume for CAD modeling VM, Pod4 should be +selected. For CAD modeling VM/Volume, a special tag "resource: CAD Modeling" +in flavor or volume type metadata determines the binding. + +When it is detected that there is no more resources in Pod1, Pod4. Based on +the resource_affinity_tag, the Tricircle queries the pod table for available +pods which provision a specific type of resources. The field resource_affinity +is a key-value pair. The pods will be selected when there are matched +key-value in flavor extra-spec or volume extra-spec. A tenant will be bound +to one pod in one group of pods with same resource_affinity_tag. In this case, +the Tricircle obtains Pod2 and Pod3 for general purpose, as well as Pod5 an +Pod6 for CAD purpose. The Tricircle needs to change the binding, for example, +tenant1 needs to be bound to Pod2, Pod5. + +Implementation +============== + +Measurement +---------------- + +To get the information of resource utilization of pods, the Tricircle needs to +conduct some measurements on pods. The statistic task should be done in +bottom pod. + +For resources usages, current cells provide interface to retrieve usage for +cells [1]. OpenStack provides details of capacity of a cell, including disk +and ram via api of showing cell capacities [1]. + +If OpenStack is not running with cells mode, we can ask Nova to provide +an interface to show the usage detail in AZ. Moreover, an API for usage +query at host level is provided for admins [3], through which we can obtain +details of a host, including cpu, memory, disk, and so on. + +Cinder also provides interface to retrieve the backend pool usage, +including updated time, total capacity, free capacity and so on [2]. + +The Tricircle needs to have one task to collect the usage in the bottom on +daily base, to evaluate whether the threshold is reached or not. A threshold +or headroom could be configured for each pod, but not to reach 100% exhaustion +of resources. + +On top there should be no heavy process. So getting the sum info from the +bottom can be done in the Tricircle. After collecting the details, the +Tricircle can judge whether a pod reaches its limit. + +Tricircle +---------- + +The Tricircle needs a framework to support different binding policy (filter). + +Each pod is one OpenStack instance, including controller nodes and compute +nodes. E.g., + +:: + + +-> controller(s) - pod1 <--> compute nodes <---+ + | + The tricircle +-> controller(s) - pod2 <--> compute nodes <---+ resource migration, if necessary + (resource controller) .... | + +-> controller(s) - pod{N} <--> compute nodes <-+ + + +The Tricircle selects a pod to decide where the requests should be forwarded +to which controller. Then the controllers in the selected pod will do its own +scheduling. + +One simplest binding filter is as follows. Line up all available pods in a +list and always select the first one. When all the resources in the first pod +has been allocated, remove it from the list. This is quite like how production +cloud is built: at first, only a few pods are in the list, and then add more +and more pods if there is not enough resources in current cloud. For example, + +List1 for general pool: Pod1 <- Pod2 <- Pod3 +List2 for CAD modeling pool: Pod4 <- Pod5 <- Pod6 + +If Pod1's resource exhausted, Pod1 is removed from List1. The List1 is changed +to: Pod2 <- Pod3. +If Pod4's resource exhausted, Pod4 is removed from List2. The List2 is changed +to: Pod5 <- Pod6 + +If the tenant wants to allocate resources for general VM, the Tricircle +selects Pod2. If the tenant wants to allocate resources for CAD modeling VM, +the Tricircle selects Pod5. + +Filtering +------------- + +For the strategy of selecting pods, we need a series of filters. Before +implementing dynamic pod binding, the binding criteria are hard coded to +select the first pod in the AZ. Hence, we need to design a series of filter +algorithms. Firstly, we plan to design an ALLPodsFilter which does no +filtering and passes all the available pods. Secondly, we plan to design an +AvailabilityZoneFilter which passes the pods matching the specified available +zone. Thirdly, we plan to design a ResourceAffiniyFilter which passes the pods +matching the specified resource type. Based on the resource_affinity_tag, +the Tricircle can be aware of which type of resource the tenant wants to +provision. In the future, we can add more filters, which requires adding more +information in the pod table. + +Weighting +------------- + +After filtering all the pods, the Tricircle obtains the available pods for a +tenant. The Tricircle needs to select the most suitable pod for the tenant. +Hence, we need to define a weight function to calculate the corresponding +weight of each pod. Based on the weights, the Tricircle selects the pod which +has the maximum weight value. When calculating the weight of a pod, we need +to design a series of weigher. We first take the pod creation time into +consideration when designing the weight function. The second one is the idle +capacity, to select a pod which has the most idle capacity. Other metrics +will be added in the future, e.g., cost. + +Data Model Impact +----------------- + +Firstly, we need to add a column “resource_affinity_tag” to the pod table, +which is used to store the key-value pair, to match flavor extra-spec and +volume extra-spec. + +Secondly, in the pod binding table, we need to add fields of start binding +time and end binding time, so the history of the binding relationship could +be stored. + +Thirdly, we need a table to store the usage of each pod for Cinder/Nova. +We plan to use JSON object to store the usage information. Hence, even if +the usage structure is changed, we don't need to update the table. And if +the usage value is null, that means the usage has not been initialized yet. +As just mentioned above, the usage could be refreshed in daily basis. If it's +not initialized yet, it means there is still lots of resources available, +which could be scheduled just like this pod has not reach usage threshold. + +Dependencies +------------ + +None + + +Testing +------- + +None + + +Documentation Impact +-------------------- + +None + + +Reference +--------- + +[1] http://developer.openstack.org/api-ref-compute-v2.1.html#showCellCapacities + +[2] http://developer.openstack.org/api-ref-blockstorage-v2.html#os-vol-pool-v2 + +[3] http://developer.openstack.org/api-ref-compute-v2.1.html#showinfo diff --git a/doc/source/devspecs/enhance-xjob-reliability.rst b/doc/source/devspecs/enhance-xjob-reliability.rst new file mode 100644 index 00000000..e29ebde6 --- /dev/null +++ b/doc/source/devspecs/enhance-xjob-reliability.rst @@ -0,0 +1,234 @@ +======================================= +Enhance Reliability of Asynchronous Job +======================================= + +Background +========== + +Currently we are using cast method in our RPC client to trigger asynchronous +job in XJob daemon. After one of the worker threads receives the RPC message +from the message broker, it registers the job in the database and starts to +run the handle function. The registration guarantees that asynchronous job will +not be lost after the job fails and the failed job can be redone. The detailed +discussion of the asynchronous job process in XJob daemon is covered in our +design document [1]. + +Though asynchronous jobs are correctly saved after worker threads get the RPC +message, we still have risk to lose jobs. By using cast method, it's only +guaranteed that the message is received by the message broker, but there's no +guarantee that the message can be received by the message consumer, i.e., the +RPC server thread running in XJob daemon. According to the RabbitMQ document, +undelivered messages will be lost if RabbitMQ server stops [2]. Message +persistence or publisher confirm can be used to increase reliability, but +they sacrifice performance. On the other hand, we can not assume that message +brokers other than RabbitMQ will provide similar persistence or confirmation +functionality. Therefore, Tricircle itself should handle the asynchronous job +reliability problem as far as possible. Since we already have a framework to +register, run and redo asynchronous jobs in XJob daemon, we propose a cheaper +way to improve reliability. + +Proposal +======== + +One straightforward way to make sure that the RPC server has received the RPC +message is to use call method. RPC client will be blocked until the RPC server +replies the message if it uses call method to send the RPC request. So if +something wrong happens before the reply, RPC client can be aware of it. Of +course we cannot make RPC client wait too long, thus RPC handlers in the RPC +server side need to be simple and quick to run. Thanks to the asynchronous job +framework we already have, migrating from cast method to call method is easy. + +Here is the flow of the current process:: + + +--------+ +--------+ +---------+ +---------------+ +----------+ + | | | | | | | | | | + | API | | RPC | | Message | | RPC Server | | Database | + | Server | | client | | Broker | | Handle Worker | | | + | | | | | | | | | | + +---+----+ +---+----+ +----+----+ +-------+-------+ +----+-----+ + | | | | | + | call RPC API | | | | + +--------------> | | | + | | send cast message | | | + | +-------------------> | | + | call return | | dispatch message | | + <--------------+ +------------------> | + | | | | register job | + | | | +----------------> + | | | | | + | | | | obtain lock | + | | | +----------------> + | | | | | + | | | | run job | + | | | +----+ | + | | | | | | + | | | | | | + | | | <----+ | + | | | | | + | | | | | + + + + + + + +We can just leave **register job** phase in the RPC handle and put **obtain +lock** and **run job** phase in a separate thread, so the RPC handle is simple +enough to use call method to invoke it. Here is the proposed flow:: + + +--------+ +--------+ +---------+ +---------------+ +----------+ +-------------+ +-------+ + | | | | | | | | | | | | | | + | API | | RPC | | Message | | RPC Server | | Database | | RPC Server | | Job | + | Server | | client | | Broker | | Handle Worker | | | | Loop Worker | | Queue | + | | | | | | | | | | | | | | + +---+----+ +---+----+ +----+----+ +-------+-------+ +----+-----+ +------+------+ +---+---+ + | | | | | | | + | call RPC API | | | | | | + +--------------> | | | | | + | | send call message | | | | | + | +--------------------> | | | | + | | | dispatch message | | | | + | | +------------------> | | | + | | | | register job | | | + | | | +----------------> | | + | | | | | | | + | | | | job enqueue | | | + | | | +------------------------------------------------> + | | | | | | | + | | | reply message | | | job dequeue | + | | <------------------+ | |--------------> + | | send reply message | | | obtain lock | | + | <--------------------+ | <----------------+ | + | call return | | | | | | + <--------------+ | | | run job | | + | | | | | +----+ | + | | | | | | | | + | | | | | | | | + | | | | | +----> | + | | | | | | | + | | | | | | | + + + + + + + + + +In the above graph, **Loop Worker** is a new-introduced thread to do the actual +work. **Job Queue** is an eventlet queue used to coordinate **Handle +Worker** who produces job entries and **Loop Worker** who consumes job entries. +While accessing an empty queue, **Loop Worker** will be blocked until some job +entries are put into the queue. **Loop Worker** retrieves job entries from the +job queue then starts to run it. Similar to the original flow, since multiple +workers may get the same type of job for the same resource at the same time, +workers need to obtain the lock before it can run the job. One problem occurs +whenever XJob daemon stops before it finishes all the jobs in the job queue; +all unfinished jobs are lost. To solve it, we make changes to the original +periodical task that is used to redo failed job, and let it also handle the +jobs which have been registered for a certain time but haven't been started. +So both failed jobs and "orphan" new jobs can be picked up and redone. + +You can see that **Handle Worker** doesn't do many works, it just consumes RPC +messages, registers jobs then puts job items in the job queue. So one extreme +solution here, will be to register new jobs in the API server side and start +worker threads to retrieve jobs from the database and run them. In this way, we +can remove all the RPC processes and use database to coordinate. The drawback +of this solution is that we don't dispatch jobs. All the workers query jobs +from the database so there is high probability that some of the workers obtain +the same job and thus race occurs. In the first solution, message broker +helps us to dispatch messages, and so dispatch jobs. + +Considering job dispatch is important, we can make some changes to the second +solution and move to the third one, that is to also register new jobs in the +API server side, but we still use cast method to trigger asynchronous job in +XJob daemon. Since job registration is done in the API server side, we are not +afraid that the jobs will be lost if cast messages are lost. If API server side +fails to register the job, it will return response of failure; If registration +of job succeeds, the job will be done by XJob daemon at last. By using RPC, we +dispatch jobs with the help of message brokers. One thing which makes cast +method better than call method is that retrieving RPC messages and running job +handles are done in the same thread so if one XJob daemon is busy handling +jobs, RPC messages will not be dispatched to it. However when using call +method, RPC messages are retrieved by one thread(the **Handle Worker**) and job +handles are run by another thread(the **Loop Worker**), so XJob daemon may +accumulate many jobs in the queue and at the same time it's busy handling jobs. +This solution has the same problem with the call method solution. If cast +messages are lost, the new jobs are registered in the database but no XJob +daemon is aware of these new jobs. Same way to solve it, use periodical task to +pick up these "orphan" jobs. Here is the flow:: + + +--------+ +--------+ +---------+ +---------------+ +----------+ + | | | | | | | | | | + | API | | RPC | | Message | | RPC Server | | Database | + | Server | | client | | Broker | | Handle Worker | | | + | | | | | | | | | | + +---+----+ +---+----+ +----+----+ +-------+-------+ +----+-----+ + | | | | | + | call RPC API | | | | + +--------------> | | | + | | register job | | | + | +-------------------------------------------------------> + | | | | | + | | [if succeed to | | | + | | register job] | | | + | | send cast message | | | + | +-------------------> | | + | call return | | dispatch message | | + <--------------+ +------------------> | + | | | | obtain lock | + | | | +----------------> + | | | | | + | | | | run job | + | | | +----+ | + | | | | | | + | | | | | | + | | | <----+ | + | | | | | + | | | | | + + + + + + + +Discussion +========== + +In this section we discuss the pros and cons of the above three solutions. + +.. list-table:: **Solution Comparison** + :header-rows: 1 + + * - Solution + - Pros + - Cons + * - API server uses call + - no RPC message lost + - downtime of unfinished jobs in the job queue when XJob daemon stops, + job dispatch not based on XJob daemon workload + * - API server register jobs + no RPC + - no requirement on RPC(message broker), no downtime + - no job dispatch, conflict costs time + * - API server register jobs + uses cast + - job dispatch based on XJob daemon workload + - downtime of lost jobs due to cast messages lost + +Downtime means that after a job is dispatched to a worker, other workers need +to wait for a certain time to determine that job is expired and take over it. + +Conclusion +========== + +We decide to implement the third solution(API server register jobs + uses cast) +since it improves the asynchronous job reliability and at the mean time has +better work load dispatch. + +Data Model Impact +================= + +None + +Dependencies +============ + +None + +Documentation Impact +==================== + +None + +References +========== + +..[1] https://docs.google.com/document/d/1zcxwl8xMEpxVCqLTce2-dUOtB-ObmzJTbV1uSQ6qTsY +..[2] https://www.rabbitmq.com/tutorials/tutorial-two-python.html +..[3] https://www.rabbitmq.com/confirms.html +..[4] http://eventlet.net/doc/modules/queue.html diff --git a/doc/source/devspecs/index.rst b/doc/source/devspecs/index.rst new file mode 100644 index 00000000..6b0ca733 --- /dev/null +++ b/doc/source/devspecs/index.rst @@ -0,0 +1,8 @@ +========================== +Tricircle Devspecs Guide +========================== + +.. toctree:: + :maxdepth: 4 + + devspecs-guide diff --git a/doc/source/devspecs/l3-networking-combined-bridge-net.rst b/doc/source/devspecs/l3-networking-combined-bridge-net.rst new file mode 100644 index 00000000..a471a6d9 --- /dev/null +++ b/doc/source/devspecs/l3-networking-combined-bridge-net.rst @@ -0,0 +1,554 @@ +============================================== +Layer-3 Networking and Combined Bridge Network +============================================== + +Background +========== + +To achieve cross-Neutron layer-3 networking, we utilize a bridge network to +connect networks in each Neutron server, as shown below: + +East-West networking:: + + +-----------------------+ +-----------------------+ + | OpenStack1 | | OpenStack2 | + | | | | + | +------+ +---------+ | +------------+ | +---------+ +------+ | + | | net1 | | ip1| | | bridge net | | |ip2 | | net2 | | + | | +--+ R +---+ +---+ R +--+ | | + | | | | | | | | | | | | | | + | +------+ +---------+ | +------------+ | +---------+ +------+ | + +-----------------------+ +-----------------------+ + + Fig 1 + +North-South networking:: + + +---------------------+ +-------------------------------+ + | OpenStack1 | | OpenStack2 | + | | | | + | +------+ +-------+ | +--------------+ | +-------+ +----------------+ | + | | net1 | | ip1| | | bridge net | | | ip2| | external net | | + | | +--+ R1 +---+ +---+ R2 +--+ | | + | | | | | | | 100.0.1.0/24 | | | | | 163.3.124.0/24 | | + | +------+ +-------+ | +--------------+ | +-------+ +----------------+ | + +---------------------+ +-------------------------------+ + + Fig 2 + +To support east-west networking, we configure extra routes in routers in each +OpenStack cloud:: + + In OpenStack1, destination: net2, nexthop: ip2 + In OpenStack2, destination: net1, nexthop: ip1 + +To support north-south networking, we set bridge network as the external +network in OpenStack1 and as the internal network in OpenStack2. For instance +in net1 to access the external network, the packets are SNATed twice, first +SNATed to ip1, then SNATed to ip2. For floating ip binding, ip in net1 is first +bound to ip(like 100.0.1.5) in bridge network(bridge network is attached to R1 +as external network), then the ip(100.0.1.5) in bridge network is bound to ip +(like 163.3.124.8)in the real external network (bridge network is attached to +R2 as internal network). + +Problems +======== + +The idea of introducing a bridge network is good, but there are some problems +in the current usage of the bridge network. + +Redundant Bridge Network +------------------------ + +We use two bridge networks to achieve layer-3 networking for each tenant. If +VLAN is used as the bridge network type, limited by the range of VLAN tag, only +2048 pairs of bridge networks can be created. The number of tenants supported +is far from enough. + +Redundant SNAT +-------------- + +In the current implementation, packets are SNATed two times for outbound +traffic and are DNATed two times for inbound traffic. The drawback is that +packets of outbound traffic consume extra operations. Also, we need to maintain +extra floating ip pool for inbound traffic. + +DVR support +----------- + +Bridge network is attached to the router as an internal network for east-west +networking and north-south networking when the real external network and the +router are not located in the same OpenStack cloud. It's fine when the bridge +network is VLAN type, since packets directly go out of the host and are +exchanged by switches. But if we would like to support VxLAN as the bridge +network type later, attaching bridge network as an internal network in the +DVR scenario will cause some troubles. How DVR connects the internal networks +is that packets are routed locally in each host, and if the destination is not +in the local host, the packets are sent to the destination host via a VxLAN +tunnel. Here comes the problem, if bridge network is attached as an internal +network, the router interfaces will exist in all the hosts where the router +namespaces are created, so we need to maintain lots of VTEPs and VxLAN tunnels +for bridge network in the Tricircle. Ports in bridge network are located in +different OpenStack clouds so local Neutron server is not aware of ports in +other OpenStack clouds and will not setup VxLAN tunnel for us. + +Proposal +-------- + +To address the above problems, we propose to combine the bridge networks for +east-west and north-south networking. Bridge network is always attached to +routers as an external network. In the DVR scenario, different from router +interfaces, router gateway will only exist in the SNAT namespace in a specific +host, which reduces the number of VTEPs and VxLAN tunnels the Tricircle needs +to handle. By setting "enable_snat" option to "False" when attaching the router +gateway, packets will not be SNATed when go through the router gateway, so +packets are only SNATed and DNATed one time in the real external gateway. +However, since one router can only be attached to one external network, in the +OpenStack cloud where the real external network is located, we need to add one +more router to connect the bridge network with the real external network. The +network topology is shown below:: + + +-------------------------+ +-------------------------+ + |OpenStack1 | |OpenStack2 | + | +------+ +--------+ | +------------+ | +--------+ +------+ | + | | | | IP1| | | | | |IP2 | | | | + | | net1 +---+ R1 XXXXXXX bridge net XXXXXXX R2 +---+ net2 | | + | | | | | | | | | | | | | | + | +------+ +--------+ | +---X----+---+ | +--------+ +------+ | + | | X | | | + +-------------------------+ X | +-------------------------+ + X | + X | + +--------------------------------X----|-----------------------------------+ + |OpenStack3 X | | + | X | | + | +------+ +--------+ X | +--------+ +--------------+ | + | | | | IP3| X | |IP4 | | | | + | | net3 +----+ R3 XXXXXXXXXX +---+ R4 XXXXXX external net | | + | | | | | | | | | | + | +------+ +--------+ +--------+ +--------------+ | + | | + +-------------------------------------------------------------------------+ + + router interface: ----- + router gateway: XXXXX + IPn: router gateway ip or router interface ip + + Fig 3 + +Extra routes and gateway ip are configured to build the connection:: + + routes of R1: net2 via IP2 + net3 via IP3 + external gateway ip of R1: IP4 + (IP2 and IP3 are from bridge net, so routes will only be created in + SNAT namespace) + + routes of R2: net1 via IP1 + net3 via IP3 + external gateway ip of R2: IP4 + (IP1 and IP3 are from bridge net, so routes will only be created in + SNAT namespace) + + routes of R3: net1 via IP1 + net2 via IP2 + external gateway ip of R3: IP4 + (IP1 and IP2 are from bridge net, so routes will only be created in + SNAT namespace) + + routes of R4: net1 via IP1 + net2 via IP2 + net3 via IP3 + external gateway ip of R1: real-external-gateway-ip + disable DVR mode + +An alternative solution which can reduce the extra router is that for the +router that locates in the same OpenStack cloud with the real external network, +we attach the bridge network as an internal network, so the real external +network can be attached to the same router. Here is the topology:: + + +-------------------------+ +-------------------------+ + |OpenStack1 | |OpenStack2 | + | +------+ +--------+ | +------------+ | +--------+ +------+ | + | | | | IP1| | | | | |IP2 | | | | + | | net1 +---+ R1 XXXXXXX bridge net XXXXXXX R2 +---+ net2 | | + | | | | | | | | | | | | | | + | +------+ +--------+ | +-----+------+ | +--------+ +------+ | + | | | | | + +-------------------------+ | +-------------------------+ + | + | + +----------------------|---------------------------------+ + |OpenStack3 | | + | | | + | +------+ +---+----+ +--------------+ | + | | | | IP3 | | | | + | | net3 +----+ R3 XXXXXXXX external net | | + | | | | | | | | + | +------+ +--------+ +--------------+ | + | | + +--------------------------------------------------------+ + + router interface: ----- + router gateway: XXXXX + IPn: router gateway ip or router interface ip + + Fig 4 + +The limitation of this solution is that R3 needs to be set as non-DVR mode. +As is discussed above, for network attached to DVR mode router, the router +interfaces of this network will be created in all the hosts where the router +namespaces are created. Since these interfaces all have the same IP and MAC, +packets sent between instances(could be virtual machine, container or bare +metal) can't be directly wrapped in the VxLAN packets, otherwise packets sent +from different hosts will have the same MAC. How Neutron solve this problem is +to introduce DVR MACs which are allocated by Neutron server and assigned to +each host hosting DVR mode router. Before wrapping the packets in the VxLAN +packets, the source MAC of the packets are replaced by the DVR MAC of the host. +If R3 is DVR mode, source MAC of packets sent from net3 to bridge network will +be changed, but after the packets reach R1 or R2, R1 and R2 don't recognize the +DVR MAC, so the packets are dropped. + +The same, extra routes and gateway ip are configured to build the connection:: + + routes of R1: net2 via IP2 + net3 via IP3 + external gateway ip of R1: IP3 + (IP2 and IP3 are from bridge net, so routes will only be created in + SNAT namespace) + + routes of R2: net1 via IP1 + net3 via IP3 + external gateway ip of R1: IP3 + (IP1 and IP3 are from bridge net, so routes will only be created in + SNAT namespace) + + routes of R3: net1 via IP1 + net2 via IP2 + external gateway ip of R3: real-external-gateway-ip + (non-DVR mode, routes will all be created in the router namespace) + +The real external network can be deployed in one dedicated OpenStack cloud. In +that case, there is no need to run services like Nova and Cinder in that cloud. +Instance and volume will not be provisioned in that cloud. Only Neutron service +is required. Then the above two topologies transform to the same one:: + + +-------------------------+ +-------------------------+ + |OpenStack1 | |OpenStack2 | + | +------+ +--------+ | +------------+ | +--------+ +------+ | + | | | | IP1| | | | | |IP2 | | | | + | | net1 +---+ R1 XXXXXXX bridge net XXXXXXX R2 +---+ net2 | | + | | | | | | | | | | | | | | + | +------+ +--------+ | +-----+------+ | +--------+ +------+ | + | | | | | + +-------------------------+ | +-------------------------+ + | + | + +-----------|-----------------------------------+ + |OpenStack3 | | + | | | + | | +--------+ +--------------+ | + | | |IP3 | | | | + | +---+ R3 XXXXXX external net | | + | | | | | | + | +--------+ +--------------+ | + | | + +-----------------------------------------------+ + + Fig 5 + +The motivation of putting the real external network in a dedicated OpenStack +cloud is to simplify the real external network management, and also to separate +the real external network and the internal networking area, for better security +control. + +Discussion +---------- + +The implementation of DVR does bring some restrictions to our cross-Neutron +layer-2 and layer-3 networking, resulting in the limitation of the above two +proposals. In the first proposal, if the real external network is deployed with +internal networks in the same OpenStack cloud, one extra router is needed in +that cloud. Also, since one of the router is DVR mode and the other is non-DVR +mode, we need to deploy at least two l3 agents, one is dvr-snat mode and the +other is legacy mode. The limitation of the second proposal is that the router +is non-DVR mode, so east-west and north-south traffic are all go through the +router namespace in the network node. + +Also, cross-Neutron layer-2 networking can not work with DVR because of +source MAC replacement. Considering the following topology:: + + +----------------------------------------------+ +-------------------------------+ + |OpenStack1 | |OpenStack2 | + | +-----------+ +--------+ +-----------+ | | +--------+ +------------+ | + | | | | | | | | | | | | | | + | | net1 +---+ R1 +---+ net2 | | | | R2 +---+ net2 | | + | | Instance1 | | | | Instance2 | | | | | | Instance3 | | + | +-----------+ +--------+ +-----------+ | | +--------+ +------------+ | + | | | | + +----------------------------------------------+ +-------------------------------+ + + Fig 6 + +net2 supports cross-Neutron layer-2 networking, so instances in net2 can be +created in both OpenStack clouds. If the router net1 and net2 connected to is +DVR mode, when Instance1 ping Instance2, the packets are routed locally and +exchanged via a VxLAN tunnel. Source MAC replacement is correctly handled +inside OpenStack1. But when Instance1 tries to ping Instance3, OpenStack2 does +not recognize the DVR MAC from OpenStack1, thus connection fails. Therefore, +only local type network can be attached to a DVR mode router. + +Cross-Neutron layer-2 networking and DVR may co-exist after we address the +DVR MAC recognition problem(we will issue a discussion about this problem in +the Neutron community) or introduce l2 gateway. Actually this bridge network +approach is just one of the implementation, we are considering in the near +future to provide a mechanism to let SDN controller to plug in, which DVR and +bridge network may be not needed. + +Having the above limitation, can our proposal support the major user scenarios? +Considering whether the tenant network and router are local or across Neutron +servers, we divide the user scenarios into four categories. For the scenario of +cross-Neutron router, we use the proposal shown in Fig 3 in our discussion. + +Local Network and Local Router +------------------------------ + +Topology:: + + +-----------------+ +-----------------+ + |OpenStack1 | |OpenStack2 | + | | | | + | ext net1 | | ext net2 | + | +-----+-----+ | | +-----+-----+ | + | | | | | | + | | | | | | + | +--+--+ | | +--+--+ | + | | | | | | | | + | | R1 | | | | R2 | | + | | | | | | | | + | +--+--+ | | +--+--+ | + | | | | | | + | | | | | | + | +---+---+ | | +---+---+ | + | net1 | | net2 | + | | | | + +-----------------+ +-----------------+ + + Fig 7 + +Each OpenStack cloud has its own external network, instance in each local +network accesses the external network via the local router. If east-west +networking is not required, this scenario has no requirement on cross-Neutron +layer-2 and layer-3 networking functionality. Both central Neutron server and +local Neutron server can process network resource management request. While if +east-west networking is needed, we have two choices to extend the above +topology:: + + * + +-----------------+ +-----------------+ * +-----------------+ +-----------------+ + |OpenStack1 | |OpenStack2 | * |OpenStack1 | |OpenStack2 | + | | | | * | | | | + | ext net1 | | ext net2 | * | ext net1 | | ext net2 | + | +-----+-----+ | | +-----+-----+ | * | +-----+-----+ | | +-----+-----+ | + | | | | | | * | | | | | | + | | | | | | * | | | | | | + | +--+--+ | | +--+--+ | * | +--+--+ | | +--+--+ | + | | | | | | | | * | | | | | | | | + | | R1 | | | | R2 | | * | | R1 +--+ | | +---+ R2 | | + | | | | | | | | * | | | | | | | | | | + | +--+--+ | | +--+--+ | * | +--+--+ | | | | +--+--+ | + | | | | | | * | | | | | | | | + | | | | | | * | | | | | | | | + | +---+-+-+ | | +---+-+-+ | * | +---+---+ | | | | +---+---+ | + | net1 | | | net2 | | * | net1 | | | | net2 | + | | | | | | * | | | | | | + | +--------+--+ | | +--------+--+ | * | | | net3 | | | + | | Instance1 | | | | Instance2 | | * | +------------+------------+-----------+ | + | +-----------+ | | +-----------+ | * | | | | + | | | | | | * +-----------------+ +-----------------+ + | | | net3 | | | * + | +------+-------------------------+----+ | * Fig 8.2 + | | | | * + +-----------------+ +-----------------+ * + * + Fig 8.1 + +In the left topology, two instances are connected by a shared VxLAN network, +only local network is attached to local router, so it can be either legacy or +DVR mode. In the right topology, two local routers are connected by a shared +VxLAN network, so they can only be legacy mode. + +Cross-Neutron Network and Local Router +-------------------------------------- + +Topology:: + + +-----------------+ +-----------------+ + |OpenStack1 | |OpenStack2 | + | | | | + | ext net1 | | ext net2 | + | +-----+-----+ | | +-----+-----+ | + | | | | | | + | | | | | | + | +--+--+ | | +--+--+ | + | | | | | | | | + | | R1 | | | | R2 | | + | | | | | | | | + | +--+--+ | | +--+--+ | + | | | | | | + | net1 | | | | | + | +--+---+---------------------+---+---+ | + | | | | | | + | | | | | | + | +--+--------+ | | +--+--------+ | + | | Instance1 | | | | Instance2 | | + | +-----------+ | | +-----------+ | + | | | | + +-----------------+ +-----------------+ + + Fig 9 + +From the Neutron API point of view, attaching a network to different routers +that each has its own external gateway is allowed but packets can only get out +via one of the external network because there is only one gateway ip in one +subnet. But in the Tricircle, we allocate one gateway ip for network in each +OpenStack cloud, so instances can access specific external network via specific +gateway according to which OpenStack cloud they are located. + +We can see this topology as a simplification of the topology shown in Fig 8.1 +that it doesn't require an extra network interface for instances. And if no +other networks are attached to R1 and R2 except net1, R1 and R2 can be DVR +mode. + +In the NFV scenario, usually instance itself acts as a router, so there's no +need to create a Neutron router and we directly attach the instance to the +provider network and access the real external network via the provider network. +In that case, when creating Neutron network, "router:external" label should be +set to "False". See Fig 10:: + + +-----------------+ +-----------------+ + |OpenStack1 | |OpenStack2 | + | | | | + | provider net1 | | provider net2 | + | +--+---------+ | | +--+---------+ | + | | | | | | + | | | | | | + | +--+--------+ | | +--+--------+ | + | | VNF | | | | VNF | | + | | Instance1 | | | | Instance2 | | + | +------+----+ | | +------+----+ | + | | | | | | + | | | | | | + | net1 | | | | | + | +------+-------------------------+---+ | + | | | | + +-----------------+ +-----------------+ + + Fig 10 + +Local Network and Cross-Neutron Router +-------------------------------------- + +Topology:: + + +-----------------+ +-----------------+ + |OpenStack1 | |OpenStack2 | + | | | | + | | | ext net | + | | | +-------+---+ | + | bridge net | | | | + | +-----+-----------------+-+-+ | | + | | | | | | +--+--+ | + | | | | | | | | | + | +--+--+ | | | +----+ R | | + | | | | | | | | | + | | R | | | | +-----+ | + | | | | | | | + | +--+--+ | | | +-----+ | + | | | | | | | | + | | | | +---+ R | | + | +---+---+ | | | | | + | net1 | | +--+--+ | + | | | | | + | | | | | + | | | +---+---+ | + | | | net2 | + | | | | + +-----------------+ +-----------------+ + + Fig 11 + +Since the router is cross-Neutron type, the Tricircle automatically creates +bridge network to connect router instances inside the two Neutron servers and +connect the router instance to the real external network. Networks attached to +the router are local type, so the router can be either legacy or DVR mode. + +Cross-Neutron Network and Cross-Neutron Router +---------------------------------------------- + +Topology:: + + * + +-----------------+ +-----------------+ * +-----------------+ +-----------------+ + |OpenStack1 | |OpenStack2 | * |OpenStack1 | |OpenStack2 | + | | | | * | | | | + | | | ext net | * | | | ext net | + | | | +-------+---+ | * | | | +-------+---+ | + | bridge net | | | | * | bridge net | | | | + | +-----+-----------------+-+-+ | | * | +-----+-----------------+-+-+ | | + | | | | | | +--+--+ | * | | | | | | +--+--+ | + | | | | | | | | | * | | | | | | | | | + | | | | | +----+ R | | * | | | | | +----+ R | | + | | | | | | | | * | | | | | | | | + | +--+--+ | | | +-----+ | * | +--+--+ | | | +-----+ | + | | | | | | | * | | | | | | | + | | R | | | | +-----+ | * | +--+ R | | | | +-----+ | + | | | | | | | | | * | | | | | | | | | | + | +--+--+ | | +---+ R | | * | | +--+--+ | | +---+ R +--+ | + | | | | | | | * | | | | | | | | | + | | | | +--+--+ | * | | | | | +--+--+ | | + | | | | | | * | | | | | | | | + | | | | | | * | | | | | | | | + | +---+------------------------+---+ | * | | +---+------------------------+---+ | | + | net1 | | | * | | net1 | | | | + | | | | * | | | | | | + +-----------------+ +-----------------+ * | | | | | | + * | +-+------------------------------------++ | + Fig 12.1 * | net2 | | | + * | | | | + * +-----------------+ +-----------------+ + * + Fig 12.2 + +In Fig 12.1, the router can only be legacy mode since net1 attached to the +router is shared VxLAN type. Actually in this case the bridge network is not +needed for east-west networking. Let's see Fig 12.2, both net1 and net2 are +shared VxLAN type and are attached to the router(also this router can only be +legacy mode), so packets between net1 and net2 are routed in the router of the +local OpenStack cloud and then sent to the target. Extra routes will be cleared +so no packets will go through the bridge network. This is the current +implementation of the Tricircle to support VLAN network. + +Recommended Layer-3 Networking Mode +----------------------------------- + +Let's make a summary of the above discussion. Assume that DVR mode is a must, +the recommended layer-3 topology for each scenario is listed below. + ++----------------------------+---------------------+------------------+ +| north-south networking via | isolated east-west | Fig 7 | +| multiple external networks | networking | | +| +---------------------+------------------+ +| | connected east-west | Fig 8.1 or Fig 9 | +| | networking | | ++----------------------------+---------------------+------------------+ +| north-south networking via | Fig 11 | +| single external network | | ++----------------------------+---------------------+------------------+ +| north-south networking via | Fig 10 | +| direct provider network | | ++--------------------------------------------------+------------------+ + + +Guide of multi-node DevStack installation needs to be updated to introduce +the new bridge network solution. diff --git a/doc/source/devspecs/l3-networking-multi-NS-with-EW-enabled.rst b/doc/source/devspecs/l3-networking-multi-NS-with-EW-enabled.rst new file mode 100644 index 00000000..d8451b5d --- /dev/null +++ b/doc/source/devspecs/l3-networking-multi-NS-with-EW-enabled.rst @@ -0,0 +1,393 @@ +=========================================== +Layer-3 Networking multi-NS-with-EW-enabled +=========================================== + +Problems +======== + +There are already several scenarios fulfilled in Tricircle for north- +south networking. + +Scenario "North South Networking via Multiple External Networks"[1] meets +the demand for multiple external networks, but local network can not +reach other local networks which are not in the same OpenStack cloud. + +Scenario "North South Networking via Single External Network"[2] can meet +local networks east-west networking requirement, but the north-south traffic +needs to go to single gateway. + +In multi-region cloud deployment, a requirement is that each OpenStack cloud +provides external network, north-south traffic is expected to be handled +locally for shortest path, and/or use multiple external networks to ensure +application north-south traffic redundancy, at the same time east-west +networking of tenant's networks between OpenStack cloud is also needed. + +Proposal +======== + +To address the above problems, the key limitation is the pattern for router +gateway, one router in Neutron can only be attached to one external network. +As what's described in the spec of combined bridge network[3], only external +network is suitable for working as bridge network due to DVR challenge. + +North-south traffic via the external network in the same region is conflict +with external network as bridge network. + +The proposal is to introduce a new networking mode for this scenario:: + + + +-----------------------+ +----------------------+ + | ext-net1 | | ext-net2 | + | +---+---+ | | +--+---+ | + |RegionOne | | | RegionTwo | | + | +---+---+ | | +----+--+ | + | | R1 | | | | R2 | | + | +--+----+ | | +--+----+ | + | | net1 | | net2 | | + | +---+--+---+-+ | | ++-----+--+---+ | + | | | | | | | | + | +---------+-+ | | | | +--+--------+ | + | | Instance1 | | | | | | Instance2 | | + | +-----------+ | | | | +-----------+ | + | +----+--+ | bridge-net | +-+-----+ | + | | R3(1) +--------------------+ R3(2) | | + | +-------+ | | +-------+ | + +-----------------------+ +----------------------+ + Figure.1 Multiple external networks with east-west networking + +R1 is the router to connect the external network ext-net1 directly +in RegionOne. Net1's default gateway is R1, so all north-south traffic +will be forwarded by R1 by default. In short, north-south traffic of net2 +will be processed by R2 in RegionTwo. R1 and R2 are local routers which +is supposed to be presented in only one region. Region name should be +specified in availability-zone-hint during router creation in central +Neutron, for example:: + + openstack --os-region-name=CentralRegion router create --availability-zone-hint=RegionOne R1 + openstack --os-region-name=CentralRegion router create --availability-zone-hint=RegionTwo R2 + + openstack --os-region-name=CentralRegion router add subnet R1 + openstack --os-region-name=CentralRegion router add subnet R2 + +In order to process the east-west traffic from net1 to net2, R3(1) and R3(2) +will be introduced, R3(1) and R3(2) will be inter-connected by bridge-net. +Bridge-net could be VLAN or VxLAN cross Neutron L2 network, and it's the +"external network" for both R3(1) and R3(2), please note here the bridge-net +is not real external network, just the concept of Neutron network. R3(1) and +R3(2) will only forward the east-west traffic across Neutron for local +networks, so it's not necessary to work as DVR, centralized router is good +enough. + +In central Neutron, we only need to create a virtual logical router R3, +and R3 router is called as east-west gateway, to handle the east-west +traffic for local networks in different region, and it's non-local router. +Tricircle central Neutron plugin will help to create R3(1) in RegionOne and +R3(2) in RegionTwo, and use the bridge network to inter-connect R3(1) and +R3(2). The logical topology in central Neutron looks like follows:: + + ext-net1 ext-net2 + +-------+ +--+---+ + | | + +---+---+ +----+--+ + | R1 | | R2 | + +--+----+ +--+----+ + | net1 net2 | + +---+--+---++ ++-----+--+---+ + | | | | + +---------+-+ | | +--+--------+ + | Instance1 | | | | Instance2 | + +-----------+ | | +-----------+ + +-+----+--+ + | R3 | + +---------+ + + Figure.2 Logical topology in central Neutron + +Tricircle central Neutron plugin will use logical router R3 to create R3(1) +in RegionOne, and R3(2) in RegionTwo. + +Please note that R3(1) is not the default gateway of net1, and R3(2) is not +the default gateway of net2 too. So the user has to create a port and use +this port as the router interface explicitly between router and local +network. + +In central Neutron, the topology could be created like this:: + + openstack --os-region-name=CentralRegion port create --network=net1 net1-R3-interface + openstack --os-region-name=CentralRegion router add port R3 + + openstack --os-region-name=CentralRegion port create --network=net2 net2-R3-interface + openstack --os-region-name=CentralRegion router add port R3 + +Tricircle central Neutron plugin will automatically configure R3(1), R3(2) and +bridge-network as follows: + +For net1, host route should be added:: + + destination=net2's cidr, nexthop= + +For net2, host route should be added:: + + destination=net1's cidr, nexthop= + +In R3(1), extra route will be configured:: + + destination=net2's cidr, nexthop=R3(2)'s interface in bridge-net + +In R3(2), extra route will be configured:: + + destination=net1's cidr, nexthop=R3(1)'s interface in bridge-net + +R3(1) and R3(2) will set the external gateway to bridge-net:: + + router-gateway-set R3(1) bridge-net + router-gateway-set R3(2) bridge-net + +Now, north-south traffic of Instance1 and Instance2 work like follows:: + + Instance1 -> net1 -> R1 -> ext-net1 + Instance2 -> net2 -> R2 -> ext-net2 + +Only one hop for north-south traffic. + +East-west traffic between Instance1 and Instance2 work like follows:: + + Instance1 <-> net1 <-> R3(1) <-> bridge-net <-> R3(2) <-> net2 <-> Instance2 + +Two hops for cross Neutron east-west traffic. + +The topology will be more complex if there are cross Neutron L2 networks +except local networks:: + + +-----------------------+ +----------------------+ + | ext-net1 | | ext-net2 | + | +-------+ | | +--+---+ | + |RegionOne | | | RegionTwo | | + | +---+----------+ | | +-------------+--+ | + | | R1 | | | | R2 | | + | +--+--+---+--+-+ | | ++-+----+---+----+ | + | net1 | | | | | | | | | | net2 | + | ++--++ | | | | | | | | +-+---+ | + | | net3| | | | | | | |net4| | + | | ++---+ | | | | | | ++---+ | | + | | | | | | net5 | | | | | | + | | | +++-------------------------+-++| | | + | | | | | | net6 | | | | | | + | | | |++-+--------------------+++ | | | | + | | | | | | | | | | | | + | | | | | | | | | | | | + | | | | | | | | | | | | + | | | | | | | | | | | | + | +----+---+----+-+-+ | bridge-net | ++--+-+-----+-----+ | + | | R3(1) +--------------------+ R3(2) | | + | +-----------------+ | | +-----------------+ | + +-----------------------+ +----------------------+ + + Figure.3 Multi-NS and cross Neutron L2 networks + +The logical topology in central Neutron for Figure.3 looks like as follows:: + + ext-net1 ext-net2 + +-------+ +--+---+ + | | + +---+----------+ +-------------+--+ + | R1 | | R2 | + +--+--+---+--+-+ ++-+----+---+----+ + net1 | | | | | | | | net2 + ++--++ | | | | | | +-+---+ + | net3| | | | | |net4| + | ++---+ | | | | ++---+ | + | | | | net5 | | | | + | | +------+------------------+ | | + | | | | net6 | | | + | | +-------------+------+ | | + | | | | | | + | | | | | | + | | | | | | + | | | | | | + +-+---+------------+---------+------------+-----+-+ + | R3 | + +-------------------------------------------------+ + Figure.4 Logical topology in central Neutron with cross Neutron L2 network + +East-west traffic inside one region will be processed locally through default +gateway. For example, in RegionOne, R1 has router interfaces in net1, net3, +net5, net6, the east-west traffic between these networks will work as follows:: + + net1 <-> R1 <-> net3 + net1 <-> R1 <-> net5 + net1 <-> R1 <-> net6 + net3 <-> R1 <-> net5 + net3 <-> R1 <-> net6 + net5 <-> R1 <-> net6 + +There is nothing special for east-west traffic between local networks +in different OpenStack regions. + +Net5 and net6 are cross Neutron L2 networks, instances could be attached +to network from different regions, and instances are reachable in a remote +region via the cross Neutron L2 network itself. There is no need to add host +route for cross Neutron L2 network, for it's routable in the same region for +other local networks or cross Neutron L2 networks, default route is enough +for east-west traffic. + +It's needed to address how one cross Neutron L2 network will be +attached different local router: different gateway IP address will be used. +For example, in central Neutron, net5's default gateway IP is 192.168.0.1 +in R1, the user needs to create a gateway port explicitly for local router R2 +and net5, for example 192.168.0.2, then net5 will be attached to R2 using this +gateway port 192.168.0.2. Tricircle central Neutron plugin will make this +port's IP 192.168.0.2 as the default gateway IP for net5 in RegionTwo. + +Besides of gateway ports creation for local router R2, it's also needed to +create a gateway port for R3 and net5, which is used for east-west traffic. +Because R3 will be spread into RegionOne and RegionTwo, so net5 will have +different gateway ports in RegionOne and RegionTwo. Tricircle central Neutron +plugin needs to reserve the gateway ports in central Neutron, and create these +gateway ports in RegionOne and RegionTwo for net5 on R3. Because R3 is the +east-west gateway router for net5, so these gateway ports are not the default +gateway port. Then host route in net5 should be updated for local networks +which are not in the same region: + +For net5 in RegionOne, host route should be added:: + + destination=net2's cidr, nexthop= + destination=net4's cidr, nexthop= + +For net5 in RegionTwo, host route should be added:: + + destination=net1's cidr, nexthop= + destination=net3's cidr, nexthop= + +Similar operation for net6 in RegionOne and RegionTwo. + +If R1 and R2 are centralized routers, cross Neutron L2 network will +work, but if R1 and R2 are DVRs, then DVR MAC issue mentioned in the +spec "l3-networking-combined-bridge-net" should be fixed[2]. + +In order to make the topology not too complex, this use case will not be +supported: a cross Neutron L2 network is not able to be stretched into +the region where there are local networks. This use case is not useful +and will make the east-west traffic even more complex:: + + +-----------------------+ +----------+ +-----------------+ + | ext-net1 | | ext-net2 | | ext-net4 | + | +-------+ | | +------+ | | +--+---+ | + |RegionOne | | | RegionTwo| | Region4 | | + | +---+----------+ | | +------+ | | +-------+--+ | + | | R1 | | | | R2 | | | | R4 | | + | +--+--+---+--+-+ | | ++-+---+ | | +-+---+----+ | + | net1 | | | | | | | | | | | | net2 | + | ++--++ | | | | | | | | | | +-+---+ | + | | net3| | | | | | | | | |net4| | + | | ++---+ | | | | | | | | ++---+ | | + | | | | | | net5 | | | | | | | | + | | | +-+-------------------------+-+ | | | | | + | | | | | net6 | | | | | | | + | | | +-+--------------------+ | | | | | + | | | | | | | | | | + | | | | | | | | | | + | | | | | | | | | | + | | | | | | | | | | + | +----+---+--------+ | | +-----+ | | +-+-----+-----+ | + | | R3(1) | | | |R3(2)| | | | R3(3) | | + | +-----------+-----+ | | +-+---+ | | +-----+-------+ | + | | | | | | | | | + +-----------------------+ +----------+ +-----------------+ + | bridge-net | | + +----------------------------+-------------------+ + + Figure.5 Cross Neutron L2 network not able to be stretched into some region + + +Implementation +-------------- + +Local router: It's a router which is created with region name specified in the +availability zone hint, this will be present only in the specific region. + +East-west gateway router: It's a router which will be spread into multiple +regions and this will handle the east-west traffic to attached local networks. + +The following description of implementation is not pseudo code, it's the +logical judgemenet for different conditions combination. + +Adding router interface to east-west gateway router:: + + if IP of the router interface is the subnet default gateway IP + # north-south traffic and east-west traffic will + # go through this router + # router is the default router gateway, it's the + # single north-south external network mode + if the network is cross Neutron L2 network + reserve gateway port in different region + add router interface in each region using reserved gateway port IP + make sure the gateway port IP is the default route + else # local network + add router interface using the default gateway port or the port + specified in request + else # not the default gateway IP in this subnet + if the network is cross Neutron L2 network + reserve gateway port in different region + add router interface in each region using reserved gateway port IP + update host route in each connected local network in each region, + next hop is the reserved gateway port IP + else # local network + create router in the region as needed + add router interface using the port specified in request + if there are more than one interfaces on this router + update host route in each connected local network in each + region, next hop is port IP on this router. + + Configure extra route to the router in each region for EW traffic + +Adding router interface to local router for cross Neutron L2 network will +make the local router as the default gateway router in this region:: + + # default north-south traffic will go through this router + add router interface using the default gateway port or the port + specified in request + make sure this local router in the region is the default gateway + +If external network is attached to east-west gateway router, and network's +default gateway is the east-west gateway router, then the router will be +upgraded to north-south networking via single external network mode. + +Constraints: + Network can only be attached to one local router in one region. + + If a network has already been attached to a east-west gateway router, + and the east-west gateway router is the default gateway of this network, + then the network can't be attached to another local router. + +.. note:: Host route update in a subnet will function only in next + dhcp request. It may take dhcp_lease_duration for VMs in the subnet + to update the host route. It's better to compose the networking + topology before attached VMs to the netwrok. dhcp_lease_duration is + configured by the cloud operator. If tenant wants to make the host + route work immediately, can send dhcp request directly in VMs. + + +Data Model Impact +----------------- + +None + +Dependencies +------------ + +None + +Documentation Impact +-------------------- + +1. Add new guide for North South Networking via Multiple External Networks + with east-west enabled. +2. Release notes. + +Reference +--------- + +[1] North South Networking via Multiple External Networks: https://docs.openstack.org/tricircle/latest/networking/networking-guide-multiple-external-networks.html +[2] l3-networking-combined-bridge-net: https://github.com/openstack/tricircle/blob/master/specs/ocata/l3-networking-combined-bridge-net.rst +[3] North South Networking via Single External Network: https://docs.openstack.org/tricircle/latest/networking/networking-guide-single-external-network.html diff --git a/doc/source/devspecs/lbaas.rst b/doc/source/devspecs/lbaas.rst new file mode 100644 index 00000000..1c1efcdf --- /dev/null +++ b/doc/source/devspecs/lbaas.rst @@ -0,0 +1,185 @@ +========================================== +Distributed LBaaS in Multi-Region Scenario +========================================== + +Background +========== + +Currently, LBaaS (Load-Balancing-as-a-Service) is not supported in the +Tricircle. This spec is to describe how LBaaS will be implemented in +the Tricircle. LBaaS is an advanced service of Neutron, which allows for +proprietary and open-source load balancing technologies to drive the actual +load balancing of requests. Based on the networking guide of Ocata release, +LBaaS can be configured with an agent or Octavia. Given that the OpenStack +community try to take Octavia as the reference implementation of LBaaS, we +only enable LBaaS based on Octavia in the Tricircle. + +Different from existing LBaaS implementation, Octavia accomplishes its +delivery of load balancing services by managing a fleet of virtual machines, +containers, or bare metal servers, collectively known as amphorae, which it +spins up on demand. This spec file is dedicated to how to implement LBaaS +in multiple regions with the Tricircle. + +Overall Implementation +====================== + +The Tricircle is designed in a central-local fashion, where all the local +neutrons are managed by the central neutron. As a result, in order to adapt +the central-local design and the amphorae mechanism of +Octavia, we plan to deploy LBaaS as follows. :: + + +---------------------------+ + | | + | Central Neutron | + | | + +---------------------------+ + Central Region + + +----------------------------+ +-----------------------------+ + | +----------------+ | | +----------------+ | + | | LBaaS Octavia | | | | LBaaS Octavia | | + | +----------------+ | | +----------------+ | + | +------+ +---------------+ | | +-------+ +---------------+ | + | | Nova | | Local Neutron | | | | Nova | | Local Neutron | | + | +------+ +---------------+ | | +-------+ +---------------+ | + +----------------------------+ +-----------------------------+ + Region One Region Two + +As demonstrated in the figure above, for each region where a local neutron +is installed, admins can optionally choose to configure and install Octavia. +Typically, Octavia leverages nova installed in its region to spin up amphorae. +By employing load balancing softwares (e.g. haproxy) installed in the +amphorae and Virtual Router Redundancy Protocol (VRRP), a load balancer which +consists of a VIP and an amphora, can balance load across members with +high availability. However, under the central-local scenario, we plan to let +Octavia employ the central neutron in Central Region to manage networking +resources, while still employ services in its region to manage amphora. +Hence, the workflow of networking resource management in Tricircle can be +described as follows. + +Tenant-->local neutron API-->neutron-LBaaS--->local Octavia--->central neutron + +Specifically, when a tenant attempts to create a load balancer, he/she needs to +send a request to the local neutron-lbaas service. The service plugin of +neutron-lbaas then prepares for creating the load balancer, including +creating port via local plugin, inserting the info of the port into the +database, and so on. Next the service plugin triggers the creating function +of the corresponding driver of Octavia, i.e., +Octavia.network.drivers.neutron.AllowedAddressPairsDriver to create the +amphora. During the creation, Octavia employs the central neutron to +complete a series of operations, for instance, allocating VIP, plugging +in VIP, updating databases. Given that the main features of managing +networking resource are implemented, we hence need to adapt the mechanism +of Octavia and neutron-lbaas by improving the functionalities of the local +and central plugins. + +Considering the Tricircle is dedicated to enabling networking automation +across Neutrons, the implementation can be divided as two parts, +i.e., LBaaS members in one OpenStack instance, and LBaaS members in +multiple OpenStack instances. + +LBaaS members in single region +============================== + +For LBaaS in one region, after installing octavia, cloud tenants should +build a management network and two security groups for amphorae manually +in the central neutron. Next, tenants need to create an interface for health +management. Then, tenants need to configure the newly created networking +resources for octavia and let octavia employ central neutron to create +resources. Finally, tenants can create load balancers, listeners, pools, +and members in the local neutron. In this case, all the members of a +loadbalancer are in one region, regardless of whether the members reside +in the same subnet or not. + +LBaaS members in multiple regions +================================= + +1. members in the same subnet yet locating in different regions +--------------------------------------------------------------- +As shown below. :: + + +-------------------------------+ +-----------------------+ + | +---------------------------+ | | | + | | Amphora | | | | + | | | | | | + | | +-------+ +---------+ | | | | + | +--+ mgmt +--+ subnet1 +---+ | | | + | +-------+ +---------+ | | | + | | | | + | +--------------------------+ | | +-------------------+ | + | | +---------+ +---------+ | | | | +---------+ | | + | | | member1 | | member2 | | | | | | member3 | | | + | | +---------+ +---------+ | | | | +---------+ | | + | +--------------------------+ | | +-------------------+ | + | network1(subnet1) | | network1(subnet1) | + +-------------------------------+ +-----------------------+ + Region One Region Two + Fig. 1. The scenario of balancing load across instances of one subnet which + reside in different regions. + +As shown in Fig. 1, suppose that a load balancer is created in Region one, +and hence a listener, a pool, and two members in subnet1. When adding an +instance in Region Two to the pool as a member, the local neutron creates +the network in Region Two. Members that locate in different regions yet +reside in the same subnet form a shared VLAN/VxLAN network. As a result, +the Tricircle supports adding members that locates in different regions to +a pool. + +2. members residing in different subnets and regions +---------------------------------------------------- +As shown below. :: + + +---------------------------------------+ +-----------------------+ + | +-----------------------------------+ | | | + | | Amphora | | | | + | | | | | | + | | +---------+ +------+ +---------+ | | | | + | +-+ subnet2 +--+ mgmt +-+ subnet1 +-+ | | | + | +---------+ +------+ +---------+ | | | + | | | | + | +----------------------------------+ | | +-------------------+ | + | | | | | | | | + | | +---------+ +---------+ | | | | +---------+ | | + | | | member1 | | member2 | | | | | | member3 | | | + | | +---------+ +---------+ | | | | +---------+ | | + | | | | | | | | + | +----------------------------------+ | | +-------------------+ | + | network1(subnet1) | | network2(subnet2) | + +---------------------------------------+ +-----------------------+ + Region One Region Two + Fig. 2. The scenario of balancing load across instances of different subnets + which reside in different regions as well. + +As show in Fig. 2, supposing that a load balancer is created in region one, as +well as a listener, a pool, and two members in subnet1. When adding an instance +of subnet2 located in region two, the local neutron-lbaas queries the central +neutron whether subnet2 exist or not. If subnet2 exists, the local +neutron-lbaas employ octavia to plug a port of subnet2 to the amphora. This +triggers cross-region vxlan networking process, then the amphora can reach +the members. As a result, the LBaaS in multiple regions works. + +Please note that LBaaS in multiple regions should not be applied to the local +network case. When adding a member in a local network which resides in other +regions, neutron-lbaas use 'get_subnet' will fail and returns "network not +located in current region" + +Data Model Impact +----------------- + +None + +Dependencies +------------ + +None + +Documentation Impact +-------------------- + +Configuration guide needs to be updated to introduce the configuration of +Octavia, local neutron, and central neutron. + +References +---------- + +None diff --git a/doc/source/devspecs/legacy_tables_clean.rst b/doc/source/devspecs/legacy_tables_clean.rst new file mode 100644 index 00000000..8448830b --- /dev/null +++ b/doc/source/devspecs/legacy_tables_clean.rst @@ -0,0 +1,111 @@ +===================================== +Tricircle Table Clean After Splitting +===================================== + +Background +========== +Originally the Tricircle provided unified OpenStack API gateway and networking +automation functionality. But now the Tricircle narrows its scope to networking +automation across Neutron servers, the functionality of OpenStack API gateway +is developed in another project called Trio2o[1]. + +Problem Description +=================== +After this splitting, many tables would no longer be used, including quota, +volume, aggregate and pod binding, etc. The data models, tables and APIs of +them should be removed. As for the rest of the tables that are still in use +in the Tricircle, they should be renamed for better understanding. + +Apart from the table cleaning work and table renaming work, a new feature +will be developed to remove the dependency on old table. During the period +of external network creation, it will take 'availability_zone_hints' (AZ or +az will be used for short for availability zone) as a parameter. Previously +az_hints was searched in the pod binding table by az_name and tenant_id, now +the pod binding table is deprecated and new search strategy is needed to fix +the problem[2]. A function named find_pod_by_az will be developed to find the +az_hints by az_name in the pod table. Given the az_name, if it is not empty, +we first match it with region_name in the pod table. When a pod with the same +region_name is found, it will be returned back. The search procedure is +complete. If no pod is found with the same region_name, then we try to match +it with az_name in the pod table. If multiple pods are found, then we will +raise an exception. If only one pod is found, this pod will be returned back. +An exception will be raised if no pod is matched at the end of the previous +search procedure. However, if the az_name is empty, we will return None, a new +configuration item "default_region_for_external_network" will be used. + +Proposed Change +=============== + +All tables that need to be changed can be divided into two categories, +``Table to be removed``, ``Table to be renamed``. + +Table to be removed: + +- quality_of_service_specs + +- quota_classes + +- quota_usages + +- quotas + +- reservations + +- volume_type_extra_specs + +- volume_type_projects + +- volume_types + +- aggregates + +- aggregate_metadata + +- instance_types + +- instance_type_projects + +- instance_type_extra_specs + +- key_pairs + +- pod_binding + +Table to be renamed: + +- cascaded_pod_service_configuration(new name: cached_endpoints) + +- cascaded_pods(new name: pods) + +- cascaded_pods_resource_routing(new name: resource_routings) + +- job(new name: async_jobs) + +The deprecated tables will be removed from the repository directly, and other +tables containing old meanings will be renamed for better understanding. + +After the deletion of pod binding table, a new feature will be developed to +lookup the az in the pod table rather than the pod binding table. + +Data Model Impact +================= + +In database, many tables are removed, other tables are renamed for better +understanding. + +Documentation Impact +==================== + +After the pod binding table is removed, the explanation of the pod binding +API in the doc/source/api_v1.rst will be removed as well. + +Dependencies +============ + +None + +References +========== +[1] https://github.com/openstack/trio2o + +[2] https://review.openstack.org/#/c/412325/ diff --git a/doc/source/devspecs/local-neutron-plugin.rst b/doc/source/devspecs/local-neutron-plugin.rst new file mode 100644 index 00000000..6dec61ab --- /dev/null +++ b/doc/source/devspecs/local-neutron-plugin.rst @@ -0,0 +1,214 @@ +============================== +Tricircle Local Neutron Plugin +============================== + +Background +========== + +One of the key value we would like to achieve via the Tricircle project is to +provide networking automation functionality across several Neutron servers. +Each OpenStack instance runs its own Nova and Neutron services but shares the +same Keystone service or uses federated Keystone, which is a multi-region +deployment mode. With networking automation, virtual machines or bare metals +booted in different OpenStack instances can inter-communicate via layer2 or +layer3 network. + +Considering the cross Neutron layer2 network case, if Neutron service in each +OpenStack instance allocates ip address independently, the same ip address +could be assigned to virtual machines in different OpenStack instances, thus ip +address conflict could occur. One straightforward solution to this problem is +to divide the ip allocation pool into several parts and each OpenStack instance +has one. The drawback is that since virtual machines are not distributed evenly +in each OpenStack instance, we may see some OpenStack instances uses up ip +addresses while other OpenStack instances still have ip addresses not +allocated. What's worse, dividing the ip allocation pool makes it impossible +for us to process virtual machine migration from one OpenStack instance to +another. + +Thanks to Neutron's flexible plugin framework, by writing a new plugin and +configuring Neutron server to use it, developers can define what Neutron server +should do after receiving a network resources operation request. So for the +ip address conflict issue discussed above, we decide to run one central Neutron +server with the Tricircle central Neutron plugin(abbr: "central plugin") to +manage ip allocation pool centrally. + +Besides central plugin, we need a bridge to connect central and local Neutron +servers since each OpenStack instance has its own local Nova and Neutron server +but these two services are not aware of the central Neutron server. This bridge +should validate requested network data via the central Neutron server, then +create necessary network resources in the target OpenStack instance with the +data retrieved from the central Neutron server. + +Local Plugin +============ + +For connecting central and local Neutron servers, Neutron plugin is again a +good place for us to build the bridge. We can write our own plugin, the +Tricircle local Neutron plugin(abbr: "local plugin") to trigger the cross +Neutron networking automation in local Neutron server. During virtual machine +booting, local Nova server will interact with local Neutron server to query +network or create port, which will trigger local plugin to retrieve data from +central Neutron server and create necessary network resources according to the +data. To support different core plugins, we will introduce a new option +"real_core_plugin" in the "tricircle" configuration group. During +initialization, local plugin will load the plugin specified by +"real_core_plugin". Local plugin only adds logic to interact with central +Neutron server, but invokes the real core plugin to finish the CRUD operations +of local network resources. The following graph shows the relation between user +and Nova and Neutron servers: :: + + +------+ + | user | + +-+--+-+ + | | + +-----------+ +----------------------+ + | boot vm create and query | + | network resource | + v | + +----+-------+ | + | local Nova | xxxxxxxxxxxxxxx | + +----+-------+ xxx xxx | + | xx xx | + +---+ xxx +--------+ xxx | + | x | | x | + | x | | x | + v V | v x v + +--------+---------+ | +----+----------+----+ + | local Neutron | | | central Neutron | + | +--------------+ | | | +----------------+ | + | | local plugin | | | | | central plugin | | + | +--------------+ | | | +----------------+ | + +------------------+ | +--------------------+ + | | + +-------------+ + +Next using virtual machine booting procedure to elaborate how local plugin +works. To begin with, user creates network and subnet via central Neutron +server. Then this user passes the network id as the requested network +information to local Nova server to boot a virtual machine. During parameter +validation, local Nova server queries local Neutron server to ensure the +passed-in network id is valid, which is a "network-get" request. In the +"network-get" handle function, local plugin first checks if local Neutron +already has a network with that id. If not, local plugin retrieves network and +also subnet information from central Neutron server then creates network and +subnet based on this information. User may pass an invalid network id by +mistake, in this case, local plugin will receive a 404 response from central +Neutron server, it just returns a 404 response to local Nova server. + +After the network id validation passes, local Nova server continues to schedule +a host so compute manager running in that host will do the left works. Compute +manager creates a port in the requested network via local Neutron server, which +is a "port-create" request. In the "port-create" handle function, local plugin +sends the same request to central Neutron server to create a port, and uses +the returned port information to create a local port. With local plugin, we +ensure all ip addresses are allocated by central Neutron server. + +At the end of the network setup of the virtual machine, compute manager issues +a "port-update" request to local Neutron server to associate the host with the +port. In the "port-update" handle function, local plugin recognizes that this +request is sent from local Nova server by the request body that the request +body contains host information, so it sends a "port-update" request to central +Neutron server with region name in the request body. In Keystone, we register +services inside one OpenStack instance as one unique region, so we can use +region name to identify one OpenStack instance. After receiving the request, +central Neutron server is informed that one virtual machine port is correctly +setup in one OpenStack instance, so it starts the cross Neutron networking +automation process, like security group rule population, tunnel setup for +layer2 communication and route setup for layer3 communication, which are done +by making Neutron API call to each local Neutron server. + + +Implementation +============== + +Implementation details of the local plugin is discussed in this section. + +Resource Id +----------- + +Local plugin always retrieves data of networks resources from central Neutron +server and use these data to create network resources in local Neutron server. +During the creation of these network resources, we need to guarantee resource +ids in central and local server the same. Consider the scenario that user +creates a port via central Neutron server then use this port to boot a virtual +machine. After local Nova server receives the request, it will use the port id +to create a tap device for the virtual machine. If port ids in central and +local Neutron servers are different, OVS agent can't correctly recognize the +tap device and configure it. As a result, virtual machine fails to connect to +the network. Fortunately, database access module in Neutron allow us to specify +id before creating the resource record, so in local plugin, we just specify id +the same as central resource's to create local resource. + +Network Type Adaption +--------------------- + +Two network types are supported currently in central plugin, which are local +and vlan type. Before creating network based on information retrieved +from central Neutron server, local plugin needs to adapt network type. For +local type, local plugin creates the network without specifying the network +type, so the default tenant network type is used. For vlan type, local plugin +keeps the network type, segmentation id and physical network parameter. + +We plan to support another two network types later. They are shared_vxlan and +mixed network type. For shared_vxlan type, local plugin changes the network +type parameter from "shared_vxlan" to "vxlan", but keeps the segmentation id +parameter(vxlan type doesn't need physical network parameter). For mixed type, +like local type, local plugin uses the default tenant network type to create +the network, but it needs to do one more thing, that is to save the segment +information in central Neutron server. Neutron has a extension which allows one +network to carry multiple segments information[1], so segment information of +each local network can all be saved in the central network. + +Dhcp Port Handle +---------------- + +After local subnet creation, local Neutron server will schedule one dhcp agent +for that subnet, and dhcp agent will automatically create a dhcp port. The ip +address of this dhcp port is not allocated by central Neutron server, so we may +encounter ip address conflict. We need to address this problem to ensure all ip +addresses are allocated by central Neutron server. + +Here is the approach. After central Neutron server receives subnet creation +subnet, central plugin not only creates the requested subnet, but also create a +port to pre-allocate an ip address for the dhcp port. So during creation of +local subnet, local plugin will query central Neutron server to retrieve the +data of the pre-created port and use its ip address to create a local dhcp +port. The "device_id" of the dhcp port is set to "reserved_dhcp_port" so after +one dhcp agent is scheduled, it will use this port other than create a new one. + +Gateway Port Handle +------------------- + +If cross Neutron layer2 networking is enabled in one network, we need to +allocate one gateway ip for that network in each OpenStack instance. The reason +is that we want layer3 routing to be finished locally in each OpenStack +instance. If all the OpenStack instances have the same gateway ip, packets sent +to the gateway may reach the remote one, so the path is not the best and not +predictable. + +How we address this problem in local plugin is that before creating local +subnet, local plugin sends request to central Neutron server to create an +"gateway port", then uses the ip of this port as the gateway ip of the local +subnet. Name of the gateway port includes the region name of the OpenStack +instance and the id of the subnet so each OpenStack instance can have its own +gateway port and gateway ip for one specific subnet. + +Data Model Impact +----------------- + +None + +Dependencies +------------ + +None + +Documentation Impact +-------------------- + +Installation guide needs to be updated to introduce the configuration of +central and local plugin. + +References +---------- +[1] https://blueprints.launchpad.net/neutron/+spec/ml2-multi-segment-api diff --git a/doc/source/devspecs/new-l3-networking-mulit-NS-with-EW.rst b/doc/source/devspecs/new-l3-networking-mulit-NS-with-EW.rst new file mode 100644 index 00000000..4d3ecf31 --- /dev/null +++ b/doc/source/devspecs/new-l3-networking-mulit-NS-with-EW.rst @@ -0,0 +1,327 @@ +================================================= +A New Layer-3 Networking multi-NS-with-EW-enabled +================================================= + +Problems +======== +Based on spec for l3 networking [1], a l3 networking which enables multiple +NS traffic along with EW traffic is demonstrated. However, in the +aforementioned l3 networking model, the host route will be only valid after +DHCP lease time expired and renewed. It may take dhcp_lease_duration for VMs +in the subnet to update the host route, after a new pod with external +network is added to Tricircle. To solve the problem, this spec is written +to introduce a new l3 networking model. + +Proposal +======== +For the networking model in [1], a tenant network is attached to two +routers, one for NS traffic, the other for EW traffic. In the new networking +model, inspired by combined bridge network [2], we propose to attach the +tenant network to one router, and the router takes charge of routing NS +and EW traffic. The new networking mode is plotted in Fig. 1. :: + + +-----------------------+ +----------------------+ + | ext-net1 | | ext-net2 | + | +---+---+ | | +--+---+ | + |RegionOne | | | RegionTwo | | + | +---+---+ | | +----+--+ | + | | R1 +------+ | | +--------+ R2 | | + | +-------+ | | | | +-------+ | + | net1 | | | | net2 | + | +------+---+-+ | | | | +-+----+------+ | + | | | | | | | | | | + | +---------+-+ | | | | | | +--+--------+ | + | | Instance1 | | | | | | | | Instance2 | | + | +-----------+ | | | | | | +-----------+ | + | +----+--+ | | | | ++------+ | + | | R3(1) +-+-----------------+--+ R3(2) | | + | +-------+ | bridge net | +-------+ | + +-----------------------+ +----------------------+ + + Figure 1 Multiple external networks with east-west networking + +As shown in Fig. 1, R1 connects to external network (i.e., ext-net1) and +ext-net1 is the default gateway of R1. Meanwhile, net1 is attached to R3 +and R3's default gateway is the bridge net. Further, interfaces of bridge +net are only attached to R1 and R2 which are regarded as local routers. + +In such a scenario, all traffic (no matter NS or EW traffic) flows to R3. +For EW traffic, from net1 to net2, R3(1) will forwards packets to the +interface of net2 in R3(2) router namespace. For NS traffic, R3 forwards +packets to the interface of an available local router (i.e., R1 or R2) +which attached to the real external network. As a result, bridge net is +an internal net where NS and EW traffic is steered, rather than the real +external network of R3. + +To create such a topology, we need to create a logical (non-local) router +R3 in the central Neutron. Tricircle central Neutron plugin then creates +R3(1) in RegionOne and R3(2) in RegionTwo, as well as the bridge network +to inter-connect R3(1) and R3(2). As such, the networking for EW traffic +is ready for tenants. To enable NS traffic, real external networks are +required to be attached to R3. When explicitly adding the gateway port +of each external network to R3, Tricircle automatically creates a local +router (e.g. R1) for external network and set the gateway to the local +router. Then to connect the local router (e.g. R1) and the non-local +router (R3), two interfaces of bridge-net are also created and attached +to respect router. The logical topology in central Neutron is plotted +in Fig. 2. :: + + ext-net1 ext-net2 + +---+---+ +---+---+ + | | + +---+---+ +---+---+ + | R1 | | R2 | + +---+---+ +---+---+ + | | + +---+--------------------+---+ + | bridge-net | + +-------------+--------------+ + | + | + +-------------+--------------+ + | R3 | + +---+--------------------+---+ + | net1 net2 | + +---+-----+-+ +---+-+---+ + | | + +---------+-+ +--+--------+ + | Instance1 | | Instance2 | + +-----------+ +-----------+ + + Figure 2 Logical topology in central Neutron + +To improve the logic of building l3 networking, we introduce routed network to +manage external networks in central Neutron. In central Neutron, one routed +network is created as a logical external network, and real external networks +are stored as segments of the external network. As such, the local routers +(e.g., R1 and R2 in Fig. 2) are transparent to users. As a result, when a real +external network is created, a local router is created and the external +network's gateway is set to the router. Moreover, a port of bridge-net is +created and added to the local router. + +The routed network is created as follows: :: + + openstack --os-region-name=CentralRegion network create --share --provider-physical-network extern --provider-network-type vlan --provider-segment 3005 ext-net + openstack --os-region-name=CentralRegion network segment create --physical-network extern --network-type vlan --segment 3005 --network ext-net ext-sm-net1 + openstack --os-region-name=CentralRegion network segment create --physical-network extern --network-type vlan --segment 3005 --network ext-net ext-sm-net2 + openstack --os-region-name=CentralRegion subnet create --network ext-net --network-segment ext-net1 --ip-version 4 --subnet-range 203.0.113.0/24 net1-subnet-v4 + openstack --os-region-name=CentralRegion subnet create --network ext-net --network-segment ext-net1 --ip-version 4 --subnet-range 203.0.114.0/24 net2--subnet-v4 + +The logical topology exposed to users is plotted in Fig. 3. :: + + ext-net (routed network) + +---+---+ + | + | + +--------------+-------------+ + | R3 | + +---+--------------------+---+ + | net1 net2 | + +---+-----+-+ +---+-+---+ + | | + +---------+-+ +--+--------+ + | Instance1 | | Instance2 | + +-----------+ +-----------+ + + Figure 3 Logical topology exposed to users in central Neutron + +For R3, net1 and net2 should be attached to R3: :: + + openstack --os-region-name=CentralRegion router add subnet R3 + openstack --os-region-name=CentralRegion router add subnet R3 + +The gateway of the ext-net, i.e., the routed network, is set to R3: :: + + openstack --os-region-name=CentralRegion router set R3 + +However, a routed network does not have a gateway. Consequently, the command +above fails for trying adding the gateway of a routed network to the router, +i.e., R3. To ensure the command works, we plan to create a gateway port for +the routed network before setting the gateway to a router. Actually, the port +is a blank port which does not have an IP, because a routed network is a +software entity of multiple segments (i.e., subnets). To make sure the +gateways of real external networks can be retrieved, we manage the IPs of +gateways in "tags" field of the gateway port. + +This command creates a port of bridget-net and add it to R3, which is plotted in +Fig. 2. + +Tricircle central Neutron plugin will automatically configure R3(1), R3(2) +and bridge-network as follows: + +For net1 and net2, no host route is needed, so in such an l3 networking +model, users are no longer required to wait for DHCP renew to update +host route. All traffic is forwarded to R3 by default. + +In R3(1), extra route will be configured: :: + + destination=net2's cidr, nexthop=R3(2)'s interface in bridge-net + destination=ext-net1's cidr, nexthop=R1's interface in bridge-net + +In R3(2), extra route will be configured: :: + + destination=net1's cidr, nexthop=R3(1)'s interface in bridge-net + destination=ext-net2's cidr, nexthop=R2's interface in bridge-net + +R3(1) and R3(2) will set the external gateway to bridge-net: :: + + router-gateway-set R3(1) bridge-net + router-gateway-set R3(2) bridge-net + +Now, north-south traffic of Instance1 and Instance2 work as follows: :: + + Instance1 -> net1 -> R3(1) -> R1 -> ext-net1 + Instance2 -> net2 -> R3(2) -> R2 -> ext-net2 + +Two hops for north-south traffic. + +East-west traffic between Instance1 and Instance2 work as follows: :: + + Instance1 <-> net1 <-> R3(1) <-> bridge-net <-> R3(2) <-> net2 <-> Instance2 + +Two hops for cross Neutron east-west traffic. + +The topology with cross Neutron L2 networks except local networks is +illustrated in Fig. 4. :: + + +-----------------------+ +-----------------------+ + | ext-net1 | | ext-net2 | + | +---+---+ | | +--+---+ | + |RegionOne | | | RegionTwo | | + | +---+------+ | | +----------+--+ | + | | R1 +---+ | | +---+ R2 | | + | +----------+ | | | | +-------------+ | + | net1 | | | | net2 | + | ++---+ | | | | +-----+ | + | | net3 | | | | net4| | + | | ++---+ | | | | +--+-+ | | + | | | | | net5 | | | | | + | | | +-+-----------------------------+-+ | | | + | | | | | | net6 | | | | | | + | | | | ++-----------------------++ | | | | + | | | | | | | | | | | | | | + | | | | | | | | | | | | | | + | | | | | | | | | | | | | | + | | | | | | | | | | | | | | + | +----+---+---+--+-+ | | bridge-net | | ++--+---+---+-----+ | + | | R3(1) +-+----------------+-+ R3(2) | | + | +-----------------+ | | +-----------------+ | + +-----------------------+ +-----------------------+ + + Figure 4 Multi-NS and cross Neutron L2 networks + +The logical topology in central Neutron for Figure. 4 is plotted in Fig. 5. :: + + ext-net1 ext-net2 + +---+---+ +--+---+ + | | + +--+-----------+ +---+------------+ + | R1 | | R2 | + +----------+---+ +----+-----------+ + | | + +----------+--------------------------+-----------+ + | bridge-net | + +-----------------------+-------------------------+ + | + +-----------------------+-------------------------+ + | R3 | + +--+----+------+-----------------+---------+----+-+ + | | | | | | + | | | | | | + | | | | | | + | | +-+--------------------+ | | + | | net5 | | | + | | +--------------+------+ | | + | | net6 | | + | +-+---+ +---+-+ | + | net3 net2 | + +-+---+ +---+-+ + net1 net4 + + Figure 5 Logical topology in central Neutron with cross Neutron L2 network + +By adding networks to R3, EW traffic is routed by R3. + +For net5 in RegionOne, extra route in R3(1) should be added: :: + + destination=net1's cidr, nexthop= + destination=net3's cidr, nexthop= + +For net5 in RegionTwo, extra route in R3(2) should be added: :: + + destination=net1's cidr, nexthop= + destination=net3's cidr, nexthop= + +The east-west traffic between these networks will work as follows:: + + net1 <-> R3 <-> net3 + net1 <-> R3 <-> net5 + net1 <-> R3 <-> net6 + net3 <-> R3 <-> net5 + net3 <-> R3 <-> net6 + net5 <-> R3 <-> net6 + +For NS traffic, the route to external network is already configured, +so NS traffic is routed to R1 or R2. + +Implementation +============== + +Part 0: add an option in local.conf to enable the new l3 networking model + +Add an option "ENABLE_HOST_ROUTE_INDEPENDENT_L3_NETWORKING", whose value +is TRUE or FALSE, to indicate whether users expect to adopt such new l3 +networking model. + +Part 1: enable external network creation with transparent (local) router + +This part mainly ensures a real external network is created along with a +local router, and set the gateway of the external network to the router. +As shown in Fig. 2, when ext-net1 is created, R1 is created, too. And the +gateway of ext-net1 is set to R1. Moreover, the local router, e.g. R1, is +transparent to users. In other words, users only create external network, +while tricircle complete the creation of the local router. As a result, +users are unaware of the local routers. + +Part 2: enable routed network and gateway setting process + +This part enables routed network in the central neutron. Meanwhile, this +part also needs to complete the process of setting gateway of the routed +network to the distributed router, e.g. R3 in Fig. 2. Here since the routed +network is a software entity of multiple real external networks, the gateway +ip of the routed network is set as NULL. And the gateway ips of real external +networks is planned to stored in tag field of the routed network. So this +part mainly deal with the blank gateway ip of the routed network when setting +gateway to the router. + +Part 3: modify floating ip creation + +In the existing l3 networking, external network and tenant network is +connected by a router, so implementing floating ip only needs NAT once. +However, in the new l3 networking model, as shown in Fig. 2, external network +and tenant network connect two routers, respectively. And the two routers +are connected by bridge network. So implementing floating ip needs to be NATed +twice. This part mainly deal with such an issue. + +Data Model Impact +================= + +None + +Dependencies +============ + +None + +Documentation Impact +==================== + +1. Add a new guide for North South Networking via Multiple External Networks + with east-west enabled. +2. Release notes. + +Reference +========= + +[1] https://github.com/openstack/tricircle/blob/master/specs/pike/l3-networking-multi-NS-with-EW-enabled.rst +[2] https://github.com/openstack/tricircle/blob/master/specs/ocata/l3-networking-combined-bridge-net.rst diff --git a/doc/source/devspecs/quality-of-service.rst b/doc/source/devspecs/quality-of-service.rst new file mode 100644 index 00000000..489592c3 --- /dev/null +++ b/doc/source/devspecs/quality-of-service.rst @@ -0,0 +1,247 @@ +============================= +Tricircle Quality of Service +============================= + +Background +========== + +QoS is defined as the ability to guarantee certain network requirements +like bandwidth, latency, jitter and reliability in order to satisfy a +Service Level Agreement (SLA) between an application provider and end +tenants. In the Tricircle, each OpenStack instance runs its own Nova and +Neutron services but shares the same Keystone service or uses federated +KeyStones, which is a multi-region deployment mode. With networking automation, +networks or ports created in different OpenStack cloud should be able to be +associated with QoS policies. + +Proposal +======== + +As networking automation across Neutron could be done through the Tricircle, +the QoS automation should be able to work based on tenant's need too. When +tenant wants to apply QoS to the network or port from the central Neutron, QoS +can't be created in the local Neutron server in the bottom pod directly, since +it's still unclear whether the network will be presented in this pod or not. + +In order to achieve QoS automation operations, QoS can't be created in the +local Neutron server directly until there are some existing networks/ports +in bottom pod. The Tricircle central Neutron plugin(abbr: "central plugin") +will operate QoS information in the local Neutron server, QoS service isn't +like network/port that needs to be created during VM booting, in order to +speed up the local VMs booting and reduce the delay that caused by +synchronization between central Neutron and local Neutron, Tricircle central +plugin should use an asynchronous method to associate QoS with the local +network/port, or remove QoS association in each local Neutron if needed. + +Implementation +============== + +Case 1, QoS policy creation +---------------------------- + +In this case, we only create QoS in the central Neutron. + +Case 2, QoS policy association without local network/port in place +------------------------------------------------------------------ + +QoS has been created in the central Neutron but local network/port has not +yet been created. + +In this case, we just need to update network/port with QoS policy id in the +central Neutron. + +Case 3, QoS policy association with local network/port in place +--------------------------------------------------------------- + +After QoS has been created in the central Neutron and local network/port +also has been created, associate QoS with network/port in the central Neutron. + +In this case, network/port has been created in the local Neutron. After +network/port is updated with the QoS policy id in the central Neutron, we also +need to do some similar association in the local Neutron. Central Neutron uses +"create_qos_policy" job to create the local QoS policy firstly, then update the +network/port QoS association asynchronously in the local Neutron through the +network/port routing information and add the QoS routing information in routing +table. XJob will interact with local Neutron to update the QoS policy id for +network/port in local Neutron. + +Case 4, provision VM with QoS policy associated central port/network +-------------------------------------------------------------------- + +QoS has been associated to central port/network first, local network/port +is created later in VM provision. + +In this case, QoS has been associated to the central network/port and at this +point local network/port does not exist. Since QoS has not been created in +the local Neutron but central Neutron has finished the association, local +neutron needs to trigger central Neutron to finish the local network/port +QoS association when VMs booting in the local. When VM booting in the bottom +pod, local Neutron sends update port request with port information to central +Neutron and if QoS id field exists in the network/port, the central Neutron +will be triggered to use XJob to create an QoS policy creation job in the +local Neutron (it also speeds up VM booting) and add the QoS routing +information in routing table. + +Case 5, QoS policy updating +---------------------------- + +In this case, if local network/port isn't associated with QoS, we only update +QoS in the central Neutron. + +If QoS policy has been associated with local network/port in place, after +central Neutron updates QoS, central Neutron will use XJob to create a QoS +asynchronous updating job through the network/port routing information. +XJob will asynchronously update QoS in the local Neutron. + +Case 6, QoS policy disassociation +----------------------------------- + +For QoS policy disassociation, just need to change the parameters of +"QoS_policy_id" to None when update network/port in the central Neutron and +we can disassociate network/port. + +In this case, if network/port in local Neutron isn't associated with QoS, we +only disassociate network/port in the central Neutron. + +If QoS policy has been associated with network/port in local Neutron, after +central Neutron disassociates network, central Neutron will use XJob to +create a network update job to disassociate the network with the QoS policy; +for port, central Neutron will synchronously update the port to disassociate +it with the QoS policy in the local Neutron. + +Case 7, QoS policy deletion +---------------------------- + +QoS policy can only be deleted if there is no any association in central +Neutron. In this case, if local network/port isn't associated with QoS, we +only delete QoS in the central Neutron. + +If there is QoS policy routing info, after central Neutron deletes QoS, +central Neutron will use XJob to create a QoS asynchronous deletion job +through the network/port routing information. XJob will asynchronously +delete QoS in the local Neutron. + +Case 8, QoS rule creation +-------------------------- + +In this case, if local network/port isn't associated with QoS, we only create +QoS rule in the central Neutron. + +If QoS policy has been associated with local network/port in place, after central +Neutron creates QoS rules, central Neutron will use XJob to create a QoS rules +syncing job through the network/port routing information, then asynchronously +creates QoS rules in the local Neutron. + +Case 9, QoS rule updating +-------------------------- + +In this case, if local network/port isn't associated with QoS, we only update +QoS rule in the central Neutron. If QoS policy has been associated with local +network/port in place, after central Neutron updates QoS rule, central Neutron +will trigger XJob to create a QoS rules syncing job in the local Neutron +through the network/port routing information. XJob will asynchronously update +QoS rule in the local Neutron. + +Case 10, QoS rule deletion +---------------------------- + +In this case, if local network/port isn't associated with QoS, we only delete +QoS rule in the central Neutron. + +If QoS policy has been associated with local network/port in place, after +central Neutron deletes QoS rule, central Neutron will use XJob to create a QoS +rules syncing job through the network/port routing information. XJob will +asynchronously delete QoS rule in the local Neutron. + +QoS XJob jobs list +------------------- + +- **1: create_qos_policy(self, ctxt, policy_id, pod_id, res_type, res_id=None)** + +Asynchronously creating QoS policy for the corresponding pod which id equals +"pod_id", specify network or port in through the parameter res_type and +res_id. If res_type is RT_NETWORK, then res_id is network's uuid, if res_type +is RT_PORT, then res_id is port's uuid + +**Triggering condition:** + +When associating network/port in the central Neutron, if this network/port +exists in the local Neutron, triggering this asynchronous job to complete +the local association. + +When central plugin processing a port update request sent by local plugin +and finding the port is associated with QoS. + +If pod_id is POD_NOT_SPECIFIED then the async job will process all related +pods, so the create_qos_policy(self, ctxt, policy_id, pod_id) job will deal +with not only single pod's QoS association. + +If the res_type is RT_NETWORK/RT_PORT, after creating the qos policy on pod, +the async job will bind the qos policy that just created to the network/port +specified by the parameter of res_id. + +- **2: update_qos_policy(self, ctxt, policy_id, pod_id)** + +Asynchronously updating QoS policy for the corresponding pod which id equals +"pod_id". + +**Triggering condition:** + +When updating QoS policy in the central Neutron, if it also exists in the +local Neutron, triggering this asynchronous job to complete the local QoS +updating. + +If pod_id is POD_NOT_SPECIFIED then the async job will process all related +pods, so the update_qos_policy(self,ctxt,policy_id,pod_id) job will deal with +not only single pod's QoS association. + +- **3: delete_qos_policy(self, ctxt, policy_id, pod_id)** + +Asynchronously deleting QoS policy for the corresponding pod which id equals +"pod_id". + +**Triggering condition:** + +When deleting QoS policy in the central Neutron, if this QoS policy exists in +the local Neutron, triggering this asynchronous job to complete the local QoS +deletion. +(Warning: the deleted QoS policy must be disassociated first.) + +If pod_id is POD_NOT_SPECIFIED then the async job will process all related +pods, so the delete_qos_policy(self,ctxt,policy_id,pod_id) job will deal with +not only single pod's QoS association. + +- **4: sync_qos_policy_rules(self, ctxt, policy_id)** + +Asynchronous operation for rules of one QoS policy for specified project. +There are two trigger conditions. The one is that central Neutron +creates/updates/deletes QoS rules after QoS policy has been associated with +local network/port. The other is that central plugin processes a port update request +sent by local plugin and finds the port is associated with QoS policy. + +If the rule both exists in the central Neutron and local Neutron, but with +inconsistent content, just asynchronously updating this QoS rule in the local +Neutron. + +If the rule exits in the central Neutron, but it does not exist in the local +Neutron, just asynchronously creating this QoS rule in the local Neutron. + +If the rule exits in the local Neutron, but it does not exist in the central +Neutron, just asynchronously deleting this QoS rule in the local Neutron. + + +Data Model Impact +----------------- + +None + +Dependencies +------------ + +None + +Documentation Impact +-------------------- + +Release notes + diff --git a/doc/source/devspecs/resource_deleting.rst b/doc/source/devspecs/resource_deleting.rst new file mode 100644 index 00000000..58770e7d --- /dev/null +++ b/doc/source/devspecs/resource_deleting.rst @@ -0,0 +1,66 @@ +======================================== +Reliable resource deleting in Tricircle +======================================== + +Background +========== +During the deletion of resources which are mapped to several local Neutron(s), +it may bring some conflict operations. For example, deleting a network in +central neutron which is also resided in several local Neutron(s). The reason +is that network-get request will trigger local neutron to query central +neutron and create the network, and we delete local networks before deleting +central network. When a network-get request comes to a local neutron server +after the local network is completely deleted in that region and at this time +the network in central neutron still exists (assuming it takes certain time to +delete all local networks), local neutron will still retrieve the network from +central neutron and the deleted local network will be recreated. This issue +also applies to the deletion cases of other resource types. + +Proposed Solution +================= +Recently, Tricircle adds a feature to distinguish the source of requests[1], so +we can distinguish the deletion request from 'Central Neutron' or +'Local Neutron'. In order to avoid the conflict mentioned above, we introduce a +new table called "deleting_resource" in Tricircle database, so central plugin +can save the resource deletion information and set the information when it +receives a deletion request. Here is the schema of the table: + +.. csv-table:: Resource deleting table + :header: Field, Type, Nullable, pk/fk/uk, Description + + resource_id, string, False, uk, resource id in central Neutron + resource_type, string, False, uk, resource_type denotes one of the available resource types + deleted_at, timestamp, False, n/a, deletion timestamp + +**How to delete the resource without conflict operation** + +Let's take network deletion as an example. + +At the beginning of network-delete handle, central neutron server sets the +information of deleted network into the "deleting_resource" table. + +At this point, if get-request from local neutron servers comes, central +neutron server will check the "deleting_resource" table whether the +associated resource has been recorded and return 404 to local neutron server +if the associated resources is being deleting. + +At this point, if deletion request is from central Neutron, central neutron +server will check the "deleting_resource" table whether the associated +resource has been recorded and it will return 204 to user if associated +resource is being deleting. + +For the get-request of user, central neutron server will query the related +network information in "deleting_resource" table and will return the deleting +resource to user if the network information which the user queries exists in +the table. When user re-deleting the network after something wrong happens, +central neutron will return 204 to user. + +At the end of network-delete handle that all the mapped local networks have +been deleted, central neutron server will remove the deleting resource record +and remove this network. + +In addition, there is a timestamp in table that cloud administrator is able to +delete a resource which is in deleting status over long time (too long to +delete, or in abnormal status). + +[1] https://review.openstack.org/#/c/518421/ diff --git a/doc/source/devspecs/smoke-test-engine.rst b/doc/source/devspecs/smoke-test-engine.rst new file mode 100644 index 00000000..1fea3758 --- /dev/null +++ b/doc/source/devspecs/smoke-test-engine.rst @@ -0,0 +1,219 @@ +================= +Smoke Test Engine +================= + +Problems +======== +Currently we are running a simple smoke test in the CI job. Several resources +are created to build a simple topology, then we query to check whether the +resources are also created in local Neutron servers as expected. The problems +exist are: + +- 1 Bash scripts are used to invoke client to send API request while python + scripts are used to check the result. Mix use of bash and python makes us + hard to write new tests. +- 2 Resources are not cleaned at the end of the test so we can't proceed other + tests in the same environment. + +Proposal +======== +Using bash scripts to do both API request and result validation is tricky and +hard to read, working with python is a better choice. We have several python +libraries that can help us to send API request: openstackclient, neutronclient +and openstacksdk. The main goal of the first two libraries is providing command +line interface(CLI), so they don't expose methods for us to send API request, +but we can still use them by calling internal functions that are used by their +CLI instance. The drawback of using internal functions is that those internal +functions are undocumented and are possible to be changed or removed someday. +Compare to openstackclient and neutronclient, openstacksdk is a library that +aims for application building and is well-documented. Actually openstackclient +uses openstacksdk for some of its commands' implementation. The limitation of +openstacksdk is that some service extensions like trunk and service function +chaining have not been supported yet, but it's easy to extend by our own. + +Before starting to write python code to prepare, validate and finally clean +resources for each test scenario, let's hold on and move one step forward. Heat +uses template to define resources and networking topologies that need to be +created, we can also use YAML file to describe our test tasks. + +Schema +------ + +A task can be defined as a dict that has the following basic fields: + +.. csv-table:: + :header: Field, Type, Description, Required or not + :widths: 10, 10, 40, 10 + + task_id, string, user specified task ID, required + region, string, keystone region to send API, required + type, string, resource type, required + depend, list, task IDs the current task depends on, optional + params, dict, "parameters to run the task, usage differs in different task types", optional + +Currently four type of tasks are defined. The usage of "params" field for each +type of task is listed below: + +.. csv-table:: + :header: Task type, Usage of "params" field + :widths: 10, 50 + + create, used as the post body of the create request + query, used as the query filter + action, used as the put body of the action request + validate, used as the filter to query resources that need to be validated + +Task doesn't have "task type" field, but it can have an extra dict type field +to include extra needed information for that task. This extra field differs in +different task types. "Create" task doesn't have an extra field. + +.. list-table:: + :widths: 15, 10, 10, 40, 10 + :header-rows: 1 + + * - Extra field + - Sub field + - Type + - Description + - Required or not + * - query(for query task) + - get_one + - bool + - whether to return an element or a list + - required + * - action(for action task) + - target + - string + - target resource ID + - required + * - + - method + - string + - action method, "update" and "delete" are also included + - required + * - + - retries + - int + - times to retry the current task + - optional + * - validate(for validate task) + - predicate + - string + - value should be "any" or "all", "any" means that for each condition, + there exists an resource satisfying that condition; "all" means that + every condition is satisfied by all the resources + - required + * - + - condition + - list + - each condition is a dict, key of the dict is the field of the resource, + value of the dict is the expected value of the resource field + - required + * - + - retries + - int + - times to retry the current task + - optional + +Several related tasks can be grouped to form a task set. A task set is a dict +with the following fields: + +.. csv-table:: + :header: Field, Type, Description, Required or not + :widths: 10, 10, 40, 10 + + task_set_id, string, user specified task set ID, required + depend, list, task set IDs the current task set depends on, optional + tasks, list, task dicts of the task set, required + +So the YAML file contains a list of task sets. + +Result and Reference +-------------------- + +"Create" and "query" type tasks will return results, which can be used in the +definition of other tasks that depend on them. Use ``task_id@resource_field`` +to refer to "resource_field" of the resource returned by "task_id". If the task +relied on belongs to other task set, use ``task_set_id@task_id@resource_field`` +to specify the task set ID. The reference can be used in the "params", "action +target" and "validate condition" field. If reference is used, task_id needs to +be in the list of task's "depend" field, and task_set_id needs to be in the +list of task set's "depend" field. For the "query" type task which is depended +on, "get_one" field needs to be true. + +Example +------- + +Give an example to show how to use the above schema to define tasks:: + + - task_set_id: preparation + tasks: + - task_id: image1 + region: region1 + type: image + query: + get_one: true + - task_id: net1 + region: central + type: network + params: + name: net1 + - task_id: subnet1 + region: central + type: subnet + depend: [net1] + params: + name: subnet1 + ip_version: 4 + cidr: 10.0.1.0/24 + network_id: net1@id + - task_id: vm1 + region: region1 + type: server + depend: + - net1 + - subnet1 + - image1 + params: + flavor_id: 1 + image_id: image1@id + name: vm1 + networks: + - uuid: net1@id + - task_set_id: wait-for-job + tasks: + - task_id: check-job + region: central + type: job + validate: + predicate: all + retries: 10 + condition: + - status: SUCCESS + - task_set_id: check + depend: [preparation] + tasks: + - task_id: check-servers1 + region: region1 + type: server + validate: + predicate: any + condition: + - status: ACTIVE + name: vm1 + +The above YAML content define three task sets. "Preparation" task set create +network, subnet and server, then "wait-for-job" task set waits for asynchronous +jobs to finish, finally "check" task set check whether the server is active. + +Implementation +-------------- + +A task engine needs to be implemented to parse the YAML file, analyse the task +and task set dependency and then run the tasks. A runner based on openstacksdk +will also be implemented. + +Dependencies +------------ + +None diff --git a/doc/source/index.rst b/doc/source/index.rst index 8162f2db..8af95b7c 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -47,4 +47,11 @@ Tricircle Networking Guide .. toctree:: :maxdepth: 4 - networking/index \ No newline at end of file + networking/index + +Tricircle Devspecs Guide +========================== +.. toctree:: + :maxdepth: 4 + + devspecs/index