Merge "Write-affinity aware object deletion"

2017-07-06 14:00:05 +00:00 · 2017-07-06 14:00:05 +00:00 · c3f6e82ae1
commit c3f6e82ae1
parent e94b383655 831eb6e3ce
8 changed files with 471 additions and 230 deletions
--- a/doc/source/deployment_guide.rst
+++ b/doc/source/deployment_guide.rst
@ -1676,187 +1676,207 @@ ionice_priority                       None                      I/O scheduling p

 [proxy-server]

-============================  ===============  =====================================
-Option                        Default          Description
----------------------------  ---------------  -------------------------------------
-use                                            Entry point for paste.deploy for
-                                               the proxy server.  For most
-                                               cases, this should be
-                                               `egg:swift#proxy`.
-set log_name                  proxy-server     Label used when logging
-set log_facility              LOG_LOCAL0       Syslog log facility
-set log_level                 INFO             Log level
-set log_headers               True             If True, log headers in each
-                                               request
-set log_handoffs              True             If True, the proxy will log
-                                               whenever it has to failover to a
-                                               handoff node
-recheck_account_existence     60               Cache timeout in seconds to
-                                               send memcached for account
-                                               existence
-recheck_container_existence   60               Cache timeout in seconds to
-                                               send memcached for container
-                                               existence
-object_chunk_size             65536            Chunk size to read from
-                                               object servers
-client_chunk_size             65536            Chunk size to read from
-                                               clients
-memcache_servers              127.0.0.1:11211  Comma separated list of
-                                               memcached servers
-                                               ip:port or [ipv6addr]:port
-memcache_max_connections      2                Max number of connections to
-                                               each memcached server per
-                                               worker
-node_timeout                  10               Request timeout to external
-                                               services
-recoverable_node_timeout      node_timeout     Request timeout to external
-                                               services for requests that, on
-                                               failure, can be recovered
-                                               from. For example, object GET.
-client_timeout                60               Timeout to read one chunk
-                                               from a client
-conn_timeout                  0.5              Connection timeout to
-                                               external services
-error_suppression_interval    60               Time in seconds that must
-                                               elapse since the last error
-                                               for a node to be considered
-                                               no longer error limited
-error_suppression_limit       10               Error count to consider a
-                                               node error limited
-allow_account_management      false            Whether account PUTs and DELETEs
-                                               are even callable
-object_post_as_copy           false            Deprecated.
-account_autocreate            false            If set to 'true' authorized
-                                               accounts that do not yet exist
-                                               within the Swift cluster will
-                                               be automatically created.
-max_containers_per_account    0                If set to a positive value,
-                                               trying to create a container
-                                               when the account already has at
-                                               least this maximum containers
-                                               will result in a 403 Forbidden.
-                                               Note: This is a soft limit,
-                                               meaning a user might exceed the
-                                               cap for
-                                               recheck_account_existence before
-                                               the 403s kick in.
-max_containers_whitelist                       This is a comma separated list
-                                               of account names that ignore
-                                               the max_containers_per_account
-                                               cap.
-rate_limit_after_segment      10               Rate limit the download of
-                                               large object segments after
-                                               this segment is downloaded.
-rate_limit_segments_per_sec   1                Rate limit large object
-                                               downloads at this rate.
-request_node_count            2 * replicas     Set to the number of nodes to
-                                               contact for a normal request.
-                                               You can use '* replicas' at the
-                                               end to have it use the number
-                                               given times the number of
-                                               replicas for the ring being used
-                                               for the request.
-swift_owner_headers           <see the sample  These are the headers whose
-                              conf file for    values will only be shown to
-                              the list of      swift_owners. The exact
-                              default          definition of a swift_owner is
-                              headers>         up to the auth system in use,
-                                               but usually indicates
-                                               administrative responsibilities.
-sorting_method                shuffle          Storage nodes can be chosen at
-                                               random (shuffle), by using timing
-                                               measurements (timing), or by using
-                                               an explicit match (affinity).
-                                               Using timing measurements may allow
-                                               for lower overall latency, while
-                                               using affinity allows for finer
-                                               control. In both the timing and
-                                               affinity cases, equally-sorting nodes
-                                               are still randomly chosen to spread
-                                               load. This option may be overridden
-                                               in a per-policy configuration
-                                               section.
-timing_expiry                 300              If the "timing" sorting_method is
-                                               used, the timings will only be valid
-                                               for the number of seconds configured
-                                               by timing_expiry.
-concurrent_gets               off              Use replica count number of
-                                               threads concurrently during a
-                                               GET/HEAD and return with the
-                                               first successful response. In
-                                               the EC case, this parameter only
-                                               effects an EC HEAD as an EC GET
-                                               behaves differently.
-concurrency_timeout           conn_timeout     This parameter controls how long
-                                               to wait before firing off the
-                                               next concurrent_get thread. A
-                                               value of 0 would we fully concurrent
-                                               any other number will stagger the
-                                               firing of the threads. This number
-                                               should be between 0 and node_timeout.
-                                               The default is conn_timeout (0.5).
-nice_priority                 None             Scheduling priority of server
-                                               processes.
-                                               Niceness values range from -20 (most
-                                               favorable to the process) to 19 (least
-                                               favorable to the process). The default
-                                               does not modify priority.
-ionice_class                  None             I/O scheduling class of server
-                                               processes. I/O niceness class values
-                                               are IOPRIO_CLASS_RT (realtime),
-                                               IOPRIO_CLASS_BE (best-effort),
-                                               and IOPRIO_CLASS_IDLE (idle).
-                                               The default does not modify class and
-                                               priority. Linux supports io scheduling
-                                               priorities and classes since 2.6.13
-                                               with the CFQ io scheduler.
-                                               Work only with ionice_priority.
-ionice_priority               None             I/O scheduling priority of server
-                                               processes. I/O niceness priority is
-                                               a number which goes from 0 to 7.
-                                               The higher the value, the lower the
-                                               I/O priority of the process. Work
-                                               only with ionice_class.
-                                               Ignored if IOPRIO_CLASS_IDLE is set.
-read_affinity                 None             Specifies which backend servers to
-                                               prefer on reads; used in conjunction
-                                               with the sorting_method option being
-                                               set to 'affinity'. Format is a comma
-                                               separated list of affinity descriptors
-                                               of the form <selection>=<priority>.
-                                               The <selection> may be r<N> for
-                                               selecting nodes in region N or
-                                               r<N>z<M> for selecting nodes in
-                                               region N, zone M. The <priority>
-                                               value should be a whole number
-                                               that represents the priority to
-                                               be given to the selection; lower
-                                               numbers are higher priority.
-                                               Default is empty, meaning no
-                                               preference. This option may be
-                                               overridden in a per-policy
-                                               configuration section.
-write_affinity                None             Specifies which backend servers to
-                                               prefer on writes. Format is a comma
-                                               separated list of affinity
-                                               descriptors of the form r<N> for
-                                               region N or r<N>z<M> for region N,
-                                               zone M. Default is empty, meaning no
-                                               preference. This option may be
-                                               overridden in a per-policy
-                                               configuration section.
-write_affinity_node_count     2 * replicas     The number of local (as governed by
-                                               the write_affinity setting) nodes to
-                                               attempt to contact first on writes,
-                                               before any non-local ones. The value
-                                               should be an integer number, or use
-                                               '* replicas' at the end to have it
-                                               use the number given times the number
-                                               of replicas for the ring being used
-                                               for the request. This option may be
-                                               overridden in a per-policy
-                                               configuration section.
-============================  ===============  =====================================
+======================================  ===============  =====================================
+Option                                  Default          Description
+--------------------------------------  ---------------  -------------------------------------
+use                                                      Entry point for paste.deploy for
+                                                         the proxy server.  For most
+                                                         cases, this should be
+                                                         `egg:swift#proxy`.
+set log_name                            proxy-server     Label used when logging
+set log_facility                        LOG_LOCAL0       Syslog log facility
+set log_level                           INFO             Log level
+set log_headers                         True             If True, log headers in each
+                                                         request
+set log_handoffs                        True             If True, the proxy will log
+                                                         whenever it has to failover to a
+                                                         handoff node
+recheck_account_existence               60               Cache timeout in seconds to
+                                                         send memcached for account
+                                                         existence
+recheck_container_existence             60               Cache timeout in seconds to
+                                                         send memcached for container
+                                                         existence
+object_chunk_size                       65536            Chunk size to read from
+                                                         object servers
+client_chunk_size                       65536            Chunk size to read from
+                                                         clients
+memcache_servers                        127.0.0.1:11211  Comma separated list of
+                                                         memcached servers
+                                                         ip:port or [ipv6addr]:port
+memcache_max_connections                2                Max number of connections to
+                                                         each memcached server per
+                                                         worker
+node_timeout                            10               Request timeout to external
+                                                         services
+recoverable_node_timeout                node_timeout     Request timeout to external
+                                                         services for requests that, on
+                                                         failure, can be recovered
+                                                         from. For example, object GET.
+client_timeout                          60               Timeout to read one chunk
+                                                         from a client
+conn_timeout                            0.5              Connection timeout to
+                                                         external services
+error_suppression_interval              60               Time in seconds that must
+                                                         elapse since the last error
+                                                         for a node to be considered
+                                                         no longer error limited
+error_suppression_limit                 10               Error count to consider a
+                                                         node error limited
+allow_account_management                false            Whether account PUTs and DELETEs
+                                                         are even callable
+object_post_as_copy                     false            Deprecated.
+account_autocreate                      false            If set to 'true' authorized
+                                                         accounts that do not yet exist
+                                                         within the Swift cluster will
+                                                         be automatically created.
+max_containers_per_account              0                If set to a positive value,
+                                                         trying to create a container
+                                                         when the account already has at
+                                                         least this maximum containers
+                                                         will result in a 403 Forbidden.
+                                                         Note: This is a soft limit,
+                                                         meaning a user might exceed the
+                                                         cap for
+                                                         recheck_account_existence before
+                                                         the 403s kick in.
+max_containers_whitelist                                 This is a comma separated list
+                                                         of account names that ignore
+                                                         the max_containers_per_account
+                                                         cap.
+rate_limit_after_segment                10               Rate limit the download of
+                                                         large object segments after
+                                                         this segment is downloaded.
+rate_limit_segments_per_sec             1                Rate limit large object
+                                                         downloads at this rate.
+request_node_count                      2 * replicas     Set to the number of nodes to
+                                                         contact for a normal request.
+                                                         You can use '* replicas' at the
+                                                         end to have it use the number
+                                                         given times the number of
+                                                         replicas for the ring being used
+                                                         for the request.
+swift_owner_headers                     <see the sample  These are the headers whose
+                                        conf file for    values will only be shown to
+                                        the list of      swift_owners. The exact
+                                        default          definition of a swift_owner is
+                                        headers>         up to the auth system in use,
+                                                         but usually indicates
+                                                         administrative responsibilities.
+sorting_method                          shuffle          Storage nodes can be chosen at
+                                                         random (shuffle), by using timing
+                                                         measurements (timing), or by using
+                                                         an explicit match (affinity).
+                                                         Using timing measurements may allow
+                                                         for lower overall latency, while
+                                                         using affinity allows for finer
+                                                         control. In both the timing and
+                                                         affinity cases, equally-sorting nodes
+                                                         are still randomly chosen to spread
+                                                         load. This option may be overridden
+                                                         in a per-policy configuration
+                                                         section.
+timing_expiry                           300              If the "timing" sorting_method is
+                                                         used, the timings will only be valid
+                                                         for the number of seconds configured
+                                                         by timing_expiry.
+concurrent_gets                         off              Use replica count number of
+                                                         threads concurrently during a
+                                                         GET/HEAD and return with the
+                                                         first successful response. In
+                                                         the EC case, this parameter only
+                                                         effects an EC HEAD as an EC GET
+                                                         behaves differently.
+concurrency_timeout                     conn_timeout     This parameter controls how long
+                                                         to wait before firing off the
+                                                         next concurrent_get thread. A
+                                                         value of 0 would we fully concurrent
+                                                         any other number will stagger the
+                                                         firing of the threads. This number
+                                                         should be between 0 and node_timeout.
+                                                         The default is conn_timeout (0.5).
+nice_priority                           None             Scheduling priority of server
+                                                         processes.
+                                                         Niceness values range from -20 (most
+                                                         favorable to the process) to 19 (least
+                                                         favorable to the process). The default
+                                                         does not modify priority.
+ionice_class                            None             I/O scheduling class of server
+                                                         processes. I/O niceness class values
+                                                         are IOPRIO_CLASS_RT (realtime),
+                                                         IOPRIO_CLASS_BE (best-effort),
+                                                         and IOPRIO_CLASS_IDLE (idle).
+                                                         The default does not modify class and
+                                                         priority. Linux supports io scheduling
+                                                         priorities and classes since 2.6.13
+                                                         with the CFQ io scheduler.
+                                                         Work only with ionice_priority.
+ionice_priority                         None             I/O scheduling priority of server
+                                                         processes. I/O niceness priority is
+                                                         a number which goes from 0 to 7.
+                                                         The higher the value, the lower the
+                                                         I/O priority of the process. Work
+                                                         only with ionice_class.
+                                                         Ignored if IOPRIO_CLASS_IDLE is set.
+read_affinity                           None             Specifies which backend servers to
+                                                         prefer on reads; used in conjunction
+                                                         with the sorting_method option being
+                                                         set to 'affinity'. Format is a comma
+                                                         separated list of affinity descriptors
+                                                         of the form <selection>=<priority>.
+                                                         The <selection> may be r<N> for
+                                                         selecting nodes in region N or
+                                                         r<N>z<M> for selecting nodes in
+                                                         region N, zone M. The <priority>
+                                                         value should be a whole number
+                                                         that represents the priority to
+                                                         be given to the selection; lower
+                                                         numbers are higher priority.
+                                                         Default is empty, meaning no
+                                                         preference. This option may be
+                                                         overridden in a per-policy
+                                                         configuration section.
+write_affinity                          None             Specifies which backend servers to
+                                                         prefer on writes. Format is a comma
+                                                         separated list of affinity
+                                                         descriptors of the form r<N> for
+                                                         region N or r<N>z<M> for region N,
+                                                         zone M. Default is empty, meaning no
+                                                         preference. This option may be
+                                                         overridden in a per-policy
+                                                         configuration section.
+write_affinity_node_count               2 * replicas     The number of local (as governed by
+                                                         the write_affinity setting) nodes to
+                                                         attempt to contact first on writes,
+                                                         before any non-local ones. The value
+                                                         should be an integer number, or use
+                                                         '* replicas' at the end to have it
+                                                         use the number given times the number
+                                                         of replicas for the ring being used
+                                                         for the request. This option may be
+                                                         overridden in a per-policy
+                                                         configuration section.
+write_affinity_handoff_delete_count     auto             The number of local (as governed by
+                                                         the write_affinity setting) handoff
+                                                         nodes to attempt to contact on
+                                                         deletion, in addition to primary
+                                                         nodes. Example: in geographically
+                                                         distributed deployment, If replicas=3,
+                                                         sometimes there may be 1 primary node
+                                                         and 2 local handoff nodes in one region
+                                                         holding the object after uploading but
+                                                         before object replicated to the
+                                                         appropriate locations in other regions.
+                                                         In this case, include these handoff
+                                                         nodes to send request when deleting
+                                                         object could help make correct decision
+                                                         for the response. The default value 'auto'
+                                                         means Swift will calculate the number
+                                                         automatically, the default value is
+                                                         (replicas - len(local_primary_nodes)).
+                                                         This option may be overridden in a
+                                                         per-policy configuration section.
+======================================  ===============  =====================================

 .. _proxy_server_per_policy_config:

@ -1871,6 +1891,7 @@ options are:
 - ``read_affinity``
 - ``write_affinity``
 - ``write_affinity_node_count``
+- ``write_affinity_handoff_delete_count``

 The per-policy config section name must be of the form::

@ -1900,6 +1921,7 @@ policy with index ``3``::
    read_affinity = r2=1
    write_affinity = r2
    write_affinity_node_count = 1 * replicas
+    write_affinity_handoff_delete_count = 2

 .. note::

--- a/doc/source/overview_global_cluster.rst
+++ b/doc/source/overview_global_cluster.rst
@ -82,9 +82,9 @@ Note that read_affinity only affects the ordering of primary nodes
 (see ring docs for definition of primary node), not the ordering of
 handoff nodes.

-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-write_affinity and write_affinity_node_count
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~
+write_affinity
+~~~~~~~~~~~~~~

 This setting makes the proxy server prefer local backend servers for
 object PUT requests over non-local ones. For example, it may be
@ -97,9 +97,15 @@ the object won't immediately have any replicas in NY. However,
 replication will move the object's replicas to their proper homes in
 both SF and NY.

-Note that only object PUT requests are affected by the write_affinity
-setting; POST, GET, HEAD, DELETE, OPTIONS, and account/container PUT
-requests are not affected.
+One potential issue with write_affinity is, end user may get 404 error when
+deleting objects before replication. The write_affinity_handoff_delete_count
+setting is used together with write_affinity in order to solve that issue.
+With its default configuration, Swift will calculate the proper number of
+handoff nodes to send requests to.
+
+Note that only object PUT/DELETE requests are affected by the write_affinity
+setting; POST, GET, HEAD, OPTIONS, and account/container PUT requests are
+not affected.

 This setting lets you trade data distribution for throughput. If
 write_affinity is enabled, then object replicas will initially be
--- a/etc/proxy-server.conf-sample
+++ b/etc/proxy-server.conf-sample
@ -236,6 +236,20 @@ use = egg:swift#proxy
 # This option may be overridden in a per-policy configuration section.
 # write_affinity_node_count = 2 * replicas
 #
+# The number of local (as governed by the write_affinity setting) handoff nodes
+# to attempt to contact on deletion, in addition to primary nodes.
+#
+# Example: in geographically distributed deployment of 2 regions, If
+# replicas=3, sometimes there may be 1 primary node and 2 local handoff nodes
+# in one region holding the object after uploading but before object replicated
+# to the appropriate locations in other regions. In this case, include these
+# handoff nodes to send request when deleting object could help make correct
+# decision for the response. The default value 'auto' means Swift will
+# calculate the number automatically, the default value is
+# (replicas - len(local_primary_nodes)). This option may be overridden in a
+# per-policy configuration section.
+# write_affinity_handoff_delete_count = auto
+#
 # These are the headers whose values will only be shown to swift_owners. The
 # exact definition of a swift_owner is up to the auth system in use, but
 # usually indicates administrative responsibilities.
@ -264,6 +278,7 @@ use = egg:swift#proxy
 # read_affinity =
 # write_affinity =
 # write_affinity_node_count =
+# write_affinity_handoff_delete_count =

 [filter:tempauth]
 use = egg:swift#tempauth
--- a/swift/proxy/controllers/base.py
+++ b/swift/proxy/controllers/base.py
@ -1596,7 +1596,8 @@ class Controller(object):
                    {'method': method, 'path': path})

    def make_requests(self, req, ring, part, method, path, headers,
-                      query_string='', overrides=None):
+                      query_string='', overrides=None, node_count=None,
+                      node_iterator=None):
        """
        Sends an HTTP request to multiple nodes and aggregates the results.
        It attempts the primary nodes concurrently, then iterates over the
@ -1613,11 +1614,16 @@ class Controller(object):
        :param query_string: optional query string to send to the backend
        :param overrides: optional return status override map used to override
                          the returned status of a request.
+        :param node_count: optional number of nodes to send request to.
+        :param node_iterator: optional node iterator.
        :returns: a swob.Response object
        """
-        start_nodes = ring.get_part_nodes(part)
-        nodes = GreenthreadSafeIterator(self.app.iter_nodes(ring, part))
-        pile = GreenAsyncPile(len(start_nodes))
+        nodes = GreenthreadSafeIterator(
+            node_iterator or self.app.iter_nodes(ring, part)
+        )
+        node_number = node_count or len(ring.get_part_nodes(part))
+        pile = GreenAsyncPile(node_number)
+
        for head in headers:
            pile.spawn(self._make_request, nodes, part, method, path,
                       head, query_string, self.app.logger.thread_locals)
@ -1628,7 +1634,7 @@ class Controller(object):
                continue
            response.append(resp)
            statuses.append(resp[0])
-            if self.have_quorum(statuses, len(start_nodes)):
+            if self.have_quorum(statuses, node_number):
                break
        # give any pending requests *some* chance to finish
        finished_quickly = pile.waitall(self.app.post_quorum_timeout)
@ -1637,7 +1643,7 @@ class Controller(object):
                continue
            response.append(resp)
            statuses.append(resp[0])
-        while len(response) < len(start_nodes):
+        while len(response) < node_number:
            response.append((HTTP_SERVICE_UNAVAILABLE, '', '', ''))
        statuses, reasons, resp_headers, bodies = zip(*response)
        return self.best_response(req, statuses, reasons, bodies,
--- a/swift/proxy/controllers/obj.py
+++ b/swift/proxy/controllers/obj.py
@ -128,7 +128,8 @@ class BaseObjectController(Controller):
        self.container_name = unquote(container_name)
        self.object_name = unquote(object_name)

-    def iter_nodes_local_first(self, ring, partition, policy=None):
+    def iter_nodes_local_first(self, ring, partition, policy=None,
+                               local_handoffs_first=False):
        """
        Yields nodes for a ring partition.

@ -141,6 +142,9 @@ class BaseObjectController(Controller):

        :param ring: ring to get nodes from
        :param partition: ring partition to yield nodes for
+        :param policy: optional, an instance of :class:`BaseStoragePolicy
+        :param local_handoffs_first: optional, if True prefer primaries and
+            local handoff nodes first before looking elsewhere.
        """
        policy_options = self.app.get_policy_options(policy)
        is_local = policy_options.write_affinity_is_local_fn
@ -148,23 +152,38 @@ class BaseObjectController(Controller):
            return self.app.iter_nodes(ring, partition, policy=policy)

        primary_nodes = ring.get_part_nodes(partition)
-        num_locals = policy_options.write_affinity_node_count_fn(
-            len(primary_nodes))
+        handoff_nodes = ring.get_more_nodes(partition)
+        all_nodes = itertools.chain(primary_nodes, handoff_nodes)

-        all_nodes = itertools.chain(primary_nodes,
-                                    ring.get_more_nodes(partition))
-        first_n_local_nodes = list(itertools.islice(
-            (node for node in all_nodes if is_local(node)), num_locals))
+        if local_handoffs_first:
+            num_locals = policy_options.write_affinity_handoff_delete_count
+            if num_locals is None:
+                local_primaries = [node for node in primary_nodes
+                                   if is_local(node)]
+                num_locals = len(primary_nodes) - len(local_primaries)

-        # refresh it; it moved when we computed first_n_local_nodes
-        all_nodes = itertools.chain(primary_nodes,
-                                    ring.get_more_nodes(partition))
-        local_first_node_iter = itertools.chain(
-            first_n_local_nodes,
-            (node for node in all_nodes if node not in first_n_local_nodes))
+            first_local_handoffs = list(itertools.islice(
+                (node for node in handoff_nodes if is_local(node)), num_locals)
+            )
+            preferred_nodes = primary_nodes + first_local_handoffs
+        else:
+            num_locals = policy_options.write_affinity_node_count_fn(
+                len(primary_nodes)
+            )
+            preferred_nodes = list(itertools.islice(
+                (node for node in all_nodes if is_local(node)), num_locals)
+            )
+            # refresh it; it moved when we computed preferred_nodes
+            handoff_nodes = ring.get_more_nodes(partition)
+            all_nodes = itertools.chain(primary_nodes, handoff_nodes)

-        return self.app.iter_nodes(
-            ring, partition, node_iter=local_first_node_iter, policy=policy)
+        node_iter = itertools.chain(
+            preferred_nodes,
+            (node for node in all_nodes if node not in preferred_nodes)
+        )
+
+        return self.app.iter_nodes(ring, partition, node_iter=node_iter,
+                                   policy=policy)

    def GETorHEAD(self, req):
        """Handle HTTP GET or HEAD requests."""
@ -592,10 +611,12 @@ class BaseObjectController(Controller):
        raise NotImplementedError()

    def _delete_object(self, req, obj_ring, partition, headers):
-        """
-        send object DELETE request to storage nodes. Subclasses of
-        the BaseObjectController can provide their own implementation
-        of this method.
+        """Delete object considering write-affinity.
+
+        When deleting object in write affinity deployment, also take configured
+        handoff nodes number into consideration, instead of just sending
+        requests to primary nodes. Otherwise (write-affinity is disabled),
+        go with the same way as before.

        :param req: the DELETE Request
        :param obj_ring: the object ring
@ -603,11 +624,37 @@ class BaseObjectController(Controller):
        :param headers: system headers to storage nodes
        :return: Response object
        """
-        # When deleting objects treat a 404 status as 204.
+        policy_index = req.headers.get('X-Backend-Storage-Policy-Index')
+        policy = POLICIES.get_by_index(policy_index)
+
+        node_count = None
+        node_iterator = None
+
+        policy_options = self.app.get_policy_options(policy)
+        is_local = policy_options.write_affinity_is_local_fn
+        if is_local is not None:
+            primaries = obj_ring.get_part_nodes(partition)
+            node_count = len(primaries)
+
+            local_handoffs = policy_options.write_affinity_handoff_delete_count
+            if local_handoffs is None:
+                local_primaries = [node for node in primaries
+                                   if is_local(node)]
+                local_handoffs = len(primaries) - len(local_primaries)
+
+            node_count += local_handoffs
+
+            node_iterator = self.iter_nodes_local_first(
+                obj_ring, partition, policy=policy, local_handoffs_first=True
+            )
+
        status_overrides = {404: 204}
        resp = self.make_requests(req, obj_ring,
                                  partition, 'DELETE', req.swift_entity_path,
-                                  headers, overrides=status_overrides)
+                                  headers, overrides=status_overrides,
+                                  node_count=node_count,
+                                  node_iterator=node_iterator)
+
        return resp

    def _post_object(self, req, obj_ring, partition, headers):
@ -734,8 +781,20 @@ class BaseObjectController(Controller):
        else:
            req.headers['X-Timestamp'] = Timestamp(time.time()).internal

+        # Include local handoff nodes if write-affinity is enabled.
+        node_count = len(nodes)
+        policy = POLICIES.get_by_index(policy_index)
+        policy_options = self.app.get_policy_options(policy)
+        is_local = policy_options.write_affinity_is_local_fn
+        if is_local is not None:
+            local_handoffs = policy_options.write_affinity_handoff_delete_count
+            if local_handoffs is None:
+                local_primaries = [node for node in nodes if is_local(node)]
+                local_handoffs = len(nodes) - len(local_primaries)
+            node_count += local_handoffs
+
        headers = self._backend_requests(
-            req, len(nodes), container_partition, container_nodes)
+            req, node_count, container_partition, container_nodes)
        return self._delete_object(req, obj_ring, partition, headers)


--- a/swift/proxy/server.py
+++ b/swift/proxy/server.py
@ -35,7 +35,7 @@ from swift.common.ring import Ring
 from swift.common.utils import cache_from_env, get_logger, \
    get_remote_client, split_path, config_true_value, generate_trans_id, \
    affinity_key_function, affinity_locality_predicate, list_from_csv, \
-    register_swift_info, readconf
+    register_swift_info, readconf, config_auto_int_value
 from swift.common.constraints import check_utf8, valid_api_version
 from swift.proxy.controllers import AccountController, ContainerController, \
    ObjectControllerRouter, InfoController
@ -130,13 +130,18 @@ class ProxyOverrideOptions(object):
                'Invalid write_affinity_node_count value: %r' %
                (' '.join(value)))

+        self.write_affinity_handoff_delete_count = config_auto_int_value(
+            get('write_affinity_handoff_delete_count', 'auto'), None
+        )
+
    def __repr__(self):
        return '%s({}, {%s})' % (self.__class__.__name__, ', '.join(
            '%r: %r' % (k, getattr(self, k)) for k in (
                'sorting_method',
                'read_affinity',
                'write_affinity',
-                'write_affinity_node_count')))
+                'write_affinity_node_count',
+                'write_affinity_handoff_delete_count')))

    def __eq__(self, other):
        if not isinstance(other, ProxyOverrideOptions):
@ -145,7 +150,8 @@ class ProxyOverrideOptions(object):
            'sorting_method',
            'read_affinity',
            'write_affinity',
-            'write_affinity_node_count'))
+            'write_affinity_node_count',
+            'write_affinity_handoff_delete_count'))


 class Application(object):
--- a/test/unit/proxy/controllers/test_obj.py
+++ b/test/unit/proxy/controllers/test_obj.py
@ -279,6 +279,86 @@ class BaseObjectControllerMixin(object):
        self.assertEqual(len(all_nodes), len(local_first_nodes))
        self.assertEqual(sorted(all_nodes), sorted(local_first_nodes))

+    def test_iter_nodes_local_handoff_first_noops_when_no_affinity(self):
+        # this test needs a stable node order - most don't
+        self.app.sort_nodes = lambda l, *args, **kwargs: l
+        controller = self.controller_cls(
+            self.app, 'a', 'c', 'o')
+        policy = self.policy
+        self.app.get_policy_options(policy).write_affinity_is_local_fn = None
+        object_ring = policy.object_ring
+        all_nodes = object_ring.get_part_nodes(1)
+        all_nodes.extend(object_ring.get_more_nodes(1))
+
+        local_first_nodes = list(controller.iter_nodes_local_first(
+            object_ring, 1, local_handoffs_first=True))
+
+        self.maxDiff = None
+
+        self.assertEqual(all_nodes, local_first_nodes)
+
+    def test_iter_nodes_handoff_local_first_default(self):
+        controller = self.controller_cls(
+            self.app, 'a', 'c', 'o')
+        policy_conf = self.app.get_policy_options(self.policy)
+        policy_conf.write_affinity_is_local_fn = (
+            lambda node: node['region'] == 1)
+
+        object_ring = self.policy.object_ring
+        primary_nodes = object_ring.get_part_nodes(1)
+        handoff_nodes_iter = object_ring.get_more_nodes(1)
+        all_nodes = primary_nodes + list(handoff_nodes_iter)
+        handoff_nodes_iter = object_ring.get_more_nodes(1)
+        local_handoffs = [n for n in handoff_nodes_iter if
+                          policy_conf.write_affinity_is_local_fn(n)]
+
+        prefered_nodes = list(controller.iter_nodes_local_first(
+            object_ring, 1, local_handoffs_first=True))
+
+        self.assertEqual(len(all_nodes), self.replicas() +
+                         POLICIES.default.object_ring.max_more_nodes)
+
+        first_primary_nodes = prefered_nodes[:len(primary_nodes)]
+        self.assertEqual(sorted(primary_nodes), sorted(first_primary_nodes))
+
+        handoff_count = self.replicas() - len(primary_nodes)
+        first_handoffs = prefered_nodes[len(primary_nodes):][:handoff_count]
+        self.assertEqual(first_handoffs, local_handoffs[:handoff_count])
+
+    def test_iter_nodes_handoff_local_first_non_default(self):
+        # Obviously this test doesn't work if we're testing 1 replica.
+        # In that case, we don't have any failovers to check.
+        if self.replicas() == 1:
+            return
+
+        controller = self.controller_cls(
+            self.app, 'a', 'c', 'o')
+        policy_conf = self.app.get_policy_options(self.policy)
+        policy_conf.write_affinity_is_local_fn = (
+            lambda node: node['region'] == 1)
+        policy_conf.write_affinity_handoff_delete_count = 1
+
+        object_ring = self.policy.object_ring
+        primary_nodes = object_ring.get_part_nodes(1)
+        handoff_nodes_iter = object_ring.get_more_nodes(1)
+        all_nodes = primary_nodes + list(handoff_nodes_iter)
+        handoff_nodes_iter = object_ring.get_more_nodes(1)
+        local_handoffs = [n for n in handoff_nodes_iter if
+                          policy_conf.write_affinity_is_local_fn(n)]
+
+        prefered_nodes = list(controller.iter_nodes_local_first(
+            object_ring, 1, local_handoffs_first=True))
+
+        self.assertEqual(len(all_nodes), self.replicas() +
+                         POLICIES.default.object_ring.max_more_nodes)
+
+        first_primary_nodes = prefered_nodes[:len(primary_nodes)]
+        self.assertEqual(sorted(primary_nodes), sorted(first_primary_nodes))
+
+        handoff_count = policy_conf.write_affinity_handoff_delete_count
+        first_handoffs = prefered_nodes[len(primary_nodes):][:handoff_count]
+        self.assertEqual(first_handoffs, local_handoffs[:handoff_count])
+
    def test_connect_put_node_timeout(self):
        controller = self.controller_cls(
            self.app, 'a', 'c', 'o')
@ -369,6 +449,36 @@ class BaseObjectControllerMixin(object):
            resp = req.get_response(self.app)
        self.assertEqual(resp.status_int, 204)

+    def test_DELETE_write_affinity_before_replication(self):
+        policy_conf = self.app.get_policy_options(self.policy)
+        policy_conf.write_affinity_handoff_delete_count = self.replicas() / 2
+        policy_conf.write_affinity_is_local_fn = (
+            lambda node: node['region'] == 1)
+        handoff_count = policy_conf.write_affinity_handoff_delete_count
+
+        req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
+        codes = [204] * self.replicas() + [404] * handoff_count
+        with set_http_connect(*codes):
+            resp = req.get_response(self.app)
+
+        self.assertEqual(resp.status_int, 204)
+
+    def test_DELETE_write_affinity_after_replication(self):
+        policy_conf = self.app.get_policy_options(self.policy)
+        policy_conf.write_affinity_handoff_delete_count = self.replicas() / 2
+        policy_conf.write_affinity_is_local_fn = (
+            lambda node: node['region'] == 1)
+        handoff_count = policy_conf.write_affinity_handoff_delete_count
+
+        req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
+        codes = ([204] * (self.replicas() - handoff_count) +
+                 [404] * handoff_count +
+                 [204] * handoff_count)
+        with set_http_connect(*codes):
+            resp = req.get_response(self.app)
+
+        self.assertEqual(resp.status_int, 204)
+
    def test_POST_non_int_delete_after(self):
        t = str(int(time.time() + 100)) + '.1'
        req = swob.Request.blank('/v1/a/c/o', method='POST',
--- a/test/unit/proxy/test_server.py
+++ b/test/unit/proxy/test_server.py
@ -1366,16 +1366,19 @@ class TestProxyServerConfigLoading(unittest.TestCase):
        read_affinity = r1=100
        write_affinity = r1
        write_affinity_node_count = 1 * replicas
+        write_affinity_handoff_delete_count = 4
        """
        expected_default = {"read_affinity": "",
                            "sorting_method": "shuffle",
                            "write_affinity": "",
-                            "write_affinity_node_count_fn": 6}
+                            "write_affinity_node_count_fn": 6,
+                            "write_affinity_handoff_delete_count": None}
        exp_options = {None: expected_default,
                       POLICIES[0]: {"read_affinity": "r1=100",
                                     "sorting_method": "affinity",
                                     "write_affinity": "r1",
-                                     "write_affinity_node_count_fn": 3},
+                                     "write_affinity_node_count_fn": 3,
+                                     "write_affinity_handoff_delete_count": 4},
                       POLICIES[1]: expected_default}
        exp_is_local = {POLICIES[0]: [({'region': 1, 'zone': 2}, True),
                                      ({'region': 2, 'zone': 1}, False)],
@ -1387,7 +1390,8 @@ class TestProxyServerConfigLoading(unittest.TestCase):
        self.assertEqual(
            "ProxyOverrideOptions({}, {'sorting_method': 'shuffle', "
            "'read_affinity': '', 'write_affinity': '', "
-            "'write_affinity_node_count': '2 * replicas'})",
+            "'write_affinity_node_count': '2 * replicas', "
+            "'write_affinity_handoff_delete_count': None})",
            repr(default_options))
        self.assertEqual(default_options, eval(repr(default_options), {
            'ProxyOverrideOptions': default_options.__class__}))
@ -1396,7 +1400,8 @@ class TestProxyServerConfigLoading(unittest.TestCase):
        self.assertEqual(
            "ProxyOverrideOptions({}, {'sorting_method': 'affinity', "
            "'read_affinity': 'r1=100', 'write_affinity': 'r1', "
-            "'write_affinity_node_count': '1 * replicas'})",
+            "'write_affinity_node_count': '1 * replicas', "
+            "'write_affinity_handoff_delete_count': 4})",
            repr(policy_0_options))
        self.assertEqual(policy_0_options, eval(repr(policy_0_options), {
            'ProxyOverrideOptions': policy_0_options.__class__}))
@ -1411,6 +1416,7 @@ class TestProxyServerConfigLoading(unittest.TestCase):
        use = egg:swift#proxy
        sorting_method = affinity
        write_affinity_node_count = 1 * replicas
+        write_affinity_handoff_delete_count = 3

        [proxy-server:policy:0]
        read_affinity = r1=100
@ -1419,12 +1425,14 @@ class TestProxyServerConfigLoading(unittest.TestCase):
        expected_default = {"read_affinity": "",
                            "sorting_method": "affinity",
                            "write_affinity": "",
-                            "write_affinity_node_count_fn": 3}
+                            "write_affinity_node_count_fn": 3,
+                            "write_affinity_handoff_delete_count": 3}
        exp_options = {None: expected_default,
                       POLICIES[0]: {"read_affinity": "r1=100",
                                     "sorting_method": "affinity",
                                     "write_affinity": "r1",
-                                     "write_affinity_node_count_fn": 3},
+                                     "write_affinity_node_count_fn": 3,
+                                     "write_affinity_handoff_delete_count": 3},
                       POLICIES[1]: expected_default}
        exp_is_local = {POLICIES[0]: [({'region': 1, 'zone': 2}, True),
                                      ({'region': 2, 'zone': 1}, False)],
@ -1440,29 +1448,35 @@ class TestProxyServerConfigLoading(unittest.TestCase):
        read_affinity = r2=10
        write_affinity_node_count = 1 * replicas
        write_affinity = r2
+        write_affinity_handoff_delete_count = 2

        [proxy-server:policy:0]
        read_affinity = r1=100
        write_affinity = r1
        write_affinity_node_count = 5
+        write_affinity_handoff_delete_count = 3

        [proxy-server:policy:1]
        read_affinity = r1=1
        write_affinity = r3
        write_affinity_node_count = 4
+        write_affinity_handoff_delete_count = 4
        """
        exp_options = {None: {"read_affinity": "r2=10",
                              "sorting_method": "affinity",
                              "write_affinity": "r2",
-                              "write_affinity_node_count_fn": 3},
+                              "write_affinity_node_count_fn": 3,
+                              "write_affinity_handoff_delete_count": 2},
                       POLICIES[0]: {"read_affinity": "r1=100",
                                     "sorting_method": "affinity",
                                     "write_affinity": "r1",
-                                     "write_affinity_node_count_fn": 5},
+                                     "write_affinity_node_count_fn": 5,
+                                     "write_affinity_handoff_delete_count": 3},
                       POLICIES[1]: {"read_affinity": "r1=1",
                                     "sorting_method": "affinity",
                                     "write_affinity": "r3",
-                                     "write_affinity_node_count_fn": 4}}
+                                     "write_affinity_node_count_fn": 4,
+                                     "write_affinity_handoff_delete_count": 4}}
        exp_is_local = {POLICIES[0]: [({'region': 1, 'zone': 2}, True),
                                      ({'region': 2, 'zone': 1}, False)],
                        POLICIES[1]: [({'region': 3, 'zone': 2}, True),
@ -1533,18 +1547,21 @@ class TestProxyServerConfigLoading(unittest.TestCase):
            None: {"read_affinity": "r1=100",
                   "sorting_method": "shuffle",
                   "write_affinity": "r0",
-                   "write_affinity_node_count_fn": 6},
+                   "write_affinity_node_count_fn": 6,
+                   "write_affinity_handoff_delete_count": None},
            # policy 0 read affinity is r2, dictated by policy 0 section
            POLICIES[0]: {"read_affinity": "r2=100",
                          "sorting_method": "affinity",
                          "write_affinity": "r2",
-                          "write_affinity_node_count_fn": 6},
+                          "write_affinity_node_count_fn": 6,
+                          "write_affinity_handoff_delete_count": None},
            # policy 1 read_affinity is r0, dictated by DEFAULT section,
            # overrides proxy server section
            POLICIES[1]: {"read_affinity": "r0=100",
                          "sorting_method": "affinity",
                          "write_affinity": "r0",
-                          "write_affinity_node_count_fn": 6}}
+                          "write_affinity_node_count_fn": 6,
+                          "write_affinity_handoff_delete_count": None}}
        exp_is_local = {
            # default write_affinity is r0, dictated by DEFAULT section
            None: [({'region': 0, 'zone': 2}, True),