diff --git a/.mailmap b/.mailmap index 00ee2a3459..6827a2d4a3 100644 --- a/.mailmap +++ b/.mailmap @@ -87,3 +87,9 @@ Donagh McCabe Eamonn O'Toole Gerry Drudy Mark Seger +Timur Alperovich +Mehdi Abaakouk +Richard Hawkins +Ondrej Novy +Peter Lisak +Ke Liang diff --git a/.manpages b/.manpages new file mode 100755 index 0000000000..69fcfc74d5 --- /dev/null +++ b/.manpages @@ -0,0 +1,18 @@ +#!/bin/sh + +RET=0 +for MAN in doc/manpages/* ; do + OUTPUT=$(LC_ALL=en_US.UTF-8 MANROFFSEQ='' MANWIDTH=80 man --warnings -E UTF-8 -l \ + -Tutf8 -Z "$MAN" 2>&1 >/dev/null) + if [ -n "$OUTPUT" ] ; then + RET=1 + echo "$MAN:" + echo "$OUTPUT" + fi +done + +if [ "$RET" -eq "0" ] ; then + echo "All manpages are fine" +fi + +exit "$RET" diff --git a/AUTHORS b/AUTHORS index 7efcfaca64..f3225dff2a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -18,6 +18,7 @@ CORE Emeritus Chmouel Boudjnah (chmouel@enovance.com) Florian Hines (syn@ronin.io) Greg Holt (gholt@rackspace.com) +Paul Luse (paul.e.luse@intel.com) Jay Payne (letterj@gmail.com) Peter Portante (peter.portante@redhat.com) Will Reese (wreese@gmail.com) @@ -25,7 +26,7 @@ Chuck Thier (cthier@gmail.com) Contributors ------------ -Mehdi Abaakouk (mehdi.abaakouk@enovance.com) +Mehdi Abaakouk (sileht@redhat.com) Timur Alperovich (timur.alperovich@gmail.com) Jesse Andrews (anotherjesse@gmail.com) Joe Arnold (joe@swiftstack.com) @@ -41,7 +42,7 @@ James E. Blair (jeblair@openstack.org) Fabien Boucher (fabien.boucher@enovance.com) Clark Boylan (clark.boylan@gmail.com) Pádraig Brady (pbrady@redhat.com) -Lorcan Browne (lorcan.browne@hp.com) +Lorcan Browne (lorcan.browne@hpe.com) Russell Bryant (rbryant@redhat.com) Jay S. Bryant (jsbryant@us.ibm.com) Tim Burke (tim.burke@gmail.com) @@ -56,15 +57,17 @@ François Charlier (francois.charlier@enovance.com) Ray Chen (oldsharp@163.com) Harshit Chitalia (harshit@acelio.com) Brian Cline (bcline@softlayer.com) -Alistair Coles (alistair.coles@hp.com) +Alistair Coles (alistair.coles@hpe.com) Clément Contini (ccontini@cloudops.com) Brian Curtin (brian.curtin@rackspace.com) Thiago da Silva (thiago@redhat.com) Julien Danjou (julien@danjou.info) +Paul Dardeau (paul.dardeau@intel.com) +Zack M. Davis (zdavis@swiftstack.com) Ksenia Demina (kdemina@mirantis.com) Dan Dillinger (dan.dillinger@sonian.net) Cedric Dos Santos (cedric.dos.sant@gmail.com) -Gerry Drudy (gerry.drudy@hp.com) +Gerry Drudy (gerry.drudy@hpe.com) Morgan Fainberg (morgan.fainberg@gmail.com) ZhiQiang Fan (aji.zqfan@gmail.com) Oshrit Feder (oshritf@il.ibm.com) @@ -85,6 +88,7 @@ David Goetz (david.goetz@rackspace.com) Tushar Gohad (tushar.gohad@intel.com) Jonathan Gonzalez V (jonathan.abdiel@gmail.com) Joe Gordon (jogo@cloudscaling.com) +ChangBo Guo(gcb) (eric.guo@easystack.cn) David Hadas (davidh@il.ibm.com) Andrew Hale (andy@wwwdata.eu) Soren Hansen (soren@linux2go.dk) @@ -92,9 +96,12 @@ Richard Hawkins (richard.hawkins@rackspace.com) Gregory Haynes (greg@greghaynes.net) Doug Hellmann (doug.hellmann@dreamhost.com) Dan Hersam (dan.hersam@hp.com) +hgangwx (hgangwx@cn.ibm.com) Derek Higgins (derekh@redhat.com) +Jonathan Hinson (jlhinson@us.ibm.com) Alex Holden (alex@alexjonasholden.com) Edward Hope-Morley (opentastic@gmail.com) +Ferenc Horváth (hferenc@inf.u-szeged.hu) Charles Hsu (charles0126@gmail.com) Joanna H. Huang (joanna.huitzu.huang@gmail.com) Kun Huang (gareth@unitedstack.com) @@ -111,6 +118,7 @@ Jason Johnson (jajohnson@softlayer.com) Brian K. Jones (bkjones@gmail.com) Arnaud JOST (arnaud.jost@ovh.net) Kiyoung Jung (kiyoung.jung@kt.com) +Harshada Mangesh Kakad (harshadak@metsi.co.uk) Takashi Kajinami (kajinamit@nttdata.co.jp) Matt Kassawara (mkassawara@gmail.com) Morita Kazutaka (morita.kazutaka@gmail.com) @@ -136,6 +144,8 @@ Eohyung Lee (liquidnuker@gmail.com) Zhao Lei (zhaolei@cn.fujitsu.com) Jamie Lennox (jlennox@redhat.com) Tong Li (litong01@us.ibm.com) +Ke Liang (ke.liang@easystack.cn) +Peter Lisak (peter.lisak@firma.seznam.cz) Changbin Liu (changbin.liu@gmail.com) Jing Liuqing (jing.liuqing@99cloud.net) Victor Lowther (victor.lowther@gmail.com) @@ -143,6 +153,7 @@ Sergey Lukjanov (slukjanov@mirantis.com) Zhongyue Luo (zhongyue.nah@intel.com) Paul Luse (paul.e.luse@intel.com) Christopher MacGown (chris@pistoncloud.com) +Ganesh Maharaj Mahalingam (ganesh.mahalingam@intel.com) Dragos Manolescu (dragosm@hp.com) Ben Martin (blmartin@us.ibm.com) Steve Martinelli (stevemar@ca.ibm.com) @@ -152,7 +163,7 @@ Nakagawa Masaaki (nakagawamsa@nttdata.co.jp) Dolph Mathews (dolph.mathews@gmail.com) Kenichiro Matsuda (matsuda_kenichi@jp.fujitsu.com) Michael Matur (michael.matur@gmail.com) -Donagh McCabe (donagh.mccabe@hp.com) +Donagh McCabe (donagh.mccabe@hpe.com) Andy McCrae (andy.mccrae@gmail.com) Paul McMillan (paul.mcmillan@nebula.com) Ewan Mellor (ewan.mellor@citrix.com) @@ -168,19 +179,22 @@ Maru Newby (mnewby@internap.com) Newptone (xingchao@unitedstack.com) Colin Nicholson (colin.nicholson@iomart.com) Zhenguo Niu (zhenguo@unitedstack.com) +Catherine Northcott (catherine@northcott.nz) Ondrej Novy (ondrej.novy@firma.seznam.cz) Timothy Okwii (tokwii@cisco.com) Matthew Oliver (matt@oliver.net.au) Hisashi Osanai (osanai.hisashi@jp.fujitsu.com) -Eamonn O'Toole (eamonn.otoole@hp.com) +Eamonn O'Toole (eamonn.otoole@hpe.com) James Page (james.page@ubuntu.com) Prashanth Pai (ppai@redhat.com) +Venkateswarlu Pallamala (p.venkatesh551@gmail.com) Pawel Palucki (pawel.palucki@gmail.com) Alex Pecoraro (alex.pecoraro@emc.com) Sascha Peilicke (saschpe@gmx.de) Constantine Peresypkin (constantine.peresypk@rackspace.com) Dieter Plaetinck (dieter@vimeo.com) Dan Prince (dprince@redhat.com) +Sivasathurappan Radhakrishnan (siva.radhakrishnan@intel.com) Sarvesh Ranjan (saranjan@cisco.com) Falk Reimann (falk.reimann@sap.com) Brian Reitz (brian.reitz@oracle.com) @@ -198,7 +212,7 @@ Shilla Saebi (shilla.saebi@gmail.com) Atsushi Sakai (sakaia@jp.fujitsu.com) Cristian A Sanchez (cristian.a.sanchez@intel.com) Christian Schwede (cschwede@redhat.com) -Mark Seger (Mark.Seger@hp.com) +Mark Seger (mark.seger@hpe.com) Azhagu Selvan SP (tamizhgeek@gmail.com) Alexandra Settle (alexandra.settle@rackspace.com) Andrew Clay Shafer (acs@parvuscaptus.com) @@ -212,6 +226,7 @@ Pradeep Kumar Singh (pradeep.singh@nectechnologies.in) Liu Siqi (meizu647@gmail.com) Adrian Smith (adrian_f_smith@dell.com) Jon Snitow (otherjon@swiftstack.com) +Emile Snyder (emile.snyder@gmail.com) Emett Speer (speer.emett@gmail.com) TheSriram (sriram@klusterkloud.com) Jeremy Stanley (fungi@yuggoth.org) @@ -234,7 +249,9 @@ Dmitry Ukov (dukov@mirantis.com) Vincent Untz (vuntz@suse.com) Daniele Valeriani (daniele@dvaleriani.net) Koert van der Veer (koert@cloudvps.com) +Béla Vancsics (vancsics@inf.u-szeged.hu) Vladimir Vechkanov (vvechkanov@mirantis.com) +venkatamahesh (venkatamaheshkotha@gmail.com) Gil Vernik (gilv@il.ibm.com) Hou Ming Wang (houming.wang@easystack.cn) Shane Wang (shane.wang@intel.com) @@ -248,7 +265,7 @@ Ye Jia Xu (xyj.asmy@gmail.com) Alex Yang (alex890714@gmail.com) Lin Yang (lin.a.yang@intel.com) Yee (mail.zhang.yee@gmail.com) -Guang Yee (guang.yee@hp.com) +Guang Yee (guang.yee@hpe.com) Pete Zaitcev (zaitcev@kotori.zaitcev.us) Hua Zhang (zhuadl@cn.ibm.com) Jian Zhang (jian.zhang@intel.com) diff --git a/CHANGELOG b/CHANGELOG index 0eb08b11a1..c1b335d548 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,92 @@ +swift (2.6.0) + + * Dependency changes + - Updated minimum version of eventlet to 0.17.4 to support IPv6. + + - Updated the minimum version of PyECLib to 1.0.7. + + * The ring rebalancing algorithm was updated to better handle edge cases + and to give better (more balanced) rings in the general case. New rings + will have better initial placement, capacity adjustments will move less + data for better balance, and existing rings that were imbalanced should + start to become better balanced as they go through rebalance cycles. + + * Added container and account reverse listings. + + A GET request to an account or container resource with a "reverse=true" + query parameter will return the listing in reverse order. When + iterating over pages of reverse listings, the relative order of marker + and end_marker are swapped. + + * Storage policies now support having more than one name. + + This allows operators to fix a typo without breaking existing clients, + or, alternatively, have "short names" for policies. This is implemented + with the "aliases" config key in the storage policy config in + swift.conf. The aliases value is a list of names that the storage + policy may also be identified by. The storage policy "name" is used to + report the policy to users (eg in container headers). The aliases have + the same naming restrictions as the policy's primary name. + + * The object auditor learned the "interval" config value to control the + time between each audit pass. + + * `swift-recon --all` now includes the config checksum check. + + * `swift-init` learned the --kill-after-timeout option to force a service + to quit (SIGKILL) after a designated time. + + * `swift-recon` now correctly shows timestamps in UTC instead of local + time. + + * Fixed bug where `swift-ring-builder` couldn't select device id 0. + + * Documented the previously undocumented + `swift-ring-builder pretend_min_part_hours_passed` command. + + * The "node_timeout" config value now accepts decimal values. + + * `swift-ring-builder` now properly removes devices with zero weight. + + * `swift-init` return codes are updated via "--strict" and "--non-strict" + options. Please see the usage string for more information. + + * `swift-ring-builder` now reports the min_part_hours lockout time + remaining + + * Container sync has been improved to more quickly find and iterate over + the containers to be synced. This reduced server load and lowers the + time required to see data propagate between two clusters. Please see + http://swift.openstack.org/overview_container_sync.html for more details + about the new on-disk structure for tracking synchronized containers. + + * A container POST will now update that container's put-timestamp value. + + * TempURL header restrictions are now exposed in /info. + + * Error messages on static large object manifest responses have been + greatly improved. + + * Closed a bug where an unfinished read of a large object would leak a + socket file descriptor and a small amount of memory. (CVE-2016-0738) + + * Fixed an issue where a zero-byte object PUT with an incorrect Etag + would return a 503. + + * Fixed an error when a static large object manifest references the same + object more than once. + + * Improved performance of finding handoff nodes if a zone is empty. + + * Fixed duplication of headers in Access-Control-Expose-Headers on CORS + requests. + + * Fixed handling of IPv6 connections to memcache pools. + + * Continued work towards python 3 compatibility. + + * Various other minor bug fixes and improvements. + swift (2.5.0, OpenStack Liberty) * Added the ability to specify ranges for Static Large Object (SLO) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6a81d6a8c6..1f69a82562 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -89,8 +89,8 @@ Specs The [``swift-specs``](https://github.com/openstack/swift-specs) repo can be used for collaborative design work before a feature is implemented. -Openstack's gerrit system is used to collaborate on the design spec. Once -approved Openstack provides a doc site to easily read these [specs](http://specs.openstack.org/openstack/swift-specs/) +OpenStack's gerrit system is used to collaborate on the design spec. Once +approved OpenStack provides a doc site to easily read these [specs](http://specs.openstack.org/openstack/swift-specs/) A spec is needed for more impactful features. Coordinating a feature between many devs (especially across companies) is a great example of when a spec is diff --git a/bin/swift-dispersion-report b/bin/swift-dispersion-report index a1b5fdaab0..48dff80a89 100755 --- a/bin/swift-dispersion-report +++ b/bin/swift-dispersion-report @@ -23,7 +23,6 @@ from time import time from eventlet import GreenPool, hubs, patcher, Timeout from eventlet.pools import Pool -from eventlet.green import urllib2 from swift.common import direct_client try: @@ -174,8 +173,8 @@ def object_dispersion_report(coropool, connpool, account, object_ring, try: objects = [o['name'] for o in conn.get_container( container, prefix='dispersion_', full_listing=True)[1]] - except urllib2.HTTPError as err: - if err.getcode() != 404: + except ClientException as err: + if err.http_status != 404: raise print >>stderr, 'No objects to query. Has ' \ diff --git a/bin/swift-drive-audit b/bin/swift-drive-audit index a0a1fbac35..013bc20226 100755 --- a/bin/swift-drive-audit +++ b/bin/swift-drive-audit @@ -200,6 +200,10 @@ if __name__ == '__main__': (mount_point)) comment_fstab(mount_point) unmounts += 1 + else: + logger.info("Detected %s with %d errors " + "(Device not unmounted)" % + (mount_point, count)) recon_errors[mount_point] = count total_errors += count recon_file = recon_cache_path + "/drive.recon" diff --git a/bin/swift-init b/bin/swift-init index 3fe18cdaa6..0fcbff5708 100755 --- a/bin/swift-init +++ b/bin/swift-init @@ -74,6 +74,11 @@ def main(): help="Return zero status code even if some config is " "missing. Default mode if any server is a glob or " "one of aliases `all`, `main` or `rest`.") + # SIGKILL daemon after kill_wait period + parser.add_option('--kill-after-timeout', dest='kill_after_timeout', + action='store_true', + help="Kill daemon and all childs after kill-wait " + "period.") options, args = parser.parse_args() diff --git a/bin/swift-oldies b/bin/swift-oldies index 46263ab95d..74854d78d8 100755 --- a/bin/swift-oldies +++ b/bin/swift-oldies @@ -59,7 +59,7 @@ Lists old Swift processes. listing.append((str(hours), pid, args)) if not listing: - exit() + sys.exit() hours_len = len('Hours') pid_len = len('PID') diff --git a/bin/swift-orphans b/bin/swift-orphans index d6cf2d0801..90311c9816 100755 --- a/bin/swift-orphans +++ b/bin/swift-orphans @@ -93,7 +93,7 @@ Example (sends SIGTERM to all orphaned Swift processes older than two hours): listing.append((str(hours), pid, args)) if not listing: - exit() + sys.exit() hours_len = len('Hours') pid_len = len('PID') diff --git a/doc/manpages/account-server.conf.5 b/doc/manpages/account-server.conf.5 index 73cadc62e0..4a7e8c597e 100644 --- a/doc/manpages/account-server.conf.5 +++ b/doc/manpages/account-server.conf.5 @@ -102,8 +102,10 @@ adapted_logger. The default is empty. If set, log_udp_host will override log_address. .IP "\fBlog_udp_port\fR UDP log port, the default is 514. -.IP \fBlog_statsd_host\fR = localhost -log_statsd_* enable StatsD logging. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. .IP \fBlog_statsd_port\fR The default is 8125. .IP \fBlog_statsd_default_sample_rate\fR diff --git a/doc/manpages/container-server.conf.5 b/doc/manpages/container-server.conf.5 index d07e08cc7b..970fa18f2c 100644 --- a/doc/manpages/container-server.conf.5 +++ b/doc/manpages/container-server.conf.5 @@ -108,8 +108,10 @@ adapted_logger. The default is empty. If set, log_udp_host will override log_address. .IP "\fBlog_udp_port\fR UDP log port, the default is 514. -.IP \fBlog_statsd_host\fR = localhost -log_statsd_* enable StatsD logging. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. .IP \fBlog_statsd_port\fR The default is 8125. .IP \fBlog_statsd_default_sample_rate\fR diff --git a/doc/manpages/object-expirer.conf.5 b/doc/manpages/object-expirer.conf.5 index 2235358ee4..c8a47a45c1 100644 --- a/doc/manpages/object-expirer.conf.5 +++ b/doc/manpages/object-expirer.conf.5 @@ -76,8 +76,10 @@ adapted_logger. The default is empty. If set, log_udp_host will override log_address. .IP "\fBlog_udp_port\fR UDP log port, the default is 514. -.IP \fBlog_statsd_host\fR = localhost -log_statsd_* enable StatsD logging. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. .IP \fBlog_statsd_port\fR The default is 8125. .IP \fBlog_statsd_default_sample_rate\fR diff --git a/doc/manpages/object-server.conf.5 b/doc/manpages/object-server.conf.5 index 1ba4290641..6e3dea7a09 100644 --- a/doc/manpages/object-server.conf.5 +++ b/doc/manpages/object-server.conf.5 @@ -111,8 +111,10 @@ adapted_logger. The default is empty. If set, log_udp_host will override log_address. .IP "\fBlog_udp_port\fR UDP log port, the default is 514. -.IP \fBlog_statsd_host\fR = localhost -log_statsd_* enable StatsD logging. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. .IP \fBlog_statsd_port\fR The default is 8125. .IP \fBlog_statsd_default_sample_rate\fR @@ -365,7 +367,7 @@ Depending on the method of deployment you may need to create this directory manu and ensure that swift has read/write.The default is /var/cache/swift. .IP "\fBhandoffs_first\fR" The flag to replicate handoffs prior to canonical partitions. -It allows to force syncing and deleting handoffs quickly. +It allows one to force syncing and deleting handoffs quickly. If set to a True value(e.g. "True" or "1"), partitions that are not supposed to be on the node will be replicated first. The default is false. @@ -425,7 +427,7 @@ Depending on the method of deployment you may need to create this directory manu and ensure that swift has read/write.The default is /var/cache/swift. .IP "\fBhandoffs_first\fR" The flag to replicate handoffs prior to canonical partitions. -It allows to force syncing and deleting handoffs quickly. +It allows one to force syncing and deleting handoffs quickly. If set to a True value(e.g. "True" or "1"), partitions that are not supposed to be on the node will be replicated first. The default is false. diff --git a/doc/manpages/proxy-server.conf.5 b/doc/manpages/proxy-server.conf.5 index a606c6bcff..45531685c6 100644 --- a/doc/manpages/proxy-server.conf.5 +++ b/doc/manpages/proxy-server.conf.5 @@ -118,8 +118,10 @@ adapted_logger. The default is empty. If set, log_udp_host will override log_address. .IP "\fBlog_udp_port\fR UDP log port, the default is 514. -.IP \fBlog_statsd_host\fR = localhost -log_statsd_* enable StatsD logging. +.IP \fBlog_statsd_host\fR +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. .IP \fBlog_statsd_port\fR The default is 8125. .IP \fBlog_statsd_default_sample_rate\fR @@ -328,8 +330,8 @@ This allows middleware higher in the WSGI pipeline to override auth processing, useful for middleware such as tempurl and formpost. If you know you're not going to use such middleware and you want a bit of extra security, you can set this to false. -.IP \fBis_admin [DEPRECATED]\fR -If is_admin is true, a user whose username is the same as the project name +.IP \fBis_admin\fR +[DEPRECATED] If is_admin is true, a user whose username is the same as the project name and who has any role on the project will have access rights elevated to be the same as if the user had an operator role. Note that the condition compares names rather than UUIDs. This option is deprecated. @@ -384,7 +386,8 @@ Sets the maximum number of connections to each memcached server per worker. If not set in the configuration file, the value for memcache_servers will be read from /etc/swift/memcache.conf (see memcache.conf-sample) or lacking that file, it will default to 127.0.0.1:11211. You can specify multiple servers -separated with commas, as in: 10.1.2.3:11211,10.1.2.4:11211. +separated with commas, as in: 10.1.2.3:11211,10.1.2.4:11211. (IPv6 +addresses must follow rfc3986 section-3.2.2, i.e. [::1]:11211) .IP \fBmemcache_serialization_support\fR This sets how memcache values are serialized and deserialized: .RE @@ -665,7 +668,9 @@ unset. Default is 514. .IP \fBaccess_log_statsd_host\fR You can use log_statsd_* from [DEFAULT], or override them here. -Default is localhost. +StatsD server. IPv4/IPv6 addresses and hostnames are +supported. If a hostname resolves to an IPv4 and IPv6 address, the IPv4 +address will be used. .IP \fBaccess_log_statsd_port\fR Default is 8125. .IP \fBaccess_log_statsd_default_sample_rate\fR @@ -949,7 +954,7 @@ chunk of data from the object servers while serving GET / HEAD requests. Timeouts from these requests can be recovered from so setting this to something lower than node_timeout would provide quicker error recovery while allowing for a longer timeout for non-recoverable requests (PUTs). -Defaults to node_timeout, should be overriden if node_timeout is set to a +Defaults to node_timeout, should be overridden if node_timeout is set to a high number to prevent client timeouts from firing before the proxy server has a chance to retry. .IP \fBconn_timeout\fR @@ -997,11 +1002,9 @@ The valid values for sorting_method are "affinity", "shuffle", and "timing". .IP \fBtiming_expiry\fR If the "timing" sorting_method is used, the timings will only be valid for the number of seconds configured by timing_expiry. The default is 300. -.IP \fBmax_large_object_get_time\fR -The maximum time (seconds) that a large object connection is allowed to last. The default is 86400. .IP \fBrequest_node_count\fR -Set to the number of nodes to contact for a normal request. You can use -'* replicas' at the end to have it use the number given times the number of +Set to the number of nodes to contact for a normal request. You can use '* replicas' +at the end to have it use the number given times the number of replicas for the ring being used for the request. The default is '2 * replicas'. .IP \fBread_affinity\fR Which backend servers to prefer on reads. Format is r for region diff --git a/doc/manpages/swift-init.1 b/doc/manpages/swift-init.1 index 3a0e112659..de700bfb0a 100644 --- a/doc/manpages/swift-init.1 +++ b/doc/manpages/swift-init.1 @@ -111,6 +111,7 @@ allows one to use the keywords such as "all", "main" and "rest" for the .IP "-r RUN_DIR, --run-dir=RUN_DIR directory where the pids will be stored (default /var/run/swift) .IP "--strict return non-zero status code if some config is missing. Default mode if server is explicitly named." .IP "--non-strict return zero status code even if some config is missing. Default mode if server is one of aliases `all`, `main` or `rest`." +.IP "--kill-after-timeout kill daemon and all children after kill-wait period." .PD .RE diff --git a/doc/saio/swift/container-reconciler.conf b/doc/saio/swift/container-reconciler.conf index 09261d294b..8dcb92f8f8 100644 --- a/doc/saio/swift/container-reconciler.conf +++ b/doc/saio/swift/container-reconciler.conf @@ -17,7 +17,7 @@ user = # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 diff --git a/doc/saio/swift/object-expirer.conf b/doc/saio/swift/object-expirer.conf index 6e77e9cdf7..0d52fddba6 100644 --- a/doc/saio/swift/object-expirer.conf +++ b/doc/saio/swift/object-expirer.conf @@ -17,7 +17,7 @@ log_level = INFO # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 diff --git a/doc/saio/swift/swift.conf b/doc/saio/swift/swift.conf index 25e1002646..e01a0ac881 100644 --- a/doc/saio/swift/swift.conf +++ b/doc/saio/swift/swift.conf @@ -1,5 +1,6 @@ [swift-hash] # random unique strings that can never change (DO NOT LOSE) +# Use only printable chars (python -c "import string; print(string.printable)") swift_hash_path_prefix = changeme swift_hash_path_suffix = changeme @@ -15,6 +16,6 @@ policy_type = replication [storage-policy:2] name = ec42 policy_type = erasure_coding -ec_type = jerasure_rs_vand +ec_type = liberasurecode_rs_vand ec_num_data_fragments = 4 ec_num_parity_fragments = 2 diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index cb6532b4be..06c4244822 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -463,7 +463,12 @@ Example:: Assuming 3 replicas, this configuration will make object PUTs try storing the object's replicas on up to 6 disks ("2 * replicas") in -region 1 ("r1"). +region 1 ("r1"). Proxy server tries to find 3 devices for storing the +object. While a device is unavailable, it queries the ring for the 4th +device and so on until 6th device. If the 6th disk is still unavailable, +the last replica will be sent to other region. It doesn't mean there'll +have 6 replicas in region 1. + You should be aware that, if you have data coming into SF faster than your link to NY can transfer it, then your cluster's data distribution @@ -624,7 +629,11 @@ configuration entries (see the sample configuration files):: log_statsd_metric_prefix = [empty-string] If `log_statsd_host` is not set, this feature is disabled. The default values -for the other settings are given above. +for the other settings are given above. The `log_statsd_host` can be a +hostname, an IPv4 address, or an IPv6 address (not surrounded with brackets, as +this is unnecessary since the port is specified separately). If a hostname +resolves to an IPv4 address, an IPv4 socket will be used to send StatsD UDP +packets, even if the hostname would also resolve to an IPv6 address. .. _StatsD: http://codeascraft.etsy.com/2011/02/15/measure-anything-measure-everything/ .. _Graphite: http://graphite.wikidot.com/ @@ -675,8 +684,7 @@ of async_pendings in real-time, but will not tell you the current number of async_pending container updates on disk at any point in time. Note also that the set of metrics collected, their names, and their semantics -are not locked down and will change over time. StatsD logging is currently in -a "beta" stage and will continue to evolve. +are not locked down and will change over time. Metrics for `account-auditor`: diff --git a/doc/source/api/form_post_middleware.rst b/doc/source/api/form_post_middleware.rst index 1d83484780..b14fa5bf9d 100644 --- a/doc/source/api/form_post_middleware.rst +++ b/doc/source/api/form_post_middleware.rst @@ -27,7 +27,7 @@ request. The format of the form **POST** request is: -**Example 1.14. Form POST format** +**Example 1.14. Form POST format** .. code:: @@ -140,7 +140,7 @@ Form **POST** middleware uses an HMAC-SHA1 cryptographic signature. This signature includes these elements from the form: - The path. Starting with ``/v1/`` onwards and including a container - name and, optionally, an object prefix. In `Example 1.15`, “HMAC-SHA1 + name and, optionally, an object prefix. In `Example 1.15`, “HMAC-SHA1 signature for form POST” the path is ``/v1/my_account/container/object_prefix``. Do not URL-encode the @@ -148,15 +148,15 @@ signature includes these elements from the form: - A redirect URL. If there is no redirect URL, use the empty string. -- Maximum file size. In `Example 1.15`, “HMAC-SHA1 signature for form +- Maximum file size. In `Example 1.15`, “HMAC-SHA1 signature for form POST” the ``max_file_size`` is ``104857600`` bytes. -- The maximum number of objects to upload. In `Example 1.15`, “HMAC-SHA1 +- The maximum number of objects to upload. In `Example 1.15`, “HMAC-SHA1 signature for form POST” ``max_file_count`` is ``10``. -- Expiry time. In `Example 1.15, “HMAC-SHA1 signature for form +- Expiry time. In `Example 1.15, “HMAC-SHA1 signature for form POST” the expiry time is set to ``600`` seconds into the future. @@ -167,7 +167,7 @@ signature includes these elements from the form: The following example code generates a signature for use with form **POST**: -**Example 1.15. HMAC-SHA1 signature for form POST** +**Example 1.15. HMAC-SHA1 signature for form POST** .. code:: diff --git a/doc/source/api/large_objects.rst b/doc/source/api/large_objects.rst index 144605f117..3e167f7530 100644 --- a/doc/source/api/large_objects.rst +++ b/doc/source/api/large_objects.rst @@ -2,7 +2,7 @@ Large objects ============= -By default, the content of an object cannot be greater than 5 GB. +By default, the content of an object cannot be greater than 5 GB. However, you can use a number of smaller objects to construct a large object. The large object is comprised of two types of objects: @@ -40,9 +40,9 @@ Note If you make a **COPY** request by using a manifest object as the source, the new object is a normal, and not a segment, object. If the total size -of the source segment objects exceeds 5 GB, the **COPY** request fails. +of the source segment objects exceeds 5 GB, the **COPY** request fails. However, you can make a duplicate of the manifest object and this new -object can be larger than 5 GB. +object can be larger than 5 GB. Static large objects ~~~~~~~~~~~~~~~~~~~~ @@ -58,7 +58,7 @@ header. This ensures that the upload cannot corrupt your data. List the name of each segment object along with its size and MD5 checksum in order. -Create a manifest object. Include the *``?multipart-manifest=put``* +Create a manifest object. Include the ``multipart-manifest=put`` query string at the end of the manifest object name to indicate that this is a manifest object. @@ -74,7 +74,7 @@ list, where each element contains the following attributes: - ``size_bytes``. The size of the segment object. This value must match the ``Content-Length`` of that object. -**Example Static large object manifest list** +**Example Static large object manifest list** This example shows three segment objects. You can use several containers and the object names do not have to conform to a specific pattern, in @@ -112,8 +112,8 @@ set to be the MD5 checksum of the concatenated ``ETag`` values of the object segments. You can also set the ``Content-Type`` request header and custom object metadata. -When the **PUT** operation sees the *``?multipart-manifest=put``* query -parameter, it reads the request body and verifies that each segment +When the **PUT** operation sees the ``multipart-manifest=put`` query +string, it reads the request body and verifies that each segment object exists and that the sizes and ETags match. If there is a mismatch, the **PUT**\ operation fails. @@ -124,25 +124,25 @@ this is a static object manifest. Normally when you perform a **GET** operation on the manifest object, the response body contains the concatenated content of the segment objects. To download the manifest list, use the -*``?multipart-manifest=get``* query parameter. The resulting list is not +``multipart-manifest=get`` query string. The resulting list is not formatted the same as the manifest you originally used in the **PUT** operation. If you use the **DELETE** operation on a manifest object, the manifest object is deleted. The segment objects are not affected. However, if you -add the *``?multipart-manifest=delete``* query parameter, the segment +add the ``multipart-manifest=delete`` query string, the segment objects are deleted and if all are successfully deleted, the manifest object is also deleted. To change the manifest, use a **PUT** operation with the -*``?multipart-manifest=put``* query parameter. This request creates a +``multipart-manifest=put`` query string. This request creates a manifest object. You can also update the object metadata in the usual way. Dynamic large objects ~~~~~~~~~~~~~~~~~~~~~ -You must segment objects that are larger than 5 GB before you can upload +You must segment objects that are larger than 5 GB before you can upload them. You then upload the segment objects like you would any other object and create a dynamic large manifest object. The manifest object tells Object Storage how to find the segment objects that comprise the @@ -168,7 +168,7 @@ of segments to a second location and update the manifest to point to this new location. During the upload of the new segments, the original manifest is still available to download the first set of segments. -**Example Upload segment of large object request: HTTP** +**Example Upload segment of large object request: HTTP** .. code:: @@ -190,7 +190,7 @@ Unprocessable Entity response is returned. You can continue uploading segments like this example shows, prior to uploading the manifest. -**Example Upload next segment of large object request: HTTP** +**Example Upload next segment of large object request: HTTP** .. code:: @@ -220,7 +220,7 @@ subsequent additional segments. X-Object-Manifest: {container}/{prefix} -**Example Upload manifest response: HTTP** +**Example Upload manifest response: HTTP** .. code:: @@ -238,67 +238,88 @@ Comparison of static and dynamic large objects While static and dynamic objects have similar behavior, here are their differences: -**Comparing static and dynamic large objects** +End-to-end integrity +-------------------- -Static large object: Assured end-to-end integrity. The list of segments -includes the MD5 checksum (``ETag``) of each segment. You cannot upload the -manifest object if the ``ETag`` in the list differs from the uploaded segment -object. If a segment is somehow lost, an attempt to download the manifest -object results in an error. You must upload the segment objects before you -upload the manifest object. You cannot add or remove segment objects from the -manifest. However, you can create a completely new manifest object of the same -name with a different manifest list. +With static large objects, integrity can be assured. +The list of segments may include the MD5 checksum (``ETag``) of each segment. +You cannot upload the manifest object if the ``ETag`` in the list differs +from the uploaded segment object. If a segment is somehow lost, an attempt +to download the manifest object results in an error. -With static large objects, you can upload new segment objects or remove -existing segments. The names must simply match the ``{prefix}`` supplied -in ``X-Object-Manifest``. The segment objects must be at least 1 MB in size -(by default). The final segment object can be any size. At most, 1000 segments -are supported (by default). The manifest list includes the container name of -each object. Segment objects can be in different containers. - -Dynamic large object: End-to-end integrity is not guaranteed. The eventual +With dynamic large objects, integrity is not guaranteed. The eventual consistency model means that although you have uploaded a segment object, it might not appear in the container listing until later. If you download the manifest before it appears in the container, it does not form part of the content returned in response to a **GET** request. +Upload Order +------------ + +With static large objects, you must upload the +segment objects before you upload the manifest object. + With dynamic large objects, you can upload manifest and segment objects in any order. In case a premature download of the manifest occurs, we recommend users upload the manifest object after the segments. However, -the system does not enforce the order. Segment objects can be any size. All -segment objects must be in the same container. +the system does not enforce the order. + +Removal or addition of segment objects +-------------------------------------- + +With static large objects, you cannot add or +remove segment objects from the manifest. However, you can create a +completely new manifest object of the same name with a different manifest +list. + +With dynamic large objects, you can upload new segment objects or remove +existing segments. The names must simply match the ``{prefix}`` supplied +in ``X-Object-Manifest``. + +Segment object size and number +------------------------------ + +With static large objects, the segment objects must be at least 1 byte in size. +However, if the segment objects are less than 1MB (by default), +the SLO download is (by default) rate limited. At most, +1000 segments are supported (by default) and the manifest has a limit +(by default) of 2MB in size. + +With dynamic large objects, segment objects can be any size. + +Segment object container name +----------------------------- + +With static large objects, the manifest list includes the container name of each object. +Segment objects can be in different containers. + +With dynamic large objects, all segment objects must be in the same container. Manifest object metadata ------------------------ -For static large objects, the object has ``X-Static-Large-Object`` set to -``true``. You do not set this metadata directly. Instead the system sets -it when you **PUT** a static manifest object. +With static large objects, the manifest object has ``X-Static-Large-Object`` +set to ``true``. You do not set this +metadata directly. Instead the system sets it when you **PUT** a static +manifest object. -For dynamic object,s the ``X-Object-Manifest`` value is the -``{container}/{prefix}``, which indicates where the segment objects are -located. You supply this request header in the **PUT** operation. +With dynamic large objects, the ``X-Object-Manifest`` value is the +``{container}/{prefix}``, which indicates +where the segment objects are located. You supply this request header in the +**PUT** operation. Copying the manifest object --------------------------- -With static large objects, you include the *``?multipart-manifest=get``* +The semantics are the same for both static and dynamic large objects. +When copying large objects, the **COPY** operation does not create +a manifest object but a normal object with content same as what you would +get on a **GET** request to the original manifest object. + +To copy the manifest object, you include the ``multipart-manifest=get`` query string in the **COPY** request. The new object contains the same manifest as the original. The segment objects are not copied. Instead, both the original and new manifest objects share the same set of segment objects. -When creating dynamic large objects, the **COPY** operation does not create -a manifest object but a normal object with content same as what you would -get on a **GET** request to original manifest object. -To duplicate a manifest object: - -* Use the **GET** operation to read the value of ``X-Object-Manifest`` and - use this value in the ``X-Object-Manifest`` request header in a **PUT** - operation. -* Alternatively, you can include *``?multipart-manifest=get``* query - string in the **COPY** request. - -This creates a new manifest object that shares the same set of segment -objects as the original manifest object. diff --git a/doc/source/api/object_api_v1_overview.rst b/doc/source/api/object_api_v1_overview.rst index 99578904c0..04bd0cde22 100644 --- a/doc/source/api/object_api_v1_overview.rst +++ b/doc/source/api/object_api_v1_overview.rst @@ -58,7 +58,7 @@ The Object Storage system organizes data in a hierarchy, as follows: object versioning, at the container level. You can bulk-delete up to 10,000 containers in a single request. - + You can set a storage policy on a container with predefined names and definitions from your cloud provider. @@ -68,7 +68,7 @@ The Object Storage system organizes data in a hierarchy, as follows: With the Object Storage API, you can: - Store an unlimited number of objects. Each object can be as large - as 5 GB, which is the default. You can configure the maximum + as 5 GB, which is the default. You can configure the maximum object size. - Upload and store objects of any size with large object creation. @@ -78,7 +78,7 @@ The Object Storage system organizes data in a hierarchy, as follows: - Compress files using content-encoding metadata. - Override browser behavior for an object using content-disposition metadata. - + - Schedule objects for deletion. - Bulk-delete up to 10,000 objects in a single request. @@ -154,11 +154,11 @@ Your service provider might use different default values. Item Maximum value Notes ============================ ============= ===== Number of HTTP headers 90 -Length of HTTP headers 4096 bytes -Length per HTTP request line 8192 bytes -Length of HTTP request 5 GB -Length of container names 256 bytes Cannot contain the ``/`` character. -Length of object names 1024 bytes By default, there are no character restrictions. +Length of HTTP headers 4096 bytes +Length per HTTP request line 8192 bytes +Length of HTTP request 5 GB +Length of container names 256 bytes Cannot contain the ``/`` character. +Length of object names 1024 bytes By default, there are no character restrictions. ============================ ============= ===== You must UTF-8-encode and then URL-encode container and object names diff --git a/doc/source/api/use_content-encoding_metadata.rst b/doc/source/api/use_content-encoding_metadata.rst index c12175ab16..69b3314723 100644 --- a/doc/source/api/use_content-encoding_metadata.rst +++ b/doc/source/api/use_content-encoding_metadata.rst @@ -7,7 +7,7 @@ the ``Content-Encoding`` metadata. This metadata enables you to indicate that the object content is compressed without losing the identity of the underlying media type (``Content-Type``) of the file, such as a video. -**Example Content-Encoding header request: HTTP** +**Example Content-Encoding header request: HTTP** This example assigns an attachment type to the ``Content-Encoding`` header that indicates how the file is downloaded: diff --git a/doc/source/associated_projects.rst b/doc/source/associated_projects.rst index cdd8a2837b..8f2ce5f072 100644 --- a/doc/source/associated_projects.rst +++ b/doc/source/associated_projects.rst @@ -25,7 +25,7 @@ Application Bindings * `java-openstack-swift `_ - Java bindings for OpenStack Swift * `swift_client `_ - Small but powerful Ruby client to interact with OpenStack Swift * `nightcrawler_swift `_ - This Ruby gem teleports your assets to a OpenStack Swift bucket/container - * `swift storage `_ - Simple Openstack Swift storage client. + * `swift storage `_ - Simple OpenStack Swift storage client. Authentication -------------- @@ -65,6 +65,7 @@ Alternative API * `Swift3 `_ - Amazon S3 API emulation. * `CDMI `_ - CDMI support +* `SwiftHLM `_ - a middleware for using OpenStack Swift with tape and other high latency media storage backends Benchmarking/Load Generators @@ -106,7 +107,7 @@ Other * `Glance `_ - Provides services for discovering, registering, and retrieving virtual machine images (for OpenStack Compute [Nova], for example). * `Better Staticweb `_ - Makes swift containers accessible by default. * `Swiftsync `_ - A massive syncer between two swift clusters. -* `Django Swiftbrowser `_ - Simple Django web app to access Openstack Swift. +* `Django Swiftbrowser `_ - Simple Django web app to access OpenStack Swift. * `Swift-account-stats `_ - Swift-account-stats is a tool to report statistics on Swift usage at tenant and global levels. * `PyECLib `_ - High Level Erasure Code library used by Swift * `liberasurecode `_ - Low Level Erasure Code library used by PyECLib diff --git a/doc/source/conf.py b/doc/source/conf.py index 5d2fbd304d..b5e0d6f071 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,4 +1,17 @@ # -*- coding: utf-8 -*- +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # Copyright (c) 2010-2012 OpenStack Foundation. # # Swift documentation build configuration file, created by @@ -13,9 +26,11 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys -import os import datetime +import os +from swift import __version__ +import subprocess +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -28,7 +43,7 @@ sys.path.extend([os.path.abspath('../swift'), os.path.abspath('..'), # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath', + 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.ifconfig', 'oslosphinx'] todo_include_todos = True @@ -36,17 +51,17 @@ todo_include_todos = True # Changing the path so that the Hudson build output contains GA code and the # source docs do not contain the code so local, offline sphinx builds are # "clean." -#templates_path = [] -#if os.getenv('HUDSON_PUBLISH_DOCS'): -# templates_path = ['_ga', '_templates'] -#else: -# templates_path = ['_templates'] +# templates_path = [] +# if os.getenv('HUDSON_PUBLISH_DOCS'): +# templates_path = ['_ga', '_templates'] +# else: +# templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. -#source_encoding = 'utf-8' +# source_encoding = 'utf-8' # The master toctree document. master_doc = 'index' @@ -60,23 +75,22 @@ copyright = u'%d, OpenStack Foundation' % datetime.datetime.now().year # built documents. # # The short X.Y version. -from swift import __version__ version = __version__.rsplit('.', 1)[0] # The full version, including alpha/beta/rc tags. release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. -#unused_docs = [] +# unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. @@ -84,14 +98,14 @@ exclude_trees = [] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. @@ -109,74 +123,76 @@ modindex_common_prefix = ['swift.'] # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. # html_theme = 'default' -#html_theme_path = ["."] -#html_theme = '_theme' +# html_theme_path = ["."] +# html_theme = '_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] +# html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' -git_cmd = "git log --pretty=format:'%ad, commit %h' --date=local -n1" -html_last_updated_fmt = os.popen(git_cmd).read() +# html_last_updated_fmt = '%b %d, %Y' +git_cmd = ["git", "log", "--pretty=format:'%ad, commit %h'", "--date=local", + "-n1"] +html_last_updated_fmt = subprocess.Popen( + git_cmd, stdout=subprocess.PIPE).communicate()[0] # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_use_modindex = True +# html_use_modindex = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' +# html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'swiftdoc' @@ -185,10 +201,10 @@ htmlhelp_basename = 'swiftdoc' # -- Options for LaTeX output ------------------------------------------------- # The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' +# latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' +# latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass @@ -200,17 +216,17 @@ latex_documents = [ # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # Additional stuff for the LaTeX preamble. -#latex_preamble = '' +# latex_preamble = '' # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_use_modindex = True +# latex_use_modindex = True diff --git a/doc/source/deployment_guide.rst b/doc/source/deployment_guide.rst index f06afc483b..9ed83e4a30 100644 --- a/doc/source/deployment_guide.rst +++ b/doc/source/deployment_guide.rst @@ -478,7 +478,11 @@ log_custom_handlers None Comma-separated list of functions t to setup custom log handlers. log_udp_host Override log_address log_udp_port 514 UDP log port -log_statsd_host localhost StatsD logging +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. log_statsd_port 8125 log_statsd_default_sample_rate 1.0 log_statsd_sample_rate_factor 1.0 @@ -526,9 +530,10 @@ set log_address /dev/log Logging directory user swift User to run as max_upload_time 86400 Maximum time allowed to upload an object -slow 0 If > 0, Minimum time in seconds - for a PUT or DELETE request to - complete +slow 0 If > 0, Minimum time in seconds for a PUT or + DELETE request to complete. This is only + useful to simulate slow devices during testing + and development. mb_per_sync 512 On PUT requests, sync file every n MB keep_cache_size 5242880 Largest object size to keep in @@ -719,6 +724,8 @@ log_facility LOG_LOCAL0 Syslog log facility log_level INFO Logging level log_address /dev/log Logging directory log_time 3600 Frequency of status logs in seconds. +interval 30 Time in seconds to wait between + auditor passes disk_chunk_size 65536 Size of chunks read during auditing files_per_second 20 Maximum files audited per second per auditor process. Should be tuned according @@ -787,7 +794,11 @@ log_custom_handlers None Comma-separated list of functions t to setup custom log handlers. log_udp_host Override log_address log_udp_port 514 UDP log port -log_statsd_host localhost StatsD logging +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. log_statsd_port 8125 log_statsd_default_sample_rate 1.0 log_statsd_sample_rate_factor 1.0 @@ -998,7 +1009,11 @@ log_custom_handlers None Comma-separated list of functions t to setup custom log handlers. log_udp_host Override log_address log_udp_port 514 UDP log port -log_statsd_host localhost StatsD logging +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. log_statsd_port 8125 log_statsd_default_sample_rate 1.0 log_statsd_sample_rate_factor 1.0 @@ -1226,7 +1241,11 @@ log_custom_handlers None Comma separated handlers. log_udp_host Override log_address log_udp_port 514 UDP log port -log_statsd_host localhost StatsD logging +log_statsd_host None Enables StatsD logging; IPv4/IPv6 + address or a hostname. If a + hostname resolves to an IPv4 and IPv6 + address, the IPv4 address will be + used. log_statsd_port 8125 log_statsd_default_sample_rate 1.0 log_statsd_sample_rate_factor 1.0 @@ -1278,7 +1297,8 @@ object_chunk_size 65536 Chunk size to read from client_chunk_size 65536 Chunk size to read from clients memcache_servers 127.0.0.1:11211 Comma separated list of - memcached servers ip:port + memcached servers + ip:port or [ipv6addr]:port memcache_max_connections 2 Max number of connections to each memcached server per worker @@ -1469,7 +1489,7 @@ At Rackspace, our Proxy servers have dual quad core processors, giving us 8 cores. Our testing has shown 16 workers to be a pretty good balance when saturating a 10g network and gives good CPU utilization. -Our Storage servers all run together on the same servers. These servers have +Our Storage server processes all run together on the same servers. These servers have dual quad core processors, for 8 cores total. We run the Account, Container, and Object servers with 8 workers each. Most of the background jobs are run at a concurrency of 1, with the exception of the replicators which are run at a diff --git a/doc/source/development_guidelines.rst b/doc/source/development_guidelines.rst index ec2c45c5ba..2f3d70f78e 100644 --- a/doc/source/development_guidelines.rst +++ b/doc/source/development_guidelines.rst @@ -83,15 +83,35 @@ For example, this command would run the functional tests using policy SWIFT_TEST_POLICY=silver tox -e func + +In-process functional testing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + If the ``test.conf`` file is not found then the functional test framework will instantiate a set of Swift servers in the same process that executes the functional tests. This 'in-process test' mode may also be enabled (or disabled) by setting the environment variable ``SWIFT_TEST_IN_PROCESS`` to a true (or false) value prior to executing `tox -e func`. -When using the 'in-process test' mode, the optional in-memory -object server may be selected by setting the environment variable -``SWIFT_TEST_IN_MEMORY_OBJ`` to a true value. +When using the 'in-process test' mode some server configuration options may be +set using environment variables: + +- the optional in-memory object server may be selected by setting the + environment variable ``SWIFT_TEST_IN_MEMORY_OBJ`` to a true value. + +- the proxy-server ``object_post_as_copy`` option may be set using the + environment variable ``SWIFT_TEST_IN_PROCESS_OBJECT_POST_AS_COPY``. + +For example, this command would run the in-process mode functional tests with +the proxy-server using object_post_as_copy=False (the 'fast-POST' mode):: + + SWIFT_TEST_IN_PROCESS=1 SWIFT_TEST_IN_PROCESS_OBJECT_POST_AS_COPY=False \ + tox -e func + +This particular example may also be run using the ``func-in-process-fast-post`` +tox environment:: + + tox -e func-in-process-fast-post The 'in-process test' mode searches for ``proxy-server.conf`` and ``swift.conf`` config files from which it copies config options and overrides @@ -127,7 +147,7 @@ using config files found in ``$HOME/my_tests`` and policy 'silver':: Coding Style ------------ -Swift use flake8 with the OpenStack `hacking`_ module to enforce +Swift uses flake8 with the OpenStack `hacking`_ module to enforce coding style. Install flake8 and hacking with pip or by the packages of your @@ -164,6 +184,14 @@ Installing Sphinx: #. Install sphinx (On Ubuntu: `sudo apt-get install python-sphinx`) #. `python setup.py build_sphinx` +-------- +Manpages +-------- + +For sanity check of your change in manpage, use this command in the root +of your Swift repo:: + + ./.manpages --------------------- License and Copyright diff --git a/doc/source/development_saio.rst b/doc/source/development_saio.rst index 1a282983b6..bca218cad5 100644 --- a/doc/source/development_saio.rst +++ b/doc/source/development_saio.rst @@ -37,7 +37,8 @@ Installing dependencies sudo apt-get update sudo apt-get install curl gcc memcached rsync sqlite3 xfsprogs \ - git-core libffi-dev python-setuptools + git-core libffi-dev python-setuptools \ + liberasurecode-dev sudo apt-get install python-coverage python-dev python-nose \ python-xattr python-eventlet \ python-greenlet python-pastedeploy \ @@ -48,7 +49,8 @@ Installing dependencies sudo yum update sudo yum install curl gcc memcached rsync sqlite xfsprogs git-core \ - libffi-devel xinetd python-setuptools \ + libffi-devel xinetd liberasurecode-devel \ + python-setuptools \ python-coverage python-devel python-nose \ pyxattr python-eventlet \ python-greenlet python-paste-deploy \ @@ -585,3 +587,7 @@ doesn't work, here are some good starting places to look for issues: cannot rate limit (unit tests generate a lot of logs very quickly). Open the file ``SWIFT_TEST_CONFIG_FILE`` points to, and change the value of ``fake_syslog`` to ``True``. +#. If you encounter a ``401 Unauthorized`` when following Step 12 where + you check that you can ``GET`` account, use ``sudo service memcached status`` + and check if memcache is running. If memcache is not running, start it using + ``sudo service memcached start``. Once memcache is running, rerun ``GET`` account. diff --git a/doc/source/howto_installmultinode.rst b/doc/source/howto_installmultinode.rst index bd46561acd..e45a8adb77 100644 --- a/doc/source/howto_installmultinode.rst +++ b/doc/source/howto_installmultinode.rst @@ -3,31 +3,31 @@ Instructions for a Multiple Server Swift Installation ===================================================== Please refer to the latest official -`Openstack Installation Guides `_ +`OpenStack Installation Guides `_ for the most up-to-date documentation. -Object Storage installation guide for Openstack Liberty ----------------------------------------------------- +Object Storage installation guide for OpenStack Liberty +------------------------------------------------------- * `openSUSE 13.2 and SUSE Linux Enterprise Server 12 `_ * `RHEL 7, CentOS 7 `_ * `Ubuntu 14.04 `_ -Object Storage installation guide for Openstack Kilo +Object Storage installation guide for OpenStack Kilo ---------------------------------------------------- * `openSUSE 13.2 and SUSE Linux Enterprise Server 12 `_ * `RHEL 7, CentOS 7, and Fedora 21 `_ * `Ubuntu 14.04 `_ -Object Storage installation guide for Openstack Juno +Object Storage installation guide for OpenStack Juno ---------------------------------------------------- * `openSUSE 13.1 and SUSE Linux Enterprise Server 11 `_ * `RHEL 7, CentOS 7, and Fedora 20 `_ * `Ubuntu 14.04 `_ -Object Storage installation guide for Openstack Icehouse +Object Storage installation guide for OpenStack Icehouse -------------------------------------------------------- * `openSUSE and SUSE Linux Enterprise Server `_ diff --git a/doc/source/images/ec_overview.png b/doc/source/images/ec_overview.png old mode 100755 new mode 100644 diff --git a/doc/source/index.rst b/doc/source/index.rst index 30bfe31808..8f045cfb18 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -86,10 +86,15 @@ Administrator Documentation admin_guide replication_network logs + ops_runbook/index Object Storage v1 REST API Documentation ======================================== +See `Complete Reference for the Object Storage REST API `_ + +The following provides supporting information for the REST API: + .. toctree:: :maxdepth: 1 @@ -104,6 +109,14 @@ Object Storage v1 REST API Documentation api/use_content-encoding_metadata.rst api/use_the_content-disposition_metadata.rst +OpenStack End User Guide +======================== + +The `OpenStack End User Guide `_ +has additional information on using Swift. +See the `Manage objects and containers `_ +section. + Source Documentation ==================== diff --git a/doc/source/ops_runbook/diagnose.rst b/doc/source/ops_runbook/diagnose.rst new file mode 100644 index 0000000000..d34b38c52b --- /dev/null +++ b/doc/source/ops_runbook/diagnose.rst @@ -0,0 +1,1031 @@ +================================== +Identifying issues and resolutions +================================== + +Diagnose: General approach +-------------------------- + +- Look at service status in your monitoring system. + +- In addition to system monitoring tools and issue logging by users, + swift errors will often result in log entries in the ``/var/log/swift`` + files: ``proxy.log``, ``server.log`` and ``background.log`` (see:``Swift + logs``). + +- Look at any logs your deployment tool produces. + +- Log files should be reviewed for error signatures (see below) that + may point to a known issue, or root cause issues reported by the + diagnostics tools, prior to escalation. + +Dependencies +^^^^^^^^^^^^ + +The Swift software is dependent on overall system health. Operating +system level issues with network connectivity, domain name resolution, +user management, hardware and system configuration and capacity in terms +of memory and free disk space, may result is secondary Swift issues. +System level issues should be resolved prior to diagnosis of swift +issues. + + +Diagnose: Swift-dispersion-report +--------------------------------- + +The swift-dispersion-report is a useful tool to gauge the general +health of the system. Configure the ``swift-dispersion`` report for +100% coverage. The dispersion report regularly monitors +these and gives a report of the amount of objects/containers are still +available as well as how many copies of them are also there. + +The dispersion-report output is logged on the first proxy of the first +AZ or each system (proxy with the monitoring role) under +``/var/log/swift/swift-dispersion-report.log``. + +Diagnose: Is swift running? +--------------------------- + +When you want to establish if a swift endpoint is running, run ``curl -k`` +against either: https://*[REPLACEABLE]*./healthcheck OR +https:*[REPLACEABLE]*.crossdomain.xml + + +Diagnose: Interpreting messages in ``/var/log/swift/`` files +------------------------------------------------------------ + +.. note:: + + In the Hewlett Packard Enterprise Helion Public Cloud we send logs to + ``proxy.log`` (proxy-server logs), ``server.log`` (object-server, + account-server, container-server logs), ``background.log`` (all + other servers [object-replicator, etc]). + +The following table lists known issues: + +.. list-table:: + :widths: 25 25 25 25 + :header-rows: 1 + + * - **Logfile** + - **Signature** + - **Issue** + - **Steps to take** + * - /var/log/syslog + - kernel: [] hpsa .... .... .... has check condition: unknown type: + Sense: 0x5, ASC: 0x20, ASC Q: 0x0 .... + - An unsupported command was issued to the storage hardware + - Understood to be a benign monitoring issue, ignore + * - /var/log/syslog + - kernel: [] sd .... [csbu:sd...] Sense Key: Medium Error + - Suggests disk surface issues + - Run swift diagnostics on the target node to check for disk errors, + repair disk errors + * - /var/log/syslog + - kernel: [] sd .... [csbu:sd...] Sense Key: Hardware Error + - Suggests storage hardware issues + - Run swift diagnostics on the target node to check for disk failures, + replace failed disks + * - /var/log/syslog + - kernel: [] .... I/O error, dev sd.... ,sector .... + - + - Run swift diagnostics on the target node to check for disk errors + * - /var/log/syslog + - pound: NULL get_thr_arg + - Multiple threads woke up + - Noise, safe to ignore + * - /var/log/swift/proxy.log + - .... ERROR .... ConnectionTimeout .... + - A storage node is not responding in a timely fashion + - Run swift diagnostics on the target node to check for node down, + node unconfigured, storage off-line or network issues between the + proxy and non responding node + * - /var/log/swift/proxy.log + - proxy-server .... HTTP/1.0 500 .... + - A proxy server has reported an internal server error + - Run swift diagnostics on the target node to check for issues + * - /var/log/swift/server.log + - .... ERROR .... ConnectionTimeout .... + - A storage server is not responding in a timely fashion + - Run swift diagnostics on the target node to check for a node or + service, down, unconfigured, storage off-line or network issues + between the two nodes + * - /var/log/swift/server.log + - .... ERROR .... Remote I/O error: '/srv/node/disk.... + - A storage device is not responding as expected + - Run swift diagnostics and check the filesystem named in the error + for corruption (unmount & xfs_repair) + * - /var/log/swift/background.log + - object-server ERROR container update failed .... Connection refused + - Peer node is not responding + - Check status of the network and peer node + * - /var/log/swift/background.log + - object-updater ERROR with remote .... ConnectionTimeout + - + - Check status of the network and peer node + * - /var/log/swift/background.log + - account-reaper STDOUT: .... error: ECONNREFUSED + - Network connectivity issue + - Resolve network issue and re-run diagnostics + * - /var/log/swift/background.log + - .... ERROR .... ConnectionTimeout + - A storage server is not responding in a timely fashion + - Run swift diagnostics on the target node to check for a node + or service, down, unconfigured, storage off-line or network issues + between the two nodes + * - /var/log/swift/background.log + - .... ERROR syncing .... Timeout + - A storage server is not responding in a timely fashion + - Run swift diagnostics on the target node to check for a node + or service, down, unconfigured, storage off-line or network issues + between the two nodes + * - /var/log/swift/background.log + - .... ERROR Remote drive not mounted .... + - A storage server disk is unavailable + - Run swift diagnostics on the target node to check for a node or + service, failed or unmounted disk on the target, or a network issue + * - /var/log/swift/background.log + - object-replicator .... responded as unmounted + - A storage server disk is unavailable + - Run swift diagnostics on the target node to check for a node or + service, failed or unmounted disk on the target, or a network issue + * - /var/log/swift/\*.log + - STDOUT: EXCEPTION IN + - A unexpected error occurred + - Read the Traceback details, if it matches known issues + (e.g. active network/disk issues), check for re-ocurrences + after the primary issues have been resolved + * - /var/log/rsyncd.log + - rsync: mkdir "/disk....failed: No such file or directory.... + - A local storage server disk is unavailable + - Run swift diagnostics on the node to check for a failed or + unmounted disk + * - /var/log/swift* + - Exception: Could not bind to 0.0.0.0:600xxx + - Possible Swift process restart issue. This indicates an old swift + process is still running. + - Run swift diagnostics, if some swift services are reported down, + check if they left residual process behind. + * - /var/log/rsyncd.log + - rsync: recv_generator: failed to stat "/disk....." (in object) + failed: Not a directory (20) + - Swift directory structure issues + - Run swift diagnostics on the node to check for issues + +Diagnose: Parted reports the backup GPT table is corrupt +-------------------------------------------------------- + +- If a GPT table is broken, a message like the following should be + observed when the following command is run: + + .. code:: + + $ sudo parted -l + + .. code:: + + Error: The backup GPT table is corrupt, but the primary appears OK, + so that will be used. + + OK/Cancel? + +To fix, go to: Fix broken GPT table (broken disk partition) + + +Diagnose: Drives diagnostic reports a FS label is not acceptable +---------------------------------------------------------------- + +If diagnostics reports something like "FS label: obj001dsk011 is not +acceptable", it indicates that a partition has a valid disk label, but an +invalid filesystem label. In such cases proceed as follows: + +#. Verify that the disk labels are correct: + + .. code:: + + FS=/dev/sd#1 + + sudo parted -l | grep object + +#. If partition labels are inconsistent then, resolve the disk label issues + before proceeding: + + .. code:: + + sudo parted -s ${FS} name ${PART_NO} ${PART_NAME} #Partition Label + #PART_NO is 1 for object disks and 3 for OS disks + #PART_NAME follows the convention seen in "sudo parted -l | grep object" + +#. If the Filesystem label is missing then create it with care: + + .. code:: + + sudo xfs_admin -l ${FS} #Filesystem label (12 Char limit) + + #Check for the existence of a FS label + + OBJNO=<3 Length Object No.> + + #I.E OBJNO for sw-stbaz3-object0007 would be 007 + + DISKNO=<3 Length Disk No.> + + #I.E DISKNO for /dev/sdb would be 001, /dev/sdc would be 002 etc. + + sudo xfs_admin -L "obj${OBJNO}dsk${DISKNO}" ${FS} + + #Create a FS Label + +Diagnose: Failed LUNs +--------------------- + +.. note:: + + The HPE Helion Public Cloud uses direct attach SmartArry + controllers/drives. The information here is specific to that + environment. + +The ``swift_diagnostics`` mount checks may return a warning that a LUN has +failed, typically accompanied by DriveAudit check failures and device +errors. + +Such cases are typically caused by a drive failure, and if drive check +also reports a failed status for the underlying drive, then follow +the procedure to replace the disk. + +Otherwise the lun can be re-enabled as follows: + +#. Generate a hpssacli diagnostic report. This report allows the swift + team to troubleshoot potential cabling or hardware issues so it is + imperative that you run it immediately when troubleshooting a failed + LUN. You will come back later and grep this file for more details, but + just generate it for now. + + .. code:: + + sudo hpssacli controller all diag file=/tmp/hpacu.diag ris=on \ + xml=off zip=off + +Export the following variables using the below instructions before +proceeding further. + +#. Print a list of logical drives and their numbers and take note of the + failed drive's number and array value (example output: "array A + logicaldrive 1..." would be exported as LDRIVE=1): + + .. code:: + + sudo hpssacli controller slot=1 ld all show + +#. Export the number of the logical drive that was retrieved from the + previous command into the LDRIVE variable: + + .. code:: + + export LDRIVE= + +#. Print the array value and Port:Box:Bay for all drives and take note of + the Port:Box:Bay for the failed drive (example output: " array A + physicaldrive 2C:1:1..." would be exported as PBOX=2C:1:1). Match the + array value of this output with the array value obtained from the + previous command to be sure you are working on the same drive. Also, + the array value usually matches the device name (For example, /dev/sdc + in the case of "array c"), but we will run a different command to be sure + we are operating on the correct device. + + .. code:: + + sudo hpssacli controller slot=1 pd all show + +.. note:: + + Sometimes a LUN may appear to be failed as it is not and cannot + be mounted but the hpssacli/parted commands may show no problems with + the LUNS/drives. In this case, the filesystem may be corrupt and may be + necessary to run ``sudo xfs_check /dev/sd[a-l][1-2]`` to see if there is + an xfs issue. The results of running this command may require that + ``xfs_repair`` is run. + +#. Export the Port:Box:Bay for the failed drive into the PBOX variable: + + .. code:: + + export PBOX= + +#. Print the physical device information and take note of the Disk Name + (example output: "Disk Name: /dev/sdk" would be exported as + DEV=/dev/sdk): + + .. code:: + + sudo hpssacli controller slot=1 ld ${LDRIVE} show detail \ + grep -i "Disk Name" + +#. Export the device name variable from the preceding command (example: + /dev/sdk): + + .. code:: + + export DEV= + +#. Export the filesystem variable. Disks that are split between the + operating system and data storage, typically sda and sdb, should only + have repairs done on their data filesystem, usually /dev/sda2 and + /dev/sdb2, Other data only disks have just one partition on the device, + so the filesystem will be 1. In any case you should verify the data + filesystem by running ``df -h | grep /srv/node`` and using the listed + data filesystem for the device in question as the export. For example: + /dev/sdk1. + + .. code:: + + export FS= + +#. Verify the LUN is failed, and the device is not: + + .. code:: + + sudo hpssacli controller slot=1 ld all show + sudo hpssacli controller slot=1 pd all show + sudo hpssacli controller slot=1 ld ${LDRIVE} show detail + sudo hpssacli controller slot=1 pd ${PBOX} show detail + +#. Stop the swift and rsync service: + + .. code:: + + sudo service rsync stop + sudo swift-init shutdown all + +#. Unmount the problem drive, fix the LUN and the filesystem: + + .. code:: + + sudo umount ${FS} + +#. If umount fails, you should run lsof search for the mountpoint and + kill any lingering processes before repeating the unpount: + + .. code:: + + sudo hpacucli controller slot=1 ld ${LDRIVE} modify reenable + sudo xfs_repair ${FS} + +#. If the ``xfs_repair`` complains about possible journal data, use the + ``xfs_repair -L`` option to zeroise the journal log. + +#. Once complete test-mount the filesystem, and tidy up its lost and + found area. + + .. code:: + + sudo mount ${FS} /mnt + sudo rm -rf /mnt/lost+found/ + sudo umount /mnt + +#. Mount the filesystem and restart swift and rsync. + +#. Run the following to determine if a DC ticket is needed to check the + cables on the node: + + .. code:: + + grep -y media.exchanged /tmp/hpacu.diag + grep -y hot.plug.count /tmp/hpacu.diag + +#. If the output reports any non 0x00 values, it suggests that the cables + should be checked. For example, log a DC ticket to check the sas cables + between the drive and the expander. + +Diagnose: Slow disk devices +--------------------------- + +.. note:: + + collectl is an open-source performance gathering/analysis tool. + +If the diagnostics report a message such as ``sda: drive is slow``, you +should log onto the node and run the following comand: + +.. code:: + + $ /usr/bin/collectl -s D -c 1 + waiting for 1 second sample... + # DISK STATISTICS (/sec) + # <---------reads---------><---------writes---------><--------averages--------> Pct + #Name KBytes Merged IOs Size KBytes Merged IOs Size RWSize QLen Wait SvcTim Util + sdb 204 0 33 6 43 0 4 11 6 1 7 6 23 + sda 84 0 13 6 108 21 6 18 10 1 7 7 13 + sdc 100 0 16 6 0 0 0 0 6 1 7 6 9 + sdd 140 0 22 6 22 0 2 11 6 1 9 9 22 + sde 76 0 12 6 255 0 52 5 5 1 2 1 10 + sdf 276 0 44 6 0 0 0 0 6 1 11 8 38 + sdg 112 0 17 7 18 0 2 9 6 1 7 7 13 + sdh 3552 0 73 49 0 0 0 0 48 1 9 8 62 + sdi 72 0 12 6 0 0 0 0 6 1 8 8 10 + sdj 112 0 17 7 22 0 2 11 7 1 10 9 18 + sdk 120 0 19 6 21 0 2 11 6 1 8 8 16 + sdl 144 0 22 7 18 0 2 9 6 1 9 7 18 + dm-0 0 0 0 0 0 0 0 0 0 0 0 0 0 + dm-1 0 0 0 0 60 0 15 4 4 0 0 0 0 + dm-2 0 0 0 0 48 0 12 4 4 0 0 0 0 + dm-3 0 0 0 0 0 0 0 0 0 0 0 0 0 + dm-4 0 0 0 0 0 0 0 0 0 0 0 0 0 + dm-5 0 0 0 0 0 0 0 0 0 0 0 0 0 + ... + (repeats -- type Ctrl/C to stop) + +Look at the ``Wait`` and ``SvcTime`` values. It is not normal for +these values to exceed 50msec. This is known to impact customer +performance (upload/download. For a controller problem, many/all drives +will show how wait and service times. A reboot may correct the prblem; +otherwise hardware replacement is needed. + +Another way to look at the data is as follows: + +.. code:: + + $ /opt/hp/syseng/disk-anal.pl -d + Disk: sda Wait: 54580 371 65 25 12 6 6 0 1 2 0 46 + Disk: sdb Wait: 54532 374 96 36 16 7 4 1 0 2 0 46 + Disk: sdc Wait: 54345 554 105 29 15 4 7 1 4 4 0 46 + Disk: sdd Wait: 54175 553 254 31 20 11 6 6 2 2 1 53 + Disk: sde Wait: 54923 66 56 15 8 7 7 0 1 0 2 29 + Disk: sdf Wait: 50952 941 565 403 426 366 442 447 338 99 38 97 + Disk: sdg Wait: 50711 689 808 562 642 675 696 185 43 14 7 82 + Disk: sdh Wait: 51018 668 688 483 575 542 692 275 55 22 9 87 + Disk: sdi Wait: 51012 1011 849 672 568 240 344 280 38 13 6 81 + Disk: sdj Wait: 50724 743 770 586 662 509 684 283 46 17 11 79 + Disk: sdk Wait: 50886 700 585 517 633 511 729 352 89 23 8 81 + Disk: sdl Wait: 50106 617 794 553 604 504 532 501 288 234 165 216 + Disk: sda Time: 55040 22 16 6 1 1 13 0 0 0 3 12 + + Disk: sdb Time: 55014 41 19 8 3 1 8 0 0 0 3 17 + Disk: sdc Time: 55032 23 14 8 9 2 6 1 0 0 0 19 + Disk: sdd Time: 55022 29 17 12 6 2 11 0 0 0 1 14 + Disk: sde Time: 55018 34 15 11 12 1 9 0 0 0 2 12 + Disk: sdf Time: 54809 250 45 7 1 0 0 0 0 0 1 1 + Disk: sdg Time: 55070 36 6 2 0 0 0 0 0 0 0 0 + Disk: sdh Time: 55079 33 2 0 0 0 0 0 0 0 0 0 + Disk: sdi Time: 55074 28 7 2 0 0 2 0 0 0 0 1 + Disk: sdj Time: 55067 35 10 0 1 0 0 0 0 0 0 1 + Disk: sdk Time: 55068 31 10 3 0 0 1 0 0 0 0 1 + Disk: sdl Time: 54905 130 61 7 3 4 1 0 0 0 0 3 + +This shows the historical distribution of the wait and service times +over a day. This is how you read it: + +- sda did 54580 operations with a short wait time, 371 operations with + a longer wait time and 65 with an even longer wait time. + +- sdl did 50106 operations with a short wait time, but as you can see + many took longer. + +There is a clear pattern that sdf to sdl have a problem. Actually, sda +to sde would more normally have lots of zeros in their data. But maybe +this is a busy system. In this example it is worth changing the +controller as the individual drives may be ok. + +After the controller is changed, use collectl -s D as described above to +see if the problem has cleared. disk-anal.pl will continue to show +historical data. You can look at recent data as follows. It only looks +at data from 13:15 to 14:15. As you can see, this is a relatively clean +system (few if any long wait or service times): + +.. code:: + + $ /opt/hp/syseng/disk-anal.pl -d -t 13:15-14:15 + Disk: sda Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdb Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdc Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdd Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sde Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdf Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdg Wait: 3594 6 0 0 0 0 0 0 0 0 0 0 + Disk: sdh Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdi Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdj Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdk Wait: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdl Wait: 3599 1 0 0 0 0 0 0 0 0 0 0 + Disk: sda Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdb Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdc Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdd Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sde Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdf Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdg Time: 3594 6 0 0 0 0 0 0 0 0 0 0 + Disk: sdh Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdi Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdj Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdk Time: 3600 0 0 0 0 0 0 0 0 0 0 0 + Disk: sdl Time: 3599 1 0 0 0 0 0 0 0 0 0 0 + +For long wait times, where the service time appears normal is to check +the logical drive cache status. While the cache may be enabled, it can +be disabled on a per-drive basis. + +Diagnose: Slow network link - Measuring network performance +----------------------------------------------------------- + +Network faults can cause performance between Swift nodes to degrade. The +following tests are recommended. Other methods (such as copying large +files) may also work, but can produce inconclusive results. + +Use netperf on all production systems. Install on all systems if not +already installed. And the UFW rules for its control port are in place. +However, there are no pre-opened ports for netperf's data connection. Pick a +port number. In this example, 12866 is used because it is one higher +than netperf's default control port number, 12865. If you get very +strange results including zero values, you may not have gotten the data +port opened in UFW at the target or may have gotten the netperf +command-line wrong. + +Pick a ``source`` and ``target`` node. The source is often a proxy node +and the target is often an object node. Using the same source proxy you +can test communication to different object nodes in different AZs to +identity possible bottlekecks. + +Running tests +^^^^^^^^^^^^^ + +#. Prepare the ``target`` node as follows: + + .. code:: + + sudo iptables -I INPUT -p tcp -j ACCEPT + + Or, do: + + .. code:: + + sudo ufw allow 12866/tcp + +#. On the ``source`` node, run the following command to check + throughput. Note the double-dash before the -P option. + The command takes 10 seconds to complete. + + .. code:: + + $ netperf -H .72.4 + MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 12866 AF_INET to + .72.4 (.72.4) port 12866 AF_INET : demo + Recv Send Send + Socket Socket Message Elapsed + Size Size Size Time Throughput + bytes bytes bytes secs. 10^6bits/sec + 87380 16384 16384 10.02 923.69 + +#. On the ``source`` node, run the following command to check latency: + + .. code:: + + $ netperf -H .72.4 -t TCP_RR -- -P 12866 + MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 12866 + AF_INET to .72.4 (.72.4) port 12866 AF_INET : demo + : first burst 0 + Local Remote Socket Size Request Resp. Elapsed Trans. + Send Recv Size Size Time Rate + bytes Bytes bytes bytes secs. per sec + 16384 87380 1 1 10.00 11753.37 + 16384 87380 + +Expected results +^^^^^^^^^^^^^^^^ + +Faults will show up as differences between different pairs of nodes. +However, for reference, here are some expected numbers: + +- For throughput, proxy to proxy, expect ~9300 Mbit/sec (proxies have + a 10Ge link). + +- For throughout, proxy to object, expect ~920 Mbit/sec (at time of + writing this, object nodes have a 1Ge link). + +- For throughput, object to object, expect ~920 Mbit/sec. + +- For latency (all types), expect ~11000 transactions/sec. + +Diagnose: Remapping sectors experiencing UREs +--------------------------------------------- + +#. Find the bad sector, device, and filesystem in ``kern.log``. + +#. Set the environment variables SEC, DEV & FS, for example: + + .. code:: + + SEC=2930954256 + DEV=/dev/sdi + FS=/dev/sdi1 + +#. Verify that the sector is bad: + + .. code:: + + sudo dd if=${DEV} of=/dev/null bs=512 count=1 skip=${SEC} + +#. If the sector is bad this command will output an input/output error: + + .. code:: + + dd: reading `/dev/sdi`: Input/output error + 0+0 records in + 0+0 records out + +#. Prevent chef from attempting to re-mount the filesystem while the + repair is in progress: + + .. code:: + + sudo mv /etc/chef/client.pem /etc/chef/xx-client.xx-pem + +#. Stop the swift and rsync service: + + .. code:: + + sudo service rsync stop + sudo swift-init shutdown all + +#. Unmount the problem drive: + + .. code:: + + sudo umount ${FS} + +#. Overwrite/remap the bad sector: + + .. code:: + + sudo dd_rescue -d -A -m8b -s ${SEC}b ${DEV} ${DEV} + +#. This command should report an input/output error the first time + it is run. Run the command a second time, if it successfully remapped + the bad sector it should not report an input/output error. + +#. Verify the sector is now readable: + + .. code:: + + sudo dd if=${DEV} of=/dev/null bs=512 count=1 skip=${SEC} + +#. If the sector is now readable this command should not report an + input/output error. + +#. If more than one problem sector is listed, set the SEC environment + variable to the next sector in the list: + + .. code:: + + SEC=123456789 + +#. Repeat from step 8. + +#. Repair the filesystem: + + .. code:: + + sudo xfs_repair ${FS} + +#. If ``xfs_repair`` reports that the filesystem has valuable filesystem + changes: + + .. code:: + + sudo xfs_repair ${FS} + Phase 1 - find and verify superblock... + Phase 2 - using internal log + - zero log... + ERROR: The filesystem has valuable metadata changes in a log which + needs to be replayed. + Mount the filesystem to replay the log, and unmount it before + re-running xfs_repair. + If you are unable to mount the filesystem, then use the -L option to + destroy the log and attempt a repair. Note that destroying the log may + cause corruption -- please attempt a mount of the filesystem before + doing this. + +#. You should attempt to mount the filesystem, and clear the lost+found + area: + + .. code:: + + sudo mount $FS /mnt + sudo rm -rf /mnt/lost+found/* + sudo umount /mnt + +#. If the filesystem fails to mount then you will need to use the + ``xfs_repair -L`` option to force log zeroing. + Repeat step 11. + +#. If ``xfs_repair`` reports that an additional input/output error has been + encountered, get the sector details as follows: + + .. code:: + + sudo grep "I/O error" /var/log/kern.log | grep sector | tail -1 + +#. If new input/output error is reported then set the SEC environment + variable to the problem sector number: + + .. code:: + + SEC=234567890 + +#. Repeat from step 8 + + +#. Remount the filesystem and restart swift and rsync. + + - If all UREs in the kern.log have been fixed and you are still unable + to have xfs_repair disk, it is possible that the URE's have + corrupted the filesystem or possibly destroyed the drive altogether. + In this case, the first step is to re-format the filesystem and if + this fails, get the disk replaced. + + +Diagnose: High system latency +----------------------------- + +.. note:: + + The latency measurements described here are specific to the HPE + Helion Public Cloud. + +- A bad NIC on a proxy server. However, as explained above, this + usually causes the peak to rise, but average should remain near + normal parameters. A quick fix is to shutdown the proxy. + +- A stuck memcache server. Accepts connections, but then will not respond. + Expect to see timeout messages in ``/var/log/proxy.log`` (port 11211). + Swift Diags will also report this as a failed node/port. A quick fix + is to shutdown the proxy server. + +- A bad/broken object server can also cause problems if the accounts + used by the monitor program happen to live on the bad object server. + +- A general network problem within the data canter. Compare the results + with the Pingdom monitors too see if they also have a problem. + +Diagnose: Interface reports errors +---------------------------------- + +Should a network interface on a Swift node begin reporting network +errors, it may well indicate a cable, switch, or network issue. + +Get an overview of the interface with: + +.. code:: + + sudo ifconfig eth{n} + sudo ethtool eth{n} + +The ``Link Detected:`` indicator will read ``yes`` if the nic is +cabled. + +Establish the adapter type with: + +.. code:: + + sudo ethtool -i eth{n} + +Gather the interface statistics with: + +.. code:: + + sudo ethtool -S eth{n} + +If the nick supports self test, this can be performed with: + +.. code:: + + sudo ethtool -t eth{n} + +Self tests should read ``PASS`` if the nic is operating correctly. + +Nic module drivers can be re-initialised by carefully removing and +re-installing the modules. Case in point being the mellanox drivers on +Swift Proxy servers. which use a two part driver mlx4_en and +mlx4_core. To reload these you must carefully remove the mlx4_en +(ethernet) then the mlx4_core modules, and reinstall them in the +reverse order. + +As the interface will be disabled while the modules are unloaded, you +must be very careful not to lock the interface out. The following +script can be used to reload the melanox drivers, as a side effect, this +resets error counts on the interface. + + +Diagnose: CorruptDir diagnostic reports corrupt directories +----------------------------------------------------------- + +From time to time Swift data structures may become corrupted by +misplaced files in filesystem locations that swift would normally place +a directory. This causes issues for swift when directory creation is +attempted at said location, it may fail due to the pre-existent file. If +the CorruptDir diagnostic reports Corrupt directories, they should be +checked to see if they exist. + +Checking existence of entries +----------------------------- + +Swift data filesystems are located under the ``/srv/node/disk`` +mountpoints and contain accounts, containers and objects +subdirectories which in turn contain partition number subdirectories. +The partition number directories contain md5 hash subdirectories. md5 +hash directories contain md5sum subdirectories. md5sum directories +contain the Swift data payload as either a database (.db), for +accounts and containers, or a data file (.data) for objects. +If the entries reported in diagnostics correspond to a partition +number, md5 hash or md5sum directory, check the entry with ``ls +-ld *entry*``. +If it turns out to be a file rather than a directory, it should be +carefully removed. + +.. note:: + + Please do not ``ls`` the partition level directory contents, as + this *especially objects* may take a lot of time and system resources, + if you need to check the contents, use: + + .. code:: + + echo /srv/node/disk#/type/partition#/ + +Diagnose: Hung swift object replicator +-------------------------------------- + +The swift diagnostic message ``Object replicator: remaining exceeds +100hrs:`` may indicate that the swift ``object-replicator`` is stuck and not +making progress. Another useful way to check this is with the +'swift-recon -r' command on a swift proxy server: + +.. code:: + + sudo swift-recon -r + =============================================================================== + + --> Starting reconnaissance on 384 hosts + =============================================================================== + [2013-07-17 12:56:19] Checking on replication + http://.72.63:6000/recon/replication: + [replication_time] low: 2, high: 80, avg: 28.8, total: 11037, Failed: 0.0%, no_result: 0, reported: 383 + Oldest completion was 2013-06-12 22:46:50 (12 days ago) by .31:6000. + Most recent completion was 2013-07-17 12:56:19 (5 seconds ago) by .204.113:6000. + =============================================================================== + +The ``Oldest completion`` line in this example indicates that the +object-replicator on swift object server .31 has not completed +the replication cycle in 12 days. This replicator is stuck. The object +replicator cycle is generally less than 1 hour. Though an replicator +cycle of 15-20 hours can occur if nodes are added to the system and a +new ring has been deployed. + +You can further check if the object replicator is stuck by logging on +the the object server and checking the object replicator progress with +the following command: + +.. code:: + + # sudo grep object-rep /var/log/swift/background.log | grep -e "Starting object replication" -e "Object replication complete" -e "partitions rep" + Jul 16 06:25:46 object-replicator 15344/16450 (93.28%) partitions replicated in 69018.48s (0.22/sec, 22h remaining) + Jul 16 06:30:46 object-replicator 15344/16450 (93.28%) partitions replicated in 69318.58s (0.22/sec, 22h remaining) + Jul 16 06:35:46 object-replicator 15344/16450 (93.28%) partitions replicated in 69618.63s (0.22/sec, 23h remaining) + Jul 16 06:40:46 object-replicator 15344/16450 (93.28%) partitions replicated in 69918.73s (0.22/sec, 23h remaining) + Jul 16 06:45:46 object-replicator 15348/16450 (93.30%) partitions replicated in 70218.75s (0.22/sec, 24h remaining) + Jul 16 06:50:47 object-replicator 15348/16450 (93.30%) partitions replicated in 70518.85s (0.22/sec, 24h remaining) + Jul 16 06:55:47 object-replicator 15348/16450 (93.30%) partitions replicated in 70818.95s (0.22/sec, 25h remaining) + Jul 16 07:00:47 object-replicator 15348/16450 (93.30%) partitions replicated in 71119.05s (0.22/sec, 25h remaining) + Jul 16 07:05:47 object-replicator 15348/16450 (93.30%) partitions replicated in 71419.15s (0.21/sec, 26h remaining) + Jul 16 07:10:47 object-replicator 15348/16450 (93.30%) partitions replicated in 71719.25s (0.21/sec, 26h remaining) + Jul 16 07:15:47 object-replicator 15348/16450 (93.30%) partitions replicated in 72019.27s (0.21/sec, 27h remaining) + Jul 16 07:20:47 object-replicator 15348/16450 (93.30%) partitions replicated in 72319.37s (0.21/sec, 27h remaining) + Jul 16 07:25:47 object-replicator 15348/16450 (93.30%) partitions replicated in 72619.47s (0.21/sec, 28h remaining) + Jul 16 07:30:47 object-replicator 15348/16450 (93.30%) partitions replicated in 72919.56s (0.21/sec, 28h remaining) + Jul 16 07:35:47 object-replicator 15348/16450 (93.30%) partitions replicated in 73219.67s (0.21/sec, 29h remaining) + Jul 16 07:40:47 object-replicator 15348/16450 (93.30%) partitions replicated in 73519.76s (0.21/sec, 29h remaining) + +The above status is output every 5 minutes to ``/var/log/swift/background.log``. + +.. note:: + + The 'remaining' time is increasing as time goes on, normally the + time remaining should be decreasing. Also note the partition number. For example, + 15344 remains the same for several status lines. Eventually the object + replicator detects the hang and attempts to make progress by killing the + problem thread. The replicator then progresses to the next partition but + quite often it again gets stuck on the same partition. + +One of the reasons for the object replicator hanging like this is +filesystem corruption on the drive. The following is a typical log entry +of a corrupted filesystem detected by the object replicator: + +.. code:: + + # sudo bzgrep "Remote I/O error" /var/log/swift/background.log* |grep srv | - tail -1 + Jul 12 03:33:30 object-replicator STDOUT: ERROR:root:Error hashing suffix#012Traceback (most recent call last):#012 File + "/usr/lib/python2.7/dist-packages/swift/obj/replicator.py", line 199, in get_hashes#012 hashes[suffix] = hash_suffix(suffix_dir, + reclaim_age)#012 File "/usr/lib/python2.7/dist-packages/swift/obj/replicator.py", line 84, in hash_suffix#012 path_contents = + sorted(os.listdir(path))#012OSError: [Errno 121] Remote I/O error: '/srv/node/disk4/objects/1643763/b51' + +An ``ls`` of the problem file or directory usually shows something like the following: + +.. code:: + + # ls -l /srv/node/disk4/objects/1643763/b51 + ls: cannot access /srv/node/disk4/objects/1643763/b51: Remote I/O error + +If no entry with ``Remote I/O error`` occurs in the ``background.log`` it is +not possible to determine why the object-replicator is hung. It may be +that the ``Remote I/O error`` entry is older than 7 days and so has been +rotated out of the logs. In this scenario it may be best to simply +restart the object-replicator. + +#. Stop the object-replicator: + + .. code:: + + # sudo swift-init object-replicator stop + +#. Make sure the object replicator has stopped, if it has hung, the stop + command will not stop the hung process: + + .. code:: + + # ps auxww | - grep swift-object-replicator + +#. If the previous ps shows the object-replicator is still running, kill + the process: + + .. code:: + + # kill -9 + +#. Start the object-replicator: + + .. code:: + + # sudo swift-init object-replicator start + +If the above grep did find an ``Remote I/O error`` then it may be possible +to repair the problem filesystem. + +#. Stop swift and rsync: + + .. code:: + + # sudo swift-init all shutdown + # sudo service rsync stop + +#. Make sure all swift process have stopped: + + .. code:: + + # ps auxww | grep swift | grep python + +#. Kill any swift processes still running. + +#. Unmount the problem filesystem: + + .. code:: + + # sudo umount /srv/node/disk4 + +#. Repair the filesystem: + + .. code:: + + # sudo xfs_repair -P /dev/sde1 + +#. If the ``xfs_repair`` fails then it may be necessary to re-format the + filesystem. See Procedure: fix broken XFS filesystem. If the + ``xfs_repair`` is successful, re-enable chef using the following command + and replication should commence again. + + +Diagnose: High CPU load +----------------------- + +The CPU load average on an object server, as shown with the +'uptime' command, is typically under 10 when the server is +lightly-moderately loaded: + +.. code:: + + $ uptime + 07:59:26 up 99 days, 5:57, 1 user, load average: 8.59, 8.39, 8.32 + +During times of increased activity, due to user transactions or object +replication, the CPU load average can increase to to around 30. + +However, sometimes the CPU load average can increase significantly. The +following is an example of an object server that has extremely high CPU +load: + +.. code:: + + $ uptime + 07:44:02 up 18:22, 1 user, load average: 407.12, 406.36, 404.59 + +.. toctree:: + :maxdepth: 2 + + sec-furtherdiagnose.rst diff --git a/doc/source/ops_runbook/general.rst b/doc/source/ops_runbook/general.rst new file mode 100644 index 0000000000..60d19badee --- /dev/null +++ b/doc/source/ops_runbook/general.rst @@ -0,0 +1,36 @@ +================== +General Procedures +================== + +Getting a swift account stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + ``swift-direct`` is specific to the HPE Helion Public Cloud. Go look at + ``swifty`` for an alternate, this is an example. + +This procedure describes how you determine the swift usage for a given +swift account, that is the number of containers, number of objects and +total bytes used. To do this you will need the project ID. + +Log onto one of the swift proxy servers. + +Use swift-direct to show this accounts usage: + +.. code:: + + $ sudo -u swift /opt/hp/swift/bin/swift-direct show AUTH_redacted-9a11-45f8-aa1c-9e7b1c7904c8 + Status: 200 + Content-Length: 0 + Accept-Ranges: bytes + X-Timestamp: 1379698586.88364 + X-Account-Bytes-Used: 67440225625994 + X-Account-Container-Count: 1 + Content-Type: text/plain; charset=utf-8 + X-Account-Object-Count: 8436776 + Status: 200 + name: my_container count: 8436776 bytes: 67440225625994 + +This account has 1 container. That container has 8436776 objects. The +total bytes used is 67440225625994. \ No newline at end of file diff --git a/doc/source/ops_runbook/index.rst b/doc/source/ops_runbook/index.rst new file mode 100644 index 0000000000..6fdb9c8c90 --- /dev/null +++ b/doc/source/ops_runbook/index.rst @@ -0,0 +1,79 @@ +================= +Swift Ops Runbook +================= + +This document contains operational procedures that Hewlett Packard Enterprise (HPE) uses to operate +and monitor the Swift system within the HPE Helion Public Cloud. This +document is an excerpt of a larger product-specific handbook. As such, +the material may appear incomplete. The suggestions and recommendations +made in this document are for our particular environment, and may not be +suitable for your environment or situation. We make no representations +concerning the accuracy, adequacy, completeness or suitability of the +information, suggestions or recommendations. This document are provided +for reference only. We are not responsible for your use of any +information, suggestions or recommendations contained herein. + +This document also contains references to certain tools that we use to +operate the Swift system within the HPE Helion Public Cloud. +Descriptions of these tools are provided for reference only, as the tools themselves +are not publically available at this time. + +- ``swift-direct``: This is similar to the ``swiftly`` tool. + + +.. toctree:: + :maxdepth: 2 + + general.rst + diagnose.rst + procedures.rst + maintenance.rst + troubleshooting.rst + +Is the system up? +~~~~~~~~~~~~~~~~~ + +If you have a report that Swift is down, perform the following basic checks: + +#. Run swift functional tests. + +#. From a server in your data center, use ``curl`` to check ``/healthcheck``. + +#. If you have a monitoring system, check your monitoring system. + +#. Check on your hardware load balancers infrastructure. + +#. Run swift-recon on a proxy node. + +Run swift function tests +------------------------ + +We would recommend that you set up your function tests against your production +system. + +A script for running the function tests is located in ``swift/.functests``. + + +External monitoring +------------------- + +- We use pingdom.com to monitor the external Swift API. We suggest the + following: + + - Do a GET on ``/healthcheck`` + + - Create a container, make it public (x-container-read: + .r\*,.rlistings), create a small file in the container; do a GET + on the object + +Reference information +~~~~~~~~~~~~~~~~~~~~~ + +Reference: Swift startup/shutdown +--------------------------------- + +- Use reload - not stop/start/restart. + +- Try to roll sets of servers (especially proxy) in groups of less + than 20% of your servers. + diff --git a/doc/source/ops_runbook/maintenance.rst b/doc/source/ops_runbook/maintenance.rst new file mode 100644 index 0000000000..b3c9e582ac --- /dev/null +++ b/doc/source/ops_runbook/maintenance.rst @@ -0,0 +1,322 @@ +================== +Server maintenance +================== + +General assumptions +~~~~~~~~~~~~~~~~~~~ + +- It is assumed that anyone attempting to replace hardware components + will have already read and understood the appropriate maintenance and + service guides. + +- It is assumed that where servers need to be taken off-line for + hardware replacement, that this will be done in series, bringing the + server back on-line before taking the next off-line. + +- It is assumed that the operations directed procedure will be used for + identifying hardware for replacement. + +Assessing the health of swift +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can run the swift-recon tool on a Swift proxy node to get a quick +check of how Swift is doing. Please note that the numbers below are +necessarily somewhat subjective. Sometimes parameters for which we +say 'low values are good' will have pretty high values for a time. Often +if you wait a while things get better. + +For example: + +.. code:: + + sudo swift-recon -rla + =============================================================================== + [2012-03-10 12:57:21] Checking async pendings on 384 hosts... + Async stats: low: 0, high: 1, avg: 0, total: 1 + =============================================================================== + + [2012-03-10 12:57:22] Checking replication times on 384 hosts... + [Replication Times] shortest: 1.4113877813, longest: 36.8293570836, avg: 4.86278064749 + =============================================================================== + + [2012-03-10 12:57:22] Checking load avg's on 384 hosts... + [5m load average] lowest: 2.22, highest: 9.5, avg: 4.59578125 + [15m load average] lowest: 2.36, highest: 9.45, avg: 4.62622395833 + [1m load average] lowest: 1.84, highest: 9.57, avg: 4.5696875 + =============================================================================== + +In the example above we ask for information on replication times (-r), +load averages (-l) and async pendings (-a). This is a healthy Swift +system. Rules-of-thumb for 'good' recon output are: + +- Nodes that respond are up and running Swift. If all nodes respond, + that is a good sign. But some nodes may time out. For example: + + .. code:: + + \-> [http://.29:6000/recon/load:] + \-> [http://.31:6000/recon/load:] + +- That could be okay or could require investigation. + +- Low values (say < 10 for high and average) for async pendings are + good. Higher values occur when disks are down and/or when the system + is heavily loaded. Many simultaneous PUTs to the same container can + drive async pendings up. This may be normal, and may resolve itself + after a while. If it persists, one way to track down the problem is + to find a node with high async pendings (with ``swift-recon -av | sort + -n -k4``), then check its Swift logs, Often async pendings are high + because a node cannot write to a container on another node. Often + this is because the node or disk is offline or bad. This may be okay + if we know about it. + +- Low values for replication times are good. These values rise when new + rings are pushed, and when nodes and devices are brought back on + line. + +- Our 'high' load average values are typically in the 9-15 range. If + they are a lot bigger it is worth having a look at the systems + pushing the average up. Run ``swift-recon -av`` to get the individual + averages. To sort the entries with the highest at the end, + run ``swift-recon -av | sort -n -k4``. + +For comparison here is the recon output for the same system above when +two entire racks of Swift are down: + +.. code:: + + [2012-03-10 16:56:33] Checking async pendings on 384 hosts... + -> http://.22:6000/recon/async: + -> http://.18:6000/recon/async: + -> http://.16:6000/recon/async: + -> http://.13:6000/recon/async: + -> http://.30:6000/recon/async: + -> http://.6:6000/recon/async: + ......... + -> http://.5:6000/recon/async: + -> http://.15:6000/recon/async: + -> http://.9:6000/recon/async: + -> http://.27:6000/recon/async: + -> http://.4:6000/recon/async: + -> http://.8:6000/recon/async: + Async stats: low: 243, high: 659, avg: 413, total: 132275 + =============================================================================== + [2012-03-10 16:57:48] Checking replication times on 384 hosts... + -> http://.22:6000/recon/replication: + -> http://.18:6000/recon/replication: + -> http://.16:6000/recon/replication: + -> http://.13:6000/recon/replication: + -> http://.30:6000/recon/replication: + -> http://.6:6000/recon/replication: + ............ + -> http://.5:6000/recon/replication: + -> http://.15:6000/recon/replication: + -> http://.9:6000/recon/replication: + -> http://.27:6000/recon/replication: + -> http://.4:6000/recon/replication: + -> http://.8:6000/recon/replication: + [Replication Times] shortest: 1.38144306739, longest: 112.620954418, avg: 10.285 + 9475361 + =============================================================================== + [2012-03-10 16:59:03] Checking load avg's on 384 hosts... + -> http://.22:6000/recon/load: + -> http://.18:6000/recon/load: + -> http://.16:6000/recon/load: + -> http://.13:6000/recon/load: + -> http://.30:6000/recon/load: + -> http://.6:6000/recon/load: + ............ + -> http://.15:6000/recon/load: + -> http://.9:6000/recon/load: + -> http://.27:6000/recon/load: + -> http://.4:6000/recon/load: + -> http://.8:6000/recon/load: + [5m load average] lowest: 1.71, highest: 4.91, avg: 2.486375 + [15m load average] lowest: 1.79, highest: 5.04, avg: 2.506125 + [1m load average] lowest: 1.46, highest: 4.55, avg: 2.4929375 + =============================================================================== + +.. note:: + + The replication times and load averages are within reasonable + parameters, even with 80 object stores down. Async pendings, however is + quite high. This is due to the fact that the containers on the servers + which are down cannot be updated. When those servers come back up, async + pendings should drop. If async pendings were at this level without an + explanation, we have a problem. + +Recon examples +~~~~~~~~~~~~~~ + +Here is an example of noting and tracking down a problem with recon. + +Running reccon shows some async pendings: + +.. code:: + + bob@notso:~/swift-1.4.4/swift$ ssh \\-q .132.7 sudo swift-recon \\-alr + =============================================================================== + \[2012-03-14 17:25:55\\] Checking async pendings on 384 hosts... + Async stats: low: 0, high: 23, avg: 8, total: 3356 + =============================================================================== + \[2012-03-14 17:25:55\\] Checking replication times on 384 hosts... + \[Replication Times\\] shortest: 1.49303831657, longest: 39.6982825994, avg: 4.2418222066 + =============================================================================== + \[2012-03-14 17:25:56\\] Checking load avg's on 384 hosts... + \[5m load average\\] lowest: 2.35, highest: 8.88, avg: 4.45911458333 + \[15m load average\\] lowest: 2.41, highest: 9.11, avg: 4.504765625 + \[1m load average\\] lowest: 1.95, highest: 8.56, avg: 4.40588541667 + =============================================================================== + +Why? Running recon again with -av swift (not shown here) tells us that +the node with the highest (23) is .72.61. Looking at the log +files on .72.61 we see: + +.. code:: + + souzab@:~$ sudo tail -f /var/log/swift/background.log | - grep -i ERROR + Mar 14 17:28:06 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6001} + Mar 14 17:28:06 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6001} + Mar 14 17:28:09 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:11 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:13 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6001} + Mar 14 17:28:13 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6001} + Mar 14 17:28:15 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:15 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:19 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:19 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:20 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.119', 'id': 5481, 'meta': '', 'device': 'disk6', 'port': 6001} + Mar 14 17:28:21 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:21 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + Mar 14 17:28:22 container-replicator ERROR Remote drive not mounted + {'zone': 5, 'weight': 1952.0, 'ip': '.204.20', 'id': 2311, 'meta': '', 'device': 'disk5', 'port': 6001} + +That is why this node has a lot of async pendings: a bunch of disks that +are not mounted on and . There may be other issues, +but clearing this up will likely drop the async pendings a fair bit, as +other nodes will be having the same problem. + +Assessing the availability risk when multiple storage servers are down +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + This procedure will tell you if you have a problem, however, in practice + you will find that you will not use this procedure frequently. + +If three storage nodes (or, more precisely, three disks on three +different storage nodes) are down, there is a small but nonzero +probability that user objects, containers, or accounts will not be +available. + +Procedure +--------- + +.. note:: + + swift has three rings: one each for objects, containers and accounts. + This procedure should be run three times, each time specifying the + appropriate ``*.builder`` file. + +#. Determine whether all three nodes are different Swift zones by + running the ring builder on a proxy node to determine which zones + the storage nodes are in. For example: + + .. code:: + + % sudo swift-ring-builder /etc/swift/object.builder + /etc/swift/object.builder, build version 1467 + 2097152 partitions, 3 replicas, 5 zones, 1320 devices, 0.02 balance + The minimum number of hours before a partition can be reassigned is 24 + Devices: id zone ip address port name weight partitions balance meta + 0 1 .4 6000 disk0 1708.00 4259 -0.00 + 1 1 .4 6000 disk1 1708.00 4260 0.02 + 2 1 .4 6000 disk2 1952.00 4868 0.01 + 3 1 .4 6000 disk3 1952.00 4868 0.01 + 4 1 .4 6000 disk4 1952.00 4867 -0.01 + +#. Here, node .4 is in zone 1. If two or more of the three + nodes under consideration are in the same Swift zone, they do not + have any ring partitions in common; there is little/no data + availability risk if all three nodes are down. + +#. If the nodes are in three distinct Swift zonesit is necessary to + whether the nodes have ring partitions in common. Run ``swift-ring`` + builder again, this time with the ``list_parts`` option and specify + the nodes under consideration. For example (all on one line): + + .. code:: + + % sudo swift-ring-builder /etc/swift/object.builder list_parts .8 .15 .72.2 + Partition Matches + 91 2 + 729 2 + 3754 2 + 3769 2 + 3947 2 + 5818 2 + 7918 2 + 8733 2 + 9509 2 + 10233 2 + +#. The ``list_parts`` option to the ring builder indicates how many ring + partitions the nodes have in common. If, as in this case, the + first entry in the list has a ‘Matches’ column of 2 or less, there + is no data availability risk if all three nodes are down. + +#. If the ‘Matches’ column has entries equal to 3, there is some data + availability risk if all three nodes are down. The risk is generally + small, and is proportional to the number of entries that have a 3 in + the Matches column. For example: + + .. code:: + + Partition Matches + 26865 3 + 362367 3 + 745940 3 + 778715 3 + 797559 3 + 820295 3 + 822118 3 + 839603 3 + 852332 3 + 855965 3 + 858016 3 + +#. A quick way to count the number of rows with 3 matches is: + + .. code:: + + % sudo swift-ring-builder /etc/swift/object.builder list_parts .8 .15 .72.2 | grep “3$” - wc \\-l + + 30 + +#. In this case the nodes have 30 out of a total of 2097152 partitions + in common; about 0.001%. In this case the risk is small nonzero. + Recall that a partition is simply a portion of the ring mapping + space, not actual data. So having partitions in common is a necessary + but not sufficient condition for data unavailability. + + .. note:: + + We should not bring down a node for repair if it shows + Matches entries of 3 with other nodes that are also down. + + If three nodes that have 3 partitions in common are all down, there is + a nonzero probability that data are unavailable and we should work to + bring some or all of the nodes up ASAP. diff --git a/doc/source/ops_runbook/procedures.rst b/doc/source/ops_runbook/procedures.rst new file mode 100644 index 0000000000..899df6d694 --- /dev/null +++ b/doc/source/ops_runbook/procedures.rst @@ -0,0 +1,367 @@ +================================= +Software configuration procedures +================================= + +Fix broken GPT table (broken disk partition) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- If a GPT table is broken, a message like the following should be + observed when the command... + + .. code:: + + $ sudo parted -l + +- ... is run. + + .. code:: + + ... + Error: The backup GPT table is corrupt, but the primary appears OK, so that will + be used. + OK/Cancel? + +#. To fix this, firstly install the ``gdisk`` program to fix this: + + .. code:: + + $ sudo aptitude install gdisk + +#. Run ``gdisk`` for the particular drive with the damaged partition: + + .. code: + + $ sudo gdisk /dev/sd*a-l* + GPT fdisk (gdisk) version 0.6.14 + + Caution: invalid backup GPT header, but valid main header; regenerating + backup header from main header. + + Warning! One or more CRCs don't match. You should repair the disk! + + Partition table scan: + MBR: protective + BSD: not present + APM: not present + GPT: damaged + /dev/sd + ***************************************************************************** + Caution: Found protective or hybrid MBR and corrupt GPT. Using GPT, but disk + verification and recovery are STRONGLY recommended. + ***************************************************************************** + +#. On the command prompt, type ``r`` (recovery and transformation + options), followed by ``d`` (use main GPT header) , ``v`` (verify disk) + and finally ``w`` (write table to disk and exit). Will also need to + enter ``Y`` when prompted in order to confirm actions. + + .. code:: + + Command (? for help): r + + Recovery/transformation command (? for help): d + + Recovery/transformation command (? for help): v + + Caution: The CRC for the backup partition table is invalid. This table may + be corrupt. This program will automatically create a new backup partition + table when you save your partitions. + + Caution: Partition 1 doesn't begin on a 8-sector boundary. This may + result in degraded performance on some modern (2009 and later) hard disks. + + Caution: Partition 2 doesn't begin on a 8-sector boundary. This may + result in degraded performance on some modern (2009 and later) hard disks. + + Caution: Partition 3 doesn't begin on a 8-sector boundary. This may + result in degraded performance on some modern (2009 and later) hard disks. + + Identified 1 problems! + + Recovery/transformation command (? for help): w + + Final checks complete. About to write GPT data. THIS WILL OVERWRITE EXISTING + PARTITIONS!! + + Do you want to proceed, possibly destroying your data? (Y/N): Y + + OK; writing new GUID partition table (GPT). + The operation has completed successfully. + +#. Running the command: + + .. code:: + + $ sudo parted /dev/sd# + +#. Should now show that the partition is recovered and healthy again. + +#. Finally, uninstall ``gdisk`` from the node: + + .. code:: + + $ sudo aptitude remove gdisk + +Procedure: Fix broken XFS filesystem +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +#. A filesystem may be corrupt or broken if the following output is + observed when checking its label: + + .. code:: + + $ sudo xfs_admin -l /dev/sd# + cache_node_purge: refcount was 1, not zero (node=0x25d5ee0) + xfs_admin: cannot read root inode (117) + cache_node_purge: refcount was 1, not zero (node=0x25d92b0) + xfs_admin: cannot read realtime bitmap inode (117) + bad sb magic # 0 in AG 1 + failed to read label in AG 1 + +#. Run the following commands to remove the broken/corrupt filesystem and replace. + (This example uses the filesystem ``/dev/sdb2``) Firstly need to replace the partition: + + .. code:: + + $ sudo parted + GNU Parted 2.3 + Using /dev/sda + Welcome to GNU Parted! Type 'help' to view a list of commands. + (parted) select /dev/sdb + Using /dev/sdb + (parted) p + Model: HP LOGICAL VOLUME (scsi) + Disk /dev/sdb: 2000GB + Sector size (logical/physical): 512B/512B + Partition Table: gpt + + Number Start End Size File system Name Flags + 1 17.4kB 1024MB 1024MB ext3 boot + 2 1024MB 1751GB 1750GB xfs sw-aw2az1-object045-disk1 + 3 1751GB 2000GB 249GB lvm + + (parted) rm 2 + (parted) mkpart primary 2 -1 + Warning: You requested a partition from 2000kB to 2000GB. + The closest location we can manage is 1024MB to 1751GB. + Is this still acceptable to you? + Yes/No? Yes + Warning: The resulting partition is not properly aligned for best performance. + Ignore/Cancel? Ignore + (parted) p + Model: HP LOGICAL VOLUME (scsi) + Disk /dev/sdb: 2000GB + Sector size (logical/physical): 512B/512B + Partition Table: gpt + + Number Start End Size File system Name Flags + 1 17.4kB 1024MB 1024MB ext3 boot + 2 1024MB 1751GB 1750GB xfs primary + 3 1751GB 2000GB 249GB lvm + + (parted) quit + +#. Next step is to scrub the filesystem and format: + + .. code:: + + $ sudo dd if=/dev/zero of=/dev/sdb2 bs=$((1024\*1024)) count=1 + 1+0 records in + 1+0 records out + 1048576 bytes (1.0 MB) copied, 0.00480617 s, 218 MB/s + $ sudo /sbin/mkfs.xfs -f -i size=1024 /dev/sdb2 + meta-data=/dev/sdb2 isize=1024 agcount=4, agsize=106811524 blks + = sectsz=512 attr=2, projid32bit=0 + data = bsize=4096 blocks=427246093, imaxpct=5 + = sunit=0 swidth=0 blks + naming =version 2 bsize=4096 ascii-ci=0 + log =internal log bsize=4096 blocks=208616, version=2 + = sectsz=512 sunit=0 blks, lazy-count=1 + realtime =none extsz=4096 blocks=0, rtextents=0 + +#. You should now label and mount your filesystem. + +#. Can now check to see if the filesystem is mounted using the command: + + .. code:: + + $ mount + +Procedure: Checking if an account is okay +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + ``swift-direct`` is only available in the HPE Helion Public Cloud. + Use ``swiftly`` as an alternate. + +If you have a tenant ID you can check the account is okay as follows from a proxy. + +.. code:: + + $ sudo -u swift /opt/hp/swift/bin/swift-direct show + +The response will either be similar to a swift list of the account +containers, or an error indicating that the resource could not be found. + +In the latter case you can establish if a backend database exists for +the tenantId by running the following on a proxy: + +.. code:: + + $ sudo -u swift swift-get-nodes /etc/swift/account.ring.gz + +The response will list ssh commands that will list the replicated +account databases, if they exist. + +Procedure: Revive a deleted account +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Swift accounts are normally not recreated. If a tenant unsubscribes from +Swift, the account is deleted. To re-subscribe to Swift, you can create +a new tenant (new tenant ID), and subscribe to Swift. This creates a +new Swift account with the new tenant ID. + +However, until the unsubscribe/new tenant process is supported, you may +hit a situation where a Swift account is deleted and the user is locked +out of Swift. + +Deleting the account database files +----------------------------------- + +Here is one possible solution. The containers and objects may be lost +forever. The solution is to delete the account database files and +re-create the account. This may only be done once the containers and +objects are completely deleted. This process is untested, but could +work as follows: + +#. Use swift-get-nodes to locate the account's database file (on three + servers). + +#. Rename the database files (on three servers). + +#. Use ``swiftly`` to create the account (use original name). + +Renaming account database so it can be revived +---------------------------------------------- + +Get the locations of the database files that hold the account data. + + .. code:: + + sudo swift-get-nodes /etc/swift/account.ring.gz AUTH_redacted-1856-44ae-97db-31242f7ad7a1 + + Account AUTH_redacted-1856-44ae-97db-31242f7ad7a1 + Container None + + Object None + + Partition 18914 + + Hash 93c41ef56dd69173a9524193ab813e78 + + Server:Port Device 15.184.9.126:6002 disk7 + Server:Port Device 15.184.9.94:6002 disk11 + Server:Port Device 15.184.9.103:6002 disk10 + Server:Port Device 15.184.9.80:6002 disk2 [Handoff] + Server:Port Device 15.184.9.120:6002 disk2 [Handoff] + Server:Port Device 15.184.9.98:6002 disk2 [Handoff] + + curl -I -XHEAD "`*http://15.184.9.126:6002/disk7/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ + curl -I -XHEAD "`*http://15.184.9.94:6002/disk11/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ + + curl -I -XHEAD "`*http://15.184.9.103:6002/disk10/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ + + curl -I -XHEAD "`*http://15.184.9.80:6002/disk2/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ # [Handoff] + curl -I -XHEAD "`*http://15.184.9.120:6002/disk2/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ # [Handoff] + curl -I -XHEAD "`*http://15.184.9.98:6002/disk2/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ # [Handoff] + + ssh 15.184.9.126 "ls -lah /srv/node/disk7/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" + ssh 15.184.9.94 "ls -lah /srv/node/disk11/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" + ssh 15.184.9.103 "ls -lah /srv/node/disk10/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" + ssh 15.184.9.80 "ls -lah /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" # [Handoff] + ssh 15.184.9.120 "ls -lah /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" # [Handoff] + ssh 15.184.9.98 "ls -lah /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" # [Handoff] + + $ sudo swift-get-nodes /etc/swift/account.ring.gz AUTH\_redacted-1856-44ae-97db-31242f7ad7a1Account AUTH_redacted-1856-44ae-97db- + 31242f7ad7a1Container NoneObject NonePartition 18914Hash 93c41ef56dd69173a9524193ab813e78Server:Port Device 15.184.9.126:6002 disk7Server:Port Device 15.184.9.94:6002 disk11Server:Port Device 15.184.9.103:6002 disk10Server:Port Device 15.184.9.80:6002 + disk2 [Handoff]Server:Port Device 15.184.9.120:6002 disk2 [Handoff]Server:Port Device 15.184.9.98:6002 disk2 [Handoff]curl -I -XHEAD + "`*http://15.184.9.126:6002/disk7/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"*`_ curl -I -XHEAD + + "`*http://15.184.9.94:6002/disk11/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ curl -I -XHEAD + + "`*http://15.184.9.103:6002/disk10/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ curl -I -XHEAD + + "`*http://15.184.9.80:6002/disk2/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ # [Handoff]curl -I -XHEAD + + "`*http://15.184.9.120:6002/disk2/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ # [Handoff]curl -I -XHEAD + + "`*http://15.184.9.98:6002/disk2/18914/AUTH_redacted-1856-44ae-97db-31242f7ad7a1"* `_ # [Handoff]ssh 15.184.9.126 + + "ls -lah /srv/node/disk7/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/"ssh 15.184.9.94 "ls -lah /srv/node/disk11/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/"ssh 15.184.9.103 + "ls -lah /srv/node/disk10/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/"ssh 15.184.9.80 "ls -lah /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" # [Handoff]ssh 15.184.9.120 + "ls -lah /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" # [Handoff]ssh 15.184.9.98 "ls -lah /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" # [Handoff] + +Check that the handoff nodes do not have account databases: + +.. code:: + + $ ssh 15.184.9.80 "ls -lah /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/" + ls: cannot access /srv/node/disk2/accounts/18914/e78/93c41ef56dd69173a9524193ab813e78/: No such file or directory + +If the handoff node has a database, wait for rebalancing to occur. + +Procedure: Temporarily stop load balancers from directing traffic to a proxy server +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can stop the load balancers sending requests to a proxy server as +follows. This can be useful when a proxy is misbehaving but you need +Swift running to help diagnose the problem. By removing from the load +balancers, customer's are not impacted by the misbehaving proxy. + +#. Ensure that in proxyserver.com the ``disable_path`` variable is set to + ``/etc/swift/disabled-by-file``. + +#. Log onto the proxy node. + +#. Shut down Swift as follows: + + .. code:: + + sudo swift-init proxy shutdown + + .. note:: + + Shutdown, not stop. + +#. Create the ``/etc/swift/disabled-by-file`` file. For example: + + .. code:: + + sudo touch /etc/swift/disabled-by-file + +#. Optional, restart Swift: + + .. code:: + + sudo swift-init proxy start + +It works because the healthcheck middleware looks for this file. If it +find it, it will return 503 error instead of 200/OK. This means the load balancer +should stop sending traffic to the proxy. + +``/healthcheck`` will report +``FAIL: disabled by file`` if the ``disabled-by-file`` file exists. + +Procedure: Ad-Hoc disk performance test +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can get an idea whether a disk drive is performing as follows: + +.. code:: + + sudo dd bs=1M count=256 if=/dev/zero conv=fdatasync of=/srv/node/disk11/remember-to-delete-this-later + +You can expect ~600MB/sec. If you get a low number, repeat many times as +Swift itself may also read or write to the disk, hence giving a lower +number. diff --git a/doc/source/ops_runbook/sec-furtherdiagnose.rst b/doc/source/ops_runbook/sec-furtherdiagnose.rst new file mode 100644 index 0000000000..dd8154a3d9 --- /dev/null +++ b/doc/source/ops_runbook/sec-furtherdiagnose.rst @@ -0,0 +1,177 @@ +============================== +Further issues and resolutions +============================== + +.. note:: + + The urgency levels in each **Action** column indicates whether or + not it is required to take immediate action, or if the problem can be worked + on during business hours. + +.. list-table:: + :widths: 33 33 33 + :header-rows: 1 + + * - **Scenario** + - **Description** + - **Action** + * - ``/healthcheck`` latency is high. + - The ``/healthcheck`` test does not tax the proxy very much so any drop in value is probably related to + network issues, rather than the proxies being very busy. A very slow proxy might impact the average + number, but it would need to be very slow to shift the number that much. + - Check networks. Do a ``curl https:///healthcheck where ip-address`` is individual proxy + IP address to see if you can pin point a problem in the network. + + Urgency: If there are other indications that your system is slow, you should treat + this as an urgent problem. + * - Swift process is not running. + - You can use ``swift-init`` status to check if swift processes are running on any + given server. + - Run this command: + .. code:: + + sudo swift-init all start + + Examine messages in the swift log files to see if there are any + error messages related to any of the swift processes since the time you + ran the ``swift-init`` command. + + Take any corrective actions that seem necessary. + + Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - ntpd is not running. + - NTP is not running. + - Configure and start NTP. + Urgency: For proxy servers, this is vital. + + * - Host clock is not syncd to an NTP server. + - Node time settings does not match NTP server time. + This may take some time to sync after a reboot. + - Assuming NTP is configured and running, you have to wait until the times sync. + * - A swift process has hundreds, to thousands of open file descriptors. + - May happen to any of the swift processes. + Known to have happened with a ``rsyslod restart`` and where ``/tmp`` was hanging. + + - Restart the swift processes on the affected node: + + .. code:: + + % sudo swift-init all reload + + Urgency: + If known performance problem: Immediate + + If system seems fine: Medium + * - A swift process is not owned by the swift user. + - If the UID of the swift user has changed, then the processes might not be + owned by that UID. + - Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - Object account or container files not owned by swift. + - This typically happens if during a reinstall or a re-image of a server that the UID + of the swift user was changed. The data files in the object account and container + directories are owned by the original swift UID. As a result, the current swift + user does not own these files. + - Correct the UID of the swift user to reflect that of the original UID. An alternate + action is to change the ownership of every file on all file systems. This alternate + action is often impractical and will take considerable time. + + Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - A disk drive has a high IO wait or service time. + - If high wait IO times are seen for a single disk, then the disk drive is the problem. + If most/all devices are slow, the controller is probably the source of the problem. + The controller cache may also be miss configured – which will cause similar long + wait or service times. + - As a first step, if your controllers have a cache, check that it is enabled and their battery/capacitor + is working. + + Second, reboot the server. + If problem persists, file a DC ticket to have the drive or controller replaced. + See `Diagnose: Slow disk devices` on how to check the drive wait or service times. + + Urgency: Medium + * - The network interface is not up. + - Use the ``ifconfig`` and ``ethtool`` commands to determine the network state. + - You can try restarting the interface. However, generally the interface + (or cable) is probably broken, especially if the interface is flapping. + + Urgency: If this only affects one server, and you have more than one, + identifying and fixing the problem can wait until business hours. + If this same problem affects many servers, then you need to take corrective + action immediately. + * - Network interface card (NIC) is not operating at the expected speed. + - The NIC is running at a slower speed than its nominal rated speed. + For example, it is running at 100 Mb/s and the NIC is a 1Ge NIC. + - 1. Try resetting the interface with: + + .. code:: + + sudo ethtool -s eth0 speed 1000 + + ... and then run: + + .. code:: + + sudo lshw -class + + See if size goes to the expected speed. Failing + that, check hardware (NIC cable/switch port). + + 2. If persistent, consider shutting down the server (especially if a proxy) + until the problem is identified and resolved. If you leave this server + running it can have a large impact on overall performance. + + Urgency: High + * - The interface RX/TX error count is non-zero. + - A value of 0 is typical, but counts of 1 or 2 do not indicate a problem. + - 1. For low numbers (For example, 1 or 2), you can simply ignore. Numbers in the range + 3-30 probably indicate that the error count has crept up slowly over a long time. + Consider rebooting the server to remove the report from the noise. + + Typically, when a cable or interface is bad, the error count goes to 400+. For example, + it stands out. There may be other symptoms such as the interface going up and down or + not running at correct speed. A server with a high error count should be watched. + + 2. If the error count continue to climb, consider taking the server down until + it can be properly investigated. In any case, a reboot should be done to clear + the error count. + + Urgency: High, if the error count increasing. + + * - In a swift log you see a message that a process has not replicated in over 24 hours. + - The replicator has not successfully completed a run in the last 24 hours. + This indicates that the replicator has probably hung. + - Use ``swift-init`` to stop and then restart the replicator process. + + Urgency: Low (high if recent adding or replacement of disk drives), however if you + recently added or replaced disk drives then you should treat this urgently. + * - Container Updater has not run in 4 hour(s). + - The service may appear to be running however, it may be hung. Examine their swift + logs to see if there are any error messages relating to the container updater. This + may potentially explain why the container is not running. + - Urgency: Medium + This may have been triggered by a recent restart of the rsyslog daemon. + Restart the service with: + .. code:: + + sudo swift-init reload + * - Object replicator: Reports the remaining time and that time is more than 100 hours. + - Each replication cycle the object replicator writes a log message to its log + reporting statistics about the current cycle. This includes an estimate for the + remaining time needed to replicate all objects. If this time is longer than + 100 hours, there is a problem with the replication process. + - Urgency: Medium + Restart the service with: + .. code:: + + sudo swift-init object-replicator reload + + Check that the remaining replication time is going down. diff --git a/doc/source/ops_runbook/troubleshooting.rst b/doc/source/ops_runbook/troubleshooting.rst new file mode 100644 index 0000000000..d097ce0673 --- /dev/null +++ b/doc/source/ops_runbook/troubleshooting.rst @@ -0,0 +1,264 @@ +==================== +Troubleshooting tips +==================== + +Diagnose: Customer complains they receive a HTTP status 500 when trying to browse containers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This entry is prompted by a real customer issue and exclusively focused on how +that problem was identified. +There are many reasons why a http status of 500 could be returned. If +there are no obvious problems with the swift object store, then it may +be necessary to take a closer look at the users transactions. +After finding the users swift account, you can +search the swift proxy logs on each swift proxy server for +transactions from this user. The linux ``bzgrep`` command can be used to +search all the proxy log files on a node including the ``.bz2`` compressed +files. For example: + +.. code:: + + $ PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no" pdsh -l -R ssh + + -w .68.[4-11,132-139 4-11,132-139],.132.[4-11,132-139 + 4-11,132-139] 'sudo bzgrep -w AUTH_redacted-4962-4692-98fb-52ddda82a5af /var/log/swift/proxy.log\*' + dshbak -c + . + . + \---------------\- + .132.6 + \---------------\- + Feb 29 08:51:57 sw-aw2az2-proxy011 proxy-server .16.132 + .66.8 29/Feb/2012/08/51/57 GET /v1.0/AUTH_redacted-4962-4692-98fb-52ddda82a5af + /%3Fformat%3Djson HTTP/1.0 404 - - _4f4d50c5e4b064d88bd7ab82 - - - + tx429fc3be354f434ab7f9c6c4206c1dc3 - 0.0130 + +This shows a ``GET`` operation on the users account. + +.. note:: + + The HTTP status returned is 404, not found, rather than 500 as reported by the user. + +Using the transaction ID, ``tx429fc3be354f434ab7f9c6c4206c1dc3`` you can +search the swift object servers log files for this transaction ID: + +.. code:: + + $ PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no" pdsh -l + + -R ssh + -w .72.[4-67|4-67],.[4-67|4-67],.[4-67|4-67],.204.[4-131| 4-131] + 'sudo bzgrep tx429fc3be354f434ab7f9c6c4206c1dc3 /var/log/swift/server.log*' + | dshbak -c + . + . + \---------------\- + .72.16 + \---------------\- + Feb 29 08:51:57 sw-aw2az1-object013 account-server .132.6 - - + + [29/Feb/2012:08:51:57 +0000|] "GET /disk9/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + 404 - "tx429fc3be354f434ab7f9c6c4206c1dc3" "-" "-" + + 0.0016 "" + \---------------\- + .31 + \---------------\- + Feb 29 08:51:57 node-az2-object060 account-server .132.6 - - + [29/Feb/2012:08:51:57 +0000|] "GET /disk6/198875/AUTH_redacted-4962- + 4692-98fb-52ddda82a5af" 404 - "tx429fc3be354f434ab7f9c6c4206c1dc3" "-" "-" 0.0011 "" + \---------------\- + .204.70 + \---------------\- + + Feb 29 08:51:57 sw-aw2az3-object0067 account-server .132.6 - - + [29/Feb/2012:08:51:57 +0000|] "GET /disk6/198875/AUTH_redacted-4962- + 4692-98fb-52ddda82a5af" 404 - "tx429fc3be354f434ab7f9c6c4206c1dc3" "-" "-" 0.0014 "" + +.. note:: + + The 3 GET operations to 3 different object servers that hold the 3 + replicas of this users account. Each ``GET`` returns a HTTP status of 404, + not found. + +Next, use the ``swift-get-nodes`` command to determine exactly where the +users account data is stored: + +.. code:: + + $ sudo swift-get-nodes /etc/swift/account.ring.gz AUTH_redacted-4962-4692-98fb-52ddda82a5af + Account AUTH_redacted-4962-4692-98fb-52ddda82a5af + Container None + Object None + + Partition 198875 + Hash 1846d99185f8a0edaf65cfbf37439696 + + Server:Port Device .31:6002 disk6 + Server:Port Device .204.70:6002 disk6 + Server:Port Device .72.16:6002 disk9 + Server:Port Device .204.64:6002 disk11 [Handoff] + Server:Port Device .26:6002 disk11 [Handoff] + Server:Port Device .72.27:6002 disk11 [Handoff] + + curl -I -XHEAD "`http://.31:6002/disk6/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ + curl -I -XHEAD "`http://.204.70:6002/disk6/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ + curl -I -XHEAD "`http://.72.16:6002/disk9/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ + curl -I -XHEAD "`http://.204.64:6002/disk11/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ # [Handoff] + curl -I -XHEAD "`http://.26:6002/disk11/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ # [Handoff] + curl -I -XHEAD "`http://.72.27:6002/disk11/198875/AUTH_redacted-4962-4692-98fb-52ddda82a5af" + `_ # [Handoff] + + ssh .31 "ls \-lah /srv/node/disk6/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" + ssh .204.70 "ls \-lah /srv/node/disk6/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" + ssh .72.16 "ls \-lah /srv/node/disk9/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" + ssh .204.64 "ls \-lah /srv/node/disk11/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" # [Handoff] + ssh .26 "ls \-lah /srv/node/disk11/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" # [Handoff] + ssh .72.27 "ls \-lah /srv/node/disk11/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/" # [Handoff] + +Check each of the primary servers, .31, .204.70 and .72.16, for +this users account. For example on .72.16: + +.. code:: + + $ ls \\-lah /srv/node/disk9/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/ + total 1.0M + drwxrwxrwx 2 swift swift 98 2012-02-23 14:49 . + drwxrwxrwx 3 swift swift 45 2012-02-03 23:28 .. + -rw-\\-----\\- 1 swift swift 15K 2012-02-23 14:49 1846d99185f8a0edaf65cfbf37439696.db + -rw-rw-rw- 1 swift swift 0 2012-02-23 14:49 1846d99185f8a0edaf65cfbf37439696.db.pending + +So this users account db, an sqlite db is present. Use sqlite to +checkout the account: + +.. code:: + + $ sudo cp /srv/node/disk9/accounts/198875/696/1846d99185f8a0edaf65cfbf37439696/1846d99185f8a0edaf65cfbf37439696.db /tmp + $ sudo sqlite3 /tmp/1846d99185f8a0edaf65cfbf37439696.db + sqlite> .mode line + sqlite> select * from account_stat; + account = AUTH_redacted-4962-4692-98fb-52ddda82a5af + created_at = 1328311738.42190 + put_timestamp = 1330000873.61411 + delete_timestamp = 1330001026.00514 + container_count = 0 + object_count = 0 + bytes_used = 0 + hash = eb7e5d0ea3544d9def940b19114e8b43 + id = 2de8c8a8-cef9-4a94-a421-2f845802fe90 + status = DELETED + status_changed_at = 1330001026.00514 + metadata = + +.. note:: + + The status is ``DELETED``. So this account was deleted. This explains + why the GET operations are returning 404, not found. Check the account + delete date/time: + + .. code:: + + $ python + + >>> import time + >>> time.ctime(1330001026.00514) + 'Thu Feb 23 12:43:46 2012' + +Next try and find the ``DELETE`` operation for this account in the proxy +server logs: + +.. code:: + + $ PDSH_SSH_ARGS_APPEND="-o StrictHostKeyChecking=no" pdsh -l -R ssh -w .68.[4-11,132-139 4-11,132- + 139],.132.[4-11,132-139|4-11,132-139] 'sudo bzgrep AUTH_redacted-4962-4692-98fb-52ddda82a5af /var/log/swift/proxy.log\* | grep -w + DELETE |awk "{print \\$3,\\$10,\\$12}"' |- dshbak -c + . + . + Feb 23 12:43:46 sw-aw2az2-proxy001 proxy-server 15.203.233.76 .66.7 23/Feb/2012/12/43/46 DELETE /v1.0/AUTH_redacted-4962-4692-98fb- + 52ddda82a5af/ HTTP/1.0 204 - Apache-HttpClient/4.1.2%20%28java%201.5%29 _4f458ee4e4b02a869c3aad02 - - - + + tx4471188b0b87406899973d297c55ab53 - 0.0086 + +From this you can see the operation that resulted in the account being deleted. + +Procedure: Deleting objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Simple case - deleting small number of objects and containers +------------------------------------------------------------- + +.. note:: + + ``swift-direct`` is specific to the Hewlett Packard Enterprise Helion Public Cloud. + Use ``swiftly`` as an alternative. + +.. note:: + + Object and container names are in UTF8. Swift direct accepts UTF8 + directly, not URL-encoded UTF8 (the REST API expects UTF8 and then + URL-encoded). In practice cut and paste of foreign language strings to + a terminal window will produce the right result. + + Hint: Use the ``head`` command before any destructive commands. + +To delete a small number of objects, log into any proxy node and proceed +as follows: + +Examine the object in question: + +.. code:: + + $ sudo -u swift /opt/hp/swift/bin/swift-direct head 132345678912345 container_name obj_name + +See if ``X-Object-Manifest`` or ``X-Static-Large-Object`` is set, +then this is the manifest object and segment objects may be in another +container. + +If the ``X-Object-Manifest`` attribute is set, you need to find the +name of the objects this means it is a DLO. For example, +if ``X-Object-Manifest`` is ``container2/seg-blah``, list the contents +of the container container2 as follows: + +.. code:: + + $ sudo -u swift /opt/hp/swift/bin/swift-direct show 132345678912345 container2 + +Pick out the objects whose names start with ``seg-blah``. +Delete the segment objects as follows: + +.. code:: + + $ sudo -u swift /opt/hp/swift/bin/swift-direct delete 132345678912345 container2 seg-blah01 + $ sudo -u swift /opt/hp/swift/bin/swift-direct delete 132345678912345 container2 seg-blah02 + etc + +If ``X-Static-Large-Object`` is set, you need to read the contents. Do this by: + +- Using swift-get-nodes to get the details of the object's location. +- Change the ``-X HEAD`` to ``-X GET`` and run ``curl`` against one copy. +- This lists a json body listing containers and object names +- Delete the objects as described above for DLO segments + +Once the segments are deleted, you can delete the object using +``swift-direct`` as described above. + +Finally, use ``swift-direct`` to delete the container. + +Procedure: Decommissioning swift nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Should Swift nodes need to be decommissioned. For example, where they are being +re-purposed, it is very important to follow the following steps. + +#. In the case of object servers, follow the procedure for removing + the node from the rings. +#. In the case of swift proxy servers, have the network team remove + the node from the load balancers. +#. Open a network ticket to have the node removed from network + firewalls. +#. Make sure that you remove the ``/etc/swift`` directory and everything in it. diff --git a/doc/source/overview_auth.rst b/doc/source/overview_auth.rst index 42e1ad029e..29ac1459e9 100644 --- a/doc/source/overview_auth.rst +++ b/doc/source/overview_auth.rst @@ -207,7 +207,7 @@ that the user is allowed to operate on project resources. OpenStack Service Using Composite Tokens ---------------------------------------- -Some Openstack services such as Cinder and Glance may use +Some OpenStack services such as Cinder and Glance may use a "service account". In this mode, you configure a separate account where the service stores project data that it manages. This account is not used directly by the end-user. Instead, all access is done through the service. @@ -234,19 +234,19 @@ situation as follows: (see ``/etc/keystone/default_catalog.templates`` above). Normally this is ``AUTH``. * The second item in the reseller_prefix list is the prefix used by the - Openstack services(s). You must configure this value (``SERVICE`` in the - example) with whatever the other Openstack service(s) use. + OpenStack services(s). You must configure this value (``SERVICE`` in the + example) with whatever the other OpenStack service(s) use. * Set the operator_roles option to contain a role or roles that end-user's have on project's they use. * Set the SERVICE_service_roles value to a role or roles that only the - Openstack service user has. Do not use a role that is assigned to + OpenStack service user has. Do not use a role that is assigned to "normal" end users. In this example, the role ``service`` is used. The service user is granted this role to a *single* project only. You do not need to make the service user a member of every project. This configuration works as follows: -* The end-user presents a user token to an Openstack service. The service +* The end-user presents a user token to an OpenStack service. The service then makes a Swift request to the account with the ``SERVICE`` prefix. * The service forwards the original user token with the request. It also adds it's own service token. diff --git a/doc/source/overview_backing_store.rst b/doc/source/overview_backing_store.rst index 57f7f37123..db50f65ea2 100644 --- a/doc/source/overview_backing_store.rst +++ b/doc/source/overview_backing_store.rst @@ -15,7 +15,7 @@ Glance writes the image to a Swift container as a set of objects. Throughout this section, the following terminology and concepts are used: * User or end-user. This is a person making a request that will result in - an Openstack Service making a request to Swift. + an OpenStack Service making a request to Swift. * Project (also known as Tenant). This is the unit of resource ownership. While data such as snapshot images or block volume backups may be @@ -182,7 +182,7 @@ Using the HTTP_X_SERVICE_CATALOG to get Swift Account Name The auth_token middleware populates the wsgi environment with information when it validates the user's token. The HTTP_X_SERVICE_CATALOG item is a JSON -string containing details of the Openstack endpoints. For Swift, this also +string containing details of the OpenStack endpoints. For Swift, this also contains the project's Swift account name. Here is an example of a catalog entry for Swift:: @@ -236,7 +236,7 @@ requirement is that your Service User has the appropriate role. In practice: reseller_prefix = AUTH_, SERVICE_ SERVICE_service_role = service -The ``service`` role should only be granted to Openstack Services. It should +The ``service`` role should only be granted to OpenStack Services. It should not be granted to users. Single or multiple Service Prefixes? @@ -244,7 +244,7 @@ Single or multiple Service Prefixes? Most of the examples used in this document used a single prefix. The prefix, ``SERVICE`` was used. By using a single prefix, an operator is -allowing all Openstack Services to share the same account for data +allowing all OpenStack Services to share the same account for data associated with a given project. For test systems or deployments well protected on private firewalled networks, this is appropriate. @@ -270,4 +270,4 @@ Container Naming Since a single Service Prefix is possible, container names should be prefixed with a unique string to prevent name clashes. We suggest you use the service type field (as used in the service catalog). For example, The Glance Service -would use "image" as a prefix. \ No newline at end of file +would use "image" as a prefix. diff --git a/doc/source/overview_container_sync.rst b/doc/source/overview_container_sync.rst index 8f03bf8174..c1255acaff 100644 --- a/doc/source/overview_container_sync.rst +++ b/doc/source/overview_container_sync.rst @@ -29,7 +29,7 @@ synchronization key. Configuring Container Sync -------------------------- -Create a container-sync-realms.conf file specifying the allowable clusters +Create a ``container-sync-realms.conf`` file specifying the allowable clusters and their information:: [realm1] @@ -50,18 +50,18 @@ clusters that have agreed to allow container syncing with each other. Realm names will be considered case insensitive. The key is the overall cluster-to-cluster key used in combination with the -external users' key that they set on their containers' X-Container-Sync-Key -metadata header values. These keys will be used to sign each request the -container sync daemon makes and used to validate each incoming container sync -request. +external users' key that they set on their containers' +``X-Container-Sync-Key`` metadata header values. These keys will be used to +sign each request the container sync daemon makes and used to validate each +incoming container sync request. The key2 is optional and is an additional key incoming requests will be checked against. This is so you can rotate keys if you wish; you move the existing key to key2 and make a new key value. -Any values in the realm section whose names begin with cluster\_ will indicate -the name and endpoint of a cluster and will be used by external users in -their containers' X-Container-Sync-To metadata header values with the format +Any values in the realm section whose names begin with ``cluster_`` will +indicate the name and endpoint of a cluster and will be used by external users in +their containers' ``X-Container-Sync-To`` metadata header values with the format "//realm_name/cluster_name/account_name/container_name". Realm and cluster names are considered case insensitive. @@ -71,7 +71,7 @@ container servers, since that is where the container sync daemon runs. Note that the endpoint ends with /v1/ and that the container sync daemon will then add the account/container/obj name after that. -Distribute this container-sync-realms.conf file to all your proxy servers +Distribute this ``container-sync-realms.conf`` file to all your proxy servers and container servers. You also need to add the container_sync middleware to your proxy pipeline. It @@ -95,7 +95,7 @@ section, Configuring Container Sync, for the new-style. With the old-style, the Swift cluster operator must allow synchronization with a set of hosts before the user can enable container synchronization. First, the backend container server needs to be given this list of hosts in the -container-server.conf file:: +``container-server.conf`` file:: [DEFAULT] # This is a comma separated list of hosts allowed in the @@ -170,8 +170,8 @@ we'll make next:: The ``-t`` indicates the cluster to sync to, which is the realm name of the section from container-sync-realms.conf, followed by the cluster name from -that section (without the cluster\_ prefix), followed by the account and container names we want to sync to. -The ``-k`` specifies the secret key the two containers will share for +that section (without the cluster\_ prefix), followed by the account and container +names we want to sync to. The ``-k`` specifies the secret key the two containers will share for synchronization; this is the user key, the cluster key in container-sync-realms.conf will also be used behind the scenes. @@ -195,8 +195,18 @@ as it gets synchronized over to the second:: list container2 [Nothing there yet, so we wait a bit...] - [If you're an operator running SAIO and just testing, you may need to - run 'swift-init container-sync once' to perform a sync scan.] + +.. note:: + + If you're an operator running SAIO and just testing, each time you + configure a container for synchronization and place objects in the + source container you will need to ensure that container-sync runs + before attempting to retrieve objects from the target container. + That is, you need to run:: + + swift-init container-sync once + +Now expect to see objects copied from the first container to the second:: $ swift -A http://cluster2/auth/v1.0 -U test2:tester2 -K testing2 \ list container2 @@ -340,13 +350,34 @@ synchronize to the second, we could have used this curl command:: What's going on behind the scenes, in the cluster? -------------------------------------------------- -The swift-container-sync does the job of sending updates to the remote -container. +Container ring devices have a directory called ``containers``, where container +databases reside. In addition to ``containers``, each container ring device +also has a directory called ``sync-containers``. ``sync-containers`` holds +symlinks to container databases that were configured for container sync using +``x-container-sync-to`` and ``x-container-sync-key`` metadata keys. -This is done by scanning the local devices for container databases and -checking for x-container-sync-to and x-container-sync-key metadata values. -If they exist, newer rows since the last sync will trigger PUTs or DELETEs -to the other container. +The swift-container-sync process does the job of sending updates to the remote +container. This is done by scanning ``sync-containers`` for container +databases. For each container db found, newer rows since the last sync will +trigger PUTs or DELETEs to the other container. + +``sync-containers`` is maintained as follows: +Whenever the container-server processes a PUT or a POST request that carries +``x-container-sync-to`` and ``x-container-sync-key`` metadata keys the server +creates a symlink to the container database in ``sync-containers``. Whenever +the container server deletes a synced container, the appropriate symlink +is deleted from ``sync-containers``. + +In addition to the container-server, the container-replicator process does the +job of identifying containers that should be synchronized. This is done by +scanning the local devices for container databases and checking for +x-container-sync-to and x-container-sync-key metadata values. If they exist +then a symlink to the container database is created in a sync-containers +sub-directory on the same device. + +Similarly, when the container sync metadata keys are deleted, the container +server and container-replicator would take care of deleting the symlinks +from ``sync-containers``. .. note:: diff --git a/doc/source/overview_erasure_code.rst b/doc/source/overview_erasure_code.rst old mode 100755 new mode 100644 index b09adcfbd3..64ce5621f2 --- a/doc/source/overview_erasure_code.rst +++ b/doc/source/overview_erasure_code.rst @@ -182,17 +182,13 @@ similar to that of replication with a few notable exceptions: Performance Considerations -------------------------- -Efforts are underway to characterize performance of various Erasure Code -schemes. One of the main goals of the beta release is to perform this -characterization and encourage others to do so and provide meaningful feedback -to the development community. There are many factors that will affect -performance of EC so it is vital that we have multiple characterization -activities happening. - In general, EC has different performance characteristics than replicated data. EC requires substantially more CPU to read and write data, and is more suited for larger objects that are not frequently accessed (eg backups). +Operators are encouraged to characterize the performance of various EC schemes +and share their observations with the developer community. + ---------------------------- Using an Erasure Code Policy ---------------------------- @@ -204,7 +200,7 @@ an EC policy can be setup is shown below:: [storage-policy:2] name = ec104 policy_type = erasure_coding - ec_type = jerasure_rs_vand + ec_type = liberasurecode_rs_vand ec_num_data_fragments = 10 ec_num_parity_fragments = 4 ec_object_segment_size = 1048576 diff --git a/doc/source/overview_large_objects.rst b/doc/source/overview_large_objects.rst index 89a3fd93d7..85a972120c 100644 --- a/doc/source/overview_large_objects.rst +++ b/doc/source/overview_large_objects.rst @@ -45,8 +45,8 @@ Direct API SLO support centers around the user generated manifest file. After the user has uploaded the segments into their account a manifest file needs to be -built and uploaded. All object segments, except the last, must be above 1 MB -(by default) in size. Please see the SLO docs for :ref:`slo-doc` further +built and uploaded. All object segments, must be at least 1 byte +in size. Please see the SLO docs for :ref:`slo-doc` further details. ---------------- diff --git a/doc/source/overview_policies.rst b/doc/source/overview_policies.rst old mode 100755 new mode 100644 index 9ae2dcb468..560320ae3e --- a/doc/source/overview_policies.rst +++ b/doc/source/overview_policies.rst @@ -37,8 +37,7 @@ There are many reasons why this might be desirable: .. note:: Today, Swift supports two different policy types: Replication and Erasure - Code. Erasure Code policy is currently a beta release and should not be - used in a Production cluster. See :doc:`overview_erasure_code` for details. + Code. See :doc:`overview_erasure_code` for details. Also note that Diskfile refers to backend object storage plug-in architecture. See :doc:`development_ondisk_backends` for details. @@ -286,6 +285,7 @@ example configuration.:: [swift-hash] # random unique strings that can never change (DO NOT LOSE) + # Use only printable chars (python -c "import string; print(string.printable)") swift_hash_path_prefix = changeme swift_hash_path_suffix = changeme diff --git a/doc/source/overview_ring.rst b/doc/source/overview_ring.rst index ac82562ff1..b121d37135 100644 --- a/doc/source/overview_ring.rst +++ b/doc/source/overview_ring.rst @@ -4,9 +4,9 @@ The Rings The rings determine where data should reside in the cluster. There is a separate ring for account databases, container databases, and individual -objects but each ring works in the same way. These rings are externally -managed, in that the server processes themselves do not modify the rings, they -are instead given new rings modified by other tools. +object storage policies but each ring works in the same way. These rings are +externally managed, in that the server processes themselves do not modify the +rings, they are instead given new rings modified by other tools. The ring uses a configurable number of bits from a path's MD5 hash as a partition index that designates a device. The number of bits kept from the hash @@ -18,10 +18,25 @@ cluster all at once. Another configurable value is the replica count, which indicates how many of the partition->device assignments comprise a single ring. For a given partition -number, each replica's device will not be in the same zone as any other -replica's device. Zones can be used to group devices based on physical -locations, power separations, network separations, or any other attribute that -would lessen multiple replicas being unavailable at the same time. +number, each replica will be assigned to a different device in the ring. + +Devices are added to the ring to describe the capacity available for +part-replica assignment. Devices are placed into failure domains consisting +of region, zone, and server. Regions can be used to describe geo-graphically +systems characterized by lower-bandwidth or higher latency between machines in +different regions. Many rings will consist of only a single region. Zones +can be used to group devices based on physical locations, power separations, +network separations, or any other attribute that would lessen multiple +replicas being unavailable at the same time. + +Devices are given a weight which describes relative weight of the device in +comparison to other devices. + +When building a ring all of each part's replicas will be assigned to devices +according to their weight. Additionally, each replica of a part will attempt +to be assigned to a device who's failure domain does not already have a +replica for the part. Only a single replica of a part may be assigned to each +device - you must have as many devices as replicas. ------------ Ring Builder @@ -91,8 +106,7 @@ Note: The list of devices may contain holes, or indexes set to None, for devices that have been removed from the cluster. Generally, device ids are not reused. Also, some devices may be temporarily disabled by setting their weight to 0.0. To obtain a list of active devices (for uptime polling, for example) -the Python code would look like: ``devices = [device for device in self.devs if -device and device['weight']]`` +the Python code would look like: ``devices = list(self._iter_devs())`` ************************* Partition Assignment List @@ -108,14 +122,24 @@ So, to create a list of device dictionaries assigned to a partition, the Python code would look like: ``devices = [self.devs[part2dev_id[partition]] for part2dev_id in self._replica2part2dev_id]`` -That code is a little simplistic, as it does not account for the -removal of duplicate devices. If a ring has more replicas than -devices, then a partition will have more than one replica on one -device; that's simply the pigeonhole principle at work. - array('H') is used for memory conservation as there may be millions of partitions. +********************* +Partition Shift Value +********************* + +The partition shift value is known internally to the Ring class as _part_shift. +This value used to shift an MD5 hash to calculate the partition on which the +data for that hash should reside. Only the top four bytes of the hash is used +in this process. For example, to compute the partition for the path +/account/container/object the Python code might look like: ``partition = +unpack_from('>I', md5('/account/container/object').digest())[0] >> +self._part_shift`` + +For a ring generated with part_power P, the partition shift value is +32 - P. + ******************* Fractional Replicas ******************* @@ -130,6 +154,21 @@ for the ring. This means that some partitions will have more replicas than others. For example, if a ring has 3.25 replicas, then 25% of its partitions will have four replicas, while the remaining 75% will have just three. +********** +Dispersion +********** + +With each rebalance, the ring builder calculates a dispersion metric. This is +the percentage of partitions in the ring that have too many replicas within a +particular failure domain. + +For example, if you have three servers in a cluster but two replicas for a +partition get placed onto the same server, that partition will count towards +the dispersion metric. + +A lower dispersion value is better, and the value can be used to find the +proper value for "overload". + ******** Overload ******** @@ -168,74 +207,118 @@ on them than the disks in nodes A and B. If 80% full is the warning threshold for the cluster, node C's disks will reach 80% full while A and B's disks are only 72.7% full. -********** -Dispersion -********** +------------------------------- +Partition & Replica Terminology +------------------------------- -With each rebalance, the ring builder calculates a dispersion metric. This is -the percentage of partitions in the ring that have too many replicas within a -particular failure domain. +All descriptions of consistent hashing describe the process of breaking the +keyspace up into multiple ranges (vnodes, buckets, etc.) - many more than the +number of "nodes" to which keys in the keyspace must be assigned. Swift calls +these ranges `partitions` - they are partitions of the total keyspace. -For example, if you have three servers in a cluster but two replicas for a -partition get placed onto the same server, that partition will count towards the -dispersion metric. +Each partition will have multiple replicas. Every replica of each partition +must be assigned to a device in the ring. When a describing a specific +replica of a partition (like when it's assigned a device) it is described as a +`part-replica` in that it is a specific `replica` of the specific `partition`. +A single device may be assigned different replicas from many parts, but it may +not be assigned multiple replicas of a single part. -A lower dispersion value is better, and the value can be used to find the proper -value for "overload". +The total number of partitions in a ring is calculated as ``2 ** +``. The total number of part-replicas in a ring is calculated as +`` * 2 ** ``. -********************* -Partition Shift Value -********************* +When considering a device's `weight` it is useful to describe the number of +part-replicas it would like to be assigned. A single device regardless of +weight will never hold more than ``2 ** `` part-replicas because +it can not have more than one replica of any part assigned. The number of +part-replicas a device can take by weights is calculated as it's +`parts_wanted`. The true number of part-replicas assigned to a device can be +compared to it's parts wanted similarly to a calculation of percentage error - +this deviation in the observed result from the idealized target is called a +devices `balance`. -The partition shift value is known internally to the Ring class as _part_shift. -This value used to shift an MD5 hash to calculate the partition on which the -data for that hash should reside. Only the top four bytes of the hash is used -in this process. For example, to compute the partition for the path -/account/container/object the Python code might look like: ``partition = -unpack_from('>I', md5('/account/container/object').digest())[0] >> -self._part_shift`` - -For a ring generated with part_power P, the partition shift value is -32 - P. +When considering a device's `failure domain` it is useful to describe the +number of part-replicas it would like to be assigned. The number of +part-replicas wanted in a failure domain of a tier is the sum of the +part-replicas wanted in the failure domains of it's sub-tier. However, +collectively when the total number of part-replicas in a failure domain +exceeds or is equal to ``2 ** `` it is most obvious that it's no +longer sufficient to consider only the number of total part-replicas, but +rather the fraction of each replica's partitions. Consider for example a ring +with ``3`` replicas and ``3`` servers, while it's necessary for dispersion +that each server hold only ``1/3`` of the total part-replicas it is +additionally constrained to require ``1.0`` replica of *each* partition. It +would not be sufficient to satisfy dispersion if two devices on one of the +servers each held a replica of a single partition, while another server held +none. By considering a decimal fraction of one replica's worth of parts in a +failure domain we can derive the total part-replicas wanted in a failure +domain (``1.0 * 2 ** ``). Additionally we infer more about +`which` part-replicas must go in the failure domain. Consider a ring with +three replicas, and two zones, each with two servers (four servers total). +The three replicas worth of partitions will be assigned into two failure +domains at the zone tier. Each zone must hold more than one replica of some +parts. We represent this improper faction of a replica's worth of partitions +in decimal form as ``1.5`` (``3.0 / 2``). This tells us not only the *number* +of total parts (``1.5 * 2 ** ``) but also that *each* partition +must have `at least` one replica in this failure domain (in fact ``0.5`` of +the partitions will have ``2`` replicas). Within each zone the two servers +will hold ``0.75`` of a replica's worth of partitions - this is equal both to +"the fraction of a replica's worth of partitions assigned to each zone +(``1.5``) divided evenly among the number of failure domain's in it's sub-tier +(``2`` servers in each zone, i.e. ``1.5 / 2``)" but *also* "the total number +of replicas (``3.0``) divided evenly among the total number of failure domains +in the server tier (``2`` servers x ``2`` zones = ``4``, i.e. ``3.0 / 4``)". +It is useful to consider that each server in this ring will hold only ``0.75`` +of a replica's worth of partitions which tells that any server should have `at +most` one replica of a given part assigned. In the interests of brevity, some +variable names will often refer to the concept representing the fraction of a +replica's worth of partitions in decimal form as *replicanths* - this is meant +to invoke connotations similar to ordinal numbers as applied to fractions, but +generalized to a replica instead of four*th* or a fif*th*. The 'n' was +probably thrown in because of Blade Runner. ----------------- Building the Ring ----------------- -The initial building of the ring first calculates the number of partitions that -should ideally be assigned to each device based the device's weight. For -example, given a partition power of 20, the ring will have 1,048,576 partitions. -If there are 1,000 devices of equal weight they will each desire 1,048.576 -partitions. The devices are then sorted by the number of partitions they desire -and kept in order throughout the initialization process. +First the ring builder calculates the replicanths wanted at each tier in the +ring's topology based on weight. -Note: each device is also assigned a random tiebreaker value that is used when -two devices desire the same number of partitions. This tiebreaker is not stored -on disk anywhere, and so two different rings created with the same parameters -will have different partition assignments. For repeatable partition assignments, -``RingBuilder.rebalance()`` takes an optional seed value that will be used to -seed Python's pseudo-random number generator. +Then the ring builder calculates the replicanths wanted at each tier in the +ring's topology based on dispersion. -Then, the ring builder assigns each replica of each partition to the device that -desires the most partitions at that point while keeping it as far away as -possible from other replicas. The ring builder prefers to assign a replica to a -device in a regions that has no replicas already; should there be no such region -available, the ring builder will try to find a device in a different zone; if -not possible, it will look on a different server; failing that, it will just -look for a device that has no replicas; finally, if all other options are -exhausted, the ring builder will assign the replica to the device that has the -fewest replicas already assigned. Note that assignment of multiple replicas to -one device will only happen if the ring has fewer devices than it has replicas. +Then the ring calculates the maximum deviation on a single device between it's +weighted replicanths and wanted replicanths. -When building a new ring based on an old ring, the desired number of partitions -each device wants is recalculated. Next the partitions to be reassigned are -gathered up. Any removed devices have all their assigned partitions unassigned -and added to the gathered list. Any partition replicas that (due to the -addition of new devices) can be spread out for better durability are unassigned -and added to the gathered list. Any devices that have more partitions than they -now desire have random partitions unassigned from them and added to the -gathered list. Lastly, the gathered partitions are then reassigned to devices -using a similar method as in the initial assignment described above. +Next we interpolate between the two replicanth values (weighted & wanted) at +each tier using the specified overload (up to the maximum required overload). +It's a linear interpolation, similar to solving for a point on a line between +two points - we calculate the slope across the max required overload and then +calculate the intersection of the line with the desired overload. This +becomes the target. + +From the target we calculate the minimum and maximum number of replicas any +part may have in a tier. This becomes the replica_plan. + +Finally, we calculate the number of partitions that should ideally be assigned +to each device based the replica_plan. + +On initial balance, the first time partitions are placed to generate a ring, +we must assign each replica of each partition to the device that desires the +most partitions excluding any devices that already have their maximum number +of replicas of that part assigned to some parent tier of that device's failure +domain. + +When building a new ring based on an old ring, the desired number of +partitions each device wants is recalculated from the current replica_plan. +Next the partitions to be reassigned are gathered up. Any removed devices have +all their assigned partitions unassigned and added to the gathered list. Any +partition replicas that (due to the addition of new devices) can be spread out +for better durability are unassigned and added to the gathered list. Any +devices that have more partitions than they now desire have random partitions +unassigned from them and added to the gathered list. Lastly, the gathered +partitions are then reassigned to devices using a similar method as in the +initial assignment described above. Whenever a partition has a replica reassigned, the time of the reassignment is recorded. This is taken into account when gathering partitions to reassign so @@ -247,10 +330,9 @@ failure and there's no choice but to make a reassignment. The above processes don't always perfectly rebalance a ring due to the random nature of gathering partitions for reassignment. To help reach a more balanced -ring, the rebalance process is repeated until near perfect (less 1% off) or -when the balance doesn't improve by at least 1% (indicating we probably can't -get perfect balance due to wildly imbalanced zones or too many partitions -recently moved). +ring, the rebalance process is repeated a fixed number of times until the +replica_plan is fulfilled or unable to be fulfilled (indicating we probably +can't get perfect balance due to too many partitions recently moved). --------------------- Ring Builder Analyzer @@ -263,8 +345,8 @@ History ------- The ring code went through many iterations before arriving at what it is now -and while it has been stable for a while now, the algorithm may be tweaked or -perhaps even fundamentally changed if new ideas emerge. This section will try +and while it has largely been stable, the algorithm has seen a few tweaks or +perhaps even fundamentally changed as new ideas emerge. This section will try to describe the previous ideas attempted and attempt to explain why they were discarded. @@ -329,15 +411,14 @@ be maintaining the rings themselves anyway and only doing hash lookups, MD5 was chosen for its general availability, good distribution, and adequate speed. The placement algorithm has seen a number of behavioral changes for -unbalanceable rings. The ring builder wants to keep replicas as far -apart as possible while still respecting device weights. In most -cases, the ring builder can achieve both, but sometimes they conflict. -At first, the behavior was to keep the replicas far apart and ignore -device weight, but that made it impossible to gradually go from one -region to two, or from two to three. Then it was changed to favor -device weight over dispersion, but that wasn't so good for rings that -were close to balanceable, like 3 machines with 60TB, 60TB, and 57TB -of disk space; operators were expecting one replica per machine, but -didn't always get it. After that, overload was added to the ring -builder so that operators could choose a balance between dispersion -and device weights. +unbalanceable rings. The ring builder wants to keep replicas as far apart as +possible while still respecting device weights. In most cases, the ring +builder can achieve both, but sometimes they conflict. At first, the behavior +was to keep the replicas far apart and ignore device weight, but that made it +impossible to gradually go from one region to two, or from two to three. Then +it was changed to favor device weight over dispersion, but that wasn't so good +for rings that were close to balanceable, like 3 machines with 60TB, 60TB, and +57TB of disk space; operators were expecting one replica per machine, but +didn't always get it. After that, overload was added to the ring builder so +that operators could choose a balance between dispersion and device weights. +In time the overload concept was improved and made more accurate. diff --git a/doc/source/policies_saio.rst b/doc/source/policies_saio.rst old mode 100755 new mode 100644 diff --git a/etc/account-server.conf-sample b/etc/account-server.conf-sample index 280961c0ae..9fa98c6f20 100644 --- a/etc/account-server.conf-sample +++ b/etc/account-server.conf-sample @@ -35,7 +35,7 @@ bind_port = 6002 # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 diff --git a/etc/container-reconciler.conf-sample b/etc/container-reconciler.conf-sample index 4301641ac4..6e8f109f5d 100644 --- a/etc/container-reconciler.conf-sample +++ b/etc/container-reconciler.conf-sample @@ -17,7 +17,7 @@ # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 diff --git a/etc/container-server.conf-sample b/etc/container-server.conf-sample index b525b7927e..5927f5e230 100644 --- a/etc/container-server.conf-sample +++ b/etc/container-server.conf-sample @@ -41,7 +41,7 @@ bind_port = 6001 # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 diff --git a/etc/internal-client.conf-sample b/etc/internal-client.conf-sample index 2d25d448b6..916e424afc 100644 --- a/etc/internal-client.conf-sample +++ b/etc/internal-client.conf-sample @@ -17,7 +17,7 @@ # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 diff --git a/etc/memcache.conf-sample b/etc/memcache.conf-sample index 7ec55f100f..813ecf9edb 100644 --- a/etc/memcache.conf-sample +++ b/etc/memcache.conf-sample @@ -2,6 +2,7 @@ # You can use this single conf file instead of having memcache_servers set in # several other conf files under [filter:cache] for example. You can specify # multiple servers separated with commas, as in: 10.1.2.3:11211,10.1.2.4:11211 +# (IPv6 addresses must follow rfc3986 section-3.2.2, i.e. [::1]:11211) # memcache_servers = 127.0.0.1:11211 # # Sets how memcache values are serialized and deserialized: diff --git a/etc/object-expirer.conf-sample b/etc/object-expirer.conf-sample index 6276fd5cfa..4b161de1d5 100644 --- a/etc/object-expirer.conf-sample +++ b/etc/object-expirer.conf-sample @@ -20,7 +20,7 @@ # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 @@ -80,7 +80,7 @@ use = egg:swift#proxy_logging # access_log_udp_port = 514 # # You can use log_statsd_* from [DEFAULT] or override them here: -# access_log_statsd_host = localhost +# access_log_statsd_host = # access_log_statsd_port = 8125 # access_log_statsd_default_sample_rate = 1.0 # access_log_statsd_sample_rate_factor = 1.0 diff --git a/etc/object-server.conf-sample b/etc/object-server.conf-sample index 815b63cc5d..3c7732416e 100644 --- a/etc/object-server.conf-sample +++ b/etc/object-server.conf-sample @@ -44,7 +44,7 @@ bind_port = 6000 # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 @@ -82,6 +82,11 @@ use = egg:swift#object # set log_address = /dev/log # # max_upload_time = 86400 +# +# slow is the total amount of seconds an object PUT/DELETE request takes at +# least. If it is faster, the object server will sleep this amount of time minus +# the already passed transaction time. This is only useful for simulating slow +# devices on storage nodes during testing and development. # slow = 0 # # Objects smaller than this are not evicted from the buffercache once read @@ -282,6 +287,9 @@ use = egg:swift#recon # log_level = INFO # log_address = /dev/log # +# Time in seconds to wait between auditor passes +# interval = 30 +# # You can set the disk chunk size that the auditor uses making it larger if # you like for more efficient local auditing of larger objects # disk_chunk_size = 65536 diff --git a/etc/proxy-server.conf-sample b/etc/proxy-server.conf-sample index 125e4b3a3c..a80e69be06 100644 --- a/etc/proxy-server.conf-sample +++ b/etc/proxy-server.conf-sample @@ -63,7 +63,7 @@ bind_port = 8080 # log_udp_port = 514 # # You can enable StatsD logging here: -# log_statsd_host = localhost +# log_statsd_host = # log_statsd_port = 8125 # log_statsd_default_sample_rate = 1.0 # log_statsd_sample_rate_factor = 1.0 @@ -171,9 +171,6 @@ use = egg:swift#proxy # the number of seconds configured by timing_expiry. # timing_expiry = 300 # -# The maximum time (seconds) that a large object connection is allowed to last. -# max_large_object_get_time = 86400 -# # Set to the number of nodes to contact for a normal request. You can use # '* replicas' at the end to have it use the number given times the number of # replicas for the ring being used for the request. @@ -287,13 +284,21 @@ user_test5_tester5 = testing5 service # You'll also need to have the keystoneauth middleware enabled and have it in # your main pipeline, as show in the sample pipeline at the top of this file. # +# Following parameters are known to work with keystonemiddleware v2.3.0 +# (above v2.0.0), but checking the latest information in the wiki page[1] +# is recommended. +# 1. http://docs.openstack.org/developer/keystonemiddleware/middlewarearchitecture.html#configuration +# # [filter:authtoken] # paste.filter_factory = keystonemiddleware.auth_token:filter_factory -# identity_uri = http://keystonehost:35357/ -# auth_uri = http://keystonehost:5000/ -# admin_tenant_name = service -# admin_user = swift -# admin_password = password +# auth_uri = http://keystonehost:5000 +# auth_url = http://keystonehost:35357 +# auth_plugin = password +# project_domain_id = default +# user_domain_id = default +# project_name = service +# username = swift +# password = password # # delay_auth_decision defaults to False, but leaving it as false will # prevent other auth systems, staticweb, tempurl, formpost, and ACLs from @@ -388,7 +393,8 @@ use = egg:swift#memcache # If not set here, the value for memcache_servers will be read from # memcache.conf (see memcache.conf-sample) or lacking that file, it will # default to the value below. You can specify multiple servers separated with -# commas, as in: 10.1.2.3:11211,10.1.2.4:11211 +# commas, as in: 10.1.2.3:11211,10.1.2.4:11211 (IPv6 addresses must +# follow rfc3986 section-3.2.2, i.e. [::1]:11211) # memcache_servers = 127.0.0.1:11211 # # Sets how memcache values are serialized and deserialized: @@ -568,7 +574,7 @@ use = egg:swift#proxy_logging # access_log_udp_port = 514 # # You can use log_statsd_* from [DEFAULT] or override them here: -# access_log_statsd_host = localhost +# access_log_statsd_host = # access_log_statsd_port = 8125 # access_log_statsd_default_sample_rate = 1.0 # access_log_statsd_sample_rate_factor = 1.0 @@ -628,14 +634,17 @@ use = egg:swift#bulk use = egg:swift#slo # max_manifest_segments = 1000 # max_manifest_size = 2097152 -# min_segment_size = 1048576 -# Start rate-limiting SLO segment serving after the Nth segment of a +# +# Rate limiting applies only to segments smaller than this size (bytes). +# rate_limit_under_size = 1048576 +# +# Start rate-limiting SLO segment serving after the Nth small segment of a # segmented object. # rate_limit_after_segment = 10 # # Once segment rate-limiting kicks in for an object, limit segments served # to N per second. 0 means no rate-limiting. -# rate_limit_segments_per_sec = 0 +# rate_limit_segments_per_sec = 1 # # Time limit on GET requests (seconds) # max_get_time = 86400 diff --git a/etc/swift.conf-sample b/etc/swift.conf-sample old mode 100755 new mode 100644 index 3768dbc9cb..5bd57e6864 --- a/etc/swift.conf-sample +++ b/etc/swift.conf-sample @@ -4,6 +4,7 @@ # the hashing algorithm when determining data placement in the cluster. # These values should remain secret and MUST NOT change # once a cluster has been deployed. +# Use only printable chars (python -c "import string; print(string.printable)") swift_hash_path_suffix = changeme swift_hash_path_prefix = changeme @@ -50,8 +51,7 @@ aliases = yellow, orange #policy_type = replication # The following declares a storage policy of type 'erasure_coding' which uses -# Erasure Coding for data reliability. The 'erasure_coding' storage policy in -# Swift is available as a "beta". Please refer to Swift documentation for +# Erasure Coding for data reliability. Please refer to Swift documentation for # details on how the 'erasure_coding' storage policy is implemented. # # Swift uses PyECLib, a Python Erasure coding API library, for encode/decode @@ -73,13 +73,14 @@ aliases = yellow, orange # The example 'deepfreeze10-4' policy defined below is a _sample_ # configuration with an alias of 'df10-4' as well as 10 'data' and 4 'parity' # fragments. 'ec_type' defines the Erasure Coding scheme. -# 'jerasure_rs_vand' (Reed-Solomon Vandermonde) is used as an example below. +# 'liberasurecode_rs_vand' (Reed-Solomon Vandermonde) is used as an example +# below. # #[storage-policy:2] #name = deepfreeze10-4 #aliases = df10-4 #policy_type = erasure_coding -#ec_type = jerasure_rs_vand +#ec_type = liberasurecode_rs_vand #ec_num_data_fragments = 10 #ec_num_parity_fragments = 4 #ec_object_segment_size = 1048576 diff --git a/requirements.txt b/requirements.txt index 13b94d9cb9..3480d4f3b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,10 +4,10 @@ dnspython>=1.12.0;python_version<'3.0' dnspython3>=1.12.0;python_version>='3.0' -eventlet>=0.16.1,!=0.17.0 +eventlet>=0.17.4 # MIT greenlet>=0.3.1 netifaces>=0.5,!=0.10.0,!=0.10.1 pastedeploy>=1.3.3 six>=1.9.0 xattr>=0.4 -PyECLib>=1.0.7 # BSD +PyECLib>=1.2.0 # BSD diff --git a/swift/cli/recon.py b/swift/cli/recon.py index 3af876b709..df85f7121e 100644 --- a/swift/cli/recon.py +++ b/swift/cli/recon.py @@ -1,4 +1,3 @@ -#! /usr/bin/env python # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -22,6 +21,7 @@ from eventlet.green import urllib2, socket from six.moves.urllib.parse import urlparse from swift.common.utils import SWIFT_CONF_FILE from swift.common.ring import Ring +from swift.common.storage_policy import POLICIES from hashlib import md5 import eventlet import json @@ -181,12 +181,12 @@ class SwiftRecon(object): def _ptime(self, timev=None): """ :param timev: a unix timestamp or None - :returns: a pretty string of the current time or provided time + :returns: a pretty string of the current time or provided time in UTC """ if timev: - return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timev)) + return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(timev)) else: - return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) def _md5_file(self, path): """ @@ -203,18 +203,19 @@ class SwiftRecon(object): block = f.read(4096) return md5sum.hexdigest() - def get_devices(self, region_filter, zone_filter, swift_dir, ring_name): + def get_hosts(self, region_filter, zone_filter, swift_dir, ring_names): """ - Get a list of hosts in the ring + Get a list of hosts in the rings. :param region_filter: Only list regions matching given filter :param zone_filter: Only list zones matching given filter :param swift_dir: Directory of swift config, usually /etc/swift - :param ring_name: Name of the ring, such as 'object' + :param ring_names: Collection of ring names, such as + ['object', 'object-2'] :returns: a set of tuples containing the ip and port of hosts """ - ring_data = Ring(swift_dir, ring_name=ring_name) - devs = [d for d in ring_data.devs if d] + rings = [Ring(swift_dir, ring_name=n) for n in ring_names] + devs = [d for r in rings for d in r.devs if d] if region_filter is not None: devs = [d for d in devs if d['region'] == region_filter] if zone_filter is not None: @@ -495,16 +496,14 @@ class SwiftRecon(object): elapsed = time.time() - least_recent_time elapsed, elapsed_unit = seconds2timeunit(elapsed) print('Oldest completion was %s (%d %s ago) by %s.' % ( - time.strftime('%Y-%m-%d %H:%M:%S', - time.gmtime(least_recent_time)), + self._ptime(least_recent_time), elapsed, elapsed_unit, host)) if most_recent_url is not None: host = urlparse(most_recent_url).netloc elapsed = time.time() - most_recent_time elapsed, elapsed_unit = seconds2timeunit(elapsed) print('Most recent completion was %s (%d %s ago) by %s.' % ( - time.strftime('%Y-%m-%d %H:%M:%S', - time.gmtime(most_recent_time)), + self._ptime(most_recent_time), elapsed, elapsed_unit, host)) print("=" * 79) @@ -899,12 +898,8 @@ class SwiftRecon(object): continue if (ts_remote < ts_start or ts_remote > ts_end): diff = abs(ts_end - ts_remote) - ts_end_f = time.strftime( - "%Y-%m-%d %H:%M:%S", - time.localtime(ts_end)) - ts_remote_f = time.strftime( - "%Y-%m-%d %H:%M:%S", - time.localtime(ts_remote)) + ts_end_f = self._ptime(ts_end) + ts_remote_f = self._ptime(ts_remote) print("!! %s current time is %s, but remote is %s, " "differs by %.2f sec" % ( @@ -920,6 +915,26 @@ class SwiftRecon(object): matches, len(hosts), errors)) print("=" * 79) + def _get_ring_names(self, policy=None): + ''' + Retrieve name of ring files. + + If no policy is passed and the server type is object, + the ring names of all storage-policies are retrieved. + + :param policy: name or index of storage policy, only applicable + with server_type==object. + :returns: list of ring names. + ''' + if self.server_type == 'object': + ring_names = [p.ring_name for p in POLICIES if ( + p.name == policy or not policy or ( + policy.isdigit() and int(policy) == int(p)))] + else: + ring_names = [self.server_type] + + return ring_names + def main(self): """ Retrieve and report cluster info from hosts running recon middleware. @@ -989,6 +1004,9 @@ class SwiftRecon(object): default=5) args.add_option('--swiftdir', default="/etc/swift", help="Default = /etc/swift") + args.add_option('--policy', '-p', + help='Only query object servers in specified ' + 'storage policy (specified as name or index).') options, arguments = args.parse_args() if len(sys.argv) <= 1 or len(arguments) > 1: @@ -1010,8 +1028,14 @@ class SwiftRecon(object): self.suppress_errors = options.suppress self.timeout = options.timeout - hosts = self.get_devices(options.region, options.zone, - swift_dir, self.server_type) + ring_names = self._get_ring_names(options.policy) + if not ring_names: + print('Invalid Storage Policy') + args.print_help() + sys.exit(0) + + hosts = self.get_hosts(options.region, options.zone, + swift_dir, ring_names) print("--> Starting reconnaissance on %s hosts" % len(hosts)) print("=" * 79) @@ -1090,7 +1114,3 @@ def main(): reconnoiter.main() except KeyboardInterrupt: print('\n') - - -if __name__ == '__main__': - main() diff --git a/swift/cli/ring_builder_analyzer.py b/swift/cli/ring_builder_analyzer.py index 1ae35ae031..85526d8f90 100644 --- a/swift/cli/ring_builder_analyzer.py +++ b/swift/cli/ring_builder_analyzer.py @@ -1,4 +1,3 @@ -#! /usr/bin/env python # Copyright (c) 2015 Samuel Merritt # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/swift/cli/ringbuilder.py b/swift/cli/ringbuilder.py old mode 100755 new mode 100644 index c2782f2795..076f1975f8 --- a/swift/cli/ringbuilder.py +++ b/swift/cli/ringbuilder.py @@ -1,4 +1,3 @@ -#! /usr/bin/env python # Copyright (c) 2010-2012 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,6 +24,7 @@ from os.path import basename, abspath, dirname, exists, join as pathjoin from sys import argv as sys_argv, exit, stderr, stdout from textwrap import wrap from time import time +from datetime import timedelta import optparse import math @@ -32,7 +32,7 @@ from six.moves import zip as izip from six.moves import input from swift.common import exceptions -from swift.common.ring import RingBuilder, Ring +from swift.common.ring import RingBuilder, Ring, RingData from swift.common.ring.builder import MAX_BALANCE from swift.common.ring.utils import validate_args, \ validate_and_normalize_ip, build_dev_from_opts, \ @@ -389,11 +389,12 @@ def _parse_remove_values(argvish): class Commands(object): - + @staticmethod def unknown(): print('Unknown command: %s' % argv[2]) exit(EXIT_ERROR) + @staticmethod def create(): """ swift-ring-builder create @@ -417,6 +418,7 @@ swift-ring-builder create builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def default(): """ swift-ring-builder @@ -444,9 +446,28 @@ swift-ring-builder builder.parts, builder.replicas, regions, zones, dev_count, balance, dispersion_trailer)) print('The minimum number of hours before a partition can be ' - 'reassigned is %s' % builder.min_part_hours) + 'reassigned is %s (%s remaining)' % ( + builder.min_part_hours, + timedelta(seconds=builder.min_part_seconds_left))) print('The overload factor is %0.2f%% (%.6f)' % ( builder.overload * 100, builder.overload)) + + # compare ring file against builder file + if not exists(ring_file): + print('Ring file %s not found, ' + 'probably it hasn\'t been written yet' % ring_file) + else: + builder_dict = builder.get_ring().to_dict() + try: + ring_dict = RingData.load(ring_file).to_dict() + except Exception as exc: + print('Ring file %s is invalid: %r' % (ring_file, exc)) + else: + if builder_dict == ring_dict: + print('Ring file %s is up-to-date' % ring_file) + else: + print('Ring file %s is obsolete' % ring_file) + if builder.devs: balance_per_dev = builder._build_balance_per_dev() print('Devices: id region zone ip address port ' @@ -463,6 +484,7 @@ swift-ring-builder dev['meta'])) exit(EXIT_SUCCESS) + @staticmethod def search(): """ swift-ring-builder search @@ -513,6 +535,7 @@ swift-ring-builder search dev['meta'])) exit(EXIT_SUCCESS) + @staticmethod def list_parts(): """ swift-ring-builder list_parts [] .. @@ -562,6 +585,7 @@ swift-ring-builder list_parts print('%9d %7d' % (partition, count)) exit(EXIT_SUCCESS) + @staticmethod def add(): """ swift-ring-builder add @@ -612,6 +636,7 @@ swift-ring-builder add builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def set_weight(): """ swift-ring-builder set_weight @@ -644,6 +669,7 @@ swift-ring-builder set_weight builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def set_info(): """ swift-ring-builder set_info @@ -689,6 +715,7 @@ swift-ring-builder set_info builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def remove(): """ swift-ring-builder remove [search-value ...] @@ -754,6 +781,7 @@ swift-ring-builder search builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def rebalance(): """ swift-ring-builder rebalance [options] @@ -787,6 +815,14 @@ swift-ring-builder rebalance [options] handler.setFormatter(formatter) logger.addHandler(handler) + if builder.min_part_seconds_left > 0 and not options.force: + print('No partitions could be reassigned.') + print('The time between rebalances must be at least ' + 'min_part_hours: %s hours (%s remaining)' % ( + builder.min_part_hours, + timedelta(seconds=builder.min_part_seconds_left))) + exit(EXIT_WARNING) + devs_changed = builder.devs_changed try: last_balance = builder.get_balance() @@ -802,8 +838,7 @@ swift-ring-builder rebalance [options] exit(EXIT_ERROR) if not (parts or options.force or removed_devs): print('No partitions could be reassigned.') - print('Either none need to be or none can be due to ' - 'min_part_hours [%s].' % builder.min_part_hours) + print('There is no need to do so at this time') exit(EXIT_WARNING) # If we set device's weight to zero, currently balance will be set # special value(MAX_BALANCE) until zero weighted device return all @@ -859,6 +894,7 @@ swift-ring-builder rebalance [options] builder.save(builder_file) exit(status) + @staticmethod def dispersion(): """ swift-ring-builder dispersion [options] @@ -953,6 +989,7 @@ swift-ring-builder dispersion [options] print(template % args) exit(status) + @staticmethod def validate(): """ swift-ring-builder validate @@ -961,6 +998,7 @@ swift-ring-builder validate builder.validate() exit(EXIT_SUCCESS) + @staticmethod def write_ring(): """ swift-ring-builder write_ring @@ -982,6 +1020,7 @@ swift-ring-builder write_ring ring_data.save(ring_file) exit(EXIT_SUCCESS) + @staticmethod def write_builder(): """ swift-ring-builder write_builder [min_part_hours] @@ -1028,6 +1067,7 @@ swift-ring-builder write_builder [min_part_hours] builder.devs[dev_id]['parts'] += 1 builder.save(builder_file) + @staticmethod def pretend_min_part_hours_passed(): """ swift-ring-builder pretend_min_part_hours_passed @@ -1046,6 +1086,7 @@ swift-ring-builder pretend_min_part_hours_passed builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def set_min_part_hours(): """ swift-ring-builder set_min_part_hours @@ -1062,6 +1103,7 @@ swift-ring-builder set_min_part_hours builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def set_replicas(): """ swift-ring-builder set_replicas @@ -1094,6 +1136,7 @@ swift-ring-builder set_replicas builder.save(builder_file) exit(EXIT_SUCCESS) + @staticmethod def set_overload(): """ swift-ring-builder set_overload [%] @@ -1150,11 +1193,12 @@ def main(arguments=None): globals()) print(Commands.default.__doc__.strip()) print() - cmds = [c for c, f in Commands.__dict__.items() - if f.__doc__ and not c.startswith('_') and c != 'default'] + cmds = [c for c in dir(Commands) + if getattr(Commands, c).__doc__ and not c.startswith('_') and + c != 'default'] cmds.sort() for cmd in cmds: - print(Commands.__dict__[cmd].__doc__.strip()) + print(getattr(Commands, cmd).__doc__.strip()) print() print(parse_search_value.__doc__.strip()) print() @@ -1199,13 +1243,9 @@ def main(arguments=None): if argv[0].endswith('-safe'): try: with lock_parent_directory(abspath(builder_file), 15): - Commands.__dict__.get(command, Commands.unknown.__func__)() + getattr(Commands, command, Commands.unknown)() except exceptions.LockTimeout: print("Ring/builder dir currently locked.") exit(2) else: - Commands.__dict__.get(command, Commands.unknown.__func__)() - - -if __name__ == '__main__': - main() + getattr(Commands, command, Commands.unknown)() diff --git a/swift/common/direct_client.py b/swift/common/direct_client.py index bbb1ca4b90..96f2579de0 100644 --- a/swift/common/direct_client.py +++ b/swift/common/direct_client.py @@ -25,6 +25,7 @@ from time import time from eventlet import sleep, Timeout import six +import six.moves.cPickle as pickle from six.moves.http_client import HTTPException from swift.common.bufferedhttp import http_connect @@ -49,6 +50,30 @@ class DirectClientException(ClientException): http_reason=resp.reason, http_headers=headers) +def _make_req(node, part, method, path, _headers, stype, + conn_timeout=5, response_timeout=15): + """ + Make request to backend storage node. + (i.e. 'Account', 'Container', 'Object') + :param node: a node dict from a ring + :param part: an integer, the partion number + :param method: a string, the HTTP method (e.g. 'PUT', 'DELETE', etc) + :param path: a string, the request path + :param headers: a dict, header name => value + :param stype: a string, describing the type of service + :returns: an HTTPResponse object + """ + with Timeout(conn_timeout): + conn = http_connect(node['ip'], node['port'], node['device'], part, + method, path, headers=_headers) + with Timeout(response_timeout): + resp = conn.getresponse() + resp.read() + if not is_success(resp.status): + raise DirectClientException(stype, method, node, part, path, resp) + return resp + + def _get_direct_account_container(path, stype, node, part, marker=None, limit=None, prefix=None, delimiter=None, conn_timeout=5, @@ -76,6 +101,7 @@ def _get_direct_account_container(path, stype, node, part, if not is_success(resp.status): resp.read() raise DirectClientException(stype, 'GET', node, part, path, resp) + resp_headers = HeaderKeyDict() for header, value in resp.getheaders(): resp_headers[header] = value @@ -126,16 +152,8 @@ def direct_delete_account(node, part, account, conn_timeout=5, headers = {} path = '/%s' % account - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'DELETE', path, - headers=gen_headers(headers, True)) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Account', 'DELETE', - node, part, path, resp) + _make_req(node, part, 'DELETE', path, gen_headers(headers, True), + 'Account', conn_timeout, response_timeout) def direct_head_container(node, part, account, container, conn_timeout=5, @@ -153,15 +171,9 @@ def direct_head_container(node, part, account, container, conn_timeout=5, :raises ClientException: HTTP HEAD request failed """ path = '/%s/%s' % (account, container) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'HEAD', path, headers=gen_headers()) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Container', 'HEAD', - node, part, path, resp) + resp = _make_req(node, part, 'HEAD', path, gen_headers(), + 'Container', conn_timeout, response_timeout) + resp_headers = HeaderKeyDict() for header, value in resp.getheaders(): resp_headers[header] = value @@ -215,16 +227,8 @@ def direct_delete_container(node, part, account, container, conn_timeout=5, path = '/%s/%s' % (account, container) add_timestamp = 'x-timestamp' not in (k.lower() for k in headers) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'DELETE', path, - headers=gen_headers(headers, add_timestamp)) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Container', 'DELETE', - node, part, path, resp) + _make_req(node, part, 'DELETE', path, gen_headers(headers, add_timestamp), + 'Container', conn_timeout, response_timeout) def direct_put_container_object(node, part, account, container, obj, @@ -236,17 +240,9 @@ def direct_put_container_object(node, part, account, container, obj, have_x_timestamp = 'x-timestamp' in (k.lower() for k in headers) path = '/%s/%s/%s' % (account, container, obj) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'PUT', path, - headers=gen_headers(headers, - add_ts=(not have_x_timestamp))) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Container', 'PUT', - node, part, path, resp) + _make_req(node, part, 'PUT', path, + gen_headers(headers, add_ts=(not have_x_timestamp)), + 'Container', conn_timeout, response_timeout) def direct_delete_container_object(node, part, account, container, obj, @@ -259,16 +255,8 @@ def direct_delete_container_object(node, part, account, container, obj, k.lower() for k in headers)) path = '/%s/%s/%s' % (account, container, obj) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'DELETE', path, headers=headers) - - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Container', 'DELETE', - node, part, path, resp) + _make_req(node, part, 'DELETE', path, headers, + 'Container', conn_timeout, response_timeout) def direct_head_object(node, part, account, container, obj, conn_timeout=5, @@ -293,15 +281,9 @@ def direct_head_object(node, part, account, container, obj, conn_timeout=5, headers = gen_headers(headers) path = '/%s/%s/%s' % (account, container, obj) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'HEAD', path, headers=headers) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Object', 'HEAD', - node, part, path, resp) + resp = _make_req(node, part, 'HEAD', path, headers, + 'Object', conn_timeout, response_timeout) + resp_headers = HeaderKeyDict() for header, value in resp.getheaders(): resp_headers[header] = value @@ -337,8 +319,8 @@ def direct_get_object(node, part, account, container, obj, conn_timeout=5, resp = conn.getresponse() if not is_success(resp.status): resp.read() - raise DirectClientException('Object', 'GET', - node, part, path, resp) + raise DirectClientException('Object', 'GET', node, part, path, resp) + if resp_chunk_size: def _object_body(): @@ -453,15 +435,8 @@ def direct_post_object(node, part, account, container, name, headers, :raises ClientException: HTTP POST request failed """ path = '/%s/%s/%s' % (account, container, name) - with Timeout(conn_timeout): - conn = http_connect(node['ip'], node['port'], node['device'], part, - 'POST', path, headers=gen_headers(headers, True)) - with Timeout(response_timeout): - resp = conn.getresponse() - resp.read() - if not is_success(resp.status): - raise DirectClientException('Object', 'POST', - node, part, path, resp) + _make_req(node, part, 'POST', path, gen_headers(headers, True), + 'Object', conn_timeout, response_timeout) def direct_delete_object(node, part, account, container, obj, @@ -485,15 +460,36 @@ def direct_delete_object(node, part, account, container, obj, k.lower() for k in headers)) path = '/%s/%s/%s' % (account, container, obj) + _make_req(node, part, 'DELETE', path, headers, + 'Object', conn_timeout, response_timeout) + + +def direct_get_suffix_hashes(node, part, suffixes, conn_timeout=5, + response_timeout=15, headers=None): + """ + Get suffix hashes directly from the object server. + + :param node: node dictionary from the ring + :param part: partition the container is on + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: dict to be passed into HTTPConnection headers + :returns: dict of suffix hashes + :raises ClientException: HTTP REPLICATE request failed + """ + if headers is None: + headers = {} + + path = '/%s' % '-'.join(suffixes) with Timeout(conn_timeout): conn = http_connect(node['ip'], node['port'], node['device'], part, - 'DELETE', path, headers=headers) + 'REPLICATE', path, headers=gen_headers(headers)) with Timeout(response_timeout): resp = conn.getresponse() - resp.read() if not is_success(resp.status): - raise DirectClientException('Object', 'DELETE', + raise DirectClientException('Object', 'REPLICATE', node, part, path, resp) + return pickle.loads(resp.read()) def retry(func, *args, **kwargs): diff --git a/swift/common/internal_client.py b/swift/common/internal_client.py index 7dceda8427..2413d9ad6d 100644 --- a/swift/common/internal_client.py +++ b/swift/common/internal_client.py @@ -20,15 +20,16 @@ import six from six.moves import range from six.moves import urllib import struct -from sys import exc_info +from sys import exc_info, exit import zlib from swift import gettext_ as _ from time import gmtime, strftime, time from zlib import compressobj -from swift.common.utils import quote +from swift.common.exceptions import ClientException from swift.common.http import HTTP_NOT_FOUND, HTTP_MULTIPLE_CHOICES from swift.common.swob import Request +from swift.common.utils import quote from swift.common.wsgi import loadapp, pipeline_property @@ -807,9 +808,14 @@ class SimpleClient(object): self.attempts += 1 try: return self.base_request(method, **kwargs) - except (socket.error, httplib.HTTPException, urllib2.URLError): + except (socket.error, httplib.HTTPException, urllib2.URLError) \ + as err: if self.attempts > retries: - raise + if isinstance(err, urllib2.HTTPError): + raise ClientException('Raise too many retries', + http_status=err.getcode()) + else: + raise sleep(backoff) backoff = min(backoff * 2, self.max_backoff) diff --git a/swift/common/manager.py b/swift/common/manager.py index 03eb0479e9..e67f8a32f7 100644 --- a/swift/common/manager.py +++ b/swift/common/manager.py @@ -162,6 +162,16 @@ def safe_kill(pid, sig, name): os.kill(pid, sig) +def kill_group(pid, sig): + """Send signal to process group + + : param pid: process id + : param sig: signal to send + """ + # Negative PID means process group + os.kill(-pid, sig) + + class UnknownCommandError(Exception): pass @@ -285,11 +295,27 @@ class Manager(object): return 0 # reached interval n watch_pids w/o killing all servers + kill_after_timeout = kwargs.get('kill_after_timeout', False) for server, pids in server_pids.items(): if not killed_pids.issuperset(pids): # some pids of this server were not killed - print(_('Waited %s seconds for %s to die; giving up') % ( - kill_wait, server)) + if kill_after_timeout: + print(_('Waited %s seconds for %s to die; killing') % ( + kill_wait, server)) + # Send SIGKILL to all remaining pids + for pid in set(pids.keys()) - killed_pids: + print(_('Signal %s pid: %s signal: %s') % ( + server, pid, signal.SIGKILL)) + # Send SIGKILL to process group + try: + kill_group(pid, signal.SIGKILL) + except OSError as e: + # PID died before kill_group can take action? + if e.errno != errno.ESRCH: + raise e + else: + print(_('Waited %s seconds for %s to die; giving up') % ( + kill_wait, server)) return 1 @command diff --git a/swift/common/memcached.py b/swift/common/memcached.py index bb359539ae..9640ac6f8f 100644 --- a/swift/common/memcached.py +++ b/swift/common/memcached.py @@ -56,7 +56,7 @@ from eventlet.green import socket from eventlet.pools import Pool from eventlet import Timeout from six.moves import range - +from swift.common import utils DEFAULT_MEMCACHED_PORT = 11211 @@ -101,23 +101,28 @@ class MemcachePoolTimeout(Timeout): class MemcacheConnPool(Pool): - """Connection pool for Memcache Connections""" + """ + Connection pool for Memcache Connections + + The *server* parameter can be a hostname, an IPv4 address, or an IPv6 + address with an optional port. See + :func:`swift.common.utils.parse_socket_string` for details. + """ def __init__(self, server, size, connect_timeout): Pool.__init__(self, max_size=size) - self.server = server + self.host, self.port = utils.parse_socket_string( + server, DEFAULT_MEMCACHED_PORT) self._connect_timeout = connect_timeout def create(self): - if ':' in self.server: - host, port = self.server.split(':') - else: - host = self.server - port = DEFAULT_MEMCACHED_PORT - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + addrs = socket.getaddrinfo(self.host, self.port, socket.AF_UNSPEC, + socket.SOCK_STREAM) + family, socktype, proto, canonname, sockaddr = addrs[0] + sock = socket.socket(family, socket.SOCK_STREAM) sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) with Timeout(self._connect_timeout): - sock.connect((host, int(port))) + sock.connect(sockaddr) return (sock.makefile(), sock) def get(self): diff --git a/swift/common/middleware/bulk.py b/swift/common/middleware/bulk.py index 7fb227b457..0dd4aa12b2 100644 --- a/swift/common/middleware/bulk.py +++ b/swift/common/middleware/bulk.py @@ -13,6 +13,183 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Middleware that will perform many operations on a single request. + +--------------- +Extract Archive +--------------- + +Expand tar files into a Swift account. Request must be a PUT with the +query parameter ``?extract-archive=format`` specifying the format of archive +file. Accepted formats are tar, tar.gz, and tar.bz2. + +For a PUT to the following url:: + + /v1/AUTH_Account/$UPLOAD_PATH?extract-archive=tar.gz + +UPLOAD_PATH is where the files will be expanded to. UPLOAD_PATH can be a +container, a pseudo-directory within a container, or an empty string. The +destination of a file in the archive will be built as follows:: + + /v1/AUTH_Account/$UPLOAD_PATH/$FILE_PATH + +Where FILE_PATH is the file name from the listing in the tar file. + +If the UPLOAD_PATH is an empty string, containers will be auto created +accordingly and files in the tar that would not map to any container (files +in the base directory) will be ignored. + +Only regular files will be uploaded. Empty directories, symlinks, etc will +not be uploaded. + +------------ +Content Type +------------ + +If the content-type header is set in the extract-archive call, Swift will +assign that content-type to all the underlying files. The bulk middleware +will extract the archive file and send the internal files using PUT +operations using the same headers from the original request +(e.g. auth-tokens, content-Type, etc.). Notice that any middleware call +that follows the bulk middleware does not know if this was a bulk request +or if these were individual requests sent by the user. + +In order to make Swift detect the content-type for the files based on the +file extension, the content-type in the extract-archive call should not be +set. Alternatively, it is possible to explicitly tell Swift to detect the +content type using this header:: + + X-Detect-Content-Type: true + +For example:: + + curl -X PUT http://127.0.0.1/v1/AUTH_acc/cont/$?extract-archive=tar + -T backup.tar + -H "Content-Type: application/x-tar" + -H "X-Auth-Token: xxx" + -H "X-Detect-Content-Type: true" + +------------------ +Assigning Metadata +------------------ + +The tar file format (1) allows for UTF-8 key/value pairs to be associated +with each file in an archive. If a file has extended attributes, then tar +will store those as key/value pairs. The bulk middleware can read those +extended attributes and convert them to Swift object metadata. Attributes +starting with "user.meta" are converted to object metadata, and +"user.mime_type" is converted to Content-Type. + +For example:: + + setfattr -n user.mime_type -v "application/python-setup" setup.py + setfattr -n user.meta.lunch -v "burger and fries" setup.py + setfattr -n user.meta.dinner -v "baked ziti" setup.py + setfattr -n user.stuff -v "whee" setup.py + +Will get translated to headers:: + + Content-Type: application/python-setup + X-Object-Meta-Lunch: burger and fries + X-Object-Meta-Dinner: baked ziti + +The bulk middleware will handle xattrs stored by both GNU and BSD tar (2). +Only xattrs ``user.mime_type`` and ``user.meta.*`` are processed. Other +attributes are ignored. + +Notes: + +(1) The POSIX 1003.1-2001 (pax) format. The default format on GNU tar +1.27.1 or later. + +(2) Even with pax-format tarballs, different encoders store xattrs slightly +differently; for example, GNU tar stores the xattr "user.userattribute" as +pax header "SCHILY.xattr.user.userattribute", while BSD tar (which uses +libarchive) stores it as "LIBARCHIVE.xattr.user.userattribute". + +-------- +Response +-------- + +The response from bulk operations functions differently from other Swift +responses. This is because a short request body sent from the client could +result in many operations on the proxy server and precautions need to be +made to prevent the request from timing out due to lack of activity. To +this end, the client will always receive a 200 OK response, regardless of +the actual success of the call. The body of the response must be parsed to +determine the actual success of the operation. In addition to this the +client may receive zero or more whitespace characters prepended to the +actual response body while the proxy server is completing the request. + +The format of the response body defaults to text/plain but can be either +json or xml depending on the ``Accept`` header. Acceptable formats are +``text/plain``, ``application/json``, ``application/xml``, and ``text/xml``. +An example body is as follows:: + + {"Response Status": "201 Created", + "Response Body": "", + "Errors": [], + "Number Files Created": 10} + +If all valid files were uploaded successfully the Response Status will be +201 Created. If any files failed to be created the response code +corresponds to the subrequest's error. Possible codes are 400, 401, 502 (on +server errors), etc. In both cases the response body will specify the +number of files successfully uploaded and a list of the files that failed. + +There are proxy logs created for each file (which becomes a subrequest) in +the tar. The subrequest's proxy log will have a swift.source set to "EA" +the log's content length will reflect the unzipped size of the file. If +double proxy-logging is used the leftmost logger will not have a +swift.source set and the content length will reflect the size of the +payload sent to the proxy (the unexpanded size of the tar.gz). + +----------- +Bulk Delete +----------- + +Will delete multiple objects or containers from their account with a +single request. Responds to POST requests with query parameter +``?bulk-delete`` set. The request url is your storage url. The Content-Type +should be set to ``text/plain``. The body of the POST request will be a +newline separated list of url encoded objects to delete. You can delete +10,000 (configurable) objects per request. The objects specified in the +POST request body must be URL encoded and in the form:: + + /container_name/obj_name + +or for a container (which must be empty at time of delete):: + + /container_name + +The response is similar to extract archive as in every response will be a +200 OK and you must parse the response body for actual results. An example +response is:: + + {"Number Not Found": 0, + "Response Status": "200 OK", + "Response Body": "", + "Errors": [], + "Number Deleted": 6} + +If all items were successfully deleted (or did not exist), the Response +Status will be 200 OK. If any failed to delete, the response code +corresponds to the subrequest's error. Possible codes are 400, 401, 502 (on +server errors), etc. In all cases the response body will specify the number +of items successfully deleted, not found, and a list of those that failed. +The return body will be formatted in the way specified in the request's +``Accept`` header. Acceptable formats are ``text/plain``, ``application/json``, +``application/xml``, and ``text/xml``. + +There are proxy logs created for each object or container (which becomes a +subrequest) that is deleted. The subrequest's proxy log will have a +swift.source set to "BD" the log's content length of 0. If double +proxy-logging is used the leftmost logger will not have a +swift.source set and the content length will reflect the size of the +payload sent to the proxy (the list of objects/containers to be deleted). +""" + import json from six.moves.urllib.parse import quote, unquote import tarfile @@ -94,170 +271,6 @@ def pax_key_to_swift_header(pax_key): class Bulk(object): - """ - Middleware that will do many operations on a single request. - - Extract Archive: - - Expand tar files into a swift account. Request must be a PUT with the - query parameter ?extract-archive=format specifying the format of archive - file. Accepted formats are tar, tar.gz, and tar.bz2. - - For a PUT to the following url: - - /v1/AUTH_Account/$UPLOAD_PATH?extract-archive=tar.gz - - UPLOAD_PATH is where the files will be expanded to. UPLOAD_PATH can be a - container, a pseudo-directory within a container, or an empty string. The - destination of a file in the archive will be built as follows: - - /v1/AUTH_Account/$UPLOAD_PATH/$FILE_PATH - - Where FILE_PATH is the file name from the listing in the tar file. - - If the UPLOAD_PATH is an empty string, containers will be auto created - accordingly and files in the tar that would not map to any container (files - in the base directory) will be ignored. - - Only regular files will be uploaded. Empty directories, symlinks, etc will - not be uploaded. - - Content Type: - - If the content-type header is set in the extract-archive call, Swift will - assign that content-type to all the underlying files. The bulk middleware - will extract the archive file and send the internal files using PUT - operations using the same headers from the original request - (e.g. auth-tokens, content-Type, etc.). Notice that any middleware call - that follows the bulk middleware does not know if this was a bulk request - or if these were individual requests sent by the user. - - In order to make Swift detect the content-type for the files based on the - file extension, the content-type in the extract-archive call should not be - set. Alternatively, it is possible to explicitly tell swift to detect the - content type using this header: - - X-Detect-Content-Type:true - - For example: - - curl -X PUT http://127.0.0.1/v1/AUTH_acc/cont/$?extract-archive=tar -T - backup.tar -H "Content-Type: application/x-tar" -H "X-Auth-Token: xxx" - -H "X-Detect-Content-Type:true" - - Assigning Metadata: - - The tar file format (1) allows for UTF-8 key/value pairs to be associated - with each file in an archive. If a file has extended attributes, then tar - will store those as key/value pairs. The bulk middleware can read those - extended attributes and convert them to Swift object metadata. Attributes - starting with "user.meta" are converted to object metadata, and - "user.mime_type" is converted to Content-Type. - - For example: - - setfattr -n user.mime_type -v "application/python-setup" setup.py - setfattr -n user.meta.lunch -v "burger and fries" setup.py - setfattr -n user.meta.dinner -v "baked ziti" setup.py - setfattr -n user.stuff -v "whee" setup.py - - Will get translated to headers: - - Content-Type: application/python-setup - X-Object-Meta-Lunch: burger and fries - X-Object-Meta-Dinner: baked ziti - - The bulk middleware will handle xattrs stored by both GNU and BSD tar (2). - Only xattrs user.mime_type and user.meta.* are processed. Other attributes - are ignored. - - Notes: - - (1) The POSIX 1003.1-2001 (pax) format. The default format on GNU tar - 1.27.1 or later. - - (2) Even with pax-format tarballs, different encoders store xattrs slightly - differently; for example, GNU tar stores the xattr "user.userattribute" as - pax header "SCHILY.xattr.user.userattribute", while BSD tar (which uses - libarchive) stores it as "LIBARCHIVE.xattr.user.userattribute". - - Response: - - The response from bulk operations functions differently from other swift - responses. This is because a short request body sent from the client could - result in many operations on the proxy server and precautions need to be - made to prevent the request from timing out due to lack of activity. To - this end, the client will always receive a 200 OK response, regardless of - the actual success of the call. The body of the response must be parsed to - determine the actual success of the operation. In addition to this the - client may receive zero or more whitespace characters prepended to the - actual response body while the proxy server is completing the request. - - The format of the response body defaults to text/plain but can be either - json or xml depending on the Accept header. Acceptable formats are - text/plain, application/json, application/xml, and text/xml. An example - body is as follows: - - {"Response Status": "201 Created", - "Response Body": "", - "Errors": [], - "Number Files Created": 10} - - If all valid files were uploaded successfully the Response Status will be - 201 Created. If any files failed to be created the response code - corresponds to the subrequest's error. Possible codes are 400, 401, 502 (on - server errors), etc. In both cases the response body will specify the - number of files successfully uploaded and a list of the files that failed. - - There are proxy logs created for each file (which becomes a subrequest) in - the tar. The subrequest's proxy log will have a swift.source set to "EA" - the log's content length will reflect the unzipped size of the file. If - double proxy-logging is used the leftmost logger will not have a - swift.source set and the content length will reflect the size of the - payload sent to the proxy (the unexpanded size of the tar.gz). - - Bulk Delete: - - Will delete multiple objects or containers from their account with a - single request. Responds to POST requests with query parameter - ?bulk-delete set. The request url is your storage url. The Content-Type - should be set to text/plain. The body of the POST request will be a - newline separated list of url encoded objects to delete. You can delete - 10,000 (configurable) objects per request. The objects specified in the - POST request body must be URL encoded and in the form: - - /container_name/obj_name - - or for a container (which must be empty at time of delete) - - /container_name - - The response is similar to extract archive as in every response will be a - 200 OK and you must parse the response body for actual results. An example - response is: - - {"Number Not Found": 0, - "Response Status": "200 OK", - "Response Body": "", - "Errors": [], - "Number Deleted": 6} - - If all items were successfully deleted (or did not exist), the Response - Status will be 200 OK. If any failed to delete, the response code - corresponds to the subrequest's error. Possible codes are 400, 401, 502 (on - server errors), etc. In all cases the response body will specify the number - of items successfully deleted, not found, and a list of those that failed. - The return body will be formatted in the way specified in the request's - Accept header. Acceptable formats are text/plain, application/json, - application/xml, and text/xml. - - There are proxy logs created for each object or container (which becomes a - subrequest) that is deleted. The subrequest's proxy log will have a - swift.source set to "BD" the log's content length of 0. If double - proxy-logging is used the leftmost logger will not have a - swift.source set and the content length will reflect the size of the - payload sent to the proxy (the list of objects/containers to be deleted). - """ def __init__(self, app, conf, max_containers_per_extraction=10000, max_failed_extractions=1000, max_deletes_per_request=10000, diff --git a/swift/common/middleware/slo.py b/swift/common/middleware/slo.py index 048d8b5add..a5ab1085b2 100644 --- a/swift/common/middleware/slo.py +++ b/swift/common/middleware/slo.py @@ -57,12 +57,11 @@ The format of the list will be: "range": "1048576-2097151"}, ...] The number of object segments is limited to a configurable amount, default -1000. Each segment, except for the final one, must be at least 1 megabyte -(configurable). On upload, the middleware will head every segment passed in to -verify: +1000. Each segment must be at least 1 byte. On upload, the middleware will +head every segment passed in to verify: 1. the segment exists (i.e. the HEAD was successful); - 2. the segment meets minimum size requirements (if not the last segment); + 2. the segment meets minimum size requirements; 3. if the user provided a non-null etag, the etag matches; 4. if the user provided a non-null size_bytes, the size_bytes matches; and 5. if the user provided a range, it is a singular, syntactically correct range @@ -121,8 +120,9 @@ finally bytes 2095104 through 2097152 (i.e., the last 2048 bytes) of .. note:: - The minimum sized range is min_segment_size, which by - default is 1048576 (1MB). + + The minimum sized range is 1 byte. This is the same as the minimum + segment size. ------------------------- @@ -221,7 +221,7 @@ from swift.common.middleware.bulk import get_response_body, \ ACCEPTABLE_FORMATS, Bulk -DEFAULT_MIN_SEGMENT_SIZE = 1024 * 1024 # 1 MiB +DEFAULT_RATE_LIMIT_UNDER_SIZE = 1024 * 1024 # 1 MiB DEFAULT_MAX_MANIFEST_SEGMENTS = 1000 DEFAULT_MAX_MANIFEST_SIZE = 1024 * 1024 * 2 # 2 MiB @@ -231,7 +231,7 @@ OPTIONAL_SLO_KEYS = set(['range']) ALLOWED_SLO_KEYS = REQUIRED_SLO_KEYS | OPTIONAL_SLO_KEYS -def parse_and_validate_input(req_body, req_path, min_segment_size): +def parse_and_validate_input(req_body, req_path): """ Given a request body, parses it and returns a list of dictionaries. @@ -269,7 +269,6 @@ def parse_and_validate_input(req_body, req_path, min_segment_size): vrs, account, _junk = split_path(req_path, 3, 3, True) errors = [] - num_segs = len(parsed_data) for seg_index, seg_dict in enumerate(parsed_data): if not isinstance(seg_dict, dict): errors.append("Index %d: not a JSON object" % seg_index) @@ -315,10 +314,10 @@ def parse_and_validate_input(req_body, req_path, min_segment_size): except (TypeError, ValueError): errors.append("Index %d: invalid size_bytes" % seg_index) continue - if (seg_size < min_segment_size and seg_index < num_segs - 1): - errors.append("Index %d: too small; each segment, except " - "the last, must be at least %d bytes." - % (seg_index, min_segment_size)) + if seg_size < 1: + errors.append("Index %d: too small; each segment must be " + "at least 1 byte." + % (seg_index,)) continue obj_path = '/'.join(['', vrs, account, seg_dict['path'].lstrip('/')]) @@ -461,13 +460,13 @@ class SloGetContext(WSGIContext): # no bytes are needed from this or any future segment break - range = seg_dict.get('range') - if range is None: + seg_range = seg_dict.get('range') + if seg_range is None: range_start, range_end = 0, seg_length - 1 else: # We already validated and supplied concrete values # for the range on upload - range_start, range_end = map(int, range.split('-')) + range_start, range_end = map(int, seg_range.split('-')) if config_true_value(seg_dict.get('sub_slo')): # do this check here so that we can avoid fetching this last @@ -662,10 +661,17 @@ class SloGetContext(WSGIContext): plain_listing_iter = self._segment_listing_iterator( req, ver, account, segments) + def is_small_segment((seg_dict, start_byte, end_byte)): + start = 0 if start_byte is None else start_byte + end = int(seg_dict['bytes']) - 1 if end_byte is None else end_byte + is_small = (end - start + 1) < self.slo.rate_limit_under_size + return is_small + ratelimited_listing_iter = RateLimitedIterator( plain_listing_iter, self.slo.rate_limit_segments_per_sec, - limit_after=self.slo.rate_limit_after_segment) + limit_after=self.slo.rate_limit_after_segment, + ratelimit_if=is_small_segment) # self._segment_listing_iterator gives us 3-tuples of (segment dict, # start byte, end byte), but SegmentedIterable wants (obj path, etag, @@ -716,7 +722,7 @@ class StaticLargeObject(object): :param conf: The configuration dict for the middleware. """ - def __init__(self, app, conf, min_segment_size=DEFAULT_MIN_SEGMENT_SIZE, + def __init__(self, app, conf, max_manifest_segments=DEFAULT_MAX_MANIFEST_SEGMENTS, max_manifest_size=DEFAULT_MAX_MANIFEST_SIZE): self.conf = conf @@ -724,12 +730,13 @@ class StaticLargeObject(object): self.logger = get_logger(conf, log_route='slo') self.max_manifest_segments = max_manifest_segments self.max_manifest_size = max_manifest_size - self.min_segment_size = min_segment_size self.max_get_time = int(self.conf.get('max_get_time', 86400)) + self.rate_limit_under_size = int(self.conf.get( + 'rate_limit_under_size', DEFAULT_RATE_LIMIT_UNDER_SIZE)) self.rate_limit_after_segment = int(self.conf.get( 'rate_limit_after_segment', '10')) self.rate_limit_segments_per_sec = int(self.conf.get( - 'rate_limit_segments_per_sec', '0')) + 'rate_limit_segments_per_sec', '1')) self.bulk_deleter = Bulk(app, {}, logger=self.logger) def handle_multipart_get_or_head(self, req, start_response): @@ -783,7 +790,7 @@ class StaticLargeObject(object): raise HTTPLengthRequired(request=req) parsed_data = parse_and_validate_input( req.body_file.read(self.max_manifest_size), - req.path, self.min_segment_size) + req.path) problem_segments = [] if len(parsed_data) > self.max_manifest_segments: @@ -812,6 +819,7 @@ class StaticLargeObject(object): new_env['CONTENT_LENGTH'] = 0 new_env['HTTP_USER_AGENT'] = \ '%s MultipartPUT' % req.environ.get('HTTP_USER_AGENT') + if obj_path != last_obj_path: last_obj_path = obj_path head_seg_resp = \ @@ -840,12 +848,10 @@ class StaticLargeObject(object): seg_dict['range'] = '%d-%d' % (rng[0], rng[1] - 1) segment_length = rng[1] - rng[0] - if segment_length < self.min_segment_size and \ - index < len(parsed_data) - 1: + if segment_length < 1: problem_segments.append( [quote(obj_name), - 'Too small; each segment, except the last, must be ' - 'at least %d bytes.' % self.min_segment_size]) + 'Too small; each segment must be at least 1 byte.']) total_size += segment_length if seg_dict['size_bytes'] is not None and \ seg_dict['size_bytes'] != head_seg_resp.content_length: @@ -1045,18 +1051,17 @@ def filter_factory(global_conf, **local_conf): DEFAULT_MAX_MANIFEST_SEGMENTS)) max_manifest_size = int(conf.get('max_manifest_size', DEFAULT_MAX_MANIFEST_SIZE)) - min_segment_size = int(conf.get('min_segment_size', - DEFAULT_MIN_SEGMENT_SIZE)) register_swift_info('slo', max_manifest_segments=max_manifest_segments, max_manifest_size=max_manifest_size, - min_segment_size=min_segment_size) + # this used to be configurable; report it as 1 for + # clients that might still care + min_segment_size=1) def slo_filter(app): return StaticLargeObject( app, conf, max_manifest_segments=max_manifest_segments, - max_manifest_size=max_manifest_size, - min_segment_size=min_segment_size) + max_manifest_size=max_manifest_size) return slo_filter diff --git a/swift/common/middleware/staticweb.py b/swift/common/middleware/staticweb.py index 4c0b88ce7e..e7552e4f54 100644 --- a/swift/common/middleware/staticweb.py +++ b/swift/common/middleware/staticweb.py @@ -1,4 +1,4 @@ -# Copyright (c) 2010-2012 OpenStack Foundation +# Copyright (c) 2010-2016 OpenStack Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -68,6 +68,12 @@ the .../listing.css style sheet. If you "view source" in your browser on a listing page, you will see the well defined document structure that can be styled. +By default, the listings will be rendered with a label of +"Listing of /v1/account/container/path". This can be altered by +setting a ``X-Container-Meta-Web-Listings-Label: