From e05149571550c8d1dfbf4d0daf6478108413faba Mon Sep 17 00:00:00 2001 From: Chuck Thier Date: Fri, 30 Jul 2010 14:57:20 -0500 Subject: [PATCH] Added initial admin guide, and added more to the deployment guide, plus cleaned up some of the doc string warning --- .bzrignore | 1 + doc/source/admin_guide.rst | 154 ++++++++++++++++++++++++++++++++ doc/source/deployment_guide.rst | 97 +++++++++++++++++++- doc/source/index.rst | 1 + etc/proxy-server.conf-sample | 3 - swift/account/reaper.py | 12 ++- swift/obj/replicator.py | 2 +- swift/proxy/server.py | 5 -- 8 files changed, 261 insertions(+), 14 deletions(-) create mode 100644 doc/source/admin_guide.rst diff --git a/.bzrignore b/.bzrignore index 14ebe255fd..489d097671 100644 --- a/.bzrignore +++ b/.bzrignore @@ -2,3 +2,4 @@ *.sw? doc/build/* dist +swift.egg-info diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst new file mode 100644 index 0000000000..aba899f5fa --- /dev/null +++ b/doc/source/admin_guide.rst @@ -0,0 +1,154 @@ +===================== +Administrator's Guide +===================== + +------------------ +Managing the Rings +------------------ + +Removing a device from the ring:: + + swift-ring-builder remove / + +Removing a server from the ring:: + + swift-ring-builder remove + +Adding devices to the ring: + +See :ref:`ring-preparing` + +See what devices for a server are in the ring:: + + swift-ring-builder search + +Once you are done with all changes to the ring, the changes need to be +"committed":: + + swift-ring-builder rebalance + +Once the new rings are built, they should be pushed out to all the servers +in the cluster. + +----------------------- +Handling System Updates +----------------------- + +It is recommended that system updates and reboots are done a zone at a time. +This allows the update to happen, and for the Swift cluster to stay available +and responsive to requests. It is also advisable when updating a zone, let +it run for a while before updating the other zones to make sure the update +doesn't have any adverse effects. + +---------------------- +Handling Drive Failure +---------------------- + +In the event that a drive has failed, the first step is to make sure the drive +is unmounted. This will make it easier for swift to work around the failure +until it has been resolved. If the drive is going to be replaced immediately, +then it is just best to replace the drive, format it, remount it, and let +replication fill it up. + +If the drive can't be replaced immediately, then it is best to leave it +unmounted, and remove the drive from the ring. This will allow all the +replicas that were on that drive to be replicated elsewhere until the drive +is replaced. Once the drive is replaced, it can be re-added to the ring. + +----------------------- +Handling Server Failure +----------------------- + +If a server is having hardware issues, it is a good idea to make sure the +swift services are not running. This will allow Swift to work around the +failure while you troubleshoot. + +If the server just needs a reboot, or a small amount of work that should +only last a couple of hours, then it is probably best to let Swift work +around the failure and get the machine fixed and back online. When the +machine comes back online, replication will make sure that anything that is +missing during the downtime will get updated. + +If the server has more serious issues, then it is probably best to remove +all of the server's devices from the ring. Once the server has been repaired +and is back online, the server's devices can be added back into the ring. +It is important that the devices are reformatted before putting them back +into the ring as it is likely to be responsible for a different set of +partitions than before. + +----------------------- +Detecting Failed Drives +----------------------- + +It has been our experience that when a drive is about to fail, error messages +will spew into `/var/log/kern.log`. There is a script called +`swift-drive-audit` that can be run via cron to watch for bad drives. If +errors are detected, it will unmount the bad drive, so that Swift can +work around it. The script takes a configuration file with the following +settings: + +[drive-audit] + +================== ========== =========================================== +Option Default Description +------------------ ---------- ------------------------------------------- +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Log level +device_dir /srv/node Directory devices are mounted under +minutes 60 Number of minutes to look back in + `/var/log/kern.log` +error_limit 1 Number of errors to find before a device + is unmounted +================== ========== =========================================== + +This script has only been tested on Ubuntu 10.04, so if you are using a +different distro or OS, some care should be taken before using in production. + +-------------- +Cluster Health +-------------- + +TODO: Greg, add docs here about how to use swift-stats-populate, and +swift-stats-report + +------------------------ +Debugging Tips and Tools +------------------------ + +When a request is made to Swift, it is given a unique transaction id. This +id should be in every log line that has to do with that request. This can +be usefult when looking at all the services that are hit by a single request. + +If you need to know where a specific account, container or object is in the +cluster, `swift-get-nodes` will show the location where each replica should be. + +If you are looking at an object on the server and need more info, +`swift-object-info` will display the account, container, replica locations +and metadata of the object. + +If you want to audit the data for an account, `swift-account-audit` can be +used to crawl the account, checking that all containers and objects can be +found. + +----------------- +Managing Services +----------------- + +Swift services are generally managed with `swift-init`. the general usage is +``swift-init ``, where service is the swift service to +manage (for example object, container, account, proxy) and command is one of: + +========== =============================================== +Command Description +---------- ----------------------------------------------- +start Start the service +stop Stop the service +restart Restart the service +shutdown Attempt to gracefully shutdown the service +reload Attempt to gracefully restart the service +========== =============================================== + +A graceful shutdown or reload will finish any current requests before +completely stopping the old service. There is also a special case of +`swift-init all `, which will run the command for all swift services. + diff --git a/doc/source/deployment_guide.rst b/doc/source/deployment_guide.rst index bc6fce3d21..0b614fe830 100644 --- a/doc/source/deployment_guide.rst +++ b/doc/source/deployment_guide.rst @@ -51,6 +51,8 @@ Load balancing and network design is left as an excercise to the reader, but this is a very important part of the cluster, so time should be spent designing the network for a Swift cluster. +.. _ring-preparing: + ------------------ Preparing the Ring ------------------ @@ -320,7 +322,7 @@ per_diff 1000 concurrency 8 Number of replication workers to spawn run_pause 30 Time in seconds to wait between replication passes -node_timeout 10 Request timeout to external services +node_timeout 10 Request timeout to external services conn_timeout 0.5 Connection timeout to external services reclaim_age 604800 Time elapsed in seconds before a account can be reclaimed @@ -353,6 +355,99 @@ node_timeout 10 Request timeout to external services conn_timeout 0.5 Connection timeout to external services ================== ========== =========================================== +-------------------------- +Proxy Server Configuration +-------------------------- + +[proxy-server] + +============================ =============== ============================= +Option Default Description +---------------------------- --------------- ----------------------------- +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Log level +bind_ip 0.0.0.0 IP Address for server to + bind to +bind_port 80 Port for server to bind to +cert_file Path to the ssl .crt +key_file Path to the ssl .key +swift_dir /etc/swift Swift configuration directory +log_headers True If True, log headers in each + request +workers 1 Number of workers to fork +user swift User to run as +recheck_account_existence 60 Cache timeout in seconds to + send memcached for account + existance +recheck_container_existence 60 Cache timeout in seconds to + send memcached for container + existance +object_chunk_size 65536 Chunk size to read from + object servers +client_chunk_size 65536 Chunk size to read from + clients +memcache_servers 127.0.0.1:11211 Comma separated list of + memcached servers ip:port +node_timeout 10 Request timeout to external + services +client_timeout 60 Timeout to read one chunk + from a client +conn_timeout 0.5 Connection timeout to + external services +error_suppression_interval 60 Time in seconds that must + elapse since the last error + for a node to be considered + no longer error limited +error_suppression_limit 10 Error count to consider a + node error limited +rate_limit 20000.0 Max container level ops per + second +account_rate_limit 200.0 Max account level ops per + second +rate_limit_account_whitelist Comma separated list of + account name hashes to not + rate limit +rate_limit_account_blacklist Comma separated list of + account name hashes to block + completly +============================ =============== ============================= + +[auth-server] + +============ =================================== ======================== +Option Default Description +------------ ----------------------------------- ------------------------ +class swift.common.auth.DevAuthMiddleware Auth wsgi middleware + to use +ip 127.0.0.1 IP address of auth + server +port 11000 Port of auth server +node_timeout 10 Request timeout +============ =================================== ======================== + +------------------------ +Memcached Considerations +------------------------ + +Several of the Services rely on Memcached for caching certain types of +lookups, such as auth tokens, and container/account existance. Swift does +not do any caching of actual object data. Memcached should be able to run +on any servers that have available RAM and CPU. At Rackspace, we run +Memcached on the proxy servers. The `memcache_servers` config option +in the `proxy-server.conf` should contain all memcached servers. + +----------- +System Time +----------- + +Time may be relative but it is relatively important for Swift! Sift uses +timestamps to determine which is the most recent version of an object. +It is very important for the system time on each server in the cluster to +by synced as closely as possible (more so for the proxy server, but in general +it is a good idea for all the servers). At Rackspace, we use NTP with a local +NTP server to ensure that the system times are as close as possible. This +should also be monitored to ensure that the times do not vary too much. + ---------------------- General Service Tuning ---------------------- diff --git a/doc/source/index.rst b/doc/source/index.rst index c6c01fa3cf..60d26dbe32 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -39,6 +39,7 @@ Deployment: :maxdepth: 1 deployment_guide + admin_guide Source: diff --git a/etc/proxy-server.conf-sample b/etc/proxy-server.conf-sample index b938e5fdec..33d612c724 100644 --- a/etc/proxy-server.conf-sample +++ b/etc/proxy-server.conf-sample @@ -12,8 +12,6 @@ # recheck_account_existence = 60 # recheck_container_existence = 60 # object_chunk_size = 8192 -# container_chunk_size = 8192 -# account_chunk_size = 8192 # client_chunk_size = 8192 # Default for memcache_servers is below, but you can specify multiple servers # with the format: 10.1.2.3:11211,10.1.2.4:11211 @@ -32,7 +30,6 @@ # account_rate_limit = 200.0 # rate_limit_account_whitelist = acct1,acct2,etc # rate_limit_account_blacklist = acct3,acct4,etc -# container_put_lock_timeout = 5 # [auth-server] # class = swift.common.auth.DevAuthMiddleware diff --git a/swift/account/reaper.py b/swift/account/reaper.py index 65b9a2a8ca..93b54e7525 100644 --- a/swift/account/reaper.py +++ b/swift/account/reaper.py @@ -201,10 +201,14 @@ class AccountReaper(object): :param partition: The partition in the account ring the account is on. :param nodes: The primary node dicts for the account to delete. - * See also: :class:`swift.common.db.AccountBroker` for the broker - class. - * See also: :func:`swift.common.ring.Ring.get_nodes` for a description - of the node dicts. + .. seealso:: + + :class:`swift.common.db.AccountBroker` for the broker class. + + .. seealso:: + + :func:`swift.common.ring.Ring.get_nodes` for a description + of the node dicts. """ begin = time() account = broker.get_info()['account'] diff --git a/swift/obj/replicator.py b/swift/obj/replicator.py index 33b6dcb94f..2ca0326b2a 100644 --- a/swift/obj/replicator.py +++ b/swift/obj/replicator.py @@ -123,7 +123,7 @@ def invalidate_hash(suffix_dir): Invalidates the hash for a suffix_dir in the partition's hashes file. :param suffix_dir: absolute path to suffix dir whose hash needs - invalidating + invalidating """ suffix = os.path.basename(suffix_dir) diff --git a/swift/proxy/server.py b/swift/proxy/server.py index c900108601..b09409a9b2 100644 --- a/swift/proxy/server.py +++ b/swift/proxy/server.py @@ -949,9 +949,6 @@ class BaseApplication(object): self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.client_timeout = int(conf.get('client_timeout', 60)) self.object_chunk_size = int(conf.get('object_chunk_size', 65536)) - self.container_chunk_size = \ - int(conf.get('container_chunk_size', 65536)) - self.account_chunk_size = int(conf.get('account_chunk_size', 65536)) self.client_chunk_size = int(conf.get('client_chunk_size', 65536)) self.log_headers = conf.get('log_headers') == 'True' self.error_suppression_interval = \ @@ -979,8 +976,6 @@ class BaseApplication(object): self.rate_limit_blacklist = [x.strip() for x in conf.get('rate_limit_account_blacklist', '').split(',') if x.strip()] - self.container_put_lock_timeout = \ - int(conf.get('container_put_lock_timeout', 5)) def get_controller(self, path): """